From 89cf6574bc4bcdb99894e9401ecea0063d8212ce Mon Sep 17 00:00:00 2001 From: Zheyu Ma Date: Tue, 2 Jul 2024 23:18:35 +0200 Subject: [PATCH 01/61] hw/virtio/virtio-crypto: Fix op_code assignment in virtio_crypto_create_asym_session Currently, if the function fails during the key_len check, the op_code does not have a proper value, causing virtio_crypto_free_create_session_req not to free the memory correctly, leading to a memory leak. By setting the op_code before performing any checks, we ensure that virtio_crypto_free_create_session_req has the correct context to perform cleanup operations properly, thus preventing memory leaks. ASAN log: ==3055068==ERROR: LeakSanitizer: detected memory leaks Direct leak of 512 byte(s) in 1 object(s) allocated from: #0 0x5586a75e6ddd in malloc llvm/compiler-rt/lib/asan/asan_malloc_linux.cpp:129:3 #1 0x7fb6b63b6738 in g_malloc (/lib/x86_64-linux-gnu/libglib-2.0.so.0+0x5e738) #2 0x5586a864bbde in virtio_crypto_handle_ctrl hw/virtio/virtio-crypto.c:407:19 #3 0x5586a94fc84c in virtio_queue_notify_vq hw/virtio/virtio.c:2277:9 #4 0x5586a94fc0a2 in virtio_queue_host_notifier_read hw/virtio/virtio.c:3641:9 Signed-off-by: Zheyu Ma Message-Id: <20240702211835.3064505-1-zheyuma97@gmail.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/virtio-crypto.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/virtio/virtio-crypto.c b/hw/virtio/virtio-crypto.c index bbe8aa4b99..5034768bff 100644 --- a/hw/virtio/virtio-crypto.c +++ b/hw/virtio/virtio-crypto.c @@ -205,6 +205,7 @@ virtio_crypto_create_asym_session(VirtIOCrypto *vcrypto, int queue_index; uint32_t algo, keytype, keylen; + sreq->info.op_code = opcode; algo = ldl_le_p(&sess_req->para.algo); keytype = ldl_le_p(&sess_req->para.keytype); keylen = ldl_le_p(&sess_req->para.keylen); @@ -224,7 +225,6 @@ virtio_crypto_create_asym_session(VirtIOCrypto *vcrypto, iov_discard_front(&iov, &out_num, keylen); } - sreq->info.op_code = opcode; asym_info = &sreq->info.u.asym_sess_info; asym_info->algo = algo; asym_info->keytype = keytype; From 7967b7e0b17cb5e700ca040e0b68ba8fb0035c0d Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Thu, 4 Jul 2024 10:13:36 +0200 Subject: [PATCH 02/61] MAINTAINERS: add Stefano Garzarella as vhost/vhost-user reviewer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I have recently been working on supporting vhost-user on any POSIX, so I want to help maintain it. Cc: Michael S. Tsirkin Signed-off-by: Stefano Garzarella Message-Id: <20240704081336.21208-1-sgarzare@redhat.com> Reviewed-by: Alex Bennée Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 7d9811458c..b7e9ced3e8 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2209,6 +2209,7 @@ F: docs/devel/vfio-iommufd.rst vhost M: Michael S. Tsirkin +R: Stefano Garzarella S: Supported F: hw/*/*vhost* F: docs/interop/vhost-user.json From b7dbfe4f47884e67a309c206b3eadec339a0d3b6 Mon Sep 17 00:00:00 2001 From: Fan Ni Date: Fri, 5 Jul 2024 12:39:55 +0100 Subject: [PATCH 03/61] hw/cxl/cxl-mailbox-utils: remove unneeded mailbox output payload space zeroing The whole mailbox output payload space is already zeroed after copying out the input payload, which happens before processing the specific mailbox command: https://elixir.bootlin.com/qemu/v8.2.1/source/hw/cxl/cxl-device-utils.c#L204 Signed-off-by: Fan Ni Link: https://lore.kernel.org/r/20240221221824.1092966-1-nifan.cxl@gmail.com Signed-off-by: Jonathan Cameron Message-Id: <20240705113956.941732-3-Jonathan.Cameron@huawei.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/cxl/cxl-mailbox-utils.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c index 74eeb6fde7..facec42dc8 100644 --- a/hw/cxl/cxl-mailbox-utils.c +++ b/hw/cxl/cxl-mailbox-utils.c @@ -235,7 +235,6 @@ static CXLRetCode cmd_events_get_records(const struct cxl_cmd *cmd, log_type = payload_in[0]; pl = (CXLGetEventPayload *)payload_out; - memset(pl, 0, sizeof(*pl)); max_recs = (cxlds->payload_size - CXL_EVENT_PAYLOAD_HDR_SIZE) / CXL_EVENT_RECORD_SIZE; @@ -273,7 +272,6 @@ static CXLRetCode cmd_events_get_interrupt_policy(const struct cxl_cmd *cmd, CXLEventLog *log; policy = (CXLEventInterruptPolicy *)payload_out; - memset(policy, 0, sizeof(*policy)); log = &cxlds->event_logs[CXL_EVENT_TYPE_INFO]; if (log->irq_enabled) { @@ -372,7 +370,6 @@ static CXLRetCode cmd_infostat_identify(const struct cxl_cmd *cmd, QEMU_BUILD_BUG_ON(sizeof(*is_identify) != 18); is_identify = (void *)payload_out; - memset(is_identify, 0, sizeof(*is_identify)); is_identify->pcie_vid = class->vendor_id; is_identify->pcie_did = class->device_id; if (object_dynamic_cast(OBJECT(cci->d), TYPE_CXL_USP)) { @@ -606,7 +603,6 @@ static CXLRetCode cmd_infostat_bg_op_sts(const struct cxl_cmd *cmd, QEMU_BUILD_BUG_ON(sizeof(*bg_op_status) != 8); bg_op_status = (void *)payload_out; - memset(bg_op_status, 0, sizeof(*bg_op_status)); bg_op_status->status = cci->bg.complete_pct << 1; if (cci->bg.runtime > 0) { bg_op_status->status |= 1U << 0; @@ -647,7 +643,6 @@ static CXLRetCode cmd_firmware_update_get_info(const struct cxl_cmd *cmd, } fw_info = (void *)payload_out; - memset(fw_info, 0, sizeof(*fw_info)); fw_info->slots_supported = 2; fw_info->slot_info = BIT(0) | BIT(3); @@ -805,7 +800,6 @@ static CXLRetCode cmd_identify_memory_device(const struct cxl_cmd *cmd, } id = (void *)payload_out; - memset(id, 0, sizeof(*id)); snprintf(id->fw_revision, 0x10, "BWFW VERSION %02d", 0); @@ -1095,7 +1089,6 @@ static CXLRetCode cmd_media_get_poison_list(const struct cxl_cmd *cmd, out_pl_len = sizeof(*out) + record_count * sizeof(out->records[0]); assert(out_pl_len <= CXL_MAILBOX_MAX_PAYLOAD_SIZE); - memset(out, 0, out_pl_len); QLIST_FOREACH(ent, poison_list, node) { uint64_t start, stop; From 9de2049c95e1a94e27430e71a9d9ace07ec8eda6 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Fri, 5 Jul 2024 12:39:56 +0100 Subject: [PATCH 04/61] hw/cxl: Check for multiple mappings of memory backends. Similar protection to that provided for -numa memdev=x to make sure that memory used to back a type3 device is not also mapped as normal RAM, or for multiple type3 devices. This is an easy footgun to remove and seems multiple people have run into it. Signed-off-by: Jonathan Cameron Message-Id: <20240705113956.941732-4-Jonathan.Cameron@huawei.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/mem/cxl_type3.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/hw/mem/cxl_type3.c b/hw/mem/cxl_type3.c index 35ac59883a..e7fbbb4d51 100644 --- a/hw/mem/cxl_type3.c +++ b/hw/mem/cxl_type3.c @@ -737,6 +737,11 @@ static bool cxl_setup_memory(CXLType3Dev *ct3d, Error **errp) error_setg(errp, "volatile memdev must have backing device"); return false; } + if (host_memory_backend_is_mapped(ct3d->hostvmem)) { + error_setg(errp, "memory backend %s can't be used multiple times.", + object_get_canonical_path_component(OBJECT(ct3d->hostvmem))); + return false; + } memory_region_set_nonvolatile(vmr, false); memory_region_set_enabled(vmr, true); host_memory_backend_set_mapped(ct3d->hostvmem, true); @@ -760,6 +765,11 @@ static bool cxl_setup_memory(CXLType3Dev *ct3d, Error **errp) error_setg(errp, "persistent memdev must have backing device"); return false; } + if (host_memory_backend_is_mapped(ct3d->hostpmem)) { + error_setg(errp, "memory backend %s can't be used multiple times.", + object_get_canonical_path_component(OBJECT(ct3d->hostpmem))); + return false; + } memory_region_set_nonvolatile(pmr, true); memory_region_set_enabled(pmr, true); host_memory_backend_set_mapped(ct3d->hostpmem, true); @@ -790,6 +800,11 @@ static bool cxl_setup_memory(CXLType3Dev *ct3d, Error **errp) return false; } + if (host_memory_backend_is_mapped(ct3d->dc.host_dc)) { + error_setg(errp, "memory backend %s can't be used multiple times.", + object_get_canonical_path_component(OBJECT(ct3d->dc.host_dc))); + return false; + } /* * Set DC regions as volatile for now, non-volatile support can * be added in the future if needed. From a207d5f87d66f7933b50677e047498fc4af63e1f Mon Sep 17 00:00:00 2001 From: Zhao Liu Date: Fri, 5 Jul 2024 12:39:54 +0100 Subject: [PATCH 05/61] hw/cxl/cxl-host: Fix segmentation fault when getting cxl-fmw property QEMU crashes (Segmentation fault) when getting cxl-fmw property via qmp: (QEMU) qom-get path=machine property=cxl-fmw This issue is caused by accessing wrong callback (opaque) type in machine_get_cfmw(). cxl_machine_init() sets the callback as `CXLState *` type but machine_get_cfmw() treats the callback as `CXLFixedMemoryWindowOptionsList **`. Fix this error by casting opaque to `CXLState *` type in machine_get_cfmw(). Fixes: 03b39fcf64bc ("hw/cxl: Make the CXL fixed memory window setup a machine parameter.") Signed-off-by: Zhao Liu Reviewed-by: Li Zhijian Reviewed-by: Xingtao Yao Link: https://lore.kernel.org/r/20240704093404.1848132-1-zhao1.liu@linux.intel.com Signed-off-by: Jonathan Cameron Message-Id: <20240705113956.941732-2-Jonathan.Cameron@huawei.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/cxl/cxl-host.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hw/cxl/cxl-host.c b/hw/cxl/cxl-host.c index c5f5fcfd64..e9f2543c43 100644 --- a/hw/cxl/cxl-host.c +++ b/hw/cxl/cxl-host.c @@ -315,7 +315,8 @@ static void machine_set_cxl(Object *obj, Visitor *v, const char *name, static void machine_get_cfmw(Object *obj, Visitor *v, const char *name, void *opaque, Error **errp) { - CXLFixedMemoryWindowOptionsList **list = opaque; + CXLState *state = opaque; + CXLFixedMemoryWindowOptionsList **list = &state->cfmw_list; visit_type_CXLFixedMemoryWindowOptionsList(v, name, list, errp); } From d61cc5b6a8d373376a8bf5ec29cfc3267a29efbe Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Fri, 5 Jul 2024 13:06:40 +0100 Subject: [PATCH 06/61] hw/cxl: Add get scan media capabilities cmd support Use simple heuristics to determine the cost of scanning any given chunk, assuming cost is equal across the whole device, without differentiating between volatile or persistent partitions. This is aligned to the fact that these constraints are not enforced in respective poison query commands. Signed-off-by: Davidlohr Bueso Link: https://lore.kernel.org/r/20230908073152.4386-3-dave@stgolabs.net Signed-off-by: Jonathan Cameron Message-Id: <20240705120643.959422-2-Jonathan.Cameron@huawei.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/cxl/cxl-mailbox-utils.c | 211 +++++++++++++++++++++++++++++++++++- hw/mem/cxl_type3.c | 22 ++-- include/hw/cxl/cxl_device.h | 9 ++ 3 files changed, 233 insertions(+), 9 deletions(-) diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c index facec42dc8..8ae9c6a699 100644 --- a/hw/cxl/cxl-mailbox-utils.c +++ b/hw/cxl/cxl-mailbox-utils.c @@ -83,6 +83,8 @@ enum { #define GET_POISON_LIST 0x0 #define INJECT_POISON 0x1 #define CLEAR_POISON 0x2 + #define GET_SCAN_MEDIA_CAPABILITIES 0x3 + #define SCAN_MEDIA 0x4 DCD_CONFIG = 0x48, #define GET_DC_CONFIG 0x0 #define GET_DYN_CAP_EXT_LIST 0x1 @@ -1110,6 +1112,10 @@ static CXLRetCode cmd_media_get_poison_list(const struct cxl_cmd *cmd, out->flags = (1 << 1); stq_le_p(&out->overflow_timestamp, ct3d->poison_list_overflow_ts); } + if (scan_media_running(cci)) { + out->flags |= (1 << 2); + } + stw_le_p(&out->count, record_count); *len_out = out_pl_len; return CXL_MBOX_SUCCESS; @@ -1139,6 +1145,16 @@ static CXLRetCode cmd_media_inject_poison(const struct cxl_cmd *cmd, return CXL_MBOX_SUCCESS; } } + /* + * Freeze the list if there is an on-going scan media operation. + */ + if (scan_media_running(cci)) { + /* + * XXX: Spec is ambiguous - is this case considered + * a successful return despite not adding to the list? + */ + goto success; + } if (ct3d->poison_list_cnt == CXL_POISON_LIST_LIMIT) { return CXL_MBOX_INJECT_POISON_LIMIT; @@ -1154,6 +1170,7 @@ static CXLRetCode cmd_media_inject_poison(const struct cxl_cmd *cmd, */ QLIST_INSERT_HEAD(poison_list, p, node); ct3d->poison_list_cnt++; +success: *len_out = 0; return CXL_MBOX_SUCCESS; @@ -1193,6 +1210,17 @@ static CXLRetCode cmd_media_clear_poison(const struct cxl_cmd *cmd, } } + /* + * Freeze the list if there is an on-going scan media operation. + */ + if (scan_media_running(cci)) { + /* + * XXX: Spec is ambiguous - is this case considered + * a successful return despite not removing from the list? + */ + goto success; + } + QLIST_FOREACH(ent, poison_list, node) { /* * Test for contained in entry. Simpler than general case @@ -1203,7 +1231,7 @@ static CXLRetCode cmd_media_clear_poison(const struct cxl_cmd *cmd, } } if (!ent) { - return CXL_MBOX_SUCCESS; + goto success; } QLIST_REMOVE(ent, node); @@ -1240,11 +1268,180 @@ static CXLRetCode cmd_media_clear_poison(const struct cxl_cmd *cmd, } /* Any fragments have been added, free original entry */ g_free(ent); +success: *len_out = 0; return CXL_MBOX_SUCCESS; } +/* + * CXL r3.1 section 8.2.9.9.4.4: Get Scan Media Capabilities + */ +static CXLRetCode +cmd_media_get_scan_media_capabilities(const struct cxl_cmd *cmd, + uint8_t *payload_in, + size_t len_in, + uint8_t *payload_out, + size_t *len_out, + CXLCCI *cci) +{ + struct get_scan_media_capabilities_pl { + uint64_t pa; + uint64_t length; + } QEMU_PACKED; + + struct get_scan_media_capabilities_out_pl { + uint32_t estimated_runtime_ms; + }; + + CXLType3Dev *ct3d = CXL_TYPE3(cci->d); + CXLDeviceState *cxl_dstate = &ct3d->cxl_dstate; + struct get_scan_media_capabilities_pl *in = (void *)payload_in; + struct get_scan_media_capabilities_out_pl *out = (void *)payload_out; + uint64_t query_start; + uint64_t query_length; + + query_start = ldq_le_p(&in->pa); + /* 64 byte alignment required */ + if (query_start & 0x3f) { + return CXL_MBOX_INVALID_INPUT; + } + query_length = ldq_le_p(&in->length) * CXL_CACHE_LINE_SIZE; + + if (query_start + query_length > cxl_dstate->static_mem_size) { + return CXL_MBOX_INVALID_PA; + } + + /* + * Just use 400 nanosecond access/read latency + 100 ns for + * the cost of updating the poison list. For small enough + * chunks return at least 1 ms. + */ + stl_le_p(&out->estimated_runtime_ms, + MAX(1, query_length * (0.0005L / 64))); + + *len_out = sizeof(*out); + return CXL_MBOX_SUCCESS; +} + +static void __do_scan_media(CXLType3Dev *ct3d) +{ + CXLPoison *ent; + unsigned int results_cnt = 0; + + QLIST_FOREACH(ent, &ct3d->scan_media_results, node) { + results_cnt++; + } + + /* only scan media may clear the overflow */ + if (ct3d->poison_list_overflowed && + ct3d->poison_list_cnt == results_cnt) { + cxl_clear_poison_list_overflowed(ct3d); + } +} + +/* + * CXL r3.1 section 8.2.9.9.4.5: Scan Media + */ +static CXLRetCode cmd_media_scan_media(const struct cxl_cmd *cmd, + uint8_t *payload_in, + size_t len_in, + uint8_t *payload_out, + size_t *len_out, + CXLCCI *cci) +{ + struct scan_media_pl { + uint64_t pa; + uint64_t length; + uint8_t flags; + } QEMU_PACKED; + + struct scan_media_pl *in = (void *)payload_in; + CXLType3Dev *ct3d = CXL_TYPE3(cci->d); + CXLDeviceState *cxl_dstate = &ct3d->cxl_dstate; + uint64_t query_start; + uint64_t query_length; + CXLPoison *ent, *next; + + query_start = ldq_le_p(&in->pa); + /* 64 byte alignment required */ + if (query_start & 0x3f) { + return CXL_MBOX_INVALID_INPUT; + } + query_length = ldq_le_p(&in->length) * CXL_CACHE_LINE_SIZE; + + if (query_start + query_length > cxl_dstate->static_mem_size) { + return CXL_MBOX_INVALID_PA; + } + if (ct3d->dc.num_regions && query_start + query_length >= + cxl_dstate->static_mem_size + ct3d->dc.total_capacity) { + return CXL_MBOX_INVALID_PA; + } + + if (in->flags == 0) { /* TODO */ + qemu_log_mask(LOG_UNIMP, + "Scan Media Event Log is unsupported\n"); + } + + /* any previous results are discarded upon a new Scan Media */ + QLIST_FOREACH_SAFE(ent, &ct3d->scan_media_results, node, next) { + QLIST_REMOVE(ent, node); + g_free(ent); + } + + /* kill the poison list - it will be recreated */ + if (ct3d->poison_list_overflowed) { + QLIST_FOREACH_SAFE(ent, &ct3d->poison_list, node, next) { + QLIST_REMOVE(ent, node); + g_free(ent); + ct3d->poison_list_cnt--; + } + } + + /* + * Scan the backup list and move corresponding entries + * into the results list, updating the poison list + * when possible. + */ + QLIST_FOREACH_SAFE(ent, &ct3d->poison_list_bkp, node, next) { + CXLPoison *res; + + if (ent->start >= query_start + query_length || + ent->start + ent->length <= query_start) { + continue; + } + + /* + * If a Get Poison List cmd comes in while this + * scan is being done, it will see the new complete + * list, while setting the respective flag. + */ + if (ct3d->poison_list_cnt < CXL_POISON_LIST_LIMIT) { + CXLPoison *p = g_new0(CXLPoison, 1); + + p->start = ent->start; + p->length = ent->length; + p->type = ent->type; + QLIST_INSERT_HEAD(&ct3d->poison_list, p, node); + ct3d->poison_list_cnt++; + } + + res = g_new0(CXLPoison, 1); + res->start = ent->start; + res->length = ent->length; + res->type = ent->type; + QLIST_INSERT_HEAD(&ct3d->scan_media_results, res, node); + + QLIST_REMOVE(ent, node); + g_free(ent); + } + + cci->bg.runtime = MAX(1, query_length * (0.0005L / 64)); + *len_out = 0; + + return CXL_MBOX_BG_STARTED; +} + /* * CXL r3.1 section 8.2.9.9.9.1: Get Dynamic Capacity Configuration * (Opcode: 4800h) @@ -1857,6 +2054,11 @@ static const struct cxl_cmd cxl_cmd_set[256][256] = { cmd_media_inject_poison, 8, 0 }, [MEDIA_AND_POISON][CLEAR_POISON] = { "MEDIA_AND_POISON_CLEAR_POISON", cmd_media_clear_poison, 72, 0 }, + [MEDIA_AND_POISON][GET_SCAN_MEDIA_CAPABILITIES] = { + "MEDIA_AND_POISON_GET_SCAN_MEDIA_CAPABILITIES", + cmd_media_get_scan_media_capabilities, 16, 0 }, + [MEDIA_AND_POISON][SCAN_MEDIA] = { "MEDIA_AND_POISON_SCAN_MEDIA", + cmd_media_scan_media, 17, BACKGROUND_OPERATION }, }; static const struct cxl_cmd cxl_cmd_set_dcd[256][256] = { @@ -1988,8 +2190,13 @@ static void bg_timercb(void *opaque) cxl_dev_enable_media(&ct3d->cxl_dstate); } break; - case 0x4304: /* TODO: scan media */ + case 0x4304: /* scan media */ + { + CXLType3Dev *ct3d = CXL_TYPE3(cci->d); + + __do_scan_media(ct3d); break; + } default: __builtin_unreachable(); break; diff --git a/hw/mem/cxl_type3.c b/hw/mem/cxl_type3.c index e7fbbb4d51..e763b9bff0 100644 --- a/hw/mem/cxl_type3.c +++ b/hw/mem/cxl_type3.c @@ -1319,6 +1319,12 @@ void cxl_set_poison_list_overflowed(CXLType3Dev *ct3d) cxl_device_get_timestamp(&ct3d->cxl_dstate); } +void cxl_clear_poison_list_overflowed(CXLType3Dev *ct3d) +{ + ct3d->poison_list_overflowed = false; + ct3d->poison_list_overflow_ts = 0; +} + void qmp_cxl_inject_poison(const char *path, uint64_t start, uint64_t length, Error **errp) { @@ -1355,19 +1361,21 @@ void qmp_cxl_inject_poison(const char *path, uint64_t start, uint64_t length, } } - if (ct3d->poison_list_cnt == CXL_POISON_LIST_LIMIT) { - cxl_set_poison_list_overflowed(ct3d); - return; - } - p = g_new0(CXLPoison, 1); p->length = length; p->start = start; /* Different from injected via the mbox */ p->type = CXL_POISON_TYPE_INTERNAL; - QLIST_INSERT_HEAD(&ct3d->poison_list, p, node); - ct3d->poison_list_cnt++; + if (ct3d->poison_list_cnt < CXL_POISON_LIST_LIMIT) { + QLIST_INSERT_HEAD(&ct3d->poison_list, p, node); + ct3d->poison_list_cnt++; + } else { + if (!ct3d->poison_list_overflowed) { + cxl_set_poison_list_overflowed(ct3d); + } + QLIST_INSERT_HEAD(&ct3d->poison_list_bkp, p, node); + } } /* For uncorrectable errors include support for multiple header recording */ diff --git a/include/hw/cxl/cxl_device.h b/include/hw/cxl/cxl_device.h index 0a4fcb2800..b5beb7f90e 100644 --- a/include/hw/cxl/cxl_device.h +++ b/include/hw/cxl/cxl_device.h @@ -397,6 +397,11 @@ static inline void __toggle_media(CXLDeviceState *cxl_dstate, int val) #define cxl_dev_enable_media(cxlds) \ do { __toggle_media((cxlds), 0x1); } while (0) +static inline bool scan_media_running(CXLCCI *cci) +{ + return !!cci->bg.runtime && cci->bg.opcode == 0x4304; +} + static inline bool sanitize_running(CXLCCI *cci) { return !!cci->bg.runtime && cci->bg.opcode == 0x4400; @@ -491,6 +496,9 @@ struct CXLType3Dev { unsigned int poison_list_cnt; bool poison_list_overflowed; uint64_t poison_list_overflow_ts; + /* Poison Injection - backup */ + CXLPoisonList poison_list_bkp; + CXLPoisonList scan_media_results; struct dynamic_capacity { HostMemoryBackend *host_dc; @@ -558,6 +566,7 @@ CXLRetCode cxl_event_clear_records(CXLDeviceState *cxlds, void cxl_event_irq_assert(CXLType3Dev *ct3d); void cxl_set_poison_list_overflowed(CXLType3Dev *ct3d); +void cxl_clear_poison_list_overflowed(CXLType3Dev *ct3d); CXLDCRegion *cxl_find_dc_region(CXLType3Dev *ct3d, uint64_t dpa, uint64_t len); From 75b800dd3bd8042503ddd4e8a4169f34349325e2 Mon Sep 17 00:00:00 2001 From: Hyeonggon Yoo <42.hyeyoo@gmail.com> Date: Fri, 5 Jul 2024 13:06:41 +0100 Subject: [PATCH 07/61] hw/cxl/mbox: replace sanitize_running() with cxl_dev_media_disabled() The spec states that reads/writes should have no effect and a part of commands should be ignored when the media is disabled, not when the sanitize command is running. Introduce cxl_dev_media_disabled() to check if the media is disabled and replace sanitize_running() with it. Make sure that the media has been correctly disabled during sanitation by adding an assert to __toggle_media(). Now, enabling when already enabled or vice versa results in an assert() failure. Suggested-by: Davidlohr Bueso Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Link: https://lore.kernel.org/r/20231222090051.3265307-4-42.hyeyoo@gmail.com Signed-off-by: Jonathan Cameron Message-Id: <20240705120643.959422-3-Jonathan.Cameron@huawei.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/cxl/cxl-mailbox-utils.c | 29 +++++++++++++++++------------ hw/mem/cxl_type3.c | 4 ++-- include/hw/cxl/cxl_device.h | 10 +++++----- 3 files changed, 24 insertions(+), 19 deletions(-) diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c index 8ae9c6a699..522d9aa589 100644 --- a/hw/cxl/cxl-mailbox-utils.c +++ b/hw/cxl/cxl-mailbox-utils.c @@ -2108,6 +2108,7 @@ int cxl_process_cci_message(CXLCCI *cci, uint8_t set, uint8_t cmd, int ret; const struct cxl_cmd *cxl_cmd; opcode_handler h; + CXLDeviceState *cxl_dstate; *len_out = 0; cxl_cmd = &cci->cxl_cmd_set[set][cmd]; @@ -2128,18 +2129,22 @@ int cxl_process_cci_message(CXLCCI *cci, uint8_t set, uint8_t cmd, return CXL_MBOX_BUSY; } - /* forbid any selected commands while overwriting */ - if (sanitize_running(cci)) { - if (h == cmd_events_get_records || - h == cmd_ccls_get_partition_info || - h == cmd_ccls_set_lsa || - h == cmd_ccls_get_lsa || - h == cmd_logs_get_log || - h == cmd_media_get_poison_list || - h == cmd_media_inject_poison || - h == cmd_media_clear_poison || - h == cmd_sanitize_overwrite) { - return CXL_MBOX_MEDIA_DISABLED; + /* forbid any selected commands while the media is disabled */ + if (object_dynamic_cast(OBJECT(cci->d), TYPE_CXL_TYPE3)) { + cxl_dstate = &CXL_TYPE3(cci->d)->cxl_dstate; + + if (cxl_dev_media_disabled(cxl_dstate)) { + if (h == cmd_events_get_records || + h == cmd_ccls_get_partition_info || + h == cmd_ccls_set_lsa || + h == cmd_ccls_get_lsa || + h == cmd_logs_get_log || + h == cmd_media_get_poison_list || + h == cmd_media_inject_poison || + h == cmd_media_clear_poison || + h == cmd_sanitize_overwrite) { + return CXL_MBOX_MEDIA_DISABLED; + } } } diff --git a/hw/mem/cxl_type3.c b/hw/mem/cxl_type3.c index e763b9bff0..c7910687ae 100644 --- a/hw/mem/cxl_type3.c +++ b/hw/mem/cxl_type3.c @@ -1142,7 +1142,7 @@ MemTxResult cxl_type3_read(PCIDevice *d, hwaddr host_addr, uint64_t *data, return MEMTX_ERROR; } - if (sanitize_running(&ct3d->cci)) { + if (cxl_dev_media_disabled(&ct3d->cxl_dstate)) { qemu_guest_getrandom_nofail(data, size); return MEMTX_OK; } @@ -1164,7 +1164,7 @@ MemTxResult cxl_type3_write(PCIDevice *d, hwaddr host_addr, uint64_t data, return MEMTX_ERROR; } - if (sanitize_running(&ct3d->cci)) { + if (cxl_dev_media_disabled(&ct3d->cxl_dstate)) { return MEMTX_OK; } diff --git a/include/hw/cxl/cxl_device.h b/include/hw/cxl/cxl_device.h index b5beb7f90e..42a622197e 100644 --- a/include/hw/cxl/cxl_device.h +++ b/include/hw/cxl/cxl_device.h @@ -397,16 +397,16 @@ static inline void __toggle_media(CXLDeviceState *cxl_dstate, int val) #define cxl_dev_enable_media(cxlds) \ do { __toggle_media((cxlds), 0x1); } while (0) +static inline bool cxl_dev_media_disabled(CXLDeviceState *cxl_dstate) +{ + uint64_t dev_status_reg = cxl_dstate->mbox_reg_state64[R_CXL_MEM_DEV_STS]; + return FIELD_EX64(dev_status_reg, CXL_MEM_DEV_STS, MEDIA_STATUS) == 0x3; +} static inline bool scan_media_running(CXLCCI *cci) { return !!cci->bg.runtime && cci->bg.opcode == 0x4304; } -static inline bool sanitize_running(CXLCCI *cci) -{ - return !!cci->bg.runtime && cci->bg.opcode == 0x4400; -} - typedef struct CXLError { QTAILQ_ENTRY(CXLError) node; int type; /* Error code as per FE definition */ From 7d65874ba0ea8cdb2a5ac51c397d721d7d49d828 Mon Sep 17 00:00:00 2001 From: Hyeonggon Yoo <42.hyeyoo@gmail.com> Date: Fri, 5 Jul 2024 13:06:42 +0100 Subject: [PATCH 08/61] hw/cxl/events: discard all event records during sanitation Per CXL r3.1 Section 8.2.9.9.5.1: Sanitize (Opcode 4400h), the sanitize command should delete all event logs. Introduce cxl_discard_all_event_logs() and call this in __do_sanitization(). Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Reviewed-by: Davidlohr Bueso Link: https://lore.kernel.org/r/20231222090051.3265307-5-42.hyeyoo@gmail.com Signed-off-by: Jonathan Cameron Message-Id: <20240705120643.959422-4-Jonathan.Cameron@huawei.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/cxl/cxl-events.c | 13 +++++++++++++ hw/cxl/cxl-mailbox-utils.c | 1 + include/hw/cxl/cxl_device.h | 1 + 3 files changed, 15 insertions(+) diff --git a/hw/cxl/cxl-events.c b/hw/cxl/cxl-events.c index d397718b1b..12dee2e467 100644 --- a/hw/cxl/cxl-events.c +++ b/hw/cxl/cxl-events.c @@ -139,6 +139,19 @@ bool cxl_event_insert(CXLDeviceState *cxlds, CXLEventLogType log_type, return cxl_event_count(log) == 1; } +void cxl_discard_all_event_records(CXLDeviceState *cxlds) +{ + CXLEventLogType log_type; + CXLEventLog *log; + + for (log_type = 0; log_type < CXL_EVENT_TYPE_MAX; log_type++) { + log = &cxlds->event_logs[log_type]; + while (!cxl_event_empty(log)) { + cxl_event_delete_head(cxlds, log_type, log); + } + } +} + CXLRetCode cxl_event_get_records(CXLDeviceState *cxlds, CXLGetEventPayload *pl, uint8_t log_type, int max_recs, size_t *len) diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c index 522d9aa589..3c9600c39c 100644 --- a/hw/cxl/cxl-mailbox-utils.c +++ b/hw/cxl/cxl-mailbox-utils.c @@ -949,6 +949,7 @@ static void __do_sanitization(CXLType3Dev *ct3d) memset(lsa, 0, memory_region_size(mr)); } } + cxl_discard_all_event_records(&ct3d->cxl_dstate); } /* diff --git a/include/hw/cxl/cxl_device.h b/include/hw/cxl/cxl_device.h index 42a622197e..0509d961c3 100644 --- a/include/hw/cxl/cxl_device.h +++ b/include/hw/cxl/cxl_device.h @@ -562,6 +562,7 @@ CXLRetCode cxl_event_get_records(CXLDeviceState *cxlds, CXLGetEventPayload *pl, size_t *len); CXLRetCode cxl_event_clear_records(CXLDeviceState *cxlds, CXLClearEventPayload *pl); +void cxl_discard_all_event_records(CXLDeviceState *cxlds); void cxl_event_irq_assert(CXLType3Dev *ct3d); From 89b5cfcc31e655a40919698a7f95a9208d6f12a3 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Fri, 5 Jul 2024 13:06:43 +0100 Subject: [PATCH 09/61] hw/cxl: Add get scan media results cmd support Iterate over the list keeping the output payload size into account, returning the results from a previous scan media operation. The scan media operation does not fail prematurely due to device being out of storage, so this implementation does not deal with the retry/restart functionality. Signed-off-by: Davidlohr Bueso Link: https://lore.kernel.org/r/20230908073152.4386-5-dave@stgolabs.net Signed-off-by: Jonathan Cameron Message-Id: <20240705120643.959422-5-Jonathan.Cameron@huawei.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/cxl/cxl-mailbox-utils.c | 85 +++++++++++++++++++++++++++++++++++++ include/hw/cxl/cxl_device.h | 1 + 2 files changed, 86 insertions(+) diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c index 3c9600c39c..82120a6e7b 100644 --- a/hw/cxl/cxl-mailbox-utils.c +++ b/hw/cxl/cxl-mailbox-utils.c @@ -85,6 +85,7 @@ enum { #define CLEAR_POISON 0x2 #define GET_SCAN_MEDIA_CAPABILITIES 0x3 #define SCAN_MEDIA 0x4 + #define GET_SCAN_MEDIA_RESULTS 0x5 DCD_CONFIG = 0x48, #define GET_DC_CONFIG 0x0 #define GET_DYN_CAP_EXT_LIST 0x1 @@ -1339,6 +1340,8 @@ static void __do_scan_media(CXLType3Dev *ct3d) ct3d->poison_list_cnt == results_cnt) { cxl_clear_poison_list_overflowed(ct3d); } + /* scan media has run since last conventional reset */ + ct3d->scan_media_hasrun = true; } /* @@ -1443,6 +1446,85 @@ static CXLRetCode cmd_media_scan_media(const struct cxl_cmd *cmd, return CXL_MBOX_BG_STARTED; } +/* + * CXL r3.1 section 8.2.9.9.4.6: Get Scan Media Results + */ +static CXLRetCode cmd_media_get_scan_media_results(const struct cxl_cmd *cmd, + uint8_t *payload_in, + size_t len_in, + uint8_t *payload_out, + size_t *len_out, + CXLCCI *cci) +{ + struct get_scan_media_results_out_pl { + uint64_t dpa_restart; + uint64_t length; + uint8_t flags; + uint8_t rsvd1; + uint16_t count; + uint8_t rsvd2[0xc]; + struct { + uint64_t addr; + uint32_t length; + uint32_t resv; + } QEMU_PACKED records[]; + } QEMU_PACKED; + + struct get_scan_media_results_out_pl *out = (void *)payload_out; + CXLType3Dev *ct3d = CXL_TYPE3(cci->d); + CXLPoisonList *scan_media_results = &ct3d->scan_media_results; + CXLPoison *ent, *next; + uint16_t total_count = 0, record_count = 0, i = 0; + uint16_t out_pl_len; + + if (!ct3d->scan_media_hasrun) { + return CXL_MBOX_UNSUPPORTED; + } + + /* + * Calculate limits, all entries are within the same address range of the + * last scan media call. + */ + QLIST_FOREACH(ent, scan_media_results, node) { + size_t rec_size = record_count * sizeof(out->records[0]); + + if (sizeof(*out) + rec_size < CXL_MAILBOX_MAX_PAYLOAD_SIZE) { + record_count++; + } + total_count++; + } + + out_pl_len = sizeof(*out) + record_count * sizeof(out->records[0]); + assert(out_pl_len <= CXL_MAILBOX_MAX_PAYLOAD_SIZE); + + memset(out, 0, out_pl_len); + QLIST_FOREACH_SAFE(ent, scan_media_results, node, next) { + uint64_t start, stop; + + if (i == record_count) { + break; + } + + start = ROUND_DOWN(ent->start, 64ull); + stop = ROUND_DOWN(ent->start, 64ull) + ent->length; + stq_le_p(&out->records[i].addr, start | (ent->type & 0x7)); + stl_le_p(&out->records[i].length, (stop - start) / CXL_CACHE_LINE_SIZE); + i++; + + /* consume the returning entry */ + QLIST_REMOVE(ent, node); + g_free(ent); + } + + stw_le_p(&out->count, record_count); + if (total_count > record_count) { + out->flags = (1 << 0); /* More Media Error Records */ + } + + *len_out = out_pl_len; + return CXL_MBOX_SUCCESS; +} + /* * CXL r3.1 section 8.2.9.9.9.1: Get Dynamic Capacity Configuration * (Opcode: 4800h) @@ -2060,6 +2142,9 @@ static const struct cxl_cmd cxl_cmd_set[256][256] = { cmd_media_get_scan_media_capabilities, 16, 0 }, [MEDIA_AND_POISON][SCAN_MEDIA] = { "MEDIA_AND_POISON_SCAN_MEDIA", cmd_media_scan_media, 17, BACKGROUND_OPERATION }, + [MEDIA_AND_POISON][GET_SCAN_MEDIA_RESULTS] = { + "MEDIA_AND_POISON_GET_SCAN_MEDIA_RESULTS", + cmd_media_get_scan_media_results, 0, 0 }, }; static const struct cxl_cmd cxl_cmd_set_dcd[256][256] = { diff --git a/include/hw/cxl/cxl_device.h b/include/hw/cxl/cxl_device.h index 0509d961c3..cc98553583 100644 --- a/include/hw/cxl/cxl_device.h +++ b/include/hw/cxl/cxl_device.h @@ -499,6 +499,7 @@ struct CXLType3Dev { /* Poison Injection - backup */ CXLPoisonList poison_list_bkp; CXLPoisonList scan_media_results; + bool scan_media_hasrun; struct dynamic_capacity { HostMemoryBackend *host_dc; From 25da36d5d04deb8869bd6ad1223832f09f3efe79 Mon Sep 17 00:00:00 2001 From: Gregory Price Date: Fri, 5 Jul 2024 13:30:35 +0100 Subject: [PATCH 10/61] cxl/mailbox: move mailbox effect definitions to a header Preparation for allowing devices to define their own CCI commands Signed-off-by: Gregory Price Link: https://lore.kernel.org/r/20230906001517.324380-2-gregory.price@memverge.com Signed-off-by: Jonathan Cameron Message-Id: <20240705123039.963781-2-Jonathan.Cameron@huawei.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/cxl/cxl-mailbox-utils.c | 34 +++++++++++++++------------------- include/hw/cxl/cxl_mailbox.h | 18 ++++++++++++++++++ 2 files changed, 33 insertions(+), 19 deletions(-) create mode 100644 include/hw/cxl/cxl_mailbox.h diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c index 82120a6e7b..dbaa83a110 100644 --- a/hw/cxl/cxl-mailbox-utils.c +++ b/hw/cxl/cxl-mailbox-utils.c @@ -12,6 +12,7 @@ #include "hw/pci/msix.h" #include "hw/cxl/cxl.h" #include "hw/cxl/cxl_events.h" +#include "hw/cxl/cxl_mailbox.h" #include "hw/pci/pci.h" #include "hw/pci-bridge/cxl_upstream_port.h" #include "qemu/cutils.h" @@ -2095,28 +2096,21 @@ static CXLRetCode cmd_dcd_release_dyn_cap(const struct cxl_cmd *cmd, return CXL_MBOX_SUCCESS; } -#define IMMEDIATE_CONFIG_CHANGE (1 << 1) -#define IMMEDIATE_DATA_CHANGE (1 << 2) -#define IMMEDIATE_POLICY_CHANGE (1 << 3) -#define IMMEDIATE_LOG_CHANGE (1 << 4) -#define SECURITY_STATE_CHANGE (1 << 5) -#define BACKGROUND_OPERATION (1 << 6) - static const struct cxl_cmd cxl_cmd_set[256][256] = { [EVENTS][GET_RECORDS] = { "EVENTS_GET_RECORDS", cmd_events_get_records, 1, 0 }, [EVENTS][CLEAR_RECORDS] = { "EVENTS_CLEAR_RECORDS", - cmd_events_clear_records, ~0, IMMEDIATE_LOG_CHANGE }, + cmd_events_clear_records, ~0, CXL_MBOX_IMMEDIATE_LOG_CHANGE }, [EVENTS][GET_INTERRUPT_POLICY] = { "EVENTS_GET_INTERRUPT_POLICY", cmd_events_get_interrupt_policy, 0, 0 }, [EVENTS][SET_INTERRUPT_POLICY] = { "EVENTS_SET_INTERRUPT_POLICY", cmd_events_set_interrupt_policy, - ~0, IMMEDIATE_CONFIG_CHANGE }, + ~0, CXL_MBOX_IMMEDIATE_CONFIG_CHANGE }, [FIRMWARE_UPDATE][GET_INFO] = { "FIRMWARE_UPDATE_GET_INFO", cmd_firmware_update_get_info, 0, 0 }, [TIMESTAMP][GET] = { "TIMESTAMP_GET", cmd_timestamp_get, 0, 0 }, [TIMESTAMP][SET] = { "TIMESTAMP_SET", cmd_timestamp_set, - 8, IMMEDIATE_POLICY_CHANGE }, + 8, CXL_MBOX_IMMEDIATE_POLICY_CHANGE }, [LOGS][GET_SUPPORTED] = { "LOGS_GET_SUPPORTED", cmd_logs_get_supported, 0, 0 }, [LOGS][GET_LOG] = { "LOGS_GET_LOG", cmd_logs_get_log, 0x18, 0 }, @@ -2126,9 +2120,11 @@ static const struct cxl_cmd cxl_cmd_set[256][256] = { cmd_ccls_get_partition_info, 0, 0 }, [CCLS][GET_LSA] = { "CCLS_GET_LSA", cmd_ccls_get_lsa, 8, 0 }, [CCLS][SET_LSA] = { "CCLS_SET_LSA", cmd_ccls_set_lsa, - ~0, IMMEDIATE_CONFIG_CHANGE | IMMEDIATE_DATA_CHANGE }, + ~0, CXL_MBOX_IMMEDIATE_CONFIG_CHANGE | CXL_MBOX_IMMEDIATE_DATA_CHANGE }, [SANITIZE][OVERWRITE] = { "SANITIZE_OVERWRITE", cmd_sanitize_overwrite, 0, - IMMEDIATE_DATA_CHANGE | SECURITY_STATE_CHANGE | BACKGROUND_OPERATION }, + (CXL_MBOX_IMMEDIATE_DATA_CHANGE | + CXL_MBOX_SECURITY_STATE_CHANGE | + CXL_MBOX_BACKGROUND_OPERATION)}, [PERSISTENT_MEM][GET_SECURITY_STATE] = { "GET_SECURITY_STATE", cmd_get_security_state, 0, 0 }, [MEDIA_AND_POISON][GET_POISON_LIST] = { "MEDIA_AND_POISON_GET_POISON_LIST", @@ -2141,7 +2137,7 @@ static const struct cxl_cmd cxl_cmd_set[256][256] = { "MEDIA_AND_POISON_GET_SCAN_MEDIA_CAPABILITIES", cmd_media_get_scan_media_capabilities, 16, 0 }, [MEDIA_AND_POISON][SCAN_MEDIA] = { "MEDIA_AND_POISON_SCAN_MEDIA", - cmd_media_scan_media, 17, BACKGROUND_OPERATION }, + cmd_media_scan_media, 17, CXL_MBOX_BACKGROUND_OPERATION }, [MEDIA_AND_POISON][GET_SCAN_MEDIA_RESULTS] = { "MEDIA_AND_POISON_GET_SCAN_MEDIA_RESULTS", cmd_media_get_scan_media_results, 0, 0 }, @@ -2155,10 +2151,10 @@ static const struct cxl_cmd cxl_cmd_set_dcd[256][256] = { 8, 0 }, [DCD_CONFIG][ADD_DYN_CAP_RSP] = { "DCD_ADD_DYNAMIC_CAPACITY_RESPONSE", cmd_dcd_add_dyn_cap_rsp, - ~0, IMMEDIATE_DATA_CHANGE }, + ~0, CXL_MBOX_IMMEDIATE_DATA_CHANGE }, [DCD_CONFIG][RELEASE_DYN_CAP] = { "DCD_RELEASE_DYNAMIC_CAPACITY", cmd_dcd_release_dyn_cap, - ~0, IMMEDIATE_DATA_CHANGE }, + ~0, CXL_MBOX_IMMEDIATE_DATA_CHANGE }, }; static const struct cxl_cmd cxl_cmd_set_sw[256][256] = { @@ -2166,8 +2162,8 @@ static const struct cxl_cmd cxl_cmd_set_sw[256][256] = { [INFOSTAT][BACKGROUND_OPERATION_STATUS] = { "BACKGROUND_OPERATION_STATUS", cmd_infostat_bg_op_sts, 0, 0 }, [TIMESTAMP][GET] = { "TIMESTAMP_GET", cmd_timestamp_get, 0, 0 }, - [TIMESTAMP][SET] = { "TIMESTAMP_SET", cmd_timestamp_set, 0, - IMMEDIATE_POLICY_CHANGE }, + [TIMESTAMP][SET] = { "TIMESTAMP_SET", cmd_timestamp_set, 8, + CXL_MBOX_IMMEDIATE_POLICY_CHANGE }, [LOGS][GET_SUPPORTED] = { "LOGS_GET_SUPPORTED", cmd_logs_get_supported, 0, 0 }, [LOGS][GET_LOG] = { "LOGS_GET_LOG", cmd_logs_get_log, 0x18, 0 }, @@ -2210,7 +2206,7 @@ int cxl_process_cci_message(CXLCCI *cci, uint8_t set, uint8_t cmd, } /* Only one bg command at a time */ - if ((cxl_cmd->effect & BACKGROUND_OPERATION) && + if ((cxl_cmd->effect & CXL_MBOX_BACKGROUND_OPERATION) && cci->bg.runtime > 0) { return CXL_MBOX_BUSY; } @@ -2235,7 +2231,7 @@ int cxl_process_cci_message(CXLCCI *cci, uint8_t set, uint8_t cmd, } ret = (*h)(cxl_cmd, pl_in, len_in, pl_out, len_out, cci); - if ((cxl_cmd->effect & BACKGROUND_OPERATION) && + if ((cxl_cmd->effect & CXL_MBOX_BACKGROUND_OPERATION) && ret == CXL_MBOX_BG_STARTED) { *bg_started = true; } else { diff --git a/include/hw/cxl/cxl_mailbox.h b/include/hw/cxl/cxl_mailbox.h new file mode 100644 index 0000000000..beb048052e --- /dev/null +++ b/include/hw/cxl/cxl_mailbox.h @@ -0,0 +1,18 @@ +/* + * QEMU CXL Mailbox + * + * This work is licensed under the terms of the GNU GPL, version 2. See the + * COPYING file in the top-level directory. + */ + +#ifndef CXL_MAILBOX_H +#define CXL_MAILBOX_H + +#define CXL_MBOX_IMMEDIATE_CONFIG_CHANGE (1 << 1) +#define CXL_MBOX_IMMEDIATE_DATA_CHANGE (1 << 2) +#define CXL_MBOX_IMMEDIATE_POLICY_CHANGE (1 << 3) +#define CXL_MBOX_IMMEDIATE_LOG_CHANGE (1 << 4) +#define CXL_MBOX_SECURITY_STATE_CHANGE (1 << 5) +#define CXL_MBOX_BACKGROUND_OPERATION (1 << 6) + +#endif From d80378943a6b88b8d211e1468073d9dd44239e27 Mon Sep 17 00:00:00 2001 From: Shiju Jose Date: Fri, 5 Jul 2024 13:30:36 +0100 Subject: [PATCH 11/61] hw/cxl/cxl-mailbox-utils: Add support for feature commands (8.2.9.6) CXL spec 3.1 section 8.2.9.6 describes optional device specific features. CXL devices supports features with changeable attributes. Get Supported Features retrieves the list of supported device specific features. The settings of a feature can be retrieved using Get Feature and optionally modified using Set Feature. Reviewed-by: Davidlohr Bueso Reviewed-by: Fan Ni Signed-off-by: Shiju Jose Link: https://lore.kernel.org/r/20240223085902.1549-2-shiju.jose@huawei.com Signed-off-by: Jonathan Cameron Message-Id: <20240705123039.963781-3-Jonathan.Cameron@huawei.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/cxl/cxl-mailbox-utils.c | 258 ++++++++++++++++++++++++++++++++++++ include/hw/cxl/cxl_device.h | 10 ++ 2 files changed, 268 insertions(+) diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c index dbaa83a110..7b7d8e5eae 100644 --- a/hw/cxl/cxl-mailbox-utils.c +++ b/hw/cxl/cxl-mailbox-utils.c @@ -69,6 +69,10 @@ enum { LOGS = 0x04, #define GET_SUPPORTED 0x0 #define GET_LOG 0x1 + FEATURES = 0x05, + #define GET_SUPPORTED 0x0 + #define GET_FEATURE 0x1 + #define SET_FEATURE 0x2 IDENTIFY = 0x40, #define MEMORY_DEVICE 0x0 CCLS = 0x41, @@ -767,6 +771,248 @@ static CXLRetCode cmd_logs_get_log(const struct cxl_cmd *cmd, return CXL_MBOX_SUCCESS; } +/* CXL r3.1 section 8.2.9.6: Features */ +/* + * Get Supported Features output payload + * CXL r3.1 section 8.2.9.6.1 Table 8-96 + */ +typedef struct CXLSupportedFeatureHeader { + uint16_t entries; + uint16_t nsuppfeats_dev; + uint32_t reserved; +} QEMU_PACKED CXLSupportedFeatureHeader; + +/* + * Get Supported Features Supported Feature Entry + * CXL r3.1 section 8.2.9.6.1 Table 8-97 + */ +typedef struct CXLSupportedFeatureEntry { + QemuUUID uuid; + uint16_t feat_index; + uint16_t get_feat_size; + uint16_t set_feat_size; + uint32_t attr_flags; + uint8_t get_feat_version; + uint8_t set_feat_version; + uint16_t set_feat_effects; + uint8_t rsvd[18]; +} QEMU_PACKED CXLSupportedFeatureEntry; + +/* + * Get Supported Features Supported Feature Entry + * CXL rev 3.1 section 8.2.9.6.1 Table 8-97 + */ +/* Supported Feature Entry : attribute flags */ +#define CXL_FEAT_ENTRY_ATTR_FLAG_CHANGABLE BIT(0) +#define CXL_FEAT_ENTRY_ATTR_FLAG_DEEPEST_RESET_PERSISTENCE_MASK GENMASK(3, 1) +#define CXL_FEAT_ENTRY_ATTR_FLAG_PERSIST_ACROSS_FIRMWARE_UPDATE BIT(4) +#define CXL_FEAT_ENTRY_ATTR_FLAG_SUPPORT_DEFAULT_SELECTION BIT(5) +#define CXL_FEAT_ENTRY_ATTR_FLAG_SUPPORT_SAVED_SELECTION BIT(6) + +/* Supported Feature Entry : set feature effects */ +#define CXL_FEAT_ENTRY_SFE_CONFIG_CHANGE_COLD_RESET BIT(0) +#define CXL_FEAT_ENTRY_SFE_IMMEDIATE_CONFIG_CHANGE BIT(1) +#define CXL_FEAT_ENTRY_SFE_IMMEDIATE_DATA_CHANGE BIT(2) +#define CXL_FEAT_ENTRY_SFE_IMMEDIATE_POLICY_CHANGE BIT(3) +#define CXL_FEAT_ENTRY_SFE_IMMEDIATE_LOG_CHANGE BIT(4) +#define CXL_FEAT_ENTRY_SFE_SECURITY_STATE_CHANGE BIT(5) +#define CXL_FEAT_ENTRY_SFE_BACKGROUND_OPERATION BIT(6) +#define CXL_FEAT_ENTRY_SFE_SUPPORT_SECONDARY_MAILBOX BIT(7) +#define CXL_FEAT_ENTRY_SFE_SUPPORT_ABORT_BACKGROUND_OPERATION BIT(8) +#define CXL_FEAT_ENTRY_SFE_CEL_VALID BIT(9) +#define CXL_FEAT_ENTRY_SFE_CONFIG_CHANGE_CONV_RESET BIT(10) +#define CXL_FEAT_ENTRY_SFE_CONFIG_CHANGE_CXL_RESET BIT(11) + +enum CXL_SUPPORTED_FEATURES_LIST { + CXL_FEATURE_MAX +}; + +/* Get Feature CXL 3.1 Spec 8.2.9.6.2 */ +/* + * Get Feature input payload + * CXL r3.1 section 8.2.9.6.2 Table 8-99 + */ +/* Get Feature : Payload in selection */ +enum CXL_GET_FEATURE_SELECTION { + CXL_GET_FEATURE_SEL_CURRENT_VALUE, + CXL_GET_FEATURE_SEL_DEFAULT_VALUE, + CXL_GET_FEATURE_SEL_SAVED_VALUE, + CXL_GET_FEATURE_SEL_MAX +}; + +/* Set Feature CXL 3.1 Spec 8.2.9.6.3 */ +/* + * Set Feature input payload + * CXL r3.1 section 8.2.9.6.3 Table 8-101 + */ +typedef struct CXLSetFeatureInHeader { + QemuUUID uuid; + uint32_t flags; + uint16_t offset; + uint8_t version; + uint8_t rsvd[9]; +} QEMU_PACKED QEMU_ALIGNED(16) CXLSetFeatureInHeader; + +/* Set Feature : Payload in flags */ +#define CXL_SET_FEATURE_FLAG_DATA_TRANSFER_MASK 0x7 +enum CXL_SET_FEATURE_FLAG_DATA_TRANSFER { + CXL_SET_FEATURE_FLAG_FULL_DATA_TRANSFER, + CXL_SET_FEATURE_FLAG_INITIATE_DATA_TRANSFER, + CXL_SET_FEATURE_FLAG_CONTINUE_DATA_TRANSFER, + CXL_SET_FEATURE_FLAG_FINISH_DATA_TRANSFER, + CXL_SET_FEATURE_FLAG_ABORT_DATA_TRANSFER, + CXL_SET_FEATURE_FLAG_DATA_TRANSFER_MAX +}; +#define CXL_SET_FEAT_DATA_SAVED_ACROSS_RESET BIT(3) + +/* CXL r3.1 section 8.2.9.6.1: Get Supported Features (Opcode 0500h) */ +static CXLRetCode cmd_features_get_supported(const struct cxl_cmd *cmd, + uint8_t *payload_in, + size_t len_in, + uint8_t *payload_out, + size_t *len_out, + CXLCCI *cci) +{ + struct { + uint32_t count; + uint16_t start_index; + uint16_t reserved; + } QEMU_PACKED QEMU_ALIGNED(16) * get_feats_in = (void *)payload_in; + + struct { + CXLSupportedFeatureHeader hdr; + CXLSupportedFeatureEntry feat_entries[]; + } QEMU_PACKED QEMU_ALIGNED(16) * get_feats_out = (void *)payload_out; + uint16_t index, req_entries; + uint16_t entry; + + if (!object_dynamic_cast(OBJECT(cci->d), TYPE_CXL_TYPE3)) { + return CXL_MBOX_UNSUPPORTED; + } + if (get_feats_in->count < sizeof(CXLSupportedFeatureHeader) || + /* + * Temporary: suppress compiler error due to unsigned + * comparioson to zero. + */ + true /*get_feats_in->start_index >= CXL_FEATURE_MAX*/) { + return CXL_MBOX_INVALID_INPUT; + } + + req_entries = (get_feats_in->count - + sizeof(CXLSupportedFeatureHeader)) / + sizeof(CXLSupportedFeatureEntry); + req_entries = MIN(req_entries, + (CXL_FEATURE_MAX - get_feats_in->start_index)); + + for (entry = 0, index = get_feats_in->start_index; + entry < req_entries; index++) { + switch (index) { + default: + __builtin_unreachable(); + } + } + get_feats_out->hdr.nsuppfeats_dev = CXL_FEATURE_MAX; + get_feats_out->hdr.entries = req_entries; + *len_out = sizeof(CXLSupportedFeatureHeader) + + req_entries * sizeof(CXLSupportedFeatureEntry); + + return CXL_MBOX_SUCCESS; +} + +/* CXL r3.1 section 8.2.9.6.2: Get Feature (Opcode 0501h) */ +static CXLRetCode cmd_features_get_feature(const struct cxl_cmd *cmd, + uint8_t *payload_in, + size_t len_in, + uint8_t *payload_out, + size_t *len_out, + CXLCCI *cci) +{ + struct { + QemuUUID uuid; + uint16_t offset; + uint16_t count; + uint8_t selection; + } QEMU_PACKED QEMU_ALIGNED(16) * get_feature; + uint16_t bytes_to_copy = 0; + CXLType3Dev *ct3d; + CXLSetFeatureInfo *set_feat_info; + + if (!object_dynamic_cast(OBJECT(cci->d), TYPE_CXL_TYPE3)) { + return CXL_MBOX_UNSUPPORTED; + } + + ct3d = CXL_TYPE3(cci->d); + get_feature = (void *)payload_in; + + set_feat_info = &ct3d->set_feat_info; + if (qemu_uuid_is_equal(&get_feature->uuid, &set_feat_info->uuid)) { + return CXL_MBOX_FEATURE_TRANSFER_IN_PROGRESS; + } + + if (get_feature->selection != CXL_GET_FEATURE_SEL_CURRENT_VALUE) { + return CXL_MBOX_UNSUPPORTED; + } + if (get_feature->offset + get_feature->count > cci->payload_max) { + return CXL_MBOX_INVALID_INPUT; + } + + *len_out = bytes_to_copy; + + return CXL_MBOX_SUCCESS; +} + +/* CXL r3.1 section 8.2.9.6.3: Set Feature (Opcode 0502h) */ +static CXLRetCode cmd_features_set_feature(const struct cxl_cmd *cmd, + uint8_t *payload_in, + size_t len_in, + uint8_t *payload_out, + size_t *len_out, + CXLCCI *cci) +{ + CXLSetFeatureInHeader *hdr = (void *)payload_in; + CXLSetFeatureInfo *set_feat_info; + uint8_t data_transfer_flag; + CXLType3Dev *ct3d; + + + if (!object_dynamic_cast(OBJECT(cci->d), TYPE_CXL_TYPE3)) { + return CXL_MBOX_UNSUPPORTED; + } + ct3d = CXL_TYPE3(cci->d); + set_feat_info = &ct3d->set_feat_info; + + if (!qemu_uuid_is_null(&set_feat_info->uuid) && + !qemu_uuid_is_equal(&hdr->uuid, &set_feat_info->uuid)) { + return CXL_MBOX_FEATURE_TRANSFER_IN_PROGRESS; + } + if (hdr->flags & CXL_SET_FEAT_DATA_SAVED_ACROSS_RESET) { + set_feat_info->data_saved_across_reset = true; + } else { + set_feat_info->data_saved_across_reset = false; + } + + data_transfer_flag = + hdr->flags & CXL_SET_FEATURE_FLAG_DATA_TRANSFER_MASK; + if (data_transfer_flag == CXL_SET_FEATURE_FLAG_INITIATE_DATA_TRANSFER) { + set_feat_info->uuid = hdr->uuid; + set_feat_info->data_size = 0; + } + set_feat_info->data_transfer_flag = data_transfer_flag; + set_feat_info->data_offset = hdr->offset; + + if (data_transfer_flag == CXL_SET_FEATURE_FLAG_FULL_DATA_TRANSFER || + data_transfer_flag == CXL_SET_FEATURE_FLAG_FINISH_DATA_TRANSFER || + data_transfer_flag == CXL_SET_FEATURE_FLAG_ABORT_DATA_TRANSFER) { + memset(&set_feat_info->uuid, 0, sizeof(QemuUUID)); + set_feat_info->data_transfer_flag = 0; + set_feat_info->data_saved_across_reset = false; + set_feat_info->data_offset = 0; + set_feat_info->data_size = 0; + } + + return CXL_MBOX_SUCCESS; +} + /* CXL r3.1 Section 8.2.9.9.1.1: Identify Memory Device (Opcode 4000h) */ static CXLRetCode cmd_identify_memory_device(const struct cxl_cmd *cmd, uint8_t *payload_in, @@ -2114,6 +2360,18 @@ static const struct cxl_cmd cxl_cmd_set[256][256] = { [LOGS][GET_SUPPORTED] = { "LOGS_GET_SUPPORTED", cmd_logs_get_supported, 0, 0 }, [LOGS][GET_LOG] = { "LOGS_GET_LOG", cmd_logs_get_log, 0x18, 0 }, + [FEATURES][GET_SUPPORTED] = { "FEATURES_GET_SUPPORTED", + cmd_features_get_supported, 0x8, 0 }, + [FEATURES][GET_FEATURE] = { "FEATURES_GET_FEATURE", + cmd_features_get_feature, 0x15, 0 }, + [FEATURES][SET_FEATURE] = { "FEATURES_SET_FEATURE", + cmd_features_set_feature, + ~0, + (CXL_MBOX_IMMEDIATE_CONFIG_CHANGE | + CXL_MBOX_IMMEDIATE_DATA_CHANGE | + CXL_MBOX_IMMEDIATE_POLICY_CHANGE | + CXL_MBOX_IMMEDIATE_LOG_CHANGE | + CXL_MBOX_SECURITY_STATE_CHANGE)}, [IDENTIFY][MEMORY_DEVICE] = { "IDENTIFY_MEMORY_DEVICE", cmd_identify_memory_device, 0, 0 }, [CCLS][GET_PARTITION_INFO] = { "CCLS_GET_PARTITION_INFO", diff --git a/include/hw/cxl/cxl_device.h b/include/hw/cxl/cxl_device.h index cc98553583..48ed0d9240 100644 --- a/include/hw/cxl/cxl_device.h +++ b/include/hw/cxl/cxl_device.h @@ -464,6 +464,14 @@ typedef struct CXLDCRegion { unsigned long *blk_bitmap; } CXLDCRegion; +typedef struct CXLSetFeatureInfo { + QemuUUID uuid; + uint8_t data_transfer_flag; + bool data_saved_across_reset; + uint16_t data_offset; + size_t data_size; +} CXLSetFeatureInfo; + struct CXLType3Dev { /* Private */ PCIDevice parent_obj; @@ -501,6 +509,8 @@ struct CXLType3Dev { CXLPoisonList scan_media_results; bool scan_media_hasrun; + CXLSetFeatureInfo set_feat_info; + struct dynamic_capacity { HostMemoryBackend *host_dc; AddressSpace host_dc_as; From d88f667414106c7216485774293d0831c2482d20 Mon Sep 17 00:00:00 2001 From: Shiju Jose Date: Fri, 5 Jul 2024 13:30:37 +0100 Subject: [PATCH 12/61] hw/cxl/cxl-mailbox-utils: Add device patrol scrub control feature CXL spec 3.1 section 8.2.9.9.11.1 describes the device patrol scrub control feature. The device patrol scrub proactively locates and makes corrections to errors in regular cycle. The patrol scrub control allows the request to configure patrol scrub input configurations. The patrol scrub control allows the requester to specify the number of hours for which the patrol scrub cycles must be completed, provided that the requested number is not less than the minimum number of hours for the patrol scrub cycle that the device is capable of. In addition, the patrol scrub controls allow the host to disable and enable the feature in case disabling of the feature is needed for other purposes such as performance-aware operations which require the background operations to be turned off. Reviewed-by: Davidlohr Bueso Reviewed-by: Fan Ni Signed-off-by: Shiju Jose Link: https://lore.kernel.org/r/20240223085902.1549-3-shiju.jose@huawei.com Signed-off-by: Jonathan Cameron Message-Id: <20240705123039.963781-4-Jonathan.Cameron@huawei.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/cxl/cxl-mailbox-utils.c | 79 ++++++++++++++++++++++++++++++++++--- hw/mem/cxl_type3.c | 9 +++++ include/hw/cxl/cxl_device.h | 24 +++++++++++ 3 files changed, 107 insertions(+), 5 deletions(-) diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c index 7b7d8e5eae..485beb9dba 100644 --- a/hw/cxl/cxl-mailbox-utils.c +++ b/hw/cxl/cxl-mailbox-utils.c @@ -824,6 +824,7 @@ typedef struct CXLSupportedFeatureEntry { #define CXL_FEAT_ENTRY_SFE_CONFIG_CHANGE_CXL_RESET BIT(11) enum CXL_SUPPORTED_FEATURES_LIST { + CXL_FEATURE_PATROL_SCRUB = 0, CXL_FEATURE_MAX }; @@ -865,6 +866,17 @@ enum CXL_SET_FEATURE_FLAG_DATA_TRANSFER { }; #define CXL_SET_FEAT_DATA_SAVED_ACROSS_RESET BIT(3) +/* CXL r3.1 section 8.2.9.9.11.1: Device Patrol Scrub Control Feature */ +static const QemuUUID patrol_scrub_uuid = { + .data = UUID(0x96dad7d6, 0xfde8, 0x482b, 0xa7, 0x33, + 0x75, 0x77, 0x4e, 0x06, 0xdb, 0x8a) +}; + +typedef struct CXLMemPatrolScrubSetFeature { + CXLSetFeatureInHeader hdr; + CXLMemPatrolScrubWriteAttrs feat_data; +} QEMU_PACKED QEMU_ALIGNED(16) CXLMemPatrolScrubSetFeature; + /* CXL r3.1 section 8.2.9.6.1: Get Supported Features (Opcode 0500h) */ static CXLRetCode cmd_features_get_supported(const struct cxl_cmd *cmd, uint8_t *payload_in, @@ -890,11 +902,7 @@ static CXLRetCode cmd_features_get_supported(const struct cxl_cmd *cmd, return CXL_MBOX_UNSUPPORTED; } if (get_feats_in->count < sizeof(CXLSupportedFeatureHeader) || - /* - * Temporary: suppress compiler error due to unsigned - * comparioson to zero. - */ - true /*get_feats_in->start_index >= CXL_FEATURE_MAX*/) { + get_feats_in->start_index >= CXL_FEATURE_MAX) { return CXL_MBOX_INVALID_INPUT; } @@ -907,6 +915,21 @@ static CXLRetCode cmd_features_get_supported(const struct cxl_cmd *cmd, for (entry = 0, index = get_feats_in->start_index; entry < req_entries; index++) { switch (index) { + case CXL_FEATURE_PATROL_SCRUB: + /* Fill supported feature entry for device patrol scrub control */ + get_feats_out->feat_entries[entry++] = + (struct CXLSupportedFeatureEntry) { + .uuid = patrol_scrub_uuid, + .feat_index = index, + .get_feat_size = sizeof(CXLMemPatrolScrubReadAttrs), + .set_feat_size = sizeof(CXLMemPatrolScrubWriteAttrs), + .attr_flags = CXL_FEAT_ENTRY_ATTR_FLAG_CHANGABLE, + .get_feat_version = CXL_MEMDEV_PS_GET_FEATURE_VERSION, + .set_feat_version = CXL_MEMDEV_PS_SET_FEATURE_VERSION, + .set_feat_effects = CXL_FEAT_ENTRY_SFE_IMMEDIATE_CONFIG_CHANGE | + CXL_FEAT_ENTRY_SFE_CEL_VALID, + }; + break; default: __builtin_unreachable(); } @@ -956,6 +979,20 @@ static CXLRetCode cmd_features_get_feature(const struct cxl_cmd *cmd, return CXL_MBOX_INVALID_INPUT; } + if (qemu_uuid_is_equal(&get_feature->uuid, &patrol_scrub_uuid)) { + if (get_feature->offset >= sizeof(CXLMemPatrolScrubReadAttrs)) { + return CXL_MBOX_INVALID_INPUT; + } + bytes_to_copy = sizeof(CXLMemPatrolScrubReadAttrs) - + get_feature->offset; + bytes_to_copy = MIN(bytes_to_copy, get_feature->count); + memcpy(payload_out, + (uint8_t *)&ct3d->patrol_scrub_attrs + get_feature->offset, + bytes_to_copy); + } else { + return CXL_MBOX_UNSUPPORTED; + } + *len_out = bytes_to_copy; return CXL_MBOX_SUCCESS; @@ -970,7 +1007,10 @@ static CXLRetCode cmd_features_set_feature(const struct cxl_cmd *cmd, CXLCCI *cci) { CXLSetFeatureInHeader *hdr = (void *)payload_in; + CXLMemPatrolScrubWriteAttrs *ps_write_attrs; + CXLMemPatrolScrubSetFeature *ps_set_feature; CXLSetFeatureInfo *set_feat_info; + uint16_t bytes_to_copy = 0; uint8_t data_transfer_flag; CXLType3Dev *ct3d; @@ -999,11 +1039,40 @@ static CXLRetCode cmd_features_set_feature(const struct cxl_cmd *cmd, } set_feat_info->data_transfer_flag = data_transfer_flag; set_feat_info->data_offset = hdr->offset; + bytes_to_copy = len_in - sizeof(CXLSetFeatureInHeader); + + if (qemu_uuid_is_equal(&hdr->uuid, &patrol_scrub_uuid)) { + if (hdr->version != CXL_MEMDEV_PS_SET_FEATURE_VERSION) { + return CXL_MBOX_UNSUPPORTED; + } + + ps_set_feature = (void *)payload_in; + ps_write_attrs = &ps_set_feature->feat_data; + memcpy((uint8_t *)&ct3d->patrol_scrub_wr_attrs + hdr->offset, + ps_write_attrs, + bytes_to_copy); + set_feat_info->data_size += bytes_to_copy; + + if (data_transfer_flag == CXL_SET_FEATURE_FLAG_FULL_DATA_TRANSFER || + data_transfer_flag == CXL_SET_FEATURE_FLAG_FINISH_DATA_TRANSFER) { + ct3d->patrol_scrub_attrs.scrub_cycle &= ~0xFF; + ct3d->patrol_scrub_attrs.scrub_cycle |= + ct3d->patrol_scrub_wr_attrs.scrub_cycle_hr & 0xFF; + ct3d->patrol_scrub_attrs.scrub_flags &= ~0x1; + ct3d->patrol_scrub_attrs.scrub_flags |= + ct3d->patrol_scrub_wr_attrs.scrub_flags & 0x1; + } + } else { + return CXL_MBOX_UNSUPPORTED; + } if (data_transfer_flag == CXL_SET_FEATURE_FLAG_FULL_DATA_TRANSFER || data_transfer_flag == CXL_SET_FEATURE_FLAG_FINISH_DATA_TRANSFER || data_transfer_flag == CXL_SET_FEATURE_FLAG_ABORT_DATA_TRANSFER) { memset(&set_feat_info->uuid, 0, sizeof(QemuUUID)); + if (qemu_uuid_is_equal(&hdr->uuid, &patrol_scrub_uuid)) { + memset(&ct3d->patrol_scrub_wr_attrs, 0, set_feat_info->data_size); + } set_feat_info->data_transfer_flag = 0; set_feat_info->data_saved_across_reset = false; set_feat_info->data_offset = 0; diff --git a/hw/mem/cxl_type3.c b/hw/mem/cxl_type3.c index c7910687ae..7c583d80f5 100644 --- a/hw/mem/cxl_type3.c +++ b/hw/mem/cxl_type3.c @@ -908,6 +908,15 @@ static void ct3_realize(PCIDevice *pci_dev, Error **errp) } cxl_event_init(&ct3d->cxl_dstate, 2); + /* Set default value for patrol scrub attributes */ + ct3d->patrol_scrub_attrs.scrub_cycle_cap = + CXL_MEMDEV_PS_SCRUB_CYCLE_CHANGE_CAP_DEFAULT | + CXL_MEMDEV_PS_SCRUB_REALTIME_REPORT_CAP_DEFAULT; + ct3d->patrol_scrub_attrs.scrub_cycle = + CXL_MEMDEV_PS_CUR_SCRUB_CYCLE_DEFAULT | + (CXL_MEMDEV_PS_MIN_SCRUB_CYCLE_DEFAULT << 8); + ct3d->patrol_scrub_attrs.scrub_flags = CXL_MEMDEV_PS_ENABLE_DEFAULT; + return; err_release_cdat: diff --git a/include/hw/cxl/cxl_device.h b/include/hw/cxl/cxl_device.h index 48ed0d9240..2c1df25453 100644 --- a/include/hw/cxl/cxl_device.h +++ b/include/hw/cxl/cxl_device.h @@ -427,6 +427,26 @@ typedef struct CXLPoison { typedef QLIST_HEAD(, CXLPoison) CXLPoisonList; #define CXL_POISON_LIST_LIMIT 256 +/* CXL memory device patrol scrub control attributes */ +typedef struct CXLMemPatrolScrubReadAttrs { + uint8_t scrub_cycle_cap; + uint16_t scrub_cycle; + uint8_t scrub_flags; +} QEMU_PACKED CXLMemPatrolScrubReadAttrs; + +typedef struct CXLMemPatrolScrubWriteAttrs { + uint8_t scrub_cycle_hr; + uint8_t scrub_flags; +} QEMU_PACKED CXLMemPatrolScrubWriteAttrs; + +#define CXL_MEMDEV_PS_GET_FEATURE_VERSION 0x01 +#define CXL_MEMDEV_PS_SET_FEATURE_VERSION 0x01 +#define CXL_MEMDEV_PS_SCRUB_CYCLE_CHANGE_CAP_DEFAULT BIT(0) +#define CXL_MEMDEV_PS_SCRUB_REALTIME_REPORT_CAP_DEFAULT BIT(1) +#define CXL_MEMDEV_PS_CUR_SCRUB_CYCLE_DEFAULT 12 +#define CXL_MEMDEV_PS_MIN_SCRUB_CYCLE_DEFAULT 1 +#define CXL_MEMDEV_PS_ENABLE_DEFAULT 0 + #define DCD_MAX_NUM_REGION 8 typedef struct CXLDCExtentRaw { @@ -511,6 +531,10 @@ struct CXLType3Dev { CXLSetFeatureInfo set_feat_info; + /* Patrol scrub control attributes */ + CXLMemPatrolScrubReadAttrs patrol_scrub_attrs; + CXLMemPatrolScrubWriteAttrs patrol_scrub_wr_attrs; + struct dynamic_capacity { HostMemoryBackend *host_dc; AddressSpace host_dc_as; From 2d41ce38fb9af3e66f85c8b8f9c3f83148c3d549 Mon Sep 17 00:00:00 2001 From: Shiju Jose Date: Fri, 5 Jul 2024 13:30:38 +0100 Subject: [PATCH 13/61] hw/cxl/cxl-mailbox-utils: Add device DDR5 ECS control feature CXL spec 3.1 section 8.2.9.9.11.2 describes the DDR5 Error Check Scrub (ECS) control feature. The Error Check Scrub (ECS) is a feature defined in JEDEC DDR5 SDRAM Specification (JESD79-5) and allows the DRAM to internally read, correct single-bit errors, and write back corrected data bits to the DRAM array while providing transparency to error counts. The ECS control feature allows the request to configure ECS input configurations during system boot or at run-time. The ECS control allows the requester to change the log entry type, the ECS threshold count provided that the request is within the definition specified in DDR5 mode registers, change mode between codeword mode and row count mode, and reset the ECS counter. Reviewed-by: Davidlohr Bueso Reviewed-by: Fan Ni Signed-off-by: Shiju Jose Link: https://lore.kernel.org/r/20240223085902.1549-4-shiju.jose@huawei.com Signed-off-by: Jonathan Cameron Message-Id: <20240705123039.963781-5-Jonathan.Cameron@huawei.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/cxl/cxl-mailbox-utils.c | 71 +++++++++++++++++++++++++++++++++++++ hw/mem/cxl_type3.c | 14 ++++++++ include/hw/cxl/cxl_device.h | 24 +++++++++++++ 3 files changed, 109 insertions(+) diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c index 485beb9dba..0621f686f4 100644 --- a/hw/cxl/cxl-mailbox-utils.c +++ b/hw/cxl/cxl-mailbox-utils.c @@ -825,6 +825,7 @@ typedef struct CXLSupportedFeatureEntry { enum CXL_SUPPORTED_FEATURES_LIST { CXL_FEATURE_PATROL_SCRUB = 0, + CXL_FEATURE_ECS, CXL_FEATURE_MAX }; @@ -877,6 +878,20 @@ typedef struct CXLMemPatrolScrubSetFeature { CXLMemPatrolScrubWriteAttrs feat_data; } QEMU_PACKED QEMU_ALIGNED(16) CXLMemPatrolScrubSetFeature; +/* + * CXL r3.1 section 8.2.9.9.11.2: + * DDR5 Error Check Scrub (ECS) Control Feature + */ +static const QemuUUID ecs_uuid = { + .data = UUID(0xe5b13f22, 0x2328, 0x4a14, 0xb8, 0xba, + 0xb9, 0x69, 0x1e, 0x89, 0x33, 0x86) +}; + +typedef struct CXLMemECSSetFeature { + CXLSetFeatureInHeader hdr; + CXLMemECSWriteAttrs feat_data[]; +} QEMU_PACKED QEMU_ALIGNED(16) CXLMemECSSetFeature; + /* CXL r3.1 section 8.2.9.6.1: Get Supported Features (Opcode 0500h) */ static CXLRetCode cmd_features_get_supported(const struct cxl_cmd *cmd, uint8_t *payload_in, @@ -930,6 +945,23 @@ static CXLRetCode cmd_features_get_supported(const struct cxl_cmd *cmd, CXL_FEAT_ENTRY_SFE_CEL_VALID, }; break; + case CXL_FEATURE_ECS: + /* Fill supported feature entry for device DDR5 ECS control */ + get_feats_out->feat_entries[entry++] = + (struct CXLSupportedFeatureEntry) { + .uuid = ecs_uuid, + .feat_index = index, + .get_feat_size = CXL_ECS_NUM_MEDIA_FRUS * + sizeof(CXLMemECSReadAttrs), + .set_feat_size = CXL_ECS_NUM_MEDIA_FRUS * + sizeof(CXLMemECSWriteAttrs), + .attr_flags = CXL_FEAT_ENTRY_ATTR_FLAG_CHANGABLE, + .get_feat_version = CXL_ECS_GET_FEATURE_VERSION, + .set_feat_version = CXL_ECS_SET_FEATURE_VERSION, + .set_feat_effects = CXL_FEAT_ENTRY_SFE_IMMEDIATE_CONFIG_CHANGE | + CXL_FEAT_ENTRY_SFE_CEL_VALID, + }; + break; default: __builtin_unreachable(); } @@ -989,6 +1021,18 @@ static CXLRetCode cmd_features_get_feature(const struct cxl_cmd *cmd, memcpy(payload_out, (uint8_t *)&ct3d->patrol_scrub_attrs + get_feature->offset, bytes_to_copy); + } else if (qemu_uuid_is_equal(&get_feature->uuid, &ecs_uuid)) { + if (get_feature->offset >= CXL_ECS_NUM_MEDIA_FRUS * + sizeof(CXLMemECSReadAttrs)) { + return CXL_MBOX_INVALID_INPUT; + } + bytes_to_copy = CXL_ECS_NUM_MEDIA_FRUS * + sizeof(CXLMemECSReadAttrs) - + get_feature->offset; + bytes_to_copy = MIN(bytes_to_copy, get_feature->count); + memcpy(payload_out, + (uint8_t *)&ct3d->ecs_attrs + get_feature->offset, + bytes_to_copy); } else { return CXL_MBOX_UNSUPPORTED; } @@ -1009,10 +1053,13 @@ static CXLRetCode cmd_features_set_feature(const struct cxl_cmd *cmd, CXLSetFeatureInHeader *hdr = (void *)payload_in; CXLMemPatrolScrubWriteAttrs *ps_write_attrs; CXLMemPatrolScrubSetFeature *ps_set_feature; + CXLMemECSWriteAttrs *ecs_write_attrs; + CXLMemECSSetFeature *ecs_set_feature; CXLSetFeatureInfo *set_feat_info; uint16_t bytes_to_copy = 0; uint8_t data_transfer_flag; CXLType3Dev *ct3d; + uint16_t count; if (!object_dynamic_cast(OBJECT(cci->d), TYPE_CXL_TYPE3)) { @@ -1062,6 +1109,28 @@ static CXLRetCode cmd_features_set_feature(const struct cxl_cmd *cmd, ct3d->patrol_scrub_attrs.scrub_flags |= ct3d->patrol_scrub_wr_attrs.scrub_flags & 0x1; } + } else if (qemu_uuid_is_equal(&hdr->uuid, + &ecs_uuid)) { + if (hdr->version != CXL_ECS_SET_FEATURE_VERSION) { + return CXL_MBOX_UNSUPPORTED; + } + + ecs_set_feature = (void *)payload_in; + ecs_write_attrs = ecs_set_feature->feat_data; + memcpy((uint8_t *)ct3d->ecs_wr_attrs + hdr->offset, + ecs_write_attrs, + bytes_to_copy); + set_feat_info->data_size += bytes_to_copy; + + if (data_transfer_flag == CXL_SET_FEATURE_FLAG_FULL_DATA_TRANSFER || + data_transfer_flag == CXL_SET_FEATURE_FLAG_FINISH_DATA_TRANSFER) { + for (count = 0; count < CXL_ECS_NUM_MEDIA_FRUS; count++) { + ct3d->ecs_attrs[count].ecs_log_cap = + ct3d->ecs_wr_attrs[count].ecs_log_cap; + ct3d->ecs_attrs[count].ecs_config = + ct3d->ecs_wr_attrs[count].ecs_config & 0x1F; + } + } } else { return CXL_MBOX_UNSUPPORTED; } @@ -1072,6 +1141,8 @@ static CXLRetCode cmd_features_set_feature(const struct cxl_cmd *cmd, memset(&set_feat_info->uuid, 0, sizeof(QemuUUID)); if (qemu_uuid_is_equal(&hdr->uuid, &patrol_scrub_uuid)) { memset(&ct3d->patrol_scrub_wr_attrs, 0, set_feat_info->data_size); + } else if (qemu_uuid_is_equal(&hdr->uuid, &ecs_uuid)) { + memset(ct3d->ecs_wr_attrs, 0, set_feat_info->data_size); } set_feat_info->data_transfer_flag = 0; set_feat_info->data_saved_across_reset = false; diff --git a/hw/mem/cxl_type3.c b/hw/mem/cxl_type3.c index 7c583d80f5..d648192ab9 100644 --- a/hw/mem/cxl_type3.c +++ b/hw/mem/cxl_type3.c @@ -844,6 +844,7 @@ static void ct3_realize(PCIDevice *pci_dev, Error **errp) uint8_t *pci_conf = pci_dev->config; unsigned short msix_num = 6; int i, rc; + uint16_t count; QTAILQ_INIT(&ct3d->error_list); @@ -917,6 +918,19 @@ static void ct3_realize(PCIDevice *pci_dev, Error **errp) (CXL_MEMDEV_PS_MIN_SCRUB_CYCLE_DEFAULT << 8); ct3d->patrol_scrub_attrs.scrub_flags = CXL_MEMDEV_PS_ENABLE_DEFAULT; + /* Set default value for DDR5 ECS read attributes */ + for (count = 0; count < CXL_ECS_NUM_MEDIA_FRUS; count++) { + ct3d->ecs_attrs[count].ecs_log_cap = + CXL_ECS_LOG_ENTRY_TYPE_DEFAULT; + ct3d->ecs_attrs[count].ecs_cap = + CXL_ECS_REALTIME_REPORT_CAP_DEFAULT; + ct3d->ecs_attrs[count].ecs_config = + CXL_ECS_THRESHOLD_COUNT_DEFAULT | + (CXL_ECS_MODE_DEFAULT << 3); + /* Reserved */ + ct3d->ecs_attrs[count].ecs_flags = 0; + } + return; err_release_cdat: diff --git a/include/hw/cxl/cxl_device.h b/include/hw/cxl/cxl_device.h index 2c1df25453..5cae7159e6 100644 --- a/include/hw/cxl/cxl_device.h +++ b/include/hw/cxl/cxl_device.h @@ -447,6 +447,27 @@ typedef struct CXLMemPatrolScrubWriteAttrs { #define CXL_MEMDEV_PS_MIN_SCRUB_CYCLE_DEFAULT 1 #define CXL_MEMDEV_PS_ENABLE_DEFAULT 0 +/* CXL memory device DDR5 ECS control attributes */ +typedef struct CXLMemECSReadAttrs { + uint8_t ecs_log_cap; + uint8_t ecs_cap; + uint16_t ecs_config; + uint8_t ecs_flags; +} QEMU_PACKED CXLMemECSReadAttrs; + +typedef struct CXLMemECSWriteAttrs { + uint8_t ecs_log_cap; + uint16_t ecs_config; +} QEMU_PACKED CXLMemECSWriteAttrs; + +#define CXL_ECS_GET_FEATURE_VERSION 0x01 +#define CXL_ECS_SET_FEATURE_VERSION 0x01 +#define CXL_ECS_LOG_ENTRY_TYPE_DEFAULT 0x01 +#define CXL_ECS_REALTIME_REPORT_CAP_DEFAULT 1 +#define CXL_ECS_THRESHOLD_COUNT_DEFAULT 3 /* 3: 256, 4: 1024, 5: 4096 */ +#define CXL_ECS_MODE_DEFAULT 0 +#define CXL_ECS_NUM_MEDIA_FRUS 3 /* Default */ + #define DCD_MAX_NUM_REGION 8 typedef struct CXLDCExtentRaw { @@ -534,6 +555,9 @@ struct CXLType3Dev { /* Patrol scrub control attributes */ CXLMemPatrolScrubReadAttrs patrol_scrub_attrs; CXLMemPatrolScrubWriteAttrs patrol_scrub_wr_attrs; + /* ECS control attributes */ + CXLMemECSReadAttrs ecs_attrs[CXL_ECS_NUM_MEDIA_FRUS]; + CXLMemECSWriteAttrs ecs_wr_attrs[CXL_ECS_NUM_MEDIA_FRUS]; struct dynamic_capacity { HostMemoryBackend *host_dc; From 3c1e1e5e240e683f0611b96cd325471639f22c6d Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Fri, 5 Jul 2024 13:59:15 +0100 Subject: [PATCH 14/61] hw/cxl: Support firmware updates Implement transfer and activate functionality per 3.1 spec for supporting update metadata (no actual buffers). Transfer times are arbitrarily set to ten and two seconds for full and part transfers, respectively. cxl update-firmware mem0 -F fw.img cxl update-firmware mem0 "memdev":"mem0", "pmem_size":"1024.00 MiB (1073.74 MB)", "serial":"0", "host":"0000:0d:00.0", "firmware":{ "num_slots":2, "active_slot":1, "online_activate_capable":true, "slot_1_version":"BWFW VERSION 0", "fw_update_in_progress":true, "remaining_size":22400 } } cxl update-firmware mem0 { "memdev":"mem0", "pmem_size":"1024.00 MiB (1073.74 MB)", "serial":"0", "host":"0000:0d:00.0", "firmware":{ "num_slots":2, "active_slot":1, "staged_slot":2, "online_activate_capable":true, "slot_1_version":"BWFW VERSION 0", "slot_2_version":"BWFW VERSION 1", "fw_update_in_progress":false } } Signed-off-by: Davidlohr Bueso Link: https://lore.kernel.org/r/20240627164912.25630-1-dave@stgolabs.net Signed-off-by: Jonathan Cameron Message-Id: <20240705125915.991672-2-Jonathan.Cameron@huawei.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/cxl/cxl-mailbox-utils.c | 205 +++++++++++++++++++++++++++++++++++- include/hw/cxl/cxl_device.h | 15 +++ 2 files changed, 215 insertions(+), 5 deletions(-) diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c index 0621f686f4..b752920ec8 100644 --- a/hw/cxl/cxl-mailbox-utils.c +++ b/hw/cxl/cxl-mailbox-utils.c @@ -63,6 +63,8 @@ enum { #define SET_INTERRUPT_POLICY 0x3 FIRMWARE_UPDATE = 0x02, #define GET_INFO 0x0 + #define TRANSFER 0x1 + #define ACTIVATE 0x2 TIMESTAMP = 0x03, #define GET 0x0 #define SET 0x1 @@ -622,6 +624,9 @@ static CXLRetCode cmd_infostat_bg_op_sts(const struct cxl_cmd *cmd, return CXL_MBOX_SUCCESS; } +#define CXL_FW_SLOTS 2 +#define CXL_FW_SIZE 0x02000000 /* 32 mb */ + /* CXL r3.1 Section 8.2.9.3.1: Get FW Info (Opcode 0200h) */ static CXLRetCode cmd_firmware_update_get_info(const struct cxl_cmd *cmd, uint8_t *payload_in, @@ -652,15 +657,192 @@ static CXLRetCode cmd_firmware_update_get_info(const struct cxl_cmd *cmd, fw_info = (void *)payload_out; - fw_info->slots_supported = 2; - fw_info->slot_info = BIT(0) | BIT(3); - fw_info->caps = 0; - pstrcpy(fw_info->fw_rev1, sizeof(fw_info->fw_rev1), "BWFW VERSION 0"); + fw_info->slots_supported = CXL_FW_SLOTS; + fw_info->slot_info = (cci->fw.active_slot & 0x7) | + ((cci->fw.staged_slot & 0x7) << 3); + fw_info->caps = BIT(0); /* online update supported */ + + if (cci->fw.slot[0]) { + pstrcpy(fw_info->fw_rev1, sizeof(fw_info->fw_rev1), "BWFW VERSION 0"); + } + if (cci->fw.slot[1]) { + pstrcpy(fw_info->fw_rev2, sizeof(fw_info->fw_rev2), "BWFW VERSION 1"); + } *len_out = sizeof(*fw_info); return CXL_MBOX_SUCCESS; } +/* CXL r3.1 section 8.2.9.3.2: Transfer FW (Opcode 0201h) */ +#define CXL_FW_XFER_ALIGNMENT 128 + +#define CXL_FW_XFER_ACTION_FULL 0x0 +#define CXL_FW_XFER_ACTION_INIT 0x1 +#define CXL_FW_XFER_ACTION_CONTINUE 0x2 +#define CXL_FW_XFER_ACTION_END 0x3 +#define CXL_FW_XFER_ACTION_ABORT 0x4 + +static CXLRetCode cmd_firmware_update_transfer(const struct cxl_cmd *cmd, + uint8_t *payload_in, + size_t len, + uint8_t *payload_out, + size_t *len_out, + CXLCCI *cci) +{ + struct { + uint8_t action; + uint8_t slot; + uint8_t rsvd1[2]; + uint32_t offset; + uint8_t rsvd2[0x78]; + uint8_t data[]; + } QEMU_PACKED *fw_transfer = (void *)payload_in; + size_t offset, length; + + if (fw_transfer->action == CXL_FW_XFER_ACTION_ABORT) { + /* + * At this point there aren't any on-going transfers + * running in the bg - this is serialized before this + * call altogether. Just mark the state machine and + * disregard any other input. + */ + cci->fw.transferring = false; + return CXL_MBOX_SUCCESS; + } + + offset = fw_transfer->offset * CXL_FW_XFER_ALIGNMENT; + length = len - sizeof(*fw_transfer); + if (offset + length > CXL_FW_SIZE) { + return CXL_MBOX_INVALID_INPUT; + } + + if (cci->fw.transferring) { + if (fw_transfer->action == CXL_FW_XFER_ACTION_FULL || + fw_transfer->action == CXL_FW_XFER_ACTION_INIT) { + return CXL_MBOX_FW_XFER_IN_PROGRESS; + } + /* + * Abort partitioned package transfer if over 30 secs + * between parts. As opposed to the explicit ABORT action, + * semantically treat this condition as an error - as + * if a part action were passed without a previous INIT. + */ + if (difftime(time(NULL), cci->fw.last_partxfer) > 30.0) { + cci->fw.transferring = false; + return CXL_MBOX_INVALID_INPUT; + } + } else if (fw_transfer->action == CXL_FW_XFER_ACTION_CONTINUE || + fw_transfer->action == CXL_FW_XFER_ACTION_END) { + return CXL_MBOX_INVALID_INPUT; + } + + /* allow back-to-back retransmission */ + if ((offset != cci->fw.prev_offset || length != cci->fw.prev_len) && + (fw_transfer->action == CXL_FW_XFER_ACTION_CONTINUE || + fw_transfer->action == CXL_FW_XFER_ACTION_END)) { + /* verify no overlaps */ + if (offset < cci->fw.prev_offset + cci->fw.prev_len) { + return CXL_MBOX_FW_XFER_OUT_OF_ORDER; + } + } + + switch (fw_transfer->action) { + case CXL_FW_XFER_ACTION_FULL: /* ignores offset */ + case CXL_FW_XFER_ACTION_END: + if (fw_transfer->slot == 0 || + fw_transfer->slot == cci->fw.active_slot || + fw_transfer->slot > CXL_FW_SLOTS) { + return CXL_MBOX_FW_INVALID_SLOT; + } + + /* mark the slot used upon bg completion */ + break; + case CXL_FW_XFER_ACTION_INIT: + if (offset != 0) { + return CXL_MBOX_INVALID_INPUT; + } + + cci->fw.transferring = true; + cci->fw.prev_offset = offset; + cci->fw.prev_len = length; + break; + case CXL_FW_XFER_ACTION_CONTINUE: + cci->fw.prev_offset = offset; + cci->fw.prev_len = length; + break; + default: + return CXL_MBOX_INVALID_INPUT; + } + + if (fw_transfer->action == CXL_FW_XFER_ACTION_FULL) { + cci->bg.runtime = 10 * 1000UL; + } else { + cci->bg.runtime = 2 * 1000UL; + } + /* keep relevant context for bg completion */ + cci->fw.curr_action = fw_transfer->action; + cci->fw.curr_slot = fw_transfer->slot; + *len_out = 0; + + return CXL_MBOX_BG_STARTED; +} + +static void __do_firmware_xfer(CXLCCI *cci) +{ + switch (cci->fw.curr_action) { + case CXL_FW_XFER_ACTION_FULL: + case CXL_FW_XFER_ACTION_END: + cci->fw.slot[cci->fw.curr_slot - 1] = true; + cci->fw.transferring = false; + break; + case CXL_FW_XFER_ACTION_INIT: + case CXL_FW_XFER_ACTION_CONTINUE: + time(&cci->fw.last_partxfer); + break; + default: + break; + } +} + +/* CXL r3.1 section 8.2.9.3.3: Activate FW (Opcode 0202h) */ +static CXLRetCode cmd_firmware_update_activate(const struct cxl_cmd *cmd, + uint8_t *payload_in, + size_t len, + uint8_t *payload_out, + size_t *len_out, + CXLCCI *cci) +{ + struct { + uint8_t action; + uint8_t slot; + } QEMU_PACKED *fw_activate = (void *)payload_in; + QEMU_BUILD_BUG_ON(sizeof(*fw_activate) != 0x2); + + if (fw_activate->slot == 0 || + fw_activate->slot == cci->fw.active_slot || + fw_activate->slot > CXL_FW_SLOTS) { + return CXL_MBOX_FW_INVALID_SLOT; + } + + /* ensure that an actual fw package is there */ + if (!cci->fw.slot[fw_activate->slot - 1]) { + return CXL_MBOX_FW_INVALID_SLOT; + } + + switch (fw_activate->action) { + case 0: /* online */ + cci->fw.active_slot = fw_activate->slot; + break; + case 1: /* reset */ + cci->fw.staged_slot = fw_activate->slot; + break; + default: + return CXL_MBOX_INVALID_INPUT; + } + + return CXL_MBOX_SUCCESS; +} + /* CXL r3.1 Section 8.2.9.4.1: Get Timestamp (Opcode 0300h) */ static CXLRetCode cmd_timestamp_get(const struct cxl_cmd *cmd, uint8_t *payload_in, @@ -2494,6 +2676,10 @@ static const struct cxl_cmd cxl_cmd_set[256][256] = { ~0, CXL_MBOX_IMMEDIATE_CONFIG_CHANGE }, [FIRMWARE_UPDATE][GET_INFO] = { "FIRMWARE_UPDATE_GET_INFO", cmd_firmware_update_get_info, 0, 0 }, + [FIRMWARE_UPDATE][TRANSFER] = { "FIRMWARE_UPDATE_TRANSFER", + cmd_firmware_update_transfer, ~0, CXL_MBOX_BACKGROUND_OPERATION }, + [FIRMWARE_UPDATE][ACTIVATE] = { "FIRMWARE_UPDATE_ACTIVATE", + cmd_firmware_update_activate, 2, CXL_MBOX_BACKGROUND_OPERATION }, [TIMESTAMP][GET] = { "TIMESTAMP_GET", cmd_timestamp_get, 0, 0 }, [TIMESTAMP][SET] = { "TIMESTAMP_SET", cmd_timestamp_set, 8, CXL_MBOX_IMMEDIATE_POLICY_CHANGE }, @@ -2622,7 +2808,9 @@ int cxl_process_cci_message(CXLCCI *cci, uint8_t set, uint8_t cmd, h == cmd_media_get_poison_list || h == cmd_media_inject_poison || h == cmd_media_clear_poison || - h == cmd_sanitize_overwrite) { + h == cmd_sanitize_overwrite || + h == cmd_firmware_update_transfer || + h == cmd_firmware_update_activate) { return CXL_MBOX_MEDIA_DISABLED; } } @@ -2667,6 +2855,9 @@ static void bg_timercb(void *opaque) cci->bg.complete_pct = 100; cci->bg.ret_code = ret; switch (cci->bg.opcode) { + case 0x0201: /* fw transfer */ + __do_firmware_xfer(cci); + break; case 0x4400: /* sanitize */ { CXLType3Dev *ct3d = CXL_TYPE3(cci->d); @@ -2738,6 +2929,10 @@ void cxl_init_cci(CXLCCI *cci, size_t payload_max) cci->bg.runtime = 0; cci->bg.timer = timer_new_ms(QEMU_CLOCK_VIRTUAL, bg_timercb, cci); + + memset(&cci->fw, 0, sizeof(cci->fw)); + cci->fw.active_slot = 1; + cci->fw.slot[cci->fw.active_slot - 1] = true; } static void cxl_copy_cci_commands(CXLCCI *cci, const struct cxl_cmd (*cxl_cmds)[256]) diff --git a/include/hw/cxl/cxl_device.h b/include/hw/cxl/cxl_device.h index 5cae7159e6..fdd0f4e62b 100644 --- a/include/hw/cxl/cxl_device.h +++ b/include/hw/cxl/cxl_device.h @@ -181,6 +181,21 @@ typedef struct CXLCCI { uint64_t runtime; QEMUTimer *timer; } bg; + + /* firmware update */ + struct { + uint8_t active_slot; + uint8_t staged_slot; + bool slot[4]; + uint8_t curr_action; + uint8_t curr_slot; + /* handle partial transfers */ + bool transferring; + size_t prev_offset; + size_t prev_len; + time_t last_partxfer; + } fw; + size_t payload_max; /* Pointer to device hosting the CCI */ DeviceState *d; From e3f15732c479e6c4baa56af65f52d99fbfb5c417 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Sun, 7 Jul 2024 20:21:12 -0700 Subject: [PATCH 15/61] MAINTAINERS: Add myself as a VT-d reviewer Signed-off-by: Yi Liu Message-Id: <20240708032112.796339-1-yi.l.liu@intel.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index b7e9ced3e8..8ad64ff76b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3659,6 +3659,7 @@ F: tests/uefi-test-tools/ VT-d Emulation M: Michael S. Tsirkin R: Jason Wang +R: Yi Liu S: Supported F: hw/i386/intel_iommu.c F: hw/i386/intel_iommu_internal.h From 98e77e3dd8dd6e7aa9a7dffa60f49c8c8a49d4e3 Mon Sep 17 00:00:00 2001 From: Manos Pitsidianakis Date: Mon, 8 Jul 2024 10:09:49 +0300 Subject: [PATCH 16/61] virtio-snd: add max size bounds check in input cb MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When reading input audio in the virtio-snd input callback, virtio_snd_pcm_in_cb(), we do not check whether the iov can actually fit the data buffer. This is because we use the buffer->size field as a total-so-far accumulator instead of byte-size-left like in TX buffers. This triggers an out of bounds write if the size of the virtio queue element is equal to virtio_snd_pcm_status, which makes the available space for audio data zero. This commit adds a check for reaching the maximum buffer size before attempting any writes. Reported-by: Zheyu Ma Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2427 Signed-off-by: Manos Pitsidianakis Message-Id: Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/audio/virtio-snd.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/hw/audio/virtio-snd.c b/hw/audio/virtio-snd.c index 5993f4f040..e6432ac959 100644 --- a/hw/audio/virtio-snd.c +++ b/hw/audio/virtio-snd.c @@ -1261,7 +1261,7 @@ static void virtio_snd_pcm_in_cb(void *data, int available) { VirtIOSoundPCMStream *stream = data; VirtIOSoundPCMBuffer *buffer; - size_t size; + size_t size, max_size; WITH_QEMU_LOCK_GUARD(&stream->queue_mutex) { while (!QSIMPLEQ_EMPTY(&stream->queue)) { @@ -1275,7 +1275,12 @@ static void virtio_snd_pcm_in_cb(void *data, int available) continue; } + max_size = iov_size(buffer->elem->in_sg, buffer->elem->in_num); for (;;) { + if (buffer->size >= max_size) { + return_rx_buffer(stream, buffer); + break; + } size = AUD_read(stream->voice.in, buffer->data + buffer->size, MIN(available, (stream->params.period_bytes - From 9b6083465fb8311f2410615f8303a41f580a2a20 Mon Sep 17 00:00:00 2001 From: Manos Pitsidianakis Date: Thu, 11 Jul 2024 10:38:49 +0300 Subject: [PATCH 17/61] virtio-snd: check for invalid param shift operands MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When setting the parameters of a PCM stream, we compute the bit flag with the format and rate values as shift operand to check if they are set in supported_formats and supported_rates. If the guest provides a format/rate value which when shifting 1 results in a value bigger than the number of bits in supported_formats/supported_rates, we must report an error. Previously, this ended up triggering the not reached assertions later when converting to internal QEMU values. Reported-by: Zheyu Ma Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2416 Signed-off-by: Manos Pitsidianakis Message-Id: Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/audio/virtio-snd.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/hw/audio/virtio-snd.c b/hw/audio/virtio-snd.c index e6432ac959..e5196aa4bb 100644 --- a/hw/audio/virtio-snd.c +++ b/hw/audio/virtio-snd.c @@ -282,11 +282,13 @@ uint32_t virtio_snd_set_pcm_params(VirtIOSound *s, error_report("Number of channels is not supported."); return cpu_to_le32(VIRTIO_SND_S_NOT_SUPP); } - if (!(supported_formats & BIT(params->format))) { + if (BIT(params->format) > sizeof(supported_formats) || + !(supported_formats & BIT(params->format))) { error_report("Stream format is not supported."); return cpu_to_le32(VIRTIO_SND_S_NOT_SUPP); } - if (!(supported_rates & BIT(params->rate))) { + if (BIT(params->rate) > sizeof(supported_rates) || + !(supported_rates & BIT(params->rate))) { error_report("Stream rate is not supported."); return cpu_to_le32(VIRTIO_SND_S_NOT_SUPP); } From a3c8d7e38550c3d5a46e6fa94ffadfa625a4861d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Mathieu--Drif?= Date: Tue, 9 Jul 2024 14:26:08 +0000 Subject: [PATCH 18/61] intel_iommu: fix FRCD construction macro MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The constant must be unsigned, otherwise the two's complement overrides the other fields when a PASID is present. Fixes: 1b2b12376c8a ("intel-iommu: PASID support") Signed-off-by: Clément Mathieu--Drif Reviewed-by: Yi Liu Reviewed-by: Zhenzhong Duan Reviewed-by: Minwoo Im Message-Id: <20240709142557.317271-2-clement.mathieu--drif@eviden.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/intel_iommu_internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h index f8cf99bddf..cbc4030031 100644 --- a/hw/i386/intel_iommu_internal.h +++ b/hw/i386/intel_iommu_internal.h @@ -267,7 +267,7 @@ /* For the low 64-bit of 128-bit */ #define VTD_FRCD_FI(val) ((val) & ~0xfffULL) #define VTD_FRCD_PV(val) (((val) & 0xffffULL) << 40) -#define VTD_FRCD_PP(val) (((val) & 0x1) << 31) +#define VTD_FRCD_PP(val) (((val) & 0x1ULL) << 31) #define VTD_FRCD_IR_IDX(val) (((val) & 0xffffULL) << 48) /* DMA Remapping Fault Conditions */ From 3a23554f91c01cf75705a36a5eed3ebef6636d41 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Mathieu--Drif?= Date: Tue, 9 Jul 2024 14:26:09 +0000 Subject: [PATCH 19/61] intel_iommu: move VTD_FRCD_PV and VTD_FRCD_PP declarations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These 2 macros are for high 64-bit of the FRCD registers. Declarations have to be moved accordingly. Signed-off-by: Clément Mathieu--Drif Reviewed-by: Minwoo Im Reviewed-by: Yi Liu Message-Id: <20240709142557.317271-3-clement.mathieu--drif@eviden.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/intel_iommu_internal.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h index cbc4030031..faea23e8d6 100644 --- a/hw/i386/intel_iommu_internal.h +++ b/hw/i386/intel_iommu_internal.h @@ -264,10 +264,10 @@ #define VTD_FRCD_FR(val) (((val) & 0xffULL) << 32) #define VTD_FRCD_SID_MASK 0xffffULL #define VTD_FRCD_SID(val) ((val) & VTD_FRCD_SID_MASK) -/* For the low 64-bit of 128-bit */ -#define VTD_FRCD_FI(val) ((val) & ~0xfffULL) #define VTD_FRCD_PV(val) (((val) & 0xffffULL) << 40) #define VTD_FRCD_PP(val) (((val) & 0x1ULL) << 31) +/* For the low 64-bit of 128-bit */ +#define VTD_FRCD_FI(val) ((val) & ~0xfffULL) #define VTD_FRCD_IR_IDX(val) (((val) & 0xffffULL) << 48) /* DMA Remapping Fault Conditions */ From d7258f7a250716231d23d5412dd6caf923936549 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Mathieu--Drif?= Date: Tue, 9 Jul 2024 14:26:10 +0000 Subject: [PATCH 20/61] intel_iommu: fix type of the mask field in VTDIOTLBPageInvInfo MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per the below code, it can overflow as am can be larger than 8 according to the CH 6.5.2.3 IOTLB Invalidate. Use uint64_t to avoid overflows. Fixes: b5a280c00840 ("intel-iommu: add IOTLB using hash table") Signed-off-by: Clément Mathieu--Drif Reviewed-by: Minwoo Im Reviewed-by: Yi Liu Message-Id: <20240709142557.317271-4-clement.mathieu--drif@eviden.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/intel_iommu_internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h index faea23e8d6..5f32c36943 100644 --- a/hw/i386/intel_iommu_internal.h +++ b/hw/i386/intel_iommu_internal.h @@ -436,7 +436,7 @@ struct VTDIOTLBPageInvInfo { uint16_t domain_id; uint32_t pasid; uint64_t addr; - uint8_t mask; + uint64_t mask; }; typedef struct VTDIOTLBPageInvInfo VTDIOTLBPageInvInfo; From bb3a23d5b0b43bed3c9a6ecf5a6871e2871be883 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Mathieu--Drif?= Date: Tue, 9 Jul 2024 14:26:10 +0000 Subject: [PATCH 21/61] intel_iommu: make type match MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 'level' field in vtd_iotlb_key is an unsigned integer. We don't need to store level as an int in vtd_lookup_iotlb. This is not an issue by itself, but using unsigned here seems cleaner. Signed-off-by: Clément Mathieu--Drif Reviewed-by: Yi Liu Message-Id: <20240709142557.317271-5-clement.mathieu--drif@eviden.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/intel_iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 37c21a0aec..be0cb39b5c 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -358,7 +358,7 @@ static VTDIOTLBEntry *vtd_lookup_iotlb(IntelIOMMUState *s, uint16_t source_id, { struct vtd_iotlb_key key; VTDIOTLBEntry *entry; - int level; + unsigned level; for (level = VTD_SL_PT_LEVEL; level < VTD_SL_PML4_LEVEL; level++) { key.gfn = vtd_get_iotlb_gfn(addr, level); From c303aa0942589427b42192ef7cff75a79ef8646b Mon Sep 17 00:00:00 2001 From: Jonah Palmer Date: Wed, 10 Jul 2024 08:55:14 -0400 Subject: [PATCH 22/61] virtio: Add bool to VirtQueueElement MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add the boolean 'in_order_filled' member to the VirtQueueElement structure. The use of this boolean will signify whether the element has been processed and is ready to be flushed (so long as the element is in-order). This boolean is used to support the VIRTIO_F_IN_ORDER feature. Reviewed-by: Eugenio Pérez Signed-off-by: Jonah Palmer Message-Id: <20240710125522.4168043-2-jonah.palmer@oracle.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- include/hw/virtio/virtio.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h index 7512afbc84..fdc827f82e 100644 --- a/include/hw/virtio/virtio.h +++ b/include/hw/virtio/virtio.h @@ -69,6 +69,8 @@ typedef struct VirtQueueElement unsigned int ndescs; unsigned int out_num; unsigned int in_num; + /* Element has been processed (VIRTIO_F_IN_ORDER) */ + bool in_order_filled; hwaddr *in_addr; hwaddr *out_addr; struct iovec *in_sg; From 2256e8482b2bba88abcc734dbc6951b825773f0b Mon Sep 17 00:00:00 2001 From: Jonah Palmer Date: Wed, 10 Jul 2024 08:55:15 -0400 Subject: [PATCH 23/61] virtio: virtqueue_pop - VIRTIO_F_IN_ORDER support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add VIRTIO_F_IN_ORDER feature support in virtqueue_split_pop and virtqueue_packed_pop. VirtQueueElements popped from the available/descritpor ring are added to the VirtQueue's used_elems array in-order and in the same fashion as they would be added the used and descriptor rings, respectively. This will allow us to keep track of the current order, what elements have been written, as well as an element's essential data after being processed. Reviewed-by: Eugenio Pérez Signed-off-by: Jonah Palmer Message-Id: <20240710125522.4168043-3-jonah.palmer@oracle.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/virtio.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index 583a224163..98eb601b09 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -1505,7 +1505,7 @@ static void *virtqueue_alloc_element(size_t sz, unsigned out_num, unsigned in_nu static void *virtqueue_split_pop(VirtQueue *vq, size_t sz) { - unsigned int i, head, max; + unsigned int i, head, max, idx; VRingMemoryRegionCaches *caches; MemoryRegionCache indirect_desc_cache; MemoryRegionCache *desc_cache; @@ -1629,6 +1629,13 @@ static void *virtqueue_split_pop(VirtQueue *vq, size_t sz) elem->in_sg[i] = iov[out_num + i]; } + if (virtio_vdev_has_feature(vdev, VIRTIO_F_IN_ORDER)) { + idx = (vq->last_avail_idx - 1) % vq->vring.num; + vq->used_elems[idx].index = elem->index; + vq->used_elems[idx].len = elem->len; + vq->used_elems[idx].ndescs = elem->ndescs; + } + vq->inuse++; trace_virtqueue_pop(vq, elem, elem->in_num, elem->out_num); @@ -1762,6 +1769,13 @@ static void *virtqueue_packed_pop(VirtQueue *vq, size_t sz) elem->index = id; elem->ndescs = (desc_cache == &indirect_desc_cache) ? 1 : elem_entries; + + if (virtio_vdev_has_feature(vdev, VIRTIO_F_IN_ORDER)) { + vq->used_elems[vq->last_avail_idx].index = elem->index; + vq->used_elems[vq->last_avail_idx].len = elem->len; + vq->used_elems[vq->last_avail_idx].ndescs = elem->ndescs; + } + vq->last_avail_idx += elem->ndescs; vq->inuse += elem->ndescs; From b44135daa3721778f4656ca3c15bd9b3060f1176 Mon Sep 17 00:00:00 2001 From: Jonah Palmer Date: Wed, 10 Jul 2024 08:55:16 -0400 Subject: [PATCH 24/61] virtio: virtqueue_ordered_fill - VIRTIO_F_IN_ORDER support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add VIRTIO_F_IN_ORDER feature support for the virtqueue_fill operation. The goal of the virtqueue_ordered_fill operation when the VIRTIO_F_IN_ORDER feature has been negotiated is to search for this now-used element, set its length, and mark the element as filled in the VirtQueue's used_elems array. By marking the element as filled, it will indicate that this element has been processed and is ready to be flushed, so long as the element is in-order. Reviewed-by: Eugenio Pérez Signed-off-by: Jonah Palmer Message-Id: <20240710125522.4168043-4-jonah.palmer@oracle.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/virtio.c | 44 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index 98eb601b09..0000a7b41c 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -872,6 +872,46 @@ static void virtqueue_packed_fill(VirtQueue *vq, const VirtQueueElement *elem, vq->used_elems[idx].ndescs = elem->ndescs; } +static void virtqueue_ordered_fill(VirtQueue *vq, const VirtQueueElement *elem, + unsigned int len) +{ + unsigned int i, steps, max_steps; + + i = vq->used_idx % vq->vring.num; + steps = 0; + /* + * We shouldn't need to increase 'i' by more than the distance + * between used_idx and last_avail_idx. + */ + max_steps = (vq->last_avail_idx - vq->used_idx) % vq->vring.num; + + /* Search for element in vq->used_elems */ + while (steps <= max_steps) { + /* Found element, set length and mark as filled */ + if (vq->used_elems[i].index == elem->index) { + vq->used_elems[i].len = len; + vq->used_elems[i].in_order_filled = true; + break; + } + + i += vq->used_elems[i].ndescs; + steps += vq->used_elems[i].ndescs; + + if (i >= vq->vring.num) { + i -= vq->vring.num; + } + } + + /* + * We should be able to find a matching VirtQueueElement in + * used_elems. If we don't, this is an error. + */ + if (steps >= max_steps) { + qemu_log_mask(LOG_GUEST_ERROR, "%s: %s cannot fill buffer id %u\n", + __func__, vq->vdev->name, elem->index); + } +} + static void virtqueue_packed_fill_desc(VirtQueue *vq, const VirtQueueElement *elem, unsigned int idx, @@ -922,7 +962,9 @@ void virtqueue_fill(VirtQueue *vq, const VirtQueueElement *elem, return; } - if (virtio_vdev_has_feature(vq->vdev, VIRTIO_F_RING_PACKED)) { + if (virtio_vdev_has_feature(vq->vdev, VIRTIO_F_IN_ORDER)) { + virtqueue_ordered_fill(vq, elem, len); + } else if (virtio_vdev_has_feature(vq->vdev, VIRTIO_F_RING_PACKED)) { virtqueue_packed_fill(vq, elem, len, idx); } else { virtqueue_split_fill(vq, elem, len, idx); From 844619147c5110f335cbbbad51cda6a898c7ef33 Mon Sep 17 00:00:00 2001 From: Jonah Palmer Date: Wed, 10 Jul 2024 08:55:17 -0400 Subject: [PATCH 25/61] virtio: virtqueue_ordered_flush - VIRTIO_F_IN_ORDER support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add VIRTIO_F_IN_ORDER feature support for the virtqueue_flush operation. The goal of the virtqueue_ordered_flush operation when the VIRTIO_F_IN_ORDER feature has been negotiated is to write elements to the used/descriptor ring in-order and then update used_idx. The function iterates through the VirtQueueElement used_elems array in-order starting at vq->used_idx. If the element is valid (filled), the element is written to the used/descriptor ring. This process continues until we find an invalid (not filled) element. For packed VQs, the first entry (at vq->used_idx) is written to the descriptor ring last so the guest doesn't see any invalid descriptors. If any elements were written, the used_idx is updated. Signed-off-by: Jonah Palmer Message-Id: <20240710125522.4168043-5-jonah.palmer@oracle.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Acked-by: Eugenio Pérez --- hw/virtio/virtio.c | 71 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 70 insertions(+), 1 deletion(-) diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index 0000a7b41c..397c261c3c 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -1023,6 +1023,73 @@ static void virtqueue_packed_flush(VirtQueue *vq, unsigned int count) } } +static void virtqueue_ordered_flush(VirtQueue *vq) +{ + unsigned int i = vq->used_idx % vq->vring.num; + unsigned int ndescs = 0; + uint16_t old = vq->used_idx; + uint16_t new; + bool packed; + VRingUsedElem uelem; + + packed = virtio_vdev_has_feature(vq->vdev, VIRTIO_F_RING_PACKED); + + if (packed) { + if (unlikely(!vq->vring.desc)) { + return; + } + } else if (unlikely(!vq->vring.used)) { + return; + } + + /* First expected in-order element isn't ready, nothing to do */ + if (!vq->used_elems[i].in_order_filled) { + return; + } + + /* Search for filled elements in-order */ + while (vq->used_elems[i].in_order_filled) { + /* + * First entry for packed VQs is written last so the guest + * doesn't see invalid descriptors. + */ + if (packed && i != vq->used_idx) { + virtqueue_packed_fill_desc(vq, &vq->used_elems[i], ndescs, false); + } else if (!packed) { + uelem.id = vq->used_elems[i].index; + uelem.len = vq->used_elems[i].len; + vring_used_write(vq, &uelem, i); + } + + vq->used_elems[i].in_order_filled = false; + ndescs += vq->used_elems[i].ndescs; + i += vq->used_elems[i].ndescs; + if (i >= vq->vring.num) { + i -= vq->vring.num; + } + } + + if (packed) { + virtqueue_packed_fill_desc(vq, &vq->used_elems[vq->used_idx], 0, true); + vq->used_idx += ndescs; + if (vq->used_idx >= vq->vring.num) { + vq->used_idx -= vq->vring.num; + vq->used_wrap_counter ^= 1; + vq->signalled_used_valid = false; + } + } else { + /* Make sure buffer is written before we update index. */ + smp_wmb(); + new = old + ndescs; + vring_used_idx_set(vq, new); + if (unlikely((int16_t)(new - vq->signalled_used) < + (uint16_t)(new - old))) { + vq->signalled_used_valid = false; + } + } + vq->inuse -= ndescs; +} + void virtqueue_flush(VirtQueue *vq, unsigned int count) { if (virtio_device_disabled(vq->vdev)) { @@ -1030,7 +1097,9 @@ void virtqueue_flush(VirtQueue *vq, unsigned int count) return; } - if (virtio_vdev_has_feature(vq->vdev, VIRTIO_F_RING_PACKED)) { + if (virtio_vdev_has_feature(vq->vdev, VIRTIO_F_IN_ORDER)) { + virtqueue_ordered_flush(vq); + } else if (virtio_vdev_has_feature(vq->vdev, VIRTIO_F_RING_PACKED)) { virtqueue_packed_flush(vq, count); } else { virtqueue_split_flush(vq, count); From c03213fdc9d7b680cc575cd1e725750702a10b09 Mon Sep 17 00:00:00 2001 From: Jonah Palmer Date: Wed, 10 Jul 2024 08:55:18 -0400 Subject: [PATCH 26/61] vhost,vhost-user: Add VIRTIO_F_IN_ORDER to vhost feature bits MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support for the VIRTIO_F_IN_ORDER feature across a variety of vhost devices. The inclusion of VIRTIO_F_IN_ORDER in the feature bits arrays for these devices ensures that the backend is capable of offering and providing support for this feature, and that it can be disabled if the backend does not support it. Acked-by: Eugenio Pérez Signed-off-by: Jonah Palmer Message-Id: <20240710125522.4168043-6-jonah.palmer@oracle.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/block/vhost-user-blk.c | 1 + hw/net/vhost_net.c | 2 ++ hw/scsi/vhost-scsi.c | 1 + hw/scsi/vhost-user-scsi.c | 1 + hw/virtio/vhost-user-fs.c | 1 + hw/virtio/vhost-user-vsock.c | 1 + net/vhost-vdpa.c | 1 + 7 files changed, 8 insertions(+) diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c index fdbc30b9ce..5b7f46bbb0 100644 --- a/hw/block/vhost-user-blk.c +++ b/hw/block/vhost-user-blk.c @@ -51,6 +51,7 @@ static const int user_feature_bits[] = { VIRTIO_F_RING_PACKED, VIRTIO_F_IOMMU_PLATFORM, VIRTIO_F_RING_RESET, + VIRTIO_F_IN_ORDER, VIRTIO_F_NOTIFICATION_DATA, VHOST_INVALID_FEATURE_BIT }; diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c index 18898afe81..a788e6937e 100644 --- a/hw/net/vhost_net.c +++ b/hw/net/vhost_net.c @@ -48,6 +48,7 @@ static const int kernel_feature_bits[] = { VIRTIO_F_IOMMU_PLATFORM, VIRTIO_F_RING_PACKED, VIRTIO_F_RING_RESET, + VIRTIO_F_IN_ORDER, VIRTIO_F_NOTIFICATION_DATA, VIRTIO_NET_F_HASH_REPORT, VHOST_INVALID_FEATURE_BIT @@ -78,6 +79,7 @@ static const int user_feature_bits[] = { VIRTIO_F_IOMMU_PLATFORM, VIRTIO_F_RING_PACKED, VIRTIO_F_RING_RESET, + VIRTIO_F_IN_ORDER, VIRTIO_NET_F_RSS, VIRTIO_NET_F_HASH_REPORT, VIRTIO_NET_F_GUEST_USO4, diff --git a/hw/scsi/vhost-scsi.c b/hw/scsi/vhost-scsi.c index 3d5fe0994d..49cff2a0cb 100644 --- a/hw/scsi/vhost-scsi.c +++ b/hw/scsi/vhost-scsi.c @@ -38,6 +38,7 @@ static const int kernel_feature_bits[] = { VIRTIO_RING_F_EVENT_IDX, VIRTIO_SCSI_F_HOTPLUG, VIRTIO_F_RING_RESET, + VIRTIO_F_IN_ORDER, VIRTIO_F_NOTIFICATION_DATA, VHOST_INVALID_FEATURE_BIT }; diff --git a/hw/scsi/vhost-user-scsi.c b/hw/scsi/vhost-user-scsi.c index cc91ade525..55e4be5b34 100644 --- a/hw/scsi/vhost-user-scsi.c +++ b/hw/scsi/vhost-user-scsi.c @@ -36,6 +36,7 @@ static const int user_feature_bits[] = { VIRTIO_RING_F_EVENT_IDX, VIRTIO_SCSI_F_HOTPLUG, VIRTIO_F_RING_RESET, + VIRTIO_F_IN_ORDER, VIRTIO_F_NOTIFICATION_DATA, VHOST_INVALID_FEATURE_BIT }; diff --git a/hw/virtio/vhost-user-fs.c b/hw/virtio/vhost-user-fs.c index ae48cc1c96..32ee7f496d 100644 --- a/hw/virtio/vhost-user-fs.c +++ b/hw/virtio/vhost-user-fs.c @@ -33,6 +33,7 @@ static const int user_feature_bits[] = { VIRTIO_F_RING_PACKED, VIRTIO_F_IOMMU_PLATFORM, VIRTIO_F_RING_RESET, + VIRTIO_F_IN_ORDER, VIRTIO_F_NOTIFICATION_DATA, VHOST_INVALID_FEATURE_BIT }; diff --git a/hw/virtio/vhost-user-vsock.c b/hw/virtio/vhost-user-vsock.c index 802b44a07d..da3b0e0229 100644 --- a/hw/virtio/vhost-user-vsock.c +++ b/hw/virtio/vhost-user-vsock.c @@ -21,6 +21,7 @@ static const int user_feature_bits[] = { VIRTIO_RING_F_INDIRECT_DESC, VIRTIO_RING_F_EVENT_IDX, VIRTIO_F_NOTIFY_ON_EMPTY, + VIRTIO_F_IN_ORDER, VIRTIO_F_NOTIFICATION_DATA, VHOST_INVALID_FEATURE_BIT }; diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c index daa38428c5..03457ead66 100644 --- a/net/vhost-vdpa.c +++ b/net/vhost-vdpa.c @@ -62,6 +62,7 @@ const int vdpa_feature_bits[] = { VIRTIO_F_RING_PACKED, VIRTIO_F_RING_RESET, VIRTIO_F_VERSION_1, + VIRTIO_F_IN_ORDER, VIRTIO_F_NOTIFICATION_DATA, VIRTIO_NET_F_CSUM, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS, From 0b0bb34f342f04ab8255a64edb7c6aec7105dc94 Mon Sep 17 00:00:00 2001 From: Jonah Palmer Date: Wed, 10 Jul 2024 08:55:19 -0400 Subject: [PATCH 27/61] virtio: Add VIRTIO_F_IN_ORDER property definition MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extend the virtio device property definitions to include the VIRTIO_F_IN_ORDER feature. The default state of this feature is disabled, allowing it to be explicitly enabled where it's supported. Acked-by: Eugenio Pérez Signed-off-by: Jonah Palmer Message-Id: <20240710125522.4168043-7-jonah.palmer@oracle.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- include/hw/virtio/virtio.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h index fdc827f82e..d2a1938757 100644 --- a/include/hw/virtio/virtio.h +++ b/include/hw/virtio/virtio.h @@ -373,7 +373,9 @@ typedef struct VirtIORNGConf VirtIORNGConf; DEFINE_PROP_BIT64("packed", _state, _field, \ VIRTIO_F_RING_PACKED, false), \ DEFINE_PROP_BIT64("queue_reset", _state, _field, \ - VIRTIO_F_RING_RESET, true) + VIRTIO_F_RING_RESET, true), \ + DEFINE_PROP_BIT64("in_order", _state, _field, \ + VIRTIO_F_IN_ORDER, false) hwaddr virtio_queue_get_desc_addr(VirtIODevice *vdev, int n); bool virtio_queue_enabled_legacy(VirtIODevice *vdev, int n); From 99d7c1b99a3fdb69213c09da6b7614243f877bee Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Fri, 12 Jul 2024 17:38:57 +0200 Subject: [PATCH 28/61] contrib/vhost-user-blk: fix overflowing expression Coverity reported: >>> CID 1549454: Integer handling issues (OVERFLOW_BEFORE_WIDEN) >>> Potentially overflowing expression "le32_to_cpu(desc->num_sectors) << 9" with type "uint32_t" (32 bits, unsigned) is evaluated using 32-bit arithmetic, and then used in a context that expects an expression of type "uint64_t" (64 bits, unsigned). 199 le32_to_cpu(desc->num_sectors) << 9 }; Coverity noticed this issue after commit ab04420c3 ("contrib/vhost-user-*: use QEMU bswap helper functions"), but it was pre-existing and introduced from the beginning by commit caa1ee4313 ("vhost-user-blk: add discard/write zeroes features support"). Explicitly cast the 32-bit value before the shift to fix this issue. Fixes: Coverity CID 1549454 Fixes: 5ab04420c3 ("contrib/vhost-user-*: use QEMU bswap helper functions") Fixes: caa1ee4313 ("vhost-user-blk: add discard/write zeroes features support") Cc: changpeng.liu@intel.com Suggested-by: Peter Maydell Signed-off-by: Stefano Garzarella Message-Id: <20240712153857.207440-1-sgarzare@redhat.com> Reviewed-by: Peter Maydell Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- contrib/vhost-user-blk/vhost-user-blk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/vhost-user-blk/vhost-user-blk.c b/contrib/vhost-user-blk/vhost-user-blk.c index 9492146855..6cc18a1c04 100644 --- a/contrib/vhost-user-blk/vhost-user-blk.c +++ b/contrib/vhost-user-blk/vhost-user-blk.c @@ -196,7 +196,7 @@ vub_discard_write_zeroes(VubReq *req, struct iovec *iov, uint32_t iovcnt, VubDev *vdev_blk = req->vdev_blk; desc = buf; uint64_t range[2] = { le64_to_cpu(desc->sector) << 9, - le32_to_cpu(desc->num_sectors) << 9 }; + (uint64_t)le32_to_cpu(desc->num_sectors) << 9 }; if (type == VIRTIO_BLK_T_DISCARD) { if (ioctl(vdev_blk->blk_fd, BLKDISCARD, range) == 0) { g_free(buf); From ca6dd3aef8a103138c99788bcba8195d4905ddc5 Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Mon, 15 Jul 2024 14:19:08 +0900 Subject: [PATCH 29/61] hw/pci: Fix SR-IOV VF number calculation pci_config_get_bar_addr() had a division by vf_stride. vf_stride needs to be non-zero when there are multiple VFs, but the specification does not prohibit to make it zero when there is only one VF. Do not perform the division for the first VF to avoid division by zero. Signed-off-by: Akihiko Odaki Message-Id: <20240715-sriov-v5-2-3f5539093ffc@daynix.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/pci/pci.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/hw/pci/pci.c b/hw/pci/pci.c index 4c7be52951..cf2794879d 100644 --- a/hw/pci/pci.c +++ b/hw/pci/pci.c @@ -1437,7 +1437,11 @@ static pcibus_t pci_config_get_bar_addr(PCIDevice *d, int reg, pci_get_word(pf->config + sriov_cap + PCI_SRIOV_VF_OFFSET); uint16_t vf_stride = pci_get_word(pf->config + sriov_cap + PCI_SRIOV_VF_STRIDE); - uint32_t vf_num = (d->devfn - (pf->devfn + vf_offset)) / vf_stride; + uint32_t vf_num = d->devfn - (pf->devfn + vf_offset); + + if (vf_num) { + vf_num /= vf_stride; + } if (type & PCI_BASE_ADDRESS_MEM_TYPE_64) { new_addr = pci_get_quad(pf->config + bar); From 78f9d7fd1989311040beff54979bcb2a1ba0aff2 Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Mon, 15 Jul 2024 14:19:09 +0900 Subject: [PATCH 30/61] pcie_sriov: Ensure PF and VF are mutually exclusive A device cannot be a SR-IOV PF and a VF at the same time. Signed-off-by: Akihiko Odaki Message-Id: <20240715-sriov-v5-3-3f5539093ffc@daynix.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/pci/pcie_sriov.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/hw/pci/pcie_sriov.c b/hw/pci/pcie_sriov.c index 56523ab4e8..6c79658b4c 100644 --- a/hw/pci/pcie_sriov.c +++ b/hw/pci/pcie_sriov.c @@ -42,6 +42,11 @@ bool pcie_sriov_pf_init(PCIDevice *dev, uint16_t offset, uint8_t *cfg = dev->config + offset; uint8_t *wmask; + if (pci_is_vf(dev)) { + error_setg(errp, "a device cannot be both an SR-IOV PF and a VF"); + return false; + } + if (total_vfs) { uint16_t ari_cap = pcie_find_capability(dev, PCI_EXT_CAP_ID_ARI); uint16_t first_vf_devfn = dev->devfn + vf_offset; From 47cc753e50076c25334091783738be9f716253b1 Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Mon, 15 Jul 2024 14:19:10 +0900 Subject: [PATCH 31/61] pcie_sriov: Check PCI Express for SR-IOV PF SR-IOV requires PCI Express. Signed-off-by: Akihiko Odaki Message-Id: <20240715-sriov-v5-4-3f5539093ffc@daynix.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/pci/pcie_sriov.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/hw/pci/pcie_sriov.c b/hw/pci/pcie_sriov.c index 6c79658b4c..15a4aac1f4 100644 --- a/hw/pci/pcie_sriov.c +++ b/hw/pci/pcie_sriov.c @@ -42,6 +42,11 @@ bool pcie_sriov_pf_init(PCIDevice *dev, uint16_t offset, uint8_t *cfg = dev->config + offset; uint8_t *wmask; + if (!pci_is_express(dev)) { + error_setg(errp, "PCI Express is required for SR-IOV PF"); + return false; + } + if (pci_is_vf(dev)) { error_setg(errp, "a device cannot be both an SR-IOV PF and a VF"); return false; From 122173a5830f7757f8a94a3b1559582f312e140b Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Mon, 15 Jul 2024 14:19:11 +0900 Subject: [PATCH 32/61] pcie_sriov: Allow user to create SR-IOV device A user can create a SR-IOV device by specifying the PF with the sriov-pf property of the VFs. The VFs must be added before the PF. A user-creatable VF must have PCIDeviceClass::sriov_vf_user_creatable set. Such a VF cannot refer to the PF because it is created before the PF. A PF that user-creatable VFs can be attached calls pcie_sriov_pf_init_from_user_created_vfs() during realization and pcie_sriov_pf_exit() when exiting. Signed-off-by: Akihiko Odaki Message-Id: <20240715-sriov-v5-5-3f5539093ffc@daynix.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/pci/pci.c | 62 +++++--- hw/pci/pcie_sriov.c | 290 ++++++++++++++++++++++++++++-------- include/hw/pci/pci_device.h | 6 +- include/hw/pci/pcie_sriov.h | 18 +++ 4 files changed, 293 insertions(+), 83 deletions(-) diff --git a/hw/pci/pci.c b/hw/pci/pci.c index cf2794879d..8ad5d7e2d8 100644 --- a/hw/pci/pci.c +++ b/hw/pci/pci.c @@ -85,6 +85,7 @@ static Property pci_props[] = { QEMU_PCIE_ERR_UNC_MASK_BITNR, true), DEFINE_PROP_BIT("x-pcie-ari-nextfn-1", PCIDevice, cap_present, QEMU_PCIE_ARI_NEXTFN_1_BITNR, false), + DEFINE_PROP_STRING("sriov-pf", PCIDevice, sriov_pf), DEFINE_PROP_END_OF_LIST() }; @@ -959,13 +960,8 @@ static void pci_init_multifunction(PCIBus *bus, PCIDevice *dev, Error **errp) dev->config[PCI_HEADER_TYPE] |= PCI_HEADER_TYPE_MULTI_FUNCTION; } - /* - * With SR/IOV and ARI, a device at function 0 need not be a multifunction - * device, as it may just be a VF that ended up with function 0 in - * the legacy PCI interpretation. Avoid failing in such cases: - */ - if (pci_is_vf(dev) && - dev->exp.sriov_vf.pf->cap_present & QEMU_PCI_CAP_MULTIFUNCTION) { + /* SR/IOV is not handled here. */ + if (pci_is_vf(dev)) { return; } @@ -998,7 +994,8 @@ static void pci_init_multifunction(PCIBus *bus, PCIDevice *dev, Error **errp) } /* function 0 indicates single function, so function > 0 must be NULL */ for (func = 1; func < PCI_FUNC_MAX; ++func) { - if (bus->devices[PCI_DEVFN(slot, func)]) { + PCIDevice *device = bus->devices[PCI_DEVFN(slot, func)]; + if (device && !pci_is_vf(device)) { error_setg(errp, "PCI: %x.0 indicates single function, " "but %x.%x is already populated.", slot, slot, func); @@ -1283,6 +1280,7 @@ static void pci_qdev_unrealize(DeviceState *dev) pci_unregister_io_regions(pci_dev); pci_del_option_rom(pci_dev); + pcie_sriov_unregister_device(pci_dev); if (pc->exit) { pc->exit(pci_dev); @@ -1314,7 +1312,6 @@ void pci_register_bar(PCIDevice *pci_dev, int region_num, pcibus_t size = memory_region_size(memory); uint8_t hdr_type; - assert(!pci_is_vf(pci_dev)); /* VFs must use pcie_sriov_vf_register_bar */ assert(region_num >= 0); assert(region_num < PCI_NUM_REGIONS); assert(is_power_of_2(size)); @@ -1325,7 +1322,6 @@ void pci_register_bar(PCIDevice *pci_dev, int region_num, assert(hdr_type != PCI_HEADER_TYPE_BRIDGE || region_num < 2); r = &pci_dev->io_regions[region_num]; - r->addr = PCI_BAR_UNMAPPED; r->size = size; r->type = type; r->memory = memory; @@ -1333,22 +1329,35 @@ void pci_register_bar(PCIDevice *pci_dev, int region_num, ? pci_get_bus(pci_dev)->address_space_io : pci_get_bus(pci_dev)->address_space_mem; - wmask = ~(size - 1); - if (region_num == PCI_ROM_SLOT) { - /* ROM enable bit is writable */ - wmask |= PCI_ROM_ADDRESS_ENABLE; - } + if (pci_is_vf(pci_dev)) { + PCIDevice *pf = pci_dev->exp.sriov_vf.pf; + assert(!pf || type == pf->exp.sriov_pf.vf_bar_type[region_num]); - addr = pci_bar(pci_dev, region_num); - pci_set_long(pci_dev->config + addr, type); - - if (!(r->type & PCI_BASE_ADDRESS_SPACE_IO) && - r->type & PCI_BASE_ADDRESS_MEM_TYPE_64) { - pci_set_quad(pci_dev->wmask + addr, wmask); - pci_set_quad(pci_dev->cmask + addr, ~0ULL); + r->addr = pci_bar_address(pci_dev, region_num, r->type, r->size); + if (r->addr != PCI_BAR_UNMAPPED) { + memory_region_add_subregion_overlap(r->address_space, + r->addr, r->memory, 1); + } } else { - pci_set_long(pci_dev->wmask + addr, wmask & 0xffffffff); - pci_set_long(pci_dev->cmask + addr, 0xffffffff); + r->addr = PCI_BAR_UNMAPPED; + + wmask = ~(size - 1); + if (region_num == PCI_ROM_SLOT) { + /* ROM enable bit is writable */ + wmask |= PCI_ROM_ADDRESS_ENABLE; + } + + addr = pci_bar(pci_dev, region_num); + pci_set_long(pci_dev->config + addr, type); + + if (!(r->type & PCI_BASE_ADDRESS_SPACE_IO) && + r->type & PCI_BASE_ADDRESS_MEM_TYPE_64) { + pci_set_quad(pci_dev->wmask + addr, wmask); + pci_set_quad(pci_dev->cmask + addr, ~0ULL); + } else { + pci_set_long(pci_dev->wmask + addr, wmask & 0xffffffff); + pci_set_long(pci_dev->cmask + addr, 0xffffffff); + } } } @@ -2109,6 +2118,11 @@ static void pci_qdev_realize(DeviceState *qdev, Error **errp) } } + if (!pcie_sriov_register_device(pci_dev, errp)) { + pci_qdev_unrealize(DEVICE(pci_dev)); + return; + } + /* * A PCIe Downstream Port that do not have ARI Forwarding enabled must * associate only Device 0 with the device attached to the bus diff --git a/hw/pci/pcie_sriov.c b/hw/pci/pcie_sriov.c index 15a4aac1f4..0fc9f810b9 100644 --- a/hw/pci/pcie_sriov.c +++ b/hw/pci/pcie_sriov.c @@ -20,6 +20,8 @@ #include "qapi/error.h" #include "trace.h" +static GHashTable *pfs; + static void unparent_vfs(PCIDevice *dev, uint16_t total_vfs) { for (uint16_t i = 0; i < total_vfs; i++) { @@ -31,14 +33,49 @@ static void unparent_vfs(PCIDevice *dev, uint16_t total_vfs) dev->exp.sriov_pf.vf = NULL; } -bool pcie_sriov_pf_init(PCIDevice *dev, uint16_t offset, - const char *vfname, uint16_t vf_dev_id, - uint16_t init_vfs, uint16_t total_vfs, - uint16_t vf_offset, uint16_t vf_stride, - Error **errp) +static void clear_ctrl_vfe(PCIDevice *dev) +{ + uint8_t *ctrl = dev->config + dev->exp.sriov_cap + PCI_SRIOV_CTRL; + pci_set_word(ctrl, pci_get_word(ctrl) & ~PCI_SRIOV_CTRL_VFE); +} + +static void register_vfs(PCIDevice *dev) +{ + uint16_t num_vfs; + uint16_t i; + uint16_t sriov_cap = dev->exp.sriov_cap; + + assert(sriov_cap > 0); + num_vfs = pci_get_word(dev->config + sriov_cap + PCI_SRIOV_NUM_VF); + if (num_vfs > pci_get_word(dev->config + sriov_cap + PCI_SRIOV_TOTAL_VF)) { + clear_ctrl_vfe(dev); + return; + } + + trace_sriov_register_vfs(dev->name, PCI_SLOT(dev->devfn), + PCI_FUNC(dev->devfn), num_vfs); + for (i = 0; i < num_vfs; i++) { + pci_set_enabled(dev->exp.sriov_pf.vf[i], true); + } +} + +static void unregister_vfs(PCIDevice *dev) +{ + uint16_t i; + uint8_t *cfg = dev->config + dev->exp.sriov_cap; + + trace_sriov_unregister_vfs(dev->name, PCI_SLOT(dev->devfn), + PCI_FUNC(dev->devfn)); + for (i = 0; i < pci_get_word(cfg + PCI_SRIOV_TOTAL_VF); i++) { + pci_set_enabled(dev->exp.sriov_pf.vf[i], false); + } +} + +static bool pcie_sriov_pf_init_common(PCIDevice *dev, uint16_t offset, + uint16_t vf_dev_id, uint16_t init_vfs, + uint16_t total_vfs, uint16_t vf_offset, + uint16_t vf_stride, Error **errp) { - BusState *bus = qdev_get_parent_bus(&dev->qdev); - int32_t devfn = dev->devfn + vf_offset; uint8_t *cfg = dev->config + offset; uint8_t *wmask; @@ -100,6 +137,28 @@ bool pcie_sriov_pf_init(PCIDevice *dev, uint16_t offset, qdev_prop_set_bit(&dev->qdev, "multifunction", true); + return true; +} + +bool pcie_sriov_pf_init(PCIDevice *dev, uint16_t offset, + const char *vfname, uint16_t vf_dev_id, + uint16_t init_vfs, uint16_t total_vfs, + uint16_t vf_offset, uint16_t vf_stride, + Error **errp) +{ + BusState *bus = qdev_get_parent_bus(&dev->qdev); + int32_t devfn = dev->devfn + vf_offset; + + if (pfs && g_hash_table_contains(pfs, dev->qdev.id)) { + error_setg(errp, "attaching user-created SR-IOV VF unsupported"); + return false; + } + + if (!pcie_sriov_pf_init_common(dev, offset, vf_dev_id, init_vfs, + total_vfs, vf_offset, vf_stride, errp)) { + return false; + } + dev->exp.sriov_pf.vf = g_new(PCIDevice *, total_vfs); for (uint16_t i = 0; i < total_vfs; i++) { @@ -129,7 +188,24 @@ void pcie_sriov_pf_exit(PCIDevice *dev) { uint8_t *cfg = dev->config + dev->exp.sriov_cap; - unparent_vfs(dev, pci_get_word(cfg + PCI_SRIOV_TOTAL_VF)); + if (dev->exp.sriov_pf.vf_user_created) { + uint16_t ven_id = pci_get_word(dev->config + PCI_VENDOR_ID); + uint16_t total_vfs = pci_get_word(dev->config + PCI_SRIOV_TOTAL_VF); + uint16_t vf_dev_id = pci_get_word(dev->config + PCI_SRIOV_VF_DID); + + unregister_vfs(dev); + + for (uint16_t i = 0; i < total_vfs; i++) { + PCIDevice *vf = dev->exp.sriov_pf.vf[i]; + + vf->exp.sriov_vf.pf = NULL; + + pci_config_set_vendor_id(vf->config, ven_id); + pci_config_set_device_id(vf->config, vf_dev_id); + } + } else { + unparent_vfs(dev, pci_get_word(cfg + PCI_SRIOV_TOTAL_VF)); + } } void pcie_sriov_pf_init_vf_bar(PCIDevice *dev, int region_num, @@ -162,74 +238,172 @@ void pcie_sriov_pf_init_vf_bar(PCIDevice *dev, int region_num, void pcie_sriov_vf_register_bar(PCIDevice *dev, int region_num, MemoryRegion *memory) { - PCIIORegion *r; - PCIBus *bus = pci_get_bus(dev); uint8_t type; - pcibus_t size = memory_region_size(memory); - assert(pci_is_vf(dev)); /* PFs must use pci_register_bar */ - assert(region_num >= 0); - assert(region_num < PCI_NUM_REGIONS); + assert(dev->exp.sriov_vf.pf); type = dev->exp.sriov_vf.pf->exp.sriov_pf.vf_bar_type[region_num]; - if (!is_power_of_2(size)) { - error_report("%s: PCI region size must be a power" - " of two - type=0x%x, size=0x%"FMT_PCIBUS, - __func__, type, size); - exit(1); - } - - r = &dev->io_regions[region_num]; - r->memory = memory; - r->address_space = - type & PCI_BASE_ADDRESS_SPACE_IO - ? bus->address_space_io - : bus->address_space_mem; - r->size = size; - r->type = type; - - r->addr = pci_bar_address(dev, region_num, r->type, r->size); - if (r->addr != PCI_BAR_UNMAPPED) { - memory_region_add_subregion_overlap(r->address_space, - r->addr, r->memory, 1); - } + return pci_register_bar(dev, region_num, type, memory); } -static void clear_ctrl_vfe(PCIDevice *dev) +static gint compare_vf_devfns(gconstpointer a, gconstpointer b) { - uint8_t *ctrl = dev->config + dev->exp.sriov_cap + PCI_SRIOV_CTRL; - pci_set_word(ctrl, pci_get_word(ctrl) & ~PCI_SRIOV_CTRL_VFE); + return (*(PCIDevice **)a)->devfn - (*(PCIDevice **)b)->devfn; } -static void register_vfs(PCIDevice *dev) +int16_t pcie_sriov_pf_init_from_user_created_vfs(PCIDevice *dev, + uint16_t offset, + Error **errp) { - uint16_t num_vfs; + GPtrArray *pf; + PCIDevice **vfs; + BusState *bus = qdev_get_parent_bus(DEVICE(dev)); + uint16_t ven_id = pci_get_word(dev->config + PCI_VENDOR_ID); + uint16_t vf_dev_id; + uint16_t vf_offset; + uint16_t vf_stride; uint16_t i; - uint16_t sriov_cap = dev->exp.sriov_cap; - assert(sriov_cap > 0); - num_vfs = pci_get_word(dev->config + sriov_cap + PCI_SRIOV_NUM_VF); - if (num_vfs > pci_get_word(dev->config + sriov_cap + PCI_SRIOV_TOTAL_VF)) { - clear_ctrl_vfe(dev); - return; + if (!pfs || !dev->qdev.id) { + return 0; } - trace_sriov_register_vfs(dev->name, PCI_SLOT(dev->devfn), - PCI_FUNC(dev->devfn), num_vfs); - for (i = 0; i < num_vfs; i++) { - pci_set_enabled(dev->exp.sriov_pf.vf[i], true); + pf = g_hash_table_lookup(pfs, dev->qdev.id); + if (!pf) { + return 0; } + + if (pf->len > UINT16_MAX) { + error_setg(errp, "too many VFs"); + return -1; + } + + g_ptr_array_sort(pf, compare_vf_devfns); + vfs = (void *)pf->pdata; + + if (vfs[0]->devfn <= dev->devfn) { + error_setg(errp, "a VF function number is less than the PF function number"); + return -1; + } + + vf_dev_id = pci_get_word(vfs[0]->config + PCI_DEVICE_ID); + vf_offset = vfs[0]->devfn - dev->devfn; + vf_stride = pf->len < 2 ? 0 : vfs[1]->devfn - vfs[0]->devfn; + + for (i = 0; i < pf->len; i++) { + if (bus != qdev_get_parent_bus(&vfs[i]->qdev)) { + error_setg(errp, "SR-IOV VF parent bus mismatches with PF"); + return -1; + } + + if (ven_id != pci_get_word(vfs[i]->config + PCI_VENDOR_ID)) { + error_setg(errp, "SR-IOV VF vendor ID mismatches with PF"); + return -1; + } + + if (vf_dev_id != pci_get_word(vfs[i]->config + PCI_DEVICE_ID)) { + error_setg(errp, "inconsistent SR-IOV VF device IDs"); + return -1; + } + + for (size_t j = 0; j < PCI_NUM_REGIONS; j++) { + if (vfs[i]->io_regions[j].size != vfs[0]->io_regions[j].size || + vfs[i]->io_regions[j].type != vfs[0]->io_regions[j].type) { + error_setg(errp, "inconsistent SR-IOV BARs"); + return -1; + } + } + + if (vfs[i]->devfn - vfs[0]->devfn != vf_stride * i) { + error_setg(errp, "inconsistent SR-IOV stride"); + return -1; + } + } + + if (!pcie_sriov_pf_init_common(dev, offset, vf_dev_id, pf->len, + pf->len, vf_offset, vf_stride, errp)) { + return -1; + } + + for (i = 0; i < pf->len; i++) { + vfs[i]->exp.sriov_vf.pf = dev; + vfs[i]->exp.sriov_vf.vf_number = i; + + /* set vid/did according to sr/iov spec - they are not used */ + pci_config_set_vendor_id(vfs[i]->config, 0xffff); + pci_config_set_device_id(vfs[i]->config, 0xffff); + } + + dev->exp.sriov_pf.vf = vfs; + dev->exp.sriov_pf.vf_user_created = true; + + for (i = 0; i < PCI_NUM_REGIONS; i++) { + PCIIORegion *region = &vfs[0]->io_regions[i]; + + if (region->size) { + pcie_sriov_pf_init_vf_bar(dev, i, region->type, region->size); + } + } + + return PCI_EXT_CAP_SRIOV_SIZEOF; } -static void unregister_vfs(PCIDevice *dev) +bool pcie_sriov_register_device(PCIDevice *dev, Error **errp) { - uint16_t i; - uint8_t *cfg = dev->config + dev->exp.sriov_cap; + if (!dev->exp.sriov_pf.vf && dev->qdev.id && + pfs && g_hash_table_contains(pfs, dev->qdev.id)) { + error_setg(errp, "attaching user-created SR-IOV VF unsupported"); + return false; + } - trace_sriov_unregister_vfs(dev->name, PCI_SLOT(dev->devfn), - PCI_FUNC(dev->devfn)); - for (i = 0; i < pci_get_word(cfg + PCI_SRIOV_TOTAL_VF); i++) { - pci_set_enabled(dev->exp.sriov_pf.vf[i], false); + if (dev->sriov_pf) { + PCIDevice *pci_pf; + GPtrArray *pf; + + if (!PCI_DEVICE_GET_CLASS(dev)->sriov_vf_user_creatable) { + error_setg(errp, "user cannot create SR-IOV VF with this device type"); + return false; + } + + if (!pci_is_express(dev)) { + error_setg(errp, "PCI Express is required for SR-IOV VF"); + return false; + } + + if (!pci_qdev_find_device(dev->sriov_pf, &pci_pf)) { + error_setg(errp, "PCI device specified as SR-IOV PF already exists"); + return false; + } + + if (!pfs) { + pfs = g_hash_table_new_full(g_str_hash, g_str_equal, g_free, NULL); + } + + pf = g_hash_table_lookup(pfs, dev->sriov_pf); + if (!pf) { + pf = g_ptr_array_new(); + g_hash_table_insert(pfs, g_strdup(dev->sriov_pf), pf); + } + + g_ptr_array_add(pf, dev); + } + + return true; +} + +void pcie_sriov_unregister_device(PCIDevice *dev) +{ + if (dev->sriov_pf && pfs) { + GPtrArray *pf = g_hash_table_lookup(pfs, dev->sriov_pf); + + if (pf) { + g_ptr_array_remove_fast(pf, dev); + + if (!pf->len) { + g_hash_table_remove(pfs, dev->sriov_pf); + g_ptr_array_free(pf, FALSE); + } + } } } @@ -316,7 +490,7 @@ void pcie_sriov_pf_add_sup_pgsize(PCIDevice *dev, uint16_t opt_sup_pgsize) uint16_t pcie_sriov_vf_number(PCIDevice *dev) { - assert(pci_is_vf(dev)); + assert(dev->exp.sriov_vf.pf); return dev->exp.sriov_vf.vf_number; } diff --git a/include/hw/pci/pci_device.h b/include/hw/pci/pci_device.h index ca15132508..cefd6f7640 100644 --- a/include/hw/pci/pci_device.h +++ b/include/hw/pci/pci_device.h @@ -37,6 +37,8 @@ struct PCIDeviceClass { uint16_t subsystem_id; /* only for header type = 0 */ const char *romfile; /* rom bar */ + + bool sriov_vf_user_creatable; }; enum PCIReqIDType { @@ -160,6 +162,8 @@ struct PCIDevice { /* ID of standby device in net_failover pair */ char *failover_pair_id; uint32_t acpi_index; + + char *sriov_pf; }; static inline int pci_intx(PCIDevice *pci_dev) @@ -192,7 +196,7 @@ static inline int pci_is_express_downstream_port(const PCIDevice *d) static inline int pci_is_vf(const PCIDevice *d) { - return d->exp.sriov_vf.pf != NULL; + return d->sriov_pf || d->exp.sriov_vf.pf != NULL; } static inline uint32_t pci_config_size(const PCIDevice *d) diff --git a/include/hw/pci/pcie_sriov.h b/include/hw/pci/pcie_sriov.h index c5d2d318d3..f75b8f22ee 100644 --- a/include/hw/pci/pcie_sriov.h +++ b/include/hw/pci/pcie_sriov.h @@ -18,6 +18,7 @@ typedef struct PCIESriovPF { uint8_t vf_bar_type[PCI_NUM_REGIONS]; /* Store type for each VF bar */ PCIDevice **vf; /* Pointer to an array of num_vfs VF devices */ + bool vf_user_created; /* If VFs are created by user */ } PCIESriovPF; typedef struct PCIESriovVF { @@ -40,6 +41,23 @@ void pcie_sriov_pf_init_vf_bar(PCIDevice *dev, int region_num, void pcie_sriov_vf_register_bar(PCIDevice *dev, int region_num, MemoryRegion *memory); +/** + * pcie_sriov_pf_init_from_user_created_vfs() - Initialize PF with user-created + * VFs. + * @dev: A PCIe device being realized. + * @offset: The offset of the SR-IOV capability. + * @errp: pointer to Error*, to store an error if it happens. + * + * Return: The size of added capability. 0 if the user did not create VFs. + * -1 if failed. + */ +int16_t pcie_sriov_pf_init_from_user_created_vfs(PCIDevice *dev, + uint16_t offset, + Error **errp); + +bool pcie_sriov_register_device(PCIDevice *dev, Error **errp); +void pcie_sriov_unregister_device(PCIDevice *dev); + /* * Default (minimal) page size support values * as required by the SR/IOV standard: From 3f868ffb0bae0c4feafabe34a371cded57fe3806 Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Mon, 15 Jul 2024 14:19:12 +0900 Subject: [PATCH 33/61] virtio-pci: Implement SR-IOV PF Allow user to attach SR-IOV VF to a virtio-pci PF. Signed-off-by: Akihiko Odaki Message-Id: <20240715-sriov-v5-6-3f5539093ffc@daynix.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/virtio-pci.c | 20 +++++++++++++++----- include/hw/virtio/virtio-pci.h | 1 + 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c index 9534730bba..0c8fcc5627 100644 --- a/hw/virtio/virtio-pci.c +++ b/hw/virtio/virtio-pci.c @@ -1955,6 +1955,7 @@ static void virtio_pci_device_plugged(DeviceState *d, Error **errp) uint8_t *config; uint32_t size; VirtIODevice *vdev = virtio_bus_get_device(bus); + int16_t res; /* * Virtio capabilities present without @@ -2100,6 +2101,14 @@ static void virtio_pci_device_plugged(DeviceState *d, Error **errp) pci_register_bar(&proxy->pci_dev, proxy->legacy_io_bar_idx, PCI_BASE_ADDRESS_SPACE_IO, &proxy->bar); } + + res = pcie_sriov_pf_init_from_user_created_vfs(&proxy->pci_dev, + proxy->last_pcie_cap_offset, + errp); + if (res > 0) { + proxy->last_pcie_cap_offset += res; + virtio_add_feature(&vdev->host_features, VIRTIO_F_SR_IOV); + } } static void virtio_pci_device_unplugged(DeviceState *d) @@ -2187,7 +2196,7 @@ static void virtio_pci_realize(PCIDevice *pci_dev, Error **errp) if (pcie_port && pci_is_express(pci_dev)) { int pos; - uint16_t last_pcie_cap_offset = PCI_CONFIG_SPACE_SIZE; + proxy->last_pcie_cap_offset = PCI_CONFIG_SPACE_SIZE; pos = pcie_endpoint_cap_init(pci_dev, 0); assert(pos > 0); @@ -2207,9 +2216,9 @@ static void virtio_pci_realize(PCIDevice *pci_dev, Error **errp) pci_set_word(pci_dev->config + pos + PCI_PM_PMC, 0x3); if (proxy->flags & VIRTIO_PCI_FLAG_AER) { - pcie_aer_init(pci_dev, PCI_ERR_VER, last_pcie_cap_offset, + pcie_aer_init(pci_dev, PCI_ERR_VER, proxy->last_pcie_cap_offset, PCI_ERR_SIZEOF, NULL); - last_pcie_cap_offset += PCI_ERR_SIZEOF; + proxy->last_pcie_cap_offset += PCI_ERR_SIZEOF; } if (proxy->flags & VIRTIO_PCI_FLAG_INIT_DEVERR) { @@ -2234,9 +2243,9 @@ static void virtio_pci_realize(PCIDevice *pci_dev, Error **errp) } if (proxy->flags & VIRTIO_PCI_FLAG_ATS) { - pcie_ats_init(pci_dev, last_pcie_cap_offset, + pcie_ats_init(pci_dev, proxy->last_pcie_cap_offset, proxy->flags & VIRTIO_PCI_FLAG_ATS_PAGE_ALIGNED); - last_pcie_cap_offset += PCI_EXT_CAP_ATS_SIZEOF; + proxy->last_pcie_cap_offset += PCI_EXT_CAP_ATS_SIZEOF; } if (proxy->flags & VIRTIO_PCI_FLAG_INIT_FLR) { @@ -2263,6 +2272,7 @@ static void virtio_pci_exit(PCIDevice *pci_dev) bool pcie_port = pci_bus_is_express(pci_get_bus(pci_dev)) && !pci_bus_is_root(pci_get_bus(pci_dev)); + pcie_sriov_pf_exit(&proxy->pci_dev); msix_uninit_exclusive_bar(pci_dev); if (proxy->flags & VIRTIO_PCI_FLAG_AER && pcie_port && pci_is_express(pci_dev)) { diff --git a/include/hw/virtio/virtio-pci.h b/include/hw/virtio/virtio-pci.h index 9e67ba38c7..34539f2f67 100644 --- a/include/hw/virtio/virtio-pci.h +++ b/include/hw/virtio/virtio-pci.h @@ -152,6 +152,7 @@ struct VirtIOPCIProxy { uint32_t modern_io_bar_idx; uint32_t modern_mem_bar_idx; int config_cap; + uint16_t last_pcie_cap_offset; uint32_t flags; bool disable_modern; bool ignore_backend_features; From c2d6db6a1f39780b24538440091893f9fbe060a7 Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Mon, 15 Jul 2024 14:19:13 +0900 Subject: [PATCH 34/61] virtio-net: Implement SR-IOV VF A virtio-net device can be added as a SR-IOV VF to another virtio-pci device that will be the PF. Signed-off-by: Akihiko Odaki Message-Id: <20240715-sriov-v5-7-3f5539093ffc@daynix.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/virtio-net-pci.c | 1 + 1 file changed, 1 insertion(+) diff --git a/hw/virtio/virtio-net-pci.c b/hw/virtio/virtio-net-pci.c index e03543a70a..dba4987d6e 100644 --- a/hw/virtio/virtio-net-pci.c +++ b/hw/virtio/virtio-net-pci.c @@ -75,6 +75,7 @@ static void virtio_net_pci_class_init(ObjectClass *klass, void *data) k->device_id = PCI_DEVICE_ID_VIRTIO_NET; k->revision = VIRTIO_PCI_ABI_VERSION; k->class_id = PCI_CLASS_NETWORK_ETHERNET; + k->sriov_vf_user_creatable = true; set_bit(DEVICE_CATEGORY_NETWORK, dc->categories); device_class_set_props(dc, virtio_net_properties); vpciklass->realize = virtio_net_pci_realize; From d6f40c95b35bd380340b698e4306704fe22a5d68 Mon Sep 17 00:00:00 2001 From: Akihiko Odaki Date: Mon, 15 Jul 2024 14:19:14 +0900 Subject: [PATCH 35/61] docs: Document composable SR-IOV device Signed-off-by: Akihiko Odaki Message-Id: <20240715-sriov-v5-8-3f5539093ffc@daynix.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- MAINTAINERS | 1 + docs/system/index.rst | 1 + docs/system/sriov.rst | 36 ++++++++++++++++++++++++++++++++++++ 3 files changed, 38 insertions(+) create mode 100644 docs/system/sriov.rst diff --git a/MAINTAINERS b/MAINTAINERS index 8ad64ff76b..93546cfb14 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2010,6 +2010,7 @@ F: hw/pci-bridge/* F: qapi/pci.json F: docs/pci* F: docs/specs/*pci* +F: docs/system/sriov.rst PCIE DOE M: Huai-Cheng Kuo diff --git a/docs/system/index.rst b/docs/system/index.rst index c21065e519..718e9d3c56 100644 --- a/docs/system/index.rst +++ b/docs/system/index.rst @@ -39,3 +39,4 @@ or Hypervisor.Framework. multi-process confidential-guest-support vm-templating + sriov diff --git a/docs/system/sriov.rst b/docs/system/sriov.rst new file mode 100644 index 0000000000..a851a66a4b --- /dev/null +++ b/docs/system/sriov.rst @@ -0,0 +1,36 @@ +.. SPDX-License-Identifier: GPL-2.0-or-later + +Compsable SR-IOV device +======================= + +SR-IOV (Single Root I/O Virtualization) is an optional extended capability of a +PCI Express device. It allows a single physical function (PF) to appear as +multiple virtual functions (VFs) for the main purpose of eliminating software +overhead in I/O from virtual machines. + +There are devices with predefined SR-IOV configurations, but it is also possible +to compose an SR-IOV device yourself. Composing an SR-IOV device is currently +only supported by virtio-net-pci. + +Users can configure an SR-IOV-capable virtio-net device by adding +virtio-net-pci functions to a bus. Below is a command line example: + +.. code-block:: shell + + -netdev user,id=n -netdev user,id=o + -netdev user,id=p -netdev user,id=q + -device pcie-root-port,id=b + -device virtio-net-pci,bus=b,addr=0x0.0x3,netdev=q,sriov-pf=f + -device virtio-net-pci,bus=b,addr=0x0.0x2,netdev=p,sriov-pf=f + -device virtio-net-pci,bus=b,addr=0x0.0x1,netdev=o,sriov-pf=f + -device virtio-net-pci,bus=b,addr=0x0.0x0,netdev=n,id=f + +The VFs specify the paired PF with ``sriov-pf`` property. The PF must be +added after all VFs. It is the user's responsibility to ensure that VFs have +function numbers larger than one of the PF, and that the function numbers +have a consistent stride. + +You may also need to perform additional steps to activate the SR-IOV feature on +your guest. For Linux, refer to [1]_. + +.. [1] https://docs.kernel.org/PCI/pci-iov-howto.html From 62f182c97b31445012d37181005a83ff8453edaa Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Mon, 15 Jul 2024 14:24:17 +0200 Subject: [PATCH 36/61] smbios: make memory device size configurable per Machine MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently QEMU describes initial[1] RAM* in SMBIOS as a series of virtual DIMMs (capped at 16Gb max) using type 17 structure entries. Which is fine for the most cases. However when starting guest with terabytes of RAM this leads to too many memory device structures, which eventually upsets linux kernel as it reserves only 64K for these entries and when that border is crossed out it runs out of reserved memory. Instead of partitioning initial RAM on 16Gb DIMMs, use maximum possible chunk size that SMBIOS spec allows[2]. Which lets encode RAM in lower 31 bits of 32bit field (which amounts upto 2047Tb per DIMM). As result initial RAM will generate only one type 17 structure until host/guest reach ability to use more RAM in the future. Compat changes: We can't unconditionally change chunk size as it will break QEMU<->guest ABI (and migration). Thus introduce a new machine class field that would let older versioned machines to use legacy 16Gb chunks, while new(er) machine type[s] use maximum possible chunk size. PS: While it might seem to be risky to rise max entry size this large (much beyond of what current physical RAM modules support), I'd not expect it causing much issues, modulo uncovering bugs in software running within guest. And those should be fixed on guest side to handle SMBIOS spec properly, especially if guest is expected to support so huge RAM configs. In worst case, QEMU can reduce chunk size later if we would care enough about introducing a workaround for some 'unfixable' guest OS, either by fixing up the next machine type or giving users a CLI option to customize it. 1) Initial RAM - is RAM configured with help '-m SIZE' CLI option/ implicitly defined by machine. It doesn't include memory configured with help of '-device' option[s] (pcdimm,nvdimm,...) 2) SMBIOS 3.1.0 7.18.5 Memory Device — Extended Size PS: * tested on 8Tb host with RHEL6 guest, which seems to parse type 17 SMBIOS table entries correctly (according to 'dmidecode'). Signed-off-by: Igor Mammedov Message-Id: <20240715122417.4059293-1-imammedo@redhat.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/arm/virt.c | 1 + hw/core/machine.c | 6 ++++++ hw/i386/pc_piix.c | 1 + hw/i386/pc_q35.c | 1 + hw/smbios/smbios.c | 11 ++++++----- include/hw/boards.h | 4 ++++ 6 files changed, 19 insertions(+), 5 deletions(-) diff --git a/hw/arm/virt.c b/hw/arm/virt.c index b0c68d66a3..719e83e6a1 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -3308,6 +3308,7 @@ DEFINE_VIRT_MACHINE_AS_LATEST(9, 1) static void virt_machine_9_0_options(MachineClass *mc) { virt_machine_9_1_options(mc); + mc->smbios_memory_device_size = 16 * GiB; compat_props_add(mc->compat_props, hw_compat_9_0, hw_compat_9_0_len); } DEFINE_VIRT_MACHINE(9, 0) diff --git a/hw/core/machine.c b/hw/core/machine.c index bc38cad7f2..ac30544e7f 100644 --- a/hw/core/machine.c +++ b/hw/core/machine.c @@ -1004,6 +1004,12 @@ static void machine_class_init(ObjectClass *oc, void *data) /* Default 128 MB as guest ram size */ mc->default_ram_size = 128 * MiB; mc->rom_file_has_mr = true; + /* + * SMBIOS 3.1.0 7.18.5 Memory Device — Extended Size + * use max possible value that could be encoded into + * 'Extended Size' field (2047Tb). + */ + mc->smbios_memory_device_size = 2047 * TiB; /* numa node memory size aligned on 8MB by default. * On Linux, each node's border has to be 8MB aligned diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c index 9445b07b4f..d9e69243b4 100644 --- a/hw/i386/pc_piix.c +++ b/hw/i386/pc_piix.c @@ -495,6 +495,7 @@ static void pc_i440fx_machine_9_0_options(MachineClass *m) pc_i440fx_machine_9_1_options(m); m->alias = NULL; m->is_default = false; + m->smbios_memory_device_size = 16 * GiB; compat_props_add(m->compat_props, hw_compat_9_0, hw_compat_9_0_len); compat_props_add(m->compat_props, pc_compat_9_0, pc_compat_9_0_len); diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c index 71d3c6d122..9d108b194e 100644 --- a/hw/i386/pc_q35.c +++ b/hw/i386/pc_q35.c @@ -374,6 +374,7 @@ static void pc_q35_machine_9_0_options(MachineClass *m) PCMachineClass *pcmc = PC_MACHINE_CLASS(m); pc_q35_machine_9_1_options(m); m->alias = NULL; + m->smbios_memory_device_size = 16 * GiB; compat_props_add(m->compat_props, hw_compat_9_0, hw_compat_9_0_len); compat_props_add(m->compat_props, pc_compat_9_0, pc_compat_9_0_len); pcmc->isa_bios_alias = false; diff --git a/hw/smbios/smbios.c b/hw/smbios/smbios.c index 3b7703489d..a394514264 100644 --- a/hw/smbios/smbios.c +++ b/hw/smbios/smbios.c @@ -1093,6 +1093,7 @@ static bool smbios_get_tables_ep(MachineState *ms, Error **errp) { unsigned i, dimm_cnt, offset; + MachineClass *mc = MACHINE_GET_CLASS(ms); ERRP_GUARD(); assert(ep_type == SMBIOS_ENTRY_POINT_TYPE_32 || @@ -1123,12 +1124,12 @@ static bool smbios_get_tables_ep(MachineState *ms, smbios_build_type_9_table(errp); smbios_build_type_11_table(); -#define MAX_DIMM_SZ (16 * GiB) -#define GET_DIMM_SZ ((i < dimm_cnt - 1) ? MAX_DIMM_SZ \ - : ((current_machine->ram_size - 1) % MAX_DIMM_SZ) + 1) +#define GET_DIMM_SZ ((i < dimm_cnt - 1) ? mc->smbios_memory_device_size \ + : ((current_machine->ram_size - 1) % mc->smbios_memory_device_size) + 1) - dimm_cnt = QEMU_ALIGN_UP(current_machine->ram_size, MAX_DIMM_SZ) / - MAX_DIMM_SZ; + dimm_cnt = QEMU_ALIGN_UP(current_machine->ram_size, + mc->smbios_memory_device_size) / + mc->smbios_memory_device_size; /* * The offset determines if we need to keep additional space between diff --git a/include/hw/boards.h b/include/hw/boards.h index ef6f18f2c1..48ff6d8b93 100644 --- a/include/hw/boards.h +++ b/include/hw/boards.h @@ -237,6 +237,9 @@ typedef struct { * purposes only. * Applies only to default memory backend, i.e., explicit memory backend * wasn't used. + * @smbios_memory_device_size: + * Default size of memory device, + * SMBIOS 3.1.0 "7.18 Memory Device (Type 17)" */ struct MachineClass { /*< private >*/ @@ -304,6 +307,7 @@ struct MachineClass { const CPUArchIdList *(*possible_cpu_arch_ids)(MachineState *machine); int64_t (*get_default_cpu_node_id)(const MachineState *ms, int idx); ram_addr_t (*fixup_ram_size)(ram_addr_t size); + uint64_t smbios_memory_device_size; }; /** From 08c328682231b64878fc052a11091bea39577a6f Mon Sep 17 00:00:00 2001 From: Salil Mehta Date: Tue, 16 Jul 2024 12:14:56 +0100 Subject: [PATCH 37/61] accel/kvm: Extract common KVM vCPU {creation,parking} code KVM vCPU creation is done once during the vCPU realization when Qemu vCPU thread is spawned. This is common to all the architectures as of now. Hot-unplug of vCPU results in destruction of the vCPU object in QOM but the corresponding KVM vCPU object in the Host KVM is not destroyed as KVM doesn't support vCPU removal. Therefore, its representative KVM vCPU object/context in Qemu is parked. Refactor architecture common logic so that some APIs could be reused by vCPU Hotplug code of some architectures likes ARM, Loongson etc. Update new/old APIs with trace events. New APIs qemu_{create,park,unpark}_vcpu() can be externally called. No functional change is intended here. Signed-off-by: Salil Mehta Reviewed-by: Gavin Shan Tested-by: Vishnu Pajjuri Reviewed-by: Jonathan Cameron Tested-by: Xianglai Li Tested-by: Miguel Luis Reviewed-by: Shaoqin Huang Reviewed-by: Vishnu Pajjuri Reviewed-by: Nicholas Piggin Tested-by: Zhao Liu Reviewed-by: Zhao Liu Reviewed-by: Harsh Prateek Bora Reviewed-by: Igor Mammedov Message-Id: <20240716111502.202344-2-salil.mehta@huawei.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- accel/kvm/kvm-all.c | 95 ++++++++++++++++++++++++++++-------------- accel/kvm/kvm-cpus.h | 1 - accel/kvm/trace-events | 5 ++- include/sysemu/kvm.h | 25 +++++++++++ 4 files changed, 92 insertions(+), 34 deletions(-) diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c index 64bf47a033..0f110cce3e 100644 --- a/accel/kvm/kvm-all.c +++ b/accel/kvm/kvm-all.c @@ -340,14 +340,71 @@ err: return ret; } +void kvm_park_vcpu(CPUState *cpu) +{ + struct KVMParkedVcpu *vcpu; + + trace_kvm_park_vcpu(cpu->cpu_index, kvm_arch_vcpu_id(cpu)); + + vcpu = g_malloc0(sizeof(*vcpu)); + vcpu->vcpu_id = kvm_arch_vcpu_id(cpu); + vcpu->kvm_fd = cpu->kvm_fd; + QLIST_INSERT_HEAD(&kvm_state->kvm_parked_vcpus, vcpu, node); +} + +int kvm_unpark_vcpu(KVMState *s, unsigned long vcpu_id) +{ + struct KVMParkedVcpu *cpu; + int kvm_fd = -ENOENT; + + QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) { + if (cpu->vcpu_id == vcpu_id) { + QLIST_REMOVE(cpu, node); + kvm_fd = cpu->kvm_fd; + g_free(cpu); + } + } + + trace_kvm_unpark_vcpu(vcpu_id, kvm_fd > 0 ? "unparked" : "!found parked"); + + return kvm_fd; +} + +int kvm_create_vcpu(CPUState *cpu) +{ + unsigned long vcpu_id = kvm_arch_vcpu_id(cpu); + KVMState *s = kvm_state; + int kvm_fd; + + /* check if the KVM vCPU already exist but is parked */ + kvm_fd = kvm_unpark_vcpu(s, vcpu_id); + if (kvm_fd < 0) { + /* vCPU not parked: create a new KVM vCPU */ + kvm_fd = kvm_vm_ioctl(s, KVM_CREATE_VCPU, vcpu_id); + if (kvm_fd < 0) { + error_report("KVM_CREATE_VCPU IOCTL failed for vCPU %lu", vcpu_id); + return kvm_fd; + } + } + + cpu->kvm_fd = kvm_fd; + cpu->kvm_state = s; + cpu->vcpu_dirty = true; + cpu->dirty_pages = 0; + cpu->throttle_us_per_full = 0; + + trace_kvm_create_vcpu(cpu->cpu_index, vcpu_id, kvm_fd); + + return 0; +} + static int do_kvm_destroy_vcpu(CPUState *cpu) { KVMState *s = kvm_state; long mmap_size; - struct KVMParkedVcpu *vcpu = NULL; int ret = 0; - trace_kvm_destroy_vcpu(); + trace_kvm_destroy_vcpu(cpu->cpu_index, kvm_arch_vcpu_id(cpu)); ret = kvm_arch_destroy_vcpu(cpu); if (ret < 0) { @@ -373,10 +430,7 @@ static int do_kvm_destroy_vcpu(CPUState *cpu) } } - vcpu = g_malloc0(sizeof(*vcpu)); - vcpu->vcpu_id = kvm_arch_vcpu_id(cpu); - vcpu->kvm_fd = cpu->kvm_fd; - QLIST_INSERT_HEAD(&kvm_state->kvm_parked_vcpus, vcpu, node); + kvm_park_vcpu(cpu); err: return ret; } @@ -389,24 +443,6 @@ void kvm_destroy_vcpu(CPUState *cpu) } } -static int kvm_get_vcpu(KVMState *s, unsigned long vcpu_id) -{ - struct KVMParkedVcpu *cpu; - - QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) { - if (cpu->vcpu_id == vcpu_id) { - int kvm_fd; - - QLIST_REMOVE(cpu, node); - kvm_fd = cpu->kvm_fd; - g_free(cpu); - return kvm_fd; - } - } - - return kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)vcpu_id); -} - int kvm_init_vcpu(CPUState *cpu, Error **errp) { KVMState *s = kvm_state; @@ -415,19 +451,14 @@ int kvm_init_vcpu(CPUState *cpu, Error **errp) trace_kvm_init_vcpu(cpu->cpu_index, kvm_arch_vcpu_id(cpu)); - ret = kvm_get_vcpu(s, kvm_arch_vcpu_id(cpu)); + ret = kvm_create_vcpu(cpu); if (ret < 0) { - error_setg_errno(errp, -ret, "kvm_init_vcpu: kvm_get_vcpu failed (%lu)", + error_setg_errno(errp, -ret, + "kvm_init_vcpu: kvm_create_vcpu failed (%lu)", kvm_arch_vcpu_id(cpu)); goto err; } - cpu->kvm_fd = ret; - cpu->kvm_state = s; - cpu->vcpu_dirty = true; - cpu->dirty_pages = 0; - cpu->throttle_us_per_full = 0; - mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0); if (mmap_size < 0) { ret = mmap_size; diff --git a/accel/kvm/kvm-cpus.h b/accel/kvm/kvm-cpus.h index ca40add32c..171b22fd29 100644 --- a/accel/kvm/kvm-cpus.h +++ b/accel/kvm/kvm-cpus.h @@ -22,5 +22,4 @@ bool kvm_supports_guest_debug(void); int kvm_insert_breakpoint(CPUState *cpu, int type, vaddr addr, vaddr len); int kvm_remove_breakpoint(CPUState *cpu, int type, vaddr addr, vaddr len); void kvm_remove_all_breakpoints(CPUState *cpu); - #endif /* KVM_CPUS_H */ diff --git a/accel/kvm/trace-events b/accel/kvm/trace-events index 681ccb667d..37626c1ac5 100644 --- a/accel/kvm/trace-events +++ b/accel/kvm/trace-events @@ -9,6 +9,10 @@ kvm_device_ioctl(int fd, int type, void *arg) "dev fd %d, type 0x%x, arg %p" kvm_failed_reg_get(uint64_t id, const char *msg) "Warning: Unable to retrieve ONEREG %" PRIu64 " from KVM: %s" kvm_failed_reg_set(uint64_t id, const char *msg) "Warning: Unable to set ONEREG %" PRIu64 " to KVM: %s" kvm_init_vcpu(int cpu_index, unsigned long arch_cpu_id) "index: %d id: %lu" +kvm_create_vcpu(int cpu_index, unsigned long arch_cpu_id, int kvm_fd) "index: %d, id: %lu, kvm fd: %d" +kvm_destroy_vcpu(int cpu_index, unsigned long arch_cpu_id) "index: %d id: %lu" +kvm_park_vcpu(int cpu_index, unsigned long arch_cpu_id) "index: %d id: %lu" +kvm_unpark_vcpu(unsigned long arch_cpu_id, const char *msg) "id: %lu %s" kvm_irqchip_commit_routes(void) "" kvm_irqchip_add_msi_route(char *name, int vector, int virq) "dev %s vector %d virq %d" kvm_irqchip_update_msi_route(int virq) "Updating MSI route virq=%d" @@ -25,7 +29,6 @@ kvm_dirty_ring_reaper(const char *s) "%s" kvm_dirty_ring_reap(uint64_t count, int64_t t) "reaped %"PRIu64" pages (took %"PRIi64" us)" kvm_dirty_ring_reaper_kick(const char *reason) "%s" kvm_dirty_ring_flush(int finished) "%d" -kvm_destroy_vcpu(void) "" kvm_failed_get_vcpu_mmap_size(void) "" kvm_cpu_exec(void) "" kvm_interrupt_exit_request(void) "" diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h index c31d9c7356..c4a914b3d8 100644 --- a/include/sysemu/kvm.h +++ b/include/sysemu/kvm.h @@ -313,6 +313,31 @@ int kvm_create_device(KVMState *s, uint64_t type, bool test); */ bool kvm_device_supported(int vmfd, uint64_t type); +/** + * kvm_create_vcpu - Gets a parked KVM vCPU or creates a KVM vCPU + * @cpu: QOM CPUState object for which KVM vCPU has to be fetched/created. + * + * @returns: 0 when success, errno (<0) when failed. + */ +int kvm_create_vcpu(CPUState *cpu); + +/** + * kvm_park_vcpu - Park QEMU KVM vCPU context + * @cpu: QOM CPUState object for which QEMU KVM vCPU context has to be parked. + * + * @returns: none + */ +void kvm_park_vcpu(CPUState *cpu); + +/** + * kvm_unpark_vcpu - unpark QEMU KVM vCPU context + * @s: KVM State + * @vcpu_id: Architecture vCPU ID of the parked vCPU + * + * @returns: KVM fd + */ +int kvm_unpark_vcpu(KVMState *s, unsigned long vcpu_id); + /* Arch specific hooks */ extern const KVMCapabilityInfo kvm_arch_required_capabilities[]; From 2f1a85daf3d8d06e4b204c29c1d47c76a2bc81e6 Mon Sep 17 00:00:00 2001 From: Salil Mehta Date: Tue, 16 Jul 2024 12:14:57 +0100 Subject: [PATCH 38/61] hw/acpi: Move CPU ctrl-dev MMIO region len macro to common header file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CPU ctrl-dev MMIO region length could be used in ACPI GED and various other architecture specific places. Move ACPI_CPU_HOTPLUG_REG_LEN macro to more appropriate common header file. Signed-off-by: Salil Mehta Reviewed-by: Alex Bennée Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Reviewed-by: David Hildenbrand Reviewed-by: Shaoqin Huang Tested-by: Vishnu Pajjuri Tested-by: Xianglai Li Tested-by: Miguel Luis Tested-by: Zhao Liu Reviewed-by: Zhao Liu Reviewed-by: Igor Mammedov Message-Id: <20240716111502.202344-3-salil.mehta@huawei.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/acpi/cpu.c | 1 - include/hw/acpi/cpu.h | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/hw/acpi/cpu.c b/hw/acpi/cpu.c index 2d81c1e790..cf5e9183e4 100644 --- a/hw/acpi/cpu.c +++ b/hw/acpi/cpu.c @@ -7,7 +7,6 @@ #include "trace.h" #include "sysemu/numa.h" -#define ACPI_CPU_HOTPLUG_REG_LEN 12 #define ACPI_CPU_SELECTOR_OFFSET_WR 0 #define ACPI_CPU_FLAGS_OFFSET_RW 4 #define ACPI_CPU_CMD_OFFSET_WR 5 diff --git a/include/hw/acpi/cpu.h b/include/hw/acpi/cpu.h index e6e1a9ef59..df87b15997 100644 --- a/include/hw/acpi/cpu.h +++ b/include/hw/acpi/cpu.h @@ -19,6 +19,8 @@ #include "hw/boards.h" #include "hw/hotplug.h" +#define ACPI_CPU_HOTPLUG_REG_LEN 12 + typedef struct AcpiCpuStatus { CPUState *cpu; uint64_t arch_id; From 06f1f4958be70f14f527d1f03d1e9a141650bbc5 Mon Sep 17 00:00:00 2001 From: Salil Mehta Date: Tue, 16 Jul 2024 12:14:58 +0100 Subject: [PATCH 39/61] hw/acpi: Update ACPI GED framework to support vCPU Hotplug ACPI GED (as described in the ACPI 6.4 spec) uses an interrupt listed in the _CRS object of GED to intimate OSPM about an event. Later then demultiplexes the notified event by evaluating ACPI _EVT method to know the type of event. Use ACPI GED to also notify the guest kernel about any CPU hot(un)plug events. Note, GED interface is used by many hotplug events like memory hotplug, NVDIMM hotplug and non-hotplug events like system power down event. Each of these can be selected using a bit in the 32 bit GED IO interface. A bit has been reserved for the CPU hotplug event. ACPI CPU hotplug related initialization should only happen if ACPI_CPU_HOTPLUG support has been enabled for particular architecture. Add cpu_hotplug_hw_init() stub to avoid compilation break. Co-developed-by: Keqian Zhu Signed-off-by: Keqian Zhu Signed-off-by: Salil Mehta Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Reviewed-by: David Hildenbrand Reviewed-by: Shaoqin Huang Tested-by: Vishnu Pajjuri Tested-by: Xianglai Li Tested-by: Miguel Luis Reviewed-by: Vishnu Pajjuri Tested-by: Zhao Liu Reviewed-by: Zhao Liu Message-Id: <20240716111502.202344-4-salil.mehta@huawei.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Acked-by: Igor Mammedov --- docs/specs/acpi_hw_reduced_hotplug.rst | 3 +- hw/acpi/acpi-cpu-hotplug-stub.c | 6 ++++ hw/acpi/generic_event_device.c | 47 ++++++++++++++++++++++++++ include/hw/acpi/generic_event_device.h | 4 +++ 4 files changed, 59 insertions(+), 1 deletion(-) diff --git a/docs/specs/acpi_hw_reduced_hotplug.rst b/docs/specs/acpi_hw_reduced_hotplug.rst index 0bd3f9399f..3acd6fcd8b 100644 --- a/docs/specs/acpi_hw_reduced_hotplug.rst +++ b/docs/specs/acpi_hw_reduced_hotplug.rst @@ -64,7 +64,8 @@ GED IO interface (4 byte access) 0: Memory hotplug event 1: System power down event 2: NVDIMM hotplug event - 3-31: Reserved + 3: CPU hotplug event + 4-31: Reserved **write_access:** diff --git a/hw/acpi/acpi-cpu-hotplug-stub.c b/hw/acpi/acpi-cpu-hotplug-stub.c index 3fc4b14c26..c6c61bb9cd 100644 --- a/hw/acpi/acpi-cpu-hotplug-stub.c +++ b/hw/acpi/acpi-cpu-hotplug-stub.c @@ -19,6 +19,12 @@ void legacy_acpi_cpu_hotplug_init(MemoryRegion *parent, Object *owner, return; } +void cpu_hotplug_hw_init(MemoryRegion *as, Object *owner, + CPUHotplugState *state, hwaddr base_addr) +{ + return; +} + void acpi_cpu_ospm_status(CPUHotplugState *cpu_st, ACPIOSTInfoList ***list) { return; diff --git a/hw/acpi/generic_event_device.c b/hw/acpi/generic_event_device.c index 2d6e91b124..4641933a0f 100644 --- a/hw/acpi/generic_event_device.c +++ b/hw/acpi/generic_event_device.c @@ -25,6 +25,7 @@ static const uint32_t ged_supported_events[] = { ACPI_GED_MEM_HOTPLUG_EVT, ACPI_GED_PWR_DOWN_EVT, ACPI_GED_NVDIMM_HOTPLUG_EVT, + ACPI_GED_CPU_HOTPLUG_EVT, }; /* @@ -234,6 +235,8 @@ static void acpi_ged_device_plug_cb(HotplugHandler *hotplug_dev, } else { acpi_memory_plug_cb(hotplug_dev, &s->memhp_state, dev, errp); } + } else if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) { + acpi_cpu_plug_cb(hotplug_dev, &s->cpuhp_state, dev, errp); } else { error_setg(errp, "virt: device plug request for unsupported device" " type: %s", object_get_typename(OBJECT(dev))); @@ -248,6 +251,8 @@ static void acpi_ged_unplug_request_cb(HotplugHandler *hotplug_dev, if ((object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM) && !(object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM)))) { acpi_memory_unplug_request_cb(hotplug_dev, &s->memhp_state, dev, errp); + } else if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) { + acpi_cpu_unplug_request_cb(hotplug_dev, &s->cpuhp_state, dev, errp); } else { error_setg(errp, "acpi: device unplug request for unsupported device" " type: %s", object_get_typename(OBJECT(dev))); @@ -261,6 +266,8 @@ static void acpi_ged_unplug_cb(HotplugHandler *hotplug_dev, if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) { acpi_memory_unplug_cb(&s->memhp_state, dev, errp); + } else if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) { + acpi_cpu_unplug_cb(&s->cpuhp_state, dev, errp); } else { error_setg(errp, "acpi: device unplug for unsupported device" " type: %s", object_get_typename(OBJECT(dev))); @@ -272,6 +279,7 @@ static void acpi_ged_ospm_status(AcpiDeviceIf *adev, ACPIOSTInfoList ***list) AcpiGedState *s = ACPI_GED(adev); acpi_memory_ospm_status(&s->memhp_state, list); + acpi_cpu_ospm_status(&s->cpuhp_state, list); } static void acpi_ged_send_event(AcpiDeviceIf *adev, AcpiEventStatusBits ev) @@ -286,6 +294,8 @@ static void acpi_ged_send_event(AcpiDeviceIf *adev, AcpiEventStatusBits ev) sel = ACPI_GED_PWR_DOWN_EVT; } else if (ev & ACPI_NVDIMM_HOTPLUG_STATUS) { sel = ACPI_GED_NVDIMM_HOTPLUG_EVT; + } else if (ev & ACPI_CPU_HOTPLUG_STATUS) { + sel = ACPI_GED_CPU_HOTPLUG_EVT; } else { /* Unknown event. Return without generating interrupt. */ warn_report("GED: Unsupported event %d. No irq injected", ev); @@ -371,6 +381,42 @@ static const VMStateDescription vmstate_acpi_ged = { } }; +static void acpi_ged_realize(DeviceState *dev, Error **errp) +{ + SysBusDevice *sbd = SYS_BUS_DEVICE(dev); + AcpiGedState *s = ACPI_GED(dev); + uint32_t ged_events; + int i; + + ged_events = ctpop32(s->ged_event_bitmap); + + for (i = 0; i < ARRAY_SIZE(ged_supported_events) && ged_events; i++) { + uint32_t event = s->ged_event_bitmap & ged_supported_events[i]; + + if (!event) { + continue; + } + + switch (event) { + case ACPI_GED_CPU_HOTPLUG_EVT: + /* initialize CPU Hotplug related regions */ + memory_region_init(&s->container_cpuhp, OBJECT(dev), + "cpuhp container", + ACPI_CPU_HOTPLUG_REG_LEN); + sysbus_init_mmio(sbd, &s->container_cpuhp); + cpu_hotplug_hw_init(&s->container_cpuhp, OBJECT(dev), + &s->cpuhp_state, 0); + break; + } + ged_events--; + } + + if (ged_events) { + error_report("Unsupported events specified"); + abort(); + } +} + static void acpi_ged_initfn(Object *obj) { DeviceState *dev = DEVICE(obj); @@ -411,6 +457,7 @@ static void acpi_ged_class_init(ObjectClass *class, void *data) dc->desc = "ACPI Generic Event Device"; device_class_set_props(dc, acpi_ged_properties); dc->vmsd = &vmstate_acpi_ged; + dc->realize = acpi_ged_realize; hc->plug = acpi_ged_device_plug_cb; hc->unplug_request = acpi_ged_unplug_request_cb; diff --git a/include/hw/acpi/generic_event_device.h b/include/hw/acpi/generic_event_device.h index ba84ce0214..e091ac2108 100644 --- a/include/hw/acpi/generic_event_device.h +++ b/include/hw/acpi/generic_event_device.h @@ -62,6 +62,7 @@ #include "hw/sysbus.h" #include "hw/acpi/memory_hotplug.h" #include "hw/acpi/ghes.h" +#include "hw/acpi/cpu.h" #include "qom/object.h" #define ACPI_POWER_BUTTON_DEVICE "PWRB" @@ -95,6 +96,7 @@ OBJECT_DECLARE_SIMPLE_TYPE(AcpiGedState, ACPI_GED) #define ACPI_GED_MEM_HOTPLUG_EVT 0x1 #define ACPI_GED_PWR_DOWN_EVT 0x2 #define ACPI_GED_NVDIMM_HOTPLUG_EVT 0x4 +#define ACPI_GED_CPU_HOTPLUG_EVT 0x8 typedef struct GEDState { MemoryRegion evt; @@ -106,6 +108,8 @@ struct AcpiGedState { SysBusDevice parent_obj; MemHotplugState memhp_state; MemoryRegion container_memhp; + CPUHotplugState cpuhp_state; + MemoryRegion container_cpuhp; GEDState ged_state; uint32_t ged_event_bitmap; qemu_irq irq; From 549c9a9dcbc1592ea79496f7b3ab234f366adeba Mon Sep 17 00:00:00 2001 From: Salil Mehta Date: Tue, 16 Jul 2024 12:14:59 +0100 Subject: [PATCH 40/61] hw/acpi: Update GED _EVT method AML with CPU scan OSPM evaluates _EVT method to map the event. The CPU hotplug event eventually results in start of the CPU scan. Scan figures out the CPU and the kind of event(plug/unplug) and notifies it back to the guest. Update the GED AML _EVT method with the call to method \\_SB.CPUS.CSCN (via \\_SB.GED.CSCN) Architecture specific code [1] might initialize its CPUs AML code by calling common function build_cpus_aml() like below for ARM: build_cpus_aml(scope, ms, opts, xx_madt_cpu_entry, memmap[VIRT_CPUHP_ACPI].base, "\\_SB", "\\_SB.GED.CSCN", AML_SYSTEM_MEMORY); [1] https://lore.kernel.org/qemu-devel/20240613233639.202896-13-salil.mehta@huawei.com/ Co-developed-by: Keqian Zhu Signed-off-by: Keqian Zhu Signed-off-by: Salil Mehta Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Tested-by: Vishnu Pajjuri Tested-by: Xianglai Li Tested-by: Miguel Luis Reviewed-by: Shaoqin Huang Tested-by: Zhao Liu Reviewed-by: Igor Mammedov Message-Id: <20240716111502.202344-5-salil.mehta@huawei.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/acpi/generic_event_device.c | 3 +++ include/hw/acpi/generic_event_device.h | 1 + 2 files changed, 4 insertions(+) diff --git a/hw/acpi/generic_event_device.c b/hw/acpi/generic_event_device.c index 4641933a0f..15b4c3ebbf 100644 --- a/hw/acpi/generic_event_device.c +++ b/hw/acpi/generic_event_device.c @@ -108,6 +108,9 @@ void build_ged_aml(Aml *table, const char *name, HotplugHandler *hotplug_dev, aml_append(if_ctx, aml_call0(MEMORY_DEVICES_CONTAINER "." MEMORY_SLOT_SCAN_METHOD)); break; + case ACPI_GED_CPU_HOTPLUG_EVT: + aml_append(if_ctx, aml_call0(AML_GED_EVT_CPU_SCAN_METHOD)); + break; case ACPI_GED_PWR_DOWN_EVT: aml_append(if_ctx, aml_notify(aml_name(ACPI_POWER_BUTTON_DEVICE), diff --git a/include/hw/acpi/generic_event_device.h b/include/hw/acpi/generic_event_device.h index e091ac2108..40af3550b5 100644 --- a/include/hw/acpi/generic_event_device.h +++ b/include/hw/acpi/generic_event_device.h @@ -87,6 +87,7 @@ OBJECT_DECLARE_SIMPLE_TYPE(AcpiGedState, ACPI_GED) #define GED_DEVICE "GED" #define AML_GED_EVT_REG "EREG" #define AML_GED_EVT_SEL "ESEL" +#define AML_GED_EVT_CPU_SCAN_METHOD "\\_SB.GED.CSCN" /* * Platforms need to specify the GED event bitmap From efdb43b831efa5e0c8da3d062a1a37325b4a7708 Mon Sep 17 00:00:00 2001 From: Salil Mehta Date: Tue, 16 Jul 2024 12:15:00 +0100 Subject: [PATCH 41/61] hw/acpi: Update CPUs AML with cpu-(ctrl)dev change CPUs Control device(\\_SB.PCI0) register interface for the x86 arch is IO port based and existing CPUs AML code assumes _CRS objects would evaluate to a system resource which describes IO Port address. But on ARM arch CPUs control device(\\_SB.PRES) register interface is memory-mapped hence _CRS object should evaluate to system resource which describes memory-mapped base address. Update build CPUs AML function to accept both IO/MEMORY region spaces and accordingly update the _CRS object. Co-developed-by: Keqian Zhu Signed-off-by: Keqian Zhu Signed-off-by: Salil Mehta Reviewed-by: Gavin Shan Tested-by: Vishnu Pajjuri Reviewed-by: Jonathan Cameron Tested-by: Xianglai Li Tested-by: Miguel Luis Reviewed-by: Shaoqin Huang Tested-by: Zhao Liu Reviewed-by: Igor Mammedov Message-Id: <20240716111502.202344-6-salil.mehta@huawei.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/acpi/cpu.c | 17 +++++++++++++---- hw/i386/acpi-build.c | 3 ++- include/hw/acpi/cpu.h | 5 +++-- 3 files changed, 18 insertions(+), 7 deletions(-) diff --git a/hw/acpi/cpu.c b/hw/acpi/cpu.c index cf5e9183e4..5cb60ca8bc 100644 --- a/hw/acpi/cpu.c +++ b/hw/acpi/cpu.c @@ -338,9 +338,10 @@ const VMStateDescription vmstate_cpu_hotplug = { #define CPU_FW_EJECT_EVENT "CEJF" void build_cpus_aml(Aml *table, MachineState *machine, CPUHotplugFeatures opts, - build_madt_cpu_fn build_madt_cpu, hwaddr io_base, + build_madt_cpu_fn build_madt_cpu, hwaddr base_addr, const char *res_root, - const char *event_handler_method) + const char *event_handler_method, + AmlRegionSpace rs) { Aml *ifctx; Aml *field; @@ -364,14 +365,22 @@ void build_cpus_aml(Aml *table, MachineState *machine, CPUHotplugFeatures opts, aml_name_decl("_UID", aml_string("CPU Hotplug resources"))); aml_append(cpu_ctrl_dev, aml_mutex(CPU_LOCK, 0)); + assert((rs == AML_SYSTEM_IO) || (rs == AML_SYSTEM_MEMORY)); + crs = aml_resource_template(); - aml_append(crs, aml_io(AML_DECODE16, io_base, io_base, 1, + if (rs == AML_SYSTEM_IO) { + aml_append(crs, aml_io(AML_DECODE16, base_addr, base_addr, 1, ACPI_CPU_HOTPLUG_REG_LEN)); + } else if (rs == AML_SYSTEM_MEMORY) { + aml_append(crs, aml_memory32_fixed(base_addr, + ACPI_CPU_HOTPLUG_REG_LEN, AML_READ_WRITE)); + } + aml_append(cpu_ctrl_dev, aml_name_decl("_CRS", crs)); /* declare CPU hotplug MMIO region with related access fields */ aml_append(cpu_ctrl_dev, - aml_operation_region("PRST", AML_SYSTEM_IO, aml_int(io_base), + aml_operation_region("PRST", rs, aml_int(base_addr), ACPI_CPU_HOTPLUG_REG_LEN)); field = aml_field("PRST", AML_BYTE_ACC, AML_NOLOCK, diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c index f4e366f64f..5d4bd2b710 100644 --- a/hw/i386/acpi-build.c +++ b/hw/i386/acpi-build.c @@ -1536,7 +1536,8 @@ build_dsdt(GArray *table_data, BIOSLinker *linker, .fw_unplugs_cpu = pm->smi_on_cpu_unplug, }; build_cpus_aml(dsdt, machine, opts, pc_madt_cpu_entry, - pm->cpu_hp_io_base, "\\_SB.PCI0", "\\_GPE._E02"); + pm->cpu_hp_io_base, "\\_SB.PCI0", "\\_GPE._E02", + AML_SYSTEM_IO); } if (pcms->memhp_io_base && nr_mem) { diff --git a/include/hw/acpi/cpu.h b/include/hw/acpi/cpu.h index df87b15997..32654dc274 100644 --- a/include/hw/acpi/cpu.h +++ b/include/hw/acpi/cpu.h @@ -63,9 +63,10 @@ typedef void (*build_madt_cpu_fn)(int uid, const CPUArchIdList *apic_ids, GArray *entry, bool force_enabled); void build_cpus_aml(Aml *table, MachineState *machine, CPUHotplugFeatures opts, - build_madt_cpu_fn build_madt_cpu, hwaddr io_base, + build_madt_cpu_fn build_madt_cpu, hwaddr base_addr, const char *res_root, - const char *event_handler_method); + const char *event_handler_method, + AmlRegionSpace rs); void acpi_cpu_ospm_status(CPUHotplugState *cpu_st, ACPIOSTInfoList ***list); From 24bec42f3d6eae035d5df48c057157f83b260e17 Mon Sep 17 00:00:00 2001 From: Salil Mehta Date: Tue, 16 Jul 2024 12:15:01 +0100 Subject: [PATCH 42/61] physmem: Add helper function to destroy CPU AddressSpace Virtual CPU Hot-unplug leads to unrealization of a CPU object. This also involves destruction of the CPU AddressSpace. Add common function to help destroy the CPU AddressSpace. Signed-off-by: Salil Mehta Tested-by: Vishnu Pajjuri Reviewed-by: Gavin Shan Tested-by: Xianglai Li Tested-by: Miguel Luis Reviewed-by: Shaoqin Huang Tested-by: Zhao Liu Acked-by: Igor Mammedov Message-Id: <20240716111502.202344-7-salil.mehta@huawei.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- include/exec/cpu-common.h | 8 ++++++++ include/hw/core/cpu.h | 1 + system/physmem.c | 29 +++++++++++++++++++++++++++++ 3 files changed, 38 insertions(+) diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h index 815342d043..240ee04369 100644 --- a/include/exec/cpu-common.h +++ b/include/exec/cpu-common.h @@ -129,6 +129,14 @@ size_t qemu_ram_pagesize_largest(void); */ void cpu_address_space_init(CPUState *cpu, int asidx, const char *prefix, MemoryRegion *mr); +/** + * cpu_address_space_destroy: + * @cpu: CPU for which address space needs to be destroyed + * @asidx: integer index of this address space + * + * Note that with KVM only one address space is supported. + */ +void cpu_address_space_destroy(CPUState *cpu, int asidx); void cpu_physical_memory_rw(hwaddr addr, void *buf, hwaddr len, bool is_write); diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h index d946161717..1c9c775df6 100644 --- a/include/hw/core/cpu.h +++ b/include/hw/core/cpu.h @@ -496,6 +496,7 @@ struct CPUState { QSIMPLEQ_HEAD(, qemu_work_item) work_list; struct CPUAddressSpace *cpu_ases; + int cpu_ases_count; int num_ases; AddressSpace *as; MemoryRegion *memory; diff --git a/system/physmem.c b/system/physmem.c index 9a3b3a7636..0e19186e1b 100644 --- a/system/physmem.c +++ b/system/physmem.c @@ -763,6 +763,7 @@ void cpu_address_space_init(CPUState *cpu, int asidx, if (!cpu->cpu_ases) { cpu->cpu_ases = g_new0(CPUAddressSpace, cpu->num_ases); + cpu->cpu_ases_count = cpu->num_ases; } newas = &cpu->cpu_ases[asidx]; @@ -776,6 +777,34 @@ void cpu_address_space_init(CPUState *cpu, int asidx, } } +void cpu_address_space_destroy(CPUState *cpu, int asidx) +{ + CPUAddressSpace *cpuas; + + assert(cpu->cpu_ases); + assert(asidx >= 0 && asidx < cpu->num_ases); + /* KVM cannot currently support multiple address spaces. */ + assert(asidx == 0 || !kvm_enabled()); + + cpuas = &cpu->cpu_ases[asidx]; + if (tcg_enabled()) { + memory_listener_unregister(&cpuas->tcg_as_listener); + } + + address_space_destroy(cpuas->as); + g_free_rcu(cpuas->as, rcu); + + if (asidx == 0) { + /* reset the convenience alias for address space 0 */ + cpu->as = NULL; + } + + if (--cpu->cpu_ases_count == 0) { + g_free(cpu->cpu_ases); + cpu->cpu_ases = NULL; + } +} + AddressSpace *cpu_get_address_space(CPUState *cpu, int asidx) { /* Return the AddressSpace corresponding to the specified index */ From 242da18082abc9310ecd528c528501acbcf718c7 Mon Sep 17 00:00:00 2001 From: Salil Mehta Date: Tue, 16 Jul 2024 12:15:02 +0100 Subject: [PATCH 43/61] gdbstub: Add helper function to unregister GDB register space Add common function to help unregister the GDB register space. This shall be done in context to the CPU unrealization. Note: These are common functions exported to arch specific code. For example, for ARM this code is being referred in associated arch specific patch-set: Link: https://lore.kernel.org/qemu-devel/20230926103654.34424-1-salil.mehta@huawei.com/ Signed-off-by: Salil Mehta Tested-by: Vishnu Pajjuri Reviewed-by: Gavin Shan Tested-by: Xianglai Li Tested-by: Miguel Luis Reviewed-by: Shaoqin Huang Reviewed-by: Vishnu Pajjuri Tested-by: Zhao Liu Acked-by: Igor Mammedov Message-Id: <20240716111502.202344-8-salil.mehta@huawei.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- gdbstub/gdbstub.c | 13 +++++++++++++ hw/core/cpu-common.c | 5 ++++- include/exec/gdbstub.h | 6 ++++++ 3 files changed, 23 insertions(+), 1 deletion(-) diff --git a/gdbstub/gdbstub.c b/gdbstub/gdbstub.c index b9ad0a063e..5da17d6530 100644 --- a/gdbstub/gdbstub.c +++ b/gdbstub/gdbstub.c @@ -618,6 +618,19 @@ void gdb_register_coprocessor(CPUState *cpu, } } +void gdb_unregister_coprocessor_all(CPUState *cpu) +{ + /* + * Safe to nuke everything. GDBRegisterState::xml is static const char so + * it won't be freed + */ + g_array_free(cpu->gdb_regs, true); + + cpu->gdb_regs = NULL; + cpu->gdb_num_regs = 0; + cpu->gdb_num_g_regs = 0; +} + static void gdb_process_breakpoint_remove_all(GDBProcess *p) { CPUState *cpu = gdb_get_first_cpu_in_process(p); diff --git a/hw/core/cpu-common.c b/hw/core/cpu-common.c index d2e3e4570a..7982ecd39a 100644 --- a/hw/core/cpu-common.c +++ b/hw/core/cpu-common.c @@ -282,7 +282,10 @@ static void cpu_common_finalize(Object *obj) } #endif free_queued_cpu_work(cpu); - g_array_free(cpu->gdb_regs, TRUE); + /* If cleanup didn't happen in context to gdb_unregister_coprocessor_all */ + if (cpu->gdb_regs) { + g_array_free(cpu->gdb_regs, TRUE); + } qemu_lockcnt_destroy(&cpu->in_ioctl_lock); qemu_mutex_destroy(&cpu->work_mutex); qemu_cond_destroy(cpu->halt_cond); diff --git a/include/exec/gdbstub.h b/include/exec/gdbstub.h index 1bd2c4ec2a..d73f424f56 100644 --- a/include/exec/gdbstub.h +++ b/include/exec/gdbstub.h @@ -40,6 +40,12 @@ void gdb_register_coprocessor(CPUState *cpu, gdb_get_reg_cb get_reg, gdb_set_reg_cb set_reg, const GDBFeature *feature, int g_pos); +/** + * gdb_unregister_coprocessor_all() - unregisters supplemental set of registers + * @cpu - the CPU associated with registers + */ +void gdb_unregister_coprocessor_all(CPUState *cpu); + /** * gdbserver_start: start the gdb server * @port_or_device: connection spec for gdb From 935c3914184c4ebb1a4c545fc77fe2f0b24645c2 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Tue, 16 Jul 2024 11:45:03 +0200 Subject: [PATCH 44/61] Revert "virtio-iommu: Clear IOMMUDevice when VFIO device is unplugged" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 1b889d6e39c32d709f1114699a014b381bcf1cb1. There are different problems with that tentative fix: - Some resources are left dangling (resv_regions, host_resv_ranges) and memory subregions are left attached to the root MR although freed as embedded in the sdev IOMMUDevice. Finally the sdev->as is not destroyed and associated listeners are left. - Even when fixing the above we observe a memory corruption associated with the deallocation of the IOMMUDevice. This can be observed when a VFIO device is hotplugged, hot-unplugged and a system reset is issued. At this stage we have not been able to identify the root cause (IOMMU MR or as structs beeing overwritten and used later on?). - Another issue is HostIOMMUDevice are indexed by non aliased BDF whereas the IOMMUDevice is indexed by aliased BDF - yes the current naming is really misleading -. Given the state of the code I don't think the virtio-iommu device works in non singleton group case though. So let's revert the patch for now. This means the IOMMU MR/as survive the hotunplug. This is what is done in the intel_iommu for instance. It does not sound very logical to keep those but currently there is no symetric function to pci_device_iommu_address_space(). probe_done issue will be handled in a subsequent patch. Also resv_regions and host_resv_regions will be deallocated separately. Signed-off-by: Eric Auger Message-Id: <20240716094619.1713905-2-eric.auger@redhat.com> Tested-by: Cédric Le Goater Reviewed-by: Cédric Le Goater Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/virtio-iommu.c | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c index 33ae61c4a6..4e34dacd6e 100644 --- a/hw/virtio/virtio-iommu.c +++ b/hw/virtio/virtio-iommu.c @@ -467,26 +467,6 @@ static AddressSpace *virtio_iommu_find_add_as(PCIBus *bus, void *opaque, return &sdev->as; } -static void virtio_iommu_device_clear(VirtIOIOMMU *s, PCIBus *bus, int devfn) -{ - IOMMUPciBus *sbus = g_hash_table_lookup(s->as_by_busptr, bus); - IOMMUDevice *sdev; - - if (!sbus) { - return; - } - - sdev = sbus->pbdev[devfn]; - if (!sdev) { - return; - } - - g_list_free_full(sdev->resv_regions, g_free); - sdev->resv_regions = NULL; - g_free(sdev); - sbus->pbdev[devfn] = NULL; -} - static gboolean hiod_equal(gconstpointer v1, gconstpointer v2) { const struct hiod_key *key1 = v1; @@ -728,7 +708,6 @@ virtio_iommu_unset_iommu_device(PCIBus *bus, void *opaque, int devfn) } g_hash_table_remove(viommu->host_iommu_devices, &key); - virtio_iommu_device_clear(viommu, bus, devfn); } static const PCIIOMMUOps virtio_iommu_ops = { From 3745768918c2528f2b00b63bb3a764c458364f70 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Tue, 16 Jul 2024 11:45:04 +0200 Subject: [PATCH 45/61] virtio-iommu: Remove probe_done MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now we have switched to PCIIOMMUOps to convey host IOMMU information, the host reserved regions are transmitted when the PCIe topology is built. This happens way before the virtio-iommu driver calls the probe request. So let's remove the probe_done flag that allowed to check the probe was not done before the IOMMU MR got enabled. Besides this probe_done flag had a flaw wrt migration since it was not saved/restored. The only case at risk is if 2 devices were plugged to a PCIe to PCI bridge and thus aliased. First of all we discovered in the past this case was not properly supported for neither SMMU nor virtio-iommu on guest kernel side: see [RFC] virtio-iommu: Take into account possible aliasing in virtio_iommu_mr() https://lore.kernel.org/all/20230116124709.793084-1-eric.auger@redhat.com/ If this were supported by the guest kernel, it is unclear what the call sequence would be from a virtio-iommu driver point of view. Signed-off-by: Eric Auger Message-Id: <20240716094619.1713905-3-eric.auger@redhat.com> Tested-by: Cédric Le Goater Reviewed-by: Cédric Le Goater Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/virtio-iommu.c | 3 --- include/hw/virtio/virtio-iommu.h | 1 - 2 files changed, 4 deletions(-) diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c index 4e34dacd6e..2c54c0d976 100644 --- a/hw/virtio/virtio-iommu.c +++ b/hw/virtio/virtio-iommu.c @@ -555,8 +555,6 @@ static int virtio_iommu_set_host_iova_ranges(VirtIOIOMMU *s, PCIBus *bus, current_ranges = sdev->host_resv_ranges; - g_assert(!sdev->probe_done); - /* check that each new resv region is included in an existing one */ if (sdev->host_resv_ranges) { range_inverse_array(iova_ranges, @@ -956,7 +954,6 @@ static int virtio_iommu_probe(VirtIOIOMMU *s, } buf += count; free -= count; - sdev->probe_done = true; return VIRTIO_IOMMU_S_OK; } diff --git a/include/hw/virtio/virtio-iommu.h b/include/hw/virtio/virtio-iommu.h index bdb3da72d0..7db4210b16 100644 --- a/include/hw/virtio/virtio-iommu.h +++ b/include/hw/virtio/virtio-iommu.h @@ -43,7 +43,6 @@ typedef struct IOMMUDevice { MemoryRegion bypass_mr; /* The alias of shared memory MR */ GList *resv_regions; GList *host_resv_ranges; - bool probe_done; } IOMMUDevice; typedef struct IOMMUPciBus { From 62ac01d1de5096022e08355f8df47bf1191a3ed1 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Tue, 16 Jul 2024 11:45:05 +0200 Subject: [PATCH 46/61] virtio-iommu: Free [host_]resv_ranges on unset_iommu_devices MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We are currently missing the deallocation of the [host_]resv_regions in case of hot unplug. Also to make things more simple let's rule out the case where multiple HostIOMMUDevices would be aliased and attached to the same IOMMUDevice. This allows to remove the handling of conflicting Host reserved regions. Anyway this is not properly supported at guest kernel level. On hotunplug the reserved regions are reset to the ones set by virtio-iommu property. Signed-off-by: Eric Auger Message-Id: <20240716094619.1713905-4-eric.auger@redhat.com> Tested-by: Cédric Le Goater Reviewed-by: Cédric Le Goater Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/virtio-iommu.c | 62 ++++++++++++++++++---------------------- 1 file changed, 28 insertions(+), 34 deletions(-) diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c index 2c54c0d976..2de41ab412 100644 --- a/hw/virtio/virtio-iommu.c +++ b/hw/virtio/virtio-iommu.c @@ -538,8 +538,6 @@ static int virtio_iommu_set_host_iova_ranges(VirtIOIOMMU *s, PCIBus *bus, { IOMMUPciBus *sbus = g_hash_table_lookup(s->as_by_busptr, bus); IOMMUDevice *sdev; - GList *current_ranges; - GList *l, *tmp, *new_ranges = NULL; int ret = -EINVAL; if (!sbus) { @@ -553,33 +551,10 @@ static int virtio_iommu_set_host_iova_ranges(VirtIOIOMMU *s, PCIBus *bus, return ret; } - current_ranges = sdev->host_resv_ranges; - - /* check that each new resv region is included in an existing one */ if (sdev->host_resv_ranges) { - range_inverse_array(iova_ranges, - &new_ranges, - 0, UINT64_MAX); - - for (tmp = new_ranges; tmp; tmp = tmp->next) { - Range *newr = (Range *)tmp->data; - bool included = false; - - for (l = current_ranges; l; l = l->next) { - Range * r = (Range *)l->data; - - if (range_contains_range(r, newr)) { - included = true; - break; - } - } - if (!included) { - goto error; - } - } - /* all new reserved ranges are included in existing ones */ - ret = 0; - goto out; + error_setg(errp, "%s virtio-iommu does not support aliased BDF", + __func__); + return ret; } range_inverse_array(iova_ranges, @@ -588,14 +563,31 @@ static int virtio_iommu_set_host_iova_ranges(VirtIOIOMMU *s, PCIBus *bus, rebuild_resv_regions(sdev); return 0; -error: - error_setg(errp, "%s Conflicting host reserved ranges set!", - __func__); -out: - g_list_free_full(new_ranges, g_free); - return ret; } +static void virtio_iommu_unset_host_iova_ranges(VirtIOIOMMU *s, PCIBus *bus, + int devfn) +{ + IOMMUPciBus *sbus = g_hash_table_lookup(s->as_by_busptr, bus); + IOMMUDevice *sdev; + + if (!sbus) { + return; + } + + sdev = sbus->pbdev[devfn]; + if (!sdev) { + return; + } + + g_list_free_full(g_steal_pointer(&sdev->host_resv_ranges), g_free); + g_list_free_full(sdev->resv_regions, g_free); + sdev->host_resv_ranges = NULL; + sdev->resv_regions = NULL; + add_prop_resv_regions(sdev); +} + + static bool check_page_size_mask(VirtIOIOMMU *viommu, uint64_t new_mask, Error **errp) { @@ -704,6 +696,8 @@ virtio_iommu_unset_iommu_device(PCIBus *bus, void *opaque, int devfn) if (!hiod) { return; } + virtio_iommu_unset_host_iova_ranges(viommu, hiod->aliased_bus, + hiod->aliased_devfn); g_hash_table_remove(viommu->host_iommu_devices, &key); } From 1993d634d57718d6eb107c973b7c8b80a6d91338 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Tue, 16 Jul 2024 11:45:06 +0200 Subject: [PATCH 47/61] virtio-iommu: Remove the end point on detach MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We currently miss the removal of the endpoint in case of detach. Signed-off-by: Eric Auger Message-Id: <20240716094619.1713905-5-eric.auger@redhat.com> Tested-by: Cédric Le Goater Reviewed-by: Cédric Le Goater Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/virtio-iommu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c index 2de41ab412..440dfa6e92 100644 --- a/hw/virtio/virtio-iommu.c +++ b/hw/virtio/virtio-iommu.c @@ -786,6 +786,7 @@ static int virtio_iommu_detach(VirtIOIOMMU *s, if (QLIST_EMPTY(&domain->endpoint_list)) { g_tree_remove(s->domains, GUINT_TO_POINTER(domain->id)); } + g_tree_remove(s->endpoints, GUINT_TO_POINTER(ep_id)); return VIRTIO_IOMMU_S_OK; } From a6586419a13ec9698428d9585ef8540c594f978f Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Tue, 16 Jul 2024 11:45:07 +0200 Subject: [PATCH 48/61] hw/vfio/common: Add vfio_listener_region_del_iommu trace event MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Trace when VFIO gets notified about the deletion of an IOMMU MR. Also trace the name of the region in the add_iommu trace message. Signed-off-by: Eric Auger Message-Id: <20240716094619.1713905-6-eric.auger@redhat.com> Tested-by: Cédric Le Goater Reviewed-by: Cédric Le Goater Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/vfio/common.c | 3 ++- hw/vfio/trace-events | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 6d15b36e0b..cfc44a4569 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -599,7 +599,7 @@ static void vfio_listener_region_add(MemoryListener *listener, IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr); int iommu_idx; - trace_vfio_listener_region_add_iommu(iova, end); + trace_vfio_listener_region_add_iommu(section->mr->name, iova, end); /* * FIXME: For VFIO iommu types which have KVM acceleration to * avoid bouncing all map/unmaps through qemu this way, this @@ -725,6 +725,7 @@ static void vfio_listener_region_del(MemoryListener *listener, if (memory_region_is_iommu(section->mr)) { VFIOGuestIOMMU *giommu; + trace_vfio_listener_region_del_iommu(section->mr->name); QLIST_FOREACH(giommu, &bcontainer->giommu_list, giommu_next) { if (MEMORY_REGION(giommu->iommu_mr) == section->mr && giommu->n.start == section->offset_within_region) { diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index e16179b507..98bd4dccea 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -95,7 +95,8 @@ vfio_region_read(char *name, int index, uint64_t addr, unsigned size, uint64_t d vfio_iommu_map_notify(const char *op, uint64_t iova_start, uint64_t iova_end) "iommu %s @ 0x%"PRIx64" - 0x%"PRIx64 vfio_listener_region_skip(const char *name, uint64_t start, uint64_t end) "SKIPPING %s 0x%"PRIx64" - 0x%"PRIx64 vfio_spapr_group_attach(int groupfd, int tablefd) "Attached groupfd %d to liobn fd %d" -vfio_listener_region_add_iommu(uint64_t start, uint64_t end) "region_add [iommu] 0x%"PRIx64" - 0x%"PRIx64 +vfio_listener_region_add_iommu(const char* name, uint64_t start, uint64_t end) "region_add [iommu] %s 0x%"PRIx64" - 0x%"PRIx64 +vfio_listener_region_del_iommu(const char *name) "region_del [iommu] %s" vfio_listener_region_add_ram(uint64_t iova_start, uint64_t iova_end, void *vaddr) "region_add [ram] 0x%"PRIx64" - 0x%"PRIx64" [%p]" vfio_known_safe_misalignment(const char *name, uint64_t iova, uint64_t offset_within_region, uintptr_t page_size) "Region \"%s\" iova=0x%"PRIx64" offset_within_region=0x%"PRIx64" qemu_real_host_page_size=0x%"PRIxPTR vfio_listener_region_add_no_dma_map(const char *name, uint64_t iova, uint64_t size, uint64_t page_size) "Region \"%s\" 0x%"PRIx64" size=0x%"PRIx64" is not aligned to 0x%"PRIx64" and cannot be mapped for DMA" From 6c027a9de3fae3b8a3a4868e17c7a28ba5372463 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Tue, 16 Jul 2024 11:45:08 +0200 Subject: [PATCH 49/61] virtio-iommu: Add trace point on virtio_iommu_detach_endpoint_from_domain MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a trace point on virtio_iommu_detach_endpoint_from_domain(). Signed-off-by: Eric Auger Message-Id: <20240716094619.1713905-7-eric.auger@redhat.com> Tested-by: Cédric Le Goater Reviewed-by: Cédric Le Goater Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/trace-events | 1 + hw/virtio/virtio-iommu.c | 1 + 2 files changed, 2 insertions(+) diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events index b7c04f0856..04e36ae047 100644 --- a/hw/virtio/trace-events +++ b/hw/virtio/trace-events @@ -116,6 +116,7 @@ virtio_iommu_get_config(uint64_t page_size_mask, uint64_t start, uint64_t end, u virtio_iommu_set_config(uint8_t bypass) "bypass=0x%x" virtio_iommu_attach(uint32_t domain_id, uint32_t ep_id) "domain=%d endpoint=%d" virtio_iommu_detach(uint32_t domain_id, uint32_t ep_id) "domain=%d endpoint=%d" +virtio_iommu_detach_endpoint_from_domain(uint32_t domain_id, uint32_t ep_id) "domain=%d endpoint=%d" virtio_iommu_map(uint32_t domain_id, uint64_t virt_start, uint64_t virt_end, uint64_t phys_start, uint32_t flags) "domain=%d virt_start=0x%"PRIx64" virt_end=0x%"PRIx64 " phys_start=0x%"PRIx64" flags=%d" virtio_iommu_unmap(uint32_t domain_id, uint64_t virt_start, uint64_t virt_end) "domain=%d virt_start=0x%"PRIx64" virt_end=0x%"PRIx64 virtio_iommu_unmap_done(uint32_t domain_id, uint64_t virt_start, uint64_t virt_end) "domain=%d virt_start=0x%"PRIx64" virt_end=0x%"PRIx64 diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c index 440dfa6e92..59ef4fb217 100644 --- a/hw/virtio/virtio-iommu.c +++ b/hw/virtio/virtio-iommu.c @@ -308,6 +308,7 @@ static void virtio_iommu_detach_endpoint_from_domain(VirtIOIOMMUEndpoint *ep) if (!ep->domain) { return; } + trace_virtio_iommu_detach_endpoint_from_domain(domain->id, ep->id); g_tree_foreach(domain->mappings, virtio_iommu_notify_unmap_cb, ep->iommu_mr); QLIST_REMOVE(ep, next); From a54dd0cd6b9119c44d52547f51a529122f0ec1f1 Mon Sep 17 00:00:00 2001 From: Sunil V L Date: Tue, 16 Jul 2024 20:12:58 +0530 Subject: [PATCH 50/61] hw/riscv/virt-acpi-build.c: Add namespace devices for PLIC and APLIC As per the requirement ACPI_080 in the RISC-V Boot and Runtime Services (BRS) specification [1], PLIC and APLIC should be in namespace as well. So, add them using the defined HID. [1] - https://github.com/riscv-non-isa/riscv-brs/releases/download/v0.0.2/riscv-brs-spec.pdf (Chapter 6) Signed-off-by: Sunil V L Acked-by: Alistair Francis Acked-by: Igor Mammedov Message-Id: <20240716144306.2432257-2-sunilvl@ventanamicro.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/riscv/virt-acpi-build.c | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/hw/riscv/virt-acpi-build.c b/hw/riscv/virt-acpi-build.c index 0925528160..5f5082a35b 100644 --- a/hw/riscv/virt-acpi-build.c +++ b/hw/riscv/virt-acpi-build.c @@ -141,6 +141,30 @@ static void acpi_dsdt_add_cpus(Aml *scope, RISCVVirtState *s) } } +static void acpi_dsdt_add_plic_aplic(Aml *scope, uint8_t socket_count, + uint64_t mmio_base, uint64_t mmio_size, + const char *hid) +{ + uint64_t plic_aplic_addr; + uint32_t gsi_base; + uint8_t socket; + + for (socket = 0; socket < socket_count; socket++) { + plic_aplic_addr = mmio_base + mmio_size * socket; + gsi_base = VIRT_IRQCHIP_NUM_SOURCES * socket; + Aml *dev = aml_device("IC%.02X", socket); + aml_append(dev, aml_name_decl("_HID", aml_string("%s", hid))); + aml_append(dev, aml_name_decl("_UID", aml_int(socket))); + aml_append(dev, aml_name_decl("_GSB", aml_int(gsi_base))); + + Aml *crs = aml_resource_template(); + aml_append(crs, aml_memory32_fixed(plic_aplic_addr, mmio_size, + AML_READ_WRITE)); + aml_append(dev, aml_name_decl("_CRS", crs)); + aml_append(scope, dev); + } +} + static void acpi_dsdt_add_uart(Aml *scope, const MemMapEntry *uart_memmap, uint32_t uart_irq) @@ -411,6 +435,14 @@ static void build_dsdt(GArray *table_data, socket_count = riscv_socket_count(ms); + if (s->aia_type == VIRT_AIA_TYPE_NONE) { + acpi_dsdt_add_plic_aplic(scope, socket_count, memmap[VIRT_PLIC].base, + memmap[VIRT_PLIC].size, "RSCV0001"); + } else { + acpi_dsdt_add_plic_aplic(scope, socket_count, memmap[VIRT_APLIC_S].base, + memmap[VIRT_APLIC_S].size, "RSCV0002"); + } + acpi_dsdt_add_uart(scope, &memmap[VIRT_UART0], UART0_IRQ); if (socket_count == 1) { From faacd2e6b6a85a5eee2472e5a7f50bf69c4ad44a Mon Sep 17 00:00:00 2001 From: Sunil V L Date: Tue, 16 Jul 2024 20:12:59 +0530 Subject: [PATCH 51/61] hw/riscv/virt-acpi-build.c: Update the HID of RISC-V UART The requirement ACPI_060 in the RISC-V BRS specification [1], requires NS16550 compatible UART to have the HID RSCV0003. So, update the HID for the UART. [1] - https://github.com/riscv-non-isa/riscv-brs/releases/download/v0.0.2/riscv-brs-spec.pdf (Chapter 6) Signed-off-by: Sunil V L Acked-by: Alistair Francis Reviewed-by: Igor Mammedov Message-Id: <20240716144306.2432257-3-sunilvl@ventanamicro.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/riscv/virt-acpi-build.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/riscv/virt-acpi-build.c b/hw/riscv/virt-acpi-build.c index 5f5082a35b..36d6a3a412 100644 --- a/hw/riscv/virt-acpi-build.c +++ b/hw/riscv/virt-acpi-build.c @@ -170,7 +170,7 @@ acpi_dsdt_add_uart(Aml *scope, const MemMapEntry *uart_memmap, uint32_t uart_irq) { Aml *dev = aml_device("COM0"); - aml_append(dev, aml_name_decl("_HID", aml_string("PNP0501"))); + aml_append(dev, aml_name_decl("_HID", aml_string("RSCV0003"))); aml_append(dev, aml_name_decl("_UID", aml_int(0))); Aml *crs = aml_resource_template(); From af09c25199a15925c34ae8c9635a79a81fd9b499 Mon Sep 17 00:00:00 2001 From: Sunil V L Date: Tue, 16 Jul 2024 20:13:00 +0530 Subject: [PATCH 52/61] tests/acpi: Allow DSDT acpi table changes for aarch64 so that CI tests don't fail when those ACPI tables are updated in the next patch. This is as per the documentation in bios-tables-tests.c. Signed-off-by: Sunil V L Reviewed-by: Igor Mammedov Message-Id: <20240716144306.2432257-4-sunilvl@ventanamicro.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- tests/qtest/bios-tables-test-allowed-diff.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/qtest/bios-tables-test-allowed-diff.h b/tests/qtest/bios-tables-test-allowed-diff.h index dfb8523c8b..9282ea0fb2 100644 --- a/tests/qtest/bios-tables-test-allowed-diff.h +++ b/tests/qtest/bios-tables-test-allowed-diff.h @@ -1 +1,7 @@ /* List of comma-separated changed AML files to ignore */ +"tests/data/acpi/aarch64/virt/DSDT", +"tests/data/acpi/aarch64/virt/DSDT.memhp", +"tests/data/acpi/aarch64/virt/DSDT.topology", +"tests/data/acpi/aarch64/virt/DSDT.acpihmatvirt", +"tests/data/acpi/aarch64/virt/DSDT.pxb", +"tests/data/acpi/x86/microvm/DSDT.pcie", From 35520bc702a8c3d7b434305ac403de437003d4bc Mon Sep 17 00:00:00 2001 From: Sunil V L Date: Tue, 16 Jul 2024 20:13:01 +0530 Subject: [PATCH 53/61] acpi/gpex: Create PCI link devices outside PCI root bridge Currently, PCI link devices (PNP0C0F) are always created within the scope of the PCI root bridge. However, RISC-V needs these link devices to be created outside to ensure the probing order in the OS. This matches the example given in the ACPI specification [1] as well. Hence, create these link devices directly under _SB instead of under the PCI root bridge. To keep these link device names unique for multiple PCI bridges, change the device name from GSIx to LXXY format where XX is the PCI bus number and Y is the INTx. GPEX is currently used by riscv, aarch64/virt and x86/microvm machines. So, this change will alter the DSDT for those systems. [1] - ACPI 5.1: 6.2.13.1 Example: Using _PRT to Describe PCI IRQ Routing Signed-off-by: Sunil V L Acked-by: Igor Mammedov Message-Id: <20240716144306.2432257-5-sunilvl@ventanamicro.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/pci-host/gpex-acpi.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/hw/pci-host/gpex-acpi.c b/hw/pci-host/gpex-acpi.c index f69413ea2c..391fabb8a8 100644 --- a/hw/pci-host/gpex-acpi.c +++ b/hw/pci-host/gpex-acpi.c @@ -7,7 +7,8 @@ #include "hw/pci/pcie_host.h" #include "hw/acpi/cxl.h" -static void acpi_dsdt_add_pci_route_table(Aml *dev, uint32_t irq) +static void acpi_dsdt_add_pci_route_table(Aml *dev, uint32_t irq, + Aml *scope, uint8_t bus_num) { Aml *method, *crs; int i, slot_no; @@ -20,7 +21,7 @@ static void acpi_dsdt_add_pci_route_table(Aml *dev, uint32_t irq) Aml *pkg = aml_package(4); aml_append(pkg, aml_int((slot_no << 16) | 0xFFFF)); aml_append(pkg, aml_int(i)); - aml_append(pkg, aml_name("GSI%d", gsi)); + aml_append(pkg, aml_name("L%.02X%X", bus_num, gsi)); aml_append(pkg, aml_int(0)); aml_append(rt_pkg, pkg); } @@ -30,7 +31,7 @@ static void acpi_dsdt_add_pci_route_table(Aml *dev, uint32_t irq) /* Create GSI link device */ for (i = 0; i < PCI_NUM_PINS; i++) { uint32_t irqs = irq + i; - Aml *dev_gsi = aml_device("GSI%d", i); + Aml *dev_gsi = aml_device("L%.02X%X", bus_num, i); aml_append(dev_gsi, aml_name_decl("_HID", aml_string("PNP0C0F"))); aml_append(dev_gsi, aml_name_decl("_UID", aml_int(i))); crs = aml_resource_template(); @@ -45,7 +46,7 @@ static void acpi_dsdt_add_pci_route_table(Aml *dev, uint32_t irq) aml_append(dev_gsi, aml_name_decl("_CRS", crs)); method = aml_method("_SRS", 1, AML_NOTSERIALIZED); aml_append(dev_gsi, method); - aml_append(dev, dev_gsi); + aml_append(scope, dev_gsi); } } @@ -174,7 +175,7 @@ void acpi_dsdt_add_gpex(Aml *scope, struct GPEXConfig *cfg) aml_append(dev, aml_name_decl("_PXM", aml_int(numa_node))); } - acpi_dsdt_add_pci_route_table(dev, cfg->irq); + acpi_dsdt_add_pci_route_table(dev, cfg->irq, scope, bus_num); /* * Resources defined for PXBs are composed of the following parts: @@ -205,7 +206,7 @@ void acpi_dsdt_add_gpex(Aml *scope, struct GPEXConfig *cfg) aml_append(dev, aml_name_decl("_STR", aml_unicode("PCIe 0 Device"))); aml_append(dev, aml_name_decl("_CCA", aml_int(1))); - acpi_dsdt_add_pci_route_table(dev, cfg->irq); + acpi_dsdt_add_pci_route_table(dev, cfg->irq, scope, 0); method = aml_method("_CBA", 0, AML_NOTSERIALIZED); aml_append(method, aml_return(aml_int(cfg->ecam.base))); From 0af3dfa5c5590ae21d1f58a389968d4b90112fd7 Mon Sep 17 00:00:00 2001 From: Sunil V L Date: Tue, 16 Jul 2024 20:13:02 +0530 Subject: [PATCH 54/61] tests/acpi: update expected DSDT blob for aarch64 and microvm After PCI link devices are moved out of the scope of PCI root complex, the DSDT files of machines which use GPEX, will change. So, update the expected AML files with these changes for these machines. Mainly, there are 2 changes. 1) Since the link devices are created now directly under _SB for all PCI root bridges in the system, they should have unique names. So, instead of GSIx, named those devices as LXXY where L means link, XX will have PCI bus number and Y will have the INTx number (ex: L000 or L001). The _PRT entries will also be updated to reflect this name change. 2) PCI link devices are moved from the scope of each PCI root bridge to directly under _SB. Below is the sample iASL difference for one such link device. Scope (\_SB) { Name (_HID, "LNRO0005") // _HID: Hardware ID Name (_UID, 0x1F) // _UID: Unique ID Name (_CCA, One) // _CCA: Cache Coherency Attribute Name (_CRS, ResourceTemplate () // _CRS: Current Resource Settings { Memory32Fixed (ReadWrite, 0x0A003E00, // Address Base 0x00000200, // Address Length ) Interrupt (ResourceConsumer, Level, ActiveHigh, Exclusive, ,, ) { 0x0000004F, } }) + Device (L000) + { + Name (_HID, "PNP0C0F" /* PCI Interrupt Link Device */) + Name (_UID, Zero) // _UID: Unique ID + Name (_PRS, ResourceTemplate () + { + Interrupt (ResourceConsumer, Level, ActiveHigh, Exclusive, ,, ) + { + 0x00000023, + } + }) + Name (_CRS, ResourceTemplate () + { + Interrupt (ResourceConsumer, Level, ActiveHigh, Exclusive, ,, ) + { + 0x00000023, + } + }) + Method (_SRS, 1, NotSerialized) // _SRS: Set Resource Settings + { + } + } + Device (PCI0) { Name (_HID, "PNP0A08" /* PCI Express Bus */) // _HID: Hardware ID Name (_CID, "PNP0A03" /* PCI Bus */) // _CID: Compatible ID Name (_SEG, Zero) // _SEG: PCI Segment Name (_BBN, Zero) // _BBN: BIOS Bus Number Name (_UID, Zero) // _UID: Unique ID Name (_STR, Unicode ("PCIe 0 Device")) // _STR: Description String Name (_CCA, One) // _CCA: Cache Coherency Attribute Name (_PRT, Package (0x80) // _PRT: PCI Routing Table { Package (0x04) { 0xFFFF, Zero, - GSI0, + L000, Zero }, ..... }) Device (GSI0) { Name (_HID, "PNP0C0F" /* PCI Interrupt Link Device */) Name (_UID, Zero) // _UID: Unique ID Name (_PRS, ResourceTemplate () { Interrupt (ResourceConsumer, Level, ActiveHigh, Exclusive, ,, ) { 0x00000023, } }) Name (_CRS, ResourceTemplate () { Interrupt (ResourceConsumer, Level, ActiveHigh, Exclusive, ,, ) { 0x00000023, } }) Method (_SRS, 1, NotSerialized) // _SRS: Set Resource Settings { } } } } Signed-off-by: Sunil V L Message-Id: <20240716144306.2432257-6-sunilvl@ventanamicro.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- tests/data/acpi/aarch64/virt/DSDT | Bin 5196 -> 5196 bytes .../data/acpi/aarch64/virt/DSDT.acpihmatvirt | Bin 5282 -> 5282 bytes tests/data/acpi/aarch64/virt/DSDT.memhp | Bin 6557 -> 6557 bytes tests/data/acpi/aarch64/virt/DSDT.pxb | Bin 7679 -> 7679 bytes tests/data/acpi/aarch64/virt/DSDT.topology | Bin 5398 -> 5398 bytes tests/data/acpi/x86/microvm/DSDT.pcie | Bin 3023 -> 3023 bytes tests/qtest/bios-tables-test-allowed-diff.h | 6 ------ 7 files changed, 6 deletions(-) diff --git a/tests/data/acpi/aarch64/virt/DSDT b/tests/data/acpi/aarch64/virt/DSDT index c47503990715d389914fdf9c8bccb510761741ac..36d3e5d5a5e47359b6dcb3706f98b4f225677591 100644 GIT binary patch literal 5196 zcmZvg%WoT16o>EFlh__VVmr?J;S@^6GqU5nTG|qO>?AIBVmwKMluE9IK$L7EQ6ZHI zDP++?b~J)@kn)Ehv0}%LMb~Wj2iRfGojc?Fj&erwc+Si{-`sC}Y@eCBSKn(Dl#0yM zcHM1nq4sIVU7*EMO6hI=o_$%f9`(Fh@9=cmEtN{~-gdK|uDYEj1#2qa+i%v@;prWB zw;dkqwzo^Aayd8_@3~zsH|y4lB#LLD4hHUEe%#Hx2ibMth&QOO)&F zRh=XyyH(2|a!$q|B8kZ$vuZ!=hErz3r$f$^nsKgisMgtci@6#K-_+4H@eqBQ7zrz*tAdZIO|X7ii7^k1kB}kPX;xw^ z0Am9ga-HTR#v8z>B15jzyu`Q(j2bfJIvtf5w}9aSV@a;lg2Z?m7%eFsL$1>~iE#%QZD3U7I-QpocY(2k47pB=65~EFb^}J!ZVeiu z&Q7P~mNo~?M~8ZzjFa^HoHzaPM7_4JFx>OHo^^QYqk`!$6g44;x+$Q{z5(iio>lPe zVJO{=!?R9fSXSSnU)l{FW1y!O~owmMFPt<4ht~A7N(>mR~(bm__Nj_;O`+cQ98)ddF z+AgbnO7C|f`tpQ9l!A)Nr|nd~Zz-KaG?lL;Xw&b!K>$sGk|mAgXj`I5cW9X{et`q0Xck`Dd~H%y0&_pBWCN1~ahqGbu#R zA?7*cXNI0bCNrEoh3GlVJgJ4J`GiBMAw^FL(KE(8W6U$gJt;)bIP;7%PikP%lS1@N zFi*PqFjvpNBnj?GA$mH@(_x;}@S-P$=$T}mN#>d4o)n^Iig~7(CpF0ENg;ZsnP-}L zrnx7D=$T=j8Rkh1HF{Ero+He2gn5o|PYThKZvIS-`&FD}p45P&Cxz&lW1czYnd6=m zqGz6Y=9woo?C41$dX6&BQRX?yJt;&_x~&G!0`sH>A3Z5V&oSmX#yrQkCxz%a&OFDN zC*2UBCxz%a!8|9J=LGkp5IyNu96TqPC*44xCxz%a#XP5&=M?v(5Iv`v=QQ)A8xHiO z5Itv@=M3|l;hq$t=PdJ_WuA0{f}Rwj=N$8#W1e%|lS1^IXP)!SlWu6xlS1??GS4FO zEOJi@jb|a-b8A&~NxupzyJ7xVsbkrWIZuB`s|fu-(bCr|>m}7oivBPCCEM@+r*F$> z=`Yd|D@FfmW}&}fTKaUAJdB;U9I%~XDaUPcRDKW?sZ$Lu$k!XdaiwT zq}=*J`C>bE{$*~|$sV>}SN<)|Zv1lawEEku~MDKBWuN><@bM!_)`Q4_Wb!{M|UtUItSgFb8>h(r8msUOhf1ft=j+5 C#r7@$ literal 5196 zcmZvg%WoT16o>EFlh__VVmr>uc{qhq@vO#n^Jr;H?6H%$#EJ2w4N@w(5&}`OsYHcT zDx{D_3)#^~Yza~%{tYBn?AWnj&4zz~9p>D*Gs*8LXQYhh%-r+M{l>@f@oo97-K~;R zv7eed-lo6U{J7^W(q<{8^s#=;zie6$2Yz#~e^mBd*G&#KJFRTP>vbqtQOUvmPD||{ z-ST$2(Y1be({-!W@LF=<_5DKGnR<~@8kkafrM@3kmUV@qXOz3TzUQqQ?nmwJed5+A z*WYb8X-f7QmO&JpoI%7=(_v=Ae$bDmw6)#eq12^|+n#4$+}u&I@a8Tes^;z-p>KN$ z5mOh4YKUm+S=1zi6O$M=FlxxCi;TF$7zIWh88<|REisb7xPgo%kuf4M9AGRVV_9U3 zN{kFJmXWa{G7=Ia2aFYDtcr}J#K;3<6&W{0MoMB7fpHTU)$qf?OU!X3MhO_VkRg^S z|BytbJ_(HXks;S@3L$1@j z#8?Ez8ZzWMEl7;lfYCsPT&G2eaTOR%WXN?oE-|hF!vjW5uG5mlcoP^cWXN?oAu-+t zMjIJ&olZ)OcY(2v47pCHB*t}MY#>9f(`kv}0%H>ya-Ggdj2bXH$dK!_EHQ2X;~p~P zI-Qjm%fR>$8FHP@NsLusY#~Fg(|L(;3m9Es)a5!|kQldtv5gG5PAd}QE--dN#`Z5g zuM^Irx7~9a?kY7O9<@g%s_QPMy+QkCbNjq4@pt=$iZjgv z$fQtb(u{(;*gUuO`byZ9A%!=l+t{nq10lb zCxz&lV4ex)nc$ujqGytMCYdL-WavpDdZw5swbSP51&<`fJt;&_hj}{8lUg|Rq!2yR z%rnhA)7+Cn^vp2N4D+Ox4?QVF&n)xIGS4jcq!2xG%rnP4sYOIj3ej_nd5$s9G44qr zdeYAsQ{zDu=b0z9l;}wzdKQ>xfq53VCxz%)WS&LlNi8UPQiz`8%yXQ1j&n~6(UX2C zf@g_&Qp<{-6r$$@^PFIw6Wo(R^qgd#lgyJ^T=b+6J*SxG6!Vq!2x4nddC?oaLSrqURj*oMWEUa-%1O=sC|k z=b7g`_oNU#7ntV)^Q0CXJt;)b3iGTm&kFaX(0J}2b!`1snxAj_OWAYR&%cO!v@DTx z(!o;1>%mt#eeYg6R~jAoecuXEVaLEwv`&Dis{+cLJ4fBqvkDtrhSKW=$a+IynRA>K zHoBZe>jucWCa}!8kX6bLyk33NbqkreL4fW1?NuRC^br->w z9}fT1Kg(zvUZ*QohI#MjIN$ezB-Ay^2Uc57ft+$_sgb>m6Pa#^|%@=cw`t+!0-VY^R# zDZQh~lwEFlh__VVmr?J;S@^6GqU5PNkdy=kDbINPK+mMkW$H&5QvgZBr2p* zA%!doRANCRSO=6p1c?>9?m%MAhJSz^=G?h6&hI2=q}X%j-t*1<#>erQseAR^UkXY^ z{;ch|o8DaQ!?rs|o28V}`}#fm{f70R>(#mkCzEceREl>uoAq?nZ8s@cR`Kp$v#!io zSsJqrPNpbtK^k)+X0Rd-Mh{L_-JMo=#!hRqR4SK)Gse4a$IDGRW6u&wFEgRCN_{t| zEbAIYPiRFe>-+Ay0e{eF?a^u%DA(JqI!Q`)tCm6JoQy$465~N;)xO^eMYpy)JCwR~ zaoT?7#mOBN2~J)@L~uPgLZ9{uBBCx3r9QVFP0U8Nt=P#E43aL10vn zv1l&jC5A0A;=ou$#*)YwkQfdymXL8%WDH7-6fkZgV_9UxBt`}p%g9&}8F7h`1;z?8 zZi$S9#K;5V7BI@em*djs;z*1FFjkQvmMQ;|grzvULR+yI6Pj0L$)3ligXVAPQz*XfAFcoP^6WXN?oDly&$#yT?OIvtZ3^T60ZhFquP z5~Bo+O=QS*Iw3I@fYC&TT&I%~V-XnlkRjLUl*G6RjQ5ct*XgvxSOLZsGUPg)kr=DM zXaS=l*XgXpxDAYLWXN?|lo)q`u@f+w_Ex{mYHznoZfU*We6*?O$v8>B&w16KPt+^N z3WHs*<5A<~qk`oy6g44;x-Ov1u|DdaURChzVJO{ zeb(BKZMPYGYMNCot1{{hqpd=j`K-7+3P9&*(kGb(SBLwQ~J0Q(U&I-q7+nAK5eIZJxl2v znioVmiza5wn4d>e!%$}u-=9l8rI{N(1M9&!Wj4{s`_oNU#1G`~3(S)meDtIcJx7@52=g4_o)n_zDDxa;o^(Tio)n_z81o!so@3mTLiD6taqt{x zo^%6&o)n_z1oNC=o)g@YLiC(uo|DXzZaC1BLiC(so>R>O9(-ql&)MK|1+SgQ(U{aGEFlh__VVmr?J;S@^6vl`n?la{u`9y^IkoET5iAUTpNArK{-N>oUt zLJCD*Gs*9?XQYhh%)RHE`;Cv|<7xWM-JeTJ z#SR)f-lo6Q_^|6O(Pk;7^s#=;f4^ZJ4E)BRe?05CuA3Zewwu|y*KJdS;>pWu{bKsqaRW zWnH4^F|BBIecxL*;161zJz8y*a{b-9lcr>^ZW%u zu9+M80>hRVNnl(<#0#5|#QSFy2LmT&Edn9n-+Lg$%h)vl3$# z7`Krj*J(~-%md>NGUPfPlNgJ@cn=wJo#rLR5-`?~A=ha^V!Q&3Ix^%sElP~bz-S;t zuG4XeaRnG2FsgE$mL$gOz-S^vuG0yL@g^`@$dK!FQewOfjCEwlbvh+6t^#8N8FHOY zOAHqno5+ysbVg!Sfzd{WT&J@V;~FsTB15jzIf-!{81ExPuG4vmu?mbWWXN^8ATe$N zqXUeZT&If?;}$Tsks;Sw=DW&MZS}EQQZD^9>F5jfKz&57N<)IiqubjZ-}>A+DyHr9aHux?wyVss zMaLCY%;@t@jDl(u#3`t$V%E+KhnCVgG%t*F7ER2Vu^^A8Mxo9melVB1Br`XRbY?V_ zS|EKzni^zMs57Ih#DmOe1#zV_qp5L|NmGMN3Uwx}D7cCpWJW8ngUo0uwM?O9kVzqW zjxf)WAT#nDF`3cgDMZgv=1EN{ttXmFEf#uGh@J`NnP8p??nxnfCYfiFc~VP;o)n^I zig{8yZSG!hOH$mELiBW)r^7s{g+osY(KF3F)66r?Jt;)b4D-w|PipzllS1^&GS4jY z%yLf((KE+9bIg-kMD(N(J;#{m81o$Co)n@d{hTp19#nCjc~VP>o)n^Ifq52~XMuZC zh@M5}S!ABnf}$sd=sC_j$C>9i_oNU#>E|bSmY65CtmsK0dQLFU3FbM$Jt;)bN#;4p zJgLP+PYTgx#fK=R#H~KkyoLbw>X~ zRep_j=5}l*oJZegRgC_f(>ByQ>l^xXWvIIdw)}ATpZ-}!+wdxlSQ+X8%tlQMZ9^Kh z)U&rBCm24`V|ojsi=96ISS9_vZdWC}-QJcet)~V%zGpu>R9kxA31(ML zC!e20^UUeI9(<@L>+@%aKjqAMeUZx9VdT?A)L9{JS$angx;l2R<+Ezk54v>C)g1Sw`xCqeba%>Y7q>PzVzO{N^X|8-i2UdwN7EtM7qt$Vwv hhdQ`_nm>7R-_iZv)9!w+;T-jkXY>Jno;-6c^*?7DCOrTE diff --git a/tests/data/acpi/aarch64/virt/DSDT.memhp b/tests/data/acpi/aarch64/virt/DSDT.memhp index bae36cdd397473afe3923c52f030641a5ab19d5d..33f011d6b635035a04c0b39ce9b4e219f7ae74b7 100644 GIT binary patch delta 1923 zcmY+^OOD!55CzZ&HXna5*!=%npg}#0ltdy@kTOeV*+cK3^`vYd1Jpwn(aI{ak}QMV zS9XcIBm0WFxQ-3Joo=V}{Qdmnwsfxj;XQp$I+ys{;c)#o`Wl7fv8<1;;NYq(&hGaX&YxVBhx4?JaMTcLCsz&P98=EKb4>~@3>Nig(!;36(8Tx4 zY+IU`ZBl4@oozd*Vpgh%p^9~6IFhQa5V=s*(Wq`>RGNsPsh5eRsVl_N)X~_bzHv$w z*-ASk?MPLno%*7>J))I%N!pdBj&=#^@5ol#BWaJMJ(eO`X`iHhlJ@JSZjabNz1-e} zqzOqAmLghdO45|1DM{~$7Me*jJbzgsBWcD`L@Uinnv*nVDPkj)s|*M_aINtB*Sj)c zDWa7YBrQlZs1JWzWGk&mT9LG3 zDWa8*NjfI!n4yTF>9LbMnF&cJB%QDn(MqQzosx9=kOnBCh0dI7p1-UxBk7E#h*ml$ z>71l6)Z# zmLgi|hNK&UZW^d>-rC4kx+UqBq+6CETIr6YyN49e9YYbLkLHH%NxCQLo~4Mj^iNU! G?)?X92B*>h delta 1922 zcmY+^IhNW`5CzcRk{C1~K+JR2v2BibfCrLJy0hLxdkOy>Z=eHWV#hw-fY;&8;P*?v z4)r2++*`^WUD&A6!#r_a9wTh4)2* z@JU08aCFTet1#tUI~P-EVX$t$lO9Gjh9-_Lvxzh@8&ha{olTrnF)LNXP{lejoJv(! zNL{GvXxcV0Dow=D)YD|r)D<#m>S*Q)ADvP~w$ctsJ5p6?r#-1|k7%V`l6Ix3qg{gf z8?u%5NZKQ5kEMuK+9zqBr2V$3+apG(m&==zG$(1!Qba2)NLrAzAn6U!LQ83e`!6e$ zBrRErXr&cNE0R_$MT}B8%YdK*_Yi*ndQ}E2MYPhIq%}!vmLgi|P?`!I64Vb_ifE-H zl8#6^Vkx4Pj!8O}rb5Ses1JWzWGih*+K{wiDWa85NID_ugrSI`>9LdCnJG!9B%QJp z(Mo3|oso2Qmj)=Jh0dLe_g_|+lXT8fL@Qm8bV1SuOA$j;p-Y1LB}tboMYK{pym26X zCay@jVkx4PuBE9ftnW~dtFmS(qLpq)x*_R?rHEF#CFz!++X(gXsf}!zn>+Fjs%K?a#p@VvND}_?kQXJ#wcT)Vn(eQ+~NzPIOn-8kw5kUV~!MB)YfO z8fNF<#MIcKbPqGoghEujXz2n6ik|&G2aZDNaQS34`A)m%C^5^b>C^W zZh`JRtPQqmUJ4fbwTP9-ds!hMMnY&@lo|_8jRBX2 zCo~qM#wDl5pi3hyG%iVvMW@D)OCu{Z7Ny3LQzPNh$O(-lsd3qL$AC|MVy+gb}Irl0(ZL(^~zX zBCQ2bBDKytwCu4{E&p-bU+RBqs8uU3!PW8sCT&%Ps$Spl_6aHRCD`iZC%v&f^Oo342ALB|nR8O*oK!g{bxtT{&MB31O67#F88Ro7GUv3)IjwR| z>zq)^oHHutjLHdLOJq(cWlp%Q0#_*fjX$e$&gz^{%A99Z&NC_}e2tMgp_DmGDrZUM zEa{w3+ME_kM!mx0jETapBeM_x4BV-CU=qQtcH1QUt&2mA9$pjhTHV6}b?{9Ur5@swSHkDx+>@hw zkuTs(g}N)pV^5C4K6^NaC-<-!kDu)RkhRBgNabX=_(r!8wBpBSt(#vKfBqv__`LP) z_@AxGpZwNw^5|o8l8;3q_yo6_fU{2TVJV;io{&8AY{&+8x$K<*CG}j$2KT%iQqPBM zP{;Wh`!&4*uv5(+}r`E8H;b$-4je2QilvajNL-0new36VBry#ts1B5qj zC3vG&T1oK6QxM+R0m2)%61=f_f;XOm@Wu`h-nfW55Z>4U!W*{|ys>$LH=cs<#tsnPxRv0I%@e%w6ofZ+ zfbhnx1aEAf;Ekssys-m>H*O_(WAg-WJO$y69U#1ME5RF^CwSv22yg5F;f-4f-q<|B z8&5%aV+ROt+)D7q<_X?-3c?#ZKzQR;f;To#@WxXR-q-=c8@Cd?v3Y_wo`Uek4iMhB zmEeud6TI;hgg178@W!nKZ)~36ji(^Iu>*uRZY6kQ^8{}^1>ubyAiQxa!5f<=c;hJu zZ|nf!jav!c*gU}-PeFKN2MBN6O7O0Hrx^$Y^5Lr6i(K`}PhB7S4B+aNmif})JNON1 z|Ese;bms#8XX=e+qicE3{!eNJ?|uP3W#B|#-`+ipMP9K>_nRFcE%=j@NP6*|uToh0 r&7iqHSDfuOZ*L{{u(*4iJ9>E6LxOqK-tN@B{hfnJd?Vq~x3kPYFGd+# literal 7679 zcmeI1%WoT16o;=LiS6+tw&OgUms2Pe&&rRcNlRN|kDbINPK+mQkW$GN2t>&y5*4CU z2$MyD>}Vu5Y=RVtKLaFI?AWnDEZOi6uw(e$xiif<%Gt6(o=EnbnR~vOZ*mf!d);)J zJMO$v;U62@_J*_Ac)M#aVhbsy^uB)29{mxz*LNEIy~7#1TrS7^8|`MgZg<-VmQ}oe zr`=S0hu7`xPH?W**(jGQ75`kkZ}*(sj5YNnp)@d|vPykDszRX)h#nz#Wo_49Gu*dY zojc%Gk?ZW#tt67|ddMJhR>mO0iD^Hx9=+QOZfSF?w~f@Lv&%lqJiEB1!v4j}6j3o( z=Lmh-bBM4yLsWc3L*GSJLRevm5haW&WzWPjh!e&Y%2*N^BND?R zj3vsrDl$eTMv5@5QpU2#h)IkLVJuU|ipYpdj4WZSP{uWpk&qa9!nj5l75|TemzZTq zi~?a?rwp-7`5#G8>XU@=I%UXpnv(8gnlNrqhFqs7>MXl`z&QL$1>)iE)`Q)+s}-(`kuOCX5Zrkn6N4F{*^o zrVP1GXC%fI!njQta-GgfjH`t4CS}NVIwvt!2xF5n z47pBA65|$OZ2OG*cU^nMzmM)#w_Go;p!u-T9+{|ysL|N6240h--<+LZu4qjjs87^8 zYEd=Sd+KBLK)s81rBl6jqubjV-~7lL#B99T~=m4g#C)jr}S|vjG!6>5d<}r zPe)U(izdsA^CO*wff;ko%frwh)S2LWW6>L#v3{g8gP~}A^c6AG%S5O%aW~%Bh?g1M zK}6}yU}(f-VyKsiP-o(fyv(SV8QekC%M6C1QSvRlOoY^Pgn5p5nStks$qa5DA@v+( zo@hmJKfzEmR@4(A^^7sk81syAPlVJn&OGDH6O9=4L`Xdo%o9ztd3xR>NpMeu)YD>~ z7V|{oMm-Ty&m{9qGS4LUL`XeT%rnJ2(dbc6gw!+5Jk!iG%{>uP&kXa-Fi$jw)Dt1~ z9Alni%yW!;BBY*}#F!fQsyNF$(MVEHgw!*~Jaf!5$2}2J&ph+YGfy<0)Dt1~9A}>6 z%yXQ3BBY)L=2>8#XjG{uLh3oeJSUjv1ouQpJtvvxB=baLOFa=%&nf0P#XP6DCqn9p zDJA)RPBTw5!qgKX^(->aBJ(VAPlVKShI!5~Pc+Wd6Cw4SWuCLlbC!D|q@HukbB=kU z(WagVsV5c+@PvZ@`18zjo_ivso(s%#fq9}ar=AF@XNh^1m}iN5A~c?RhwZ5TRhmhg z{wTJ~Yxs243l;Hx7mE=YE9osjJnwyw<8$}ivBGHI={X@E<@X5;h1CIyx+tK`v1h2; z`mX%8mZ9`20cE`)u*@;e^4nX^!fF9!T@qO4n9nly6ITAKvgWR~4-Rt~b`@1x^`t9z+`cF51d;e(Pief+& z?{U1>4s@V7dZ(sQkM)aH(yu4kr^gRMpW#f!8(le>e0m&=Gl#SKI9A3vzpOuTgjU1DqFSDsx7MWuz4FcZ+<^JY~J+5V%WU7?yz~&l`w4HTzA;K z=}H(jZ+@TopEYlVY~OCw)hQ;xD!Ymo)6H$ftcO86r_|>-K2Tp`YLvp#khf)B_aD8I zj?KM{Gocg~aLk4kCu}axhSbq3B*!~F(C75#MQ1Pi`2VtD>_mGlhfO8)cawU;P_xSC zo{#rCz|$$Mo;@19hs6T(Rc9l1@3Q_fb-mT>oB4_PCbgp13Nb-Rz;IXdkt%_cUaUrbTVX{ HYpH($%hpa? diff --git a/tests/data/acpi/aarch64/virt/DSDT.topology b/tests/data/acpi/aarch64/virt/DSDT.topology index 501314c91be01d927fd125e0c72e919fdd85592e..029d03eecc4efddc001e5377e85ac8e831294362 100644 GIT binary patch literal 5398 zcmZvg%WoT16o>EFlh__VVmr?J;S@^6GqU5nTG|qO>@+TMVmwKMluE9IK$L7EQ6ZHI zDb1olb~J)@kn)Ehv0}%LMb~Wj2iRfGojc?Fj(bMRc+T8=zL{_4*f}$Guf8`vrc`94 zw(D+r3$;(%?gA~AQc7>@&+PME>tWZcbq`Oc-BPI(?`}2g>8jgqQn04t-Th`=nX$4o zW*wf+P~MU>=0wb3MI4Np!DVsKHiIkT;E)+y6$gjS;F>rXJ3PJR?zTe9PHU@FDwhMx zc-QTCxoKzOMMCLIlTcZuejHYob&aB@bQRV&58O=y{-n{`rz^BXx!zvYNm8;~wG1NX zWDFvbm<%$j_JdA%akqCmyOg?eb=ZF9)xjMV2@YODL~su{LLc@uL_}R7$^oLL&my>0 z9N|PHh7F7*WCV9~fDx4#!@#H@Be?Mc3|nHvfw7E?pgsl|LlVOQ#tJfmG8$kEON4?Ob2gXOpkn1!n zF&2Qafeg7$a}whXU{sMI*J)m2Tn9!C8FHPDN{k!8aDlNT*J(jwybX*xGUPfPlNj#; zqk#;$PRAw2`@q;lhFqr;5@Qh<_mCmi>7>Lc0b>gpa-B{|j3r<+ks;UVw8U5j#(iYS zbvh$4R)O&`GUPg)l^E;5*hYq2r*jhH7BE`CsK|9XFEMTdV+R>>ofaj=U1016jHbQa zZ|&MU?UGyC>^C26>UlCw((iNL^yd@xwS9%*uGjIX+4E7s=`a*EA&9yzpv=BL>b^d! z;M>Dcx*0*%TLR1M_Qe2sgw zuGMz>*U{VSM1OVK`bIrgpV6n%5Wh`pclJiNzjjXYNjur=DSdC0 z*|uoAtnw+n--+nU69!QVDk`70Q@x(0bPml6BArDOGiJ=sqp4x2Gl}odrJmBv4I-Tx zPNjxXACac|nH1{G@F-C~GdzQ+(wX7ZsL75IyIa=REVI2O9LG z5Iu{`v&cM)+>=7%S;%(XT2)=ruY$^Mn18I)v24YhCqJZBgnpoC>FJgAJ=IH!{$Kh_ zw%7YlKbFzbTci;yMgP^zLjQ(o>CvdA&fbF@fA~a?>BWz(R(k(a75DbrZ54O-I@_wS zndt6%u6=f--1t3Wp32T9=2Xp{w>aK{BrQ5`rDnqzBoH_Y?`n{pK!6ZGiJX&Ncr-p@Iklk zwd{*)=4}34`{QqK<=fUz6aTcPes^0JiSw`2WiD!2dPOg~T6U-5K?~&iFS>MFSaC|P zT_67*(A6n!TwNNyMZcity;&{xa8bWwyw|9A@x5GGw^HEFlh__VVmr?J;S@^6vl`n?la>}@kDbINPK+mQkX*@?5QvgZB`Ty+ zA*ERq=#EBW9i&M78%V6!v17rS4gUZ;%(-)ClHYO9NEy$Wd(SuX%^YWrr|CEMr>B&P zoiz5mZGWZlN!MGU#ZpS?ZT*>lwrAZR_>DpTc;0heH#yjDH?wuG+ooVmB?ougO=ZR^ z(wNmhUZA|HH0H$2U`-s1o55@1plt?M#lbN%cwHPEH-l^9V4{C~)7$Grmc7ol>sBhE zWpd#4{KC95^E{>WrAev0Qa_9<%eq9-6S@lPn+M*e0e{@;+@&j2rCfi%?xZQ%t6K(9 zaB>C_OU;Ivb^Bf~y0|;Ly*)}@y*TW7=EcDs6$=mUA|kv89H9^U3L>U15S0+o&}R|e zDvoes62k^Y6&c|j9bv>J#yBu)$Ov!z2*Z{bNnl(zR@(?8Au)2mSVP9TDXbAjQexzRv5t%zA|oX+iom!5j7s?B7 |Vw8Y!6B%Ne z@-InL>eIk@9~p9;W~B3&1;#C8$aR{P81ulmjSRU?a}r|_7#|=*uG0yLu?&n4ks;S< zUSg~OV*?p-ofagz}P{CT&GJC<0ddVz^KV}x-2nn0b>^#a-EhX#s|RI3mLn= zbiH z3nQIH6LV%P$fK!Is56Nl%%v{L%nc)*8BL`YNFR}=2ALG<%;+fbATv6HxYC)?)VRr{ zsX-=%I+M;QIEo!)MrU9LnbA~gnL^7TlS1?yW1eF{X5=|$GNY5H5Ix74CpD#XKG9Ta zvCxx3^h_|%1oKRAPYTg9$vl(HlUg$Lq!2w*%#+$_bM=BtlH#5eqNl?=9p*_b9C}iS zo@wTpW}a#8Ng;Y>m}iE0Qp<;)6ryLAd1jesmU~i&o;l{3W1iF^q9=vuIl(+9nCArd zq!2yn=ZvZGpo;U%lUhpjq!2v|%(K8e3*3`J^ei&ZBJ-pc6g??K&q?Mv$vh{yCxz%) zVxA@DNi8dSQiz`PW0|f{^dDl1c}{Up3ej_#c}_D=YH`t%LiC(ro-@pIhI>+op7i4q z?&mD?q?Q;xDMZgX<~hea=eQ?@=sC|k=b0z9(CA4adM+@}1?IWHJt;)bMdrE4JgMbI zPYTgLGEjoHqh@NHUS!SMP?n$BX>>syneJjn+H~mod+|Ba` zahG08=+n|-V-3|vVb!C&QW*tS%nQQL+SSg z$a+IynSGoUHoBZe?+uW3MPQkIA*+-hc#XO`qyM2Qzd-$9U!Fws%AgYI@7Sl8%-Q_0_WR%d>NlMqXa4ET{pNK}Qzu`lvqIdm^om||b?jctXVs`* zbm^L_IqoahC%6Z6b;=tTmqu^V^Txb4Yb5Sp)$bU$TFrqear1()q8m?o!I-6ikZ@GArKLgZ-hS$l4j!E5Po6$-bien!d(dk*NB!eD My@B5+&m2qr5A>`*Jpcdz diff --git a/tests/data/acpi/x86/microvm/DSDT.pcie b/tests/data/acpi/x86/microvm/DSDT.pcie index 765f14ef3d1e54d3cadccbf0a880f8adb73b3f1f..8eacd21d6ecdf9a3cd3e4f03cf1b40748dcbf53e 100644 GIT binary patch literal 3023 zcmZvezi%T&6vyA%>v%o>k+tK*aeg^mi$LpG(a|8Cy>=2KJ6XKW1>uwSBn09}6A6R_ zC!~<#fOMQCY9r+jprt#Bii+zfD7c1(hLWq!eBVwsnkTZ7XXiWf-rM(iv$LLV*d70t zCrb4?NB%*$-FYwYw{clS#C^WI{hy<-6HPAO+&l62oamt6Exy|u9($g*LrxTq+bZuw z{wORLmD;!5jGM6vU(n=Y$3Gf2WBWq;dB$?0<9-({J1lAkR~7fIm+6;ja_+dqK7)n* zUN_PYY5PIn^O`L@1fE4qpvTWCsr+{fl(urn4o>I2QE#6LM6*}Qo_w8W>G#6kV7s~* z_3n>zXDlAh-09=*;4tuf&)3gFA|3nw$h)iO28W{~(C+F< zl|>eQ8zh{|nyNj?kwvASUQvn&d0AFX=XKGy2XHgv?_I}tqTq(ZvXDABOtxgKx)6n< z(3UAijVTLN+5=xKVgnEaw+PNhwm6EYax5Uz8GFI!hSZNt+@s?P6>?T%c z8u)#qT5azwt_fD2f!iLx2O$>Pi8`G_w0I|YWIQT2)J8x)_31;}CJ(9xT5G|?{?ZSMH&7keB6??(3q~~Hk zm?n}v5kI!Yi=l0=6?-P8{6cEe*b|aHkxguyjy;nM(!`#K8k?rICrlH`p2#Tnv}4aC z13UIiOl|F%BoAqFW&~#@_DoDO+A~QW(&StaoY)G;Cox@+oRB7GR&ZtoXI64Tnw*Z{ zbOfg(IU!9>d_IgyV-xG@#l_4?PDqne2~H(AmE?ppIq|L4HHp_KFF5m(6Vl|wM*^P7 zdKLs{L2^QxoY?;cXHjq#B`2iGiQTSi5@%ZyoF&N#X>ww38=Q-Rb5U|anw+lSbOom? zIU!BXvfwNW&a&i$G&w7Rvm!Vvk`vP8ToRm1f^$i7LYkaa!C4iYRmllyaxM$bWx=^D zIU!BX6~Vb8I9DVmq{)fDG-Ew$g0m($Ax+Lz!MQ3pS0yK;$ypbib-`JeoRB8xn&4a$ zoNJO3(&Sthoa=&fU2;O2oEw63LvU_LPDqn;Q*dqy&P~Y)X>x7}&Mm>YB{?BY&W7M@ z2+oG&gw&kXLgaVabj8P&7y7hvm{IS4Tci{Z5zV==zQMav;DaMy&*t;_pSnxim#-T$ zdKxV(@NX=ydyqI0g~vfajz1m^sJfSnjzZtQJSD$-(wrS+ryrM>lsg&z(fYTsaqowd z&)U!4{{5rNQ)R%aKJS#J6XU6s?8#SM;? wLwnQhprDg6jOkcjLl0lI!ET5BLPM8&$Hl!oN2C2j(D$9{>OV literal 3023 zcmZveJ!~UI6vyA%>v%o>$oeDSAI{O94y|oPM}ri5?IcEVvUr^f!bp1(0&%2?1VVxn zQb=(?IxZ5mks@kZx|67=xDF%=uA!l!R7|2Yaf(fI83?IVBNiFW(l;@iE!q33x`a-wk5R(U7# zhhe#>)Q;t5+>BNDoW^HO|6tIF?F;Rv8OwEG&!+J8}IFc;9QWLkS6D%;9L}( zi;@%470C%{a;^%_Rl&I`IU!BXHNm+i zIM*a6q{+E1IM)T|y5xj3IqQP6E;#Fw6H;@I&-ylRK`%eizPEk-Ar7vO8kU8<3P%n7 z$=An^WUMw5g~QOg3Z>6G@%Q~Y6iy0eC~RIgLg}&VP;YTo%~xclQZg2{vm3GWn2gnW zRjjm(g$?dTEIoD=t5%5oPMa>d7SD7k_A~0;Z;F)SA)+Za*4OxF75L!D*OTdV`j_s~ z#`$eyMo*)K1)R^pMFFq<)l{+5%+4`@( zy7m3hr|l(&$ue6&cA#&E__kAKCb&7oWsqilC>4iIP^boV5A=a diff --git a/tests/qtest/bios-tables-test-allowed-diff.h b/tests/qtest/bios-tables-test-allowed-diff.h index 9282ea0fb2..dfb8523c8b 100644 --- a/tests/qtest/bios-tables-test-allowed-diff.h +++ b/tests/qtest/bios-tables-test-allowed-diff.h @@ -1,7 +1 @@ /* List of comma-separated changed AML files to ignore */ -"tests/data/acpi/aarch64/virt/DSDT", -"tests/data/acpi/aarch64/virt/DSDT.memhp", -"tests/data/acpi/aarch64/virt/DSDT.topology", -"tests/data/acpi/aarch64/virt/DSDT.acpihmatvirt", -"tests/data/acpi/aarch64/virt/DSDT.pxb", -"tests/data/acpi/x86/microvm/DSDT.pcie", From 329b327924f7e1a7a735c91b31f722708599e10f Mon Sep 17 00:00:00 2001 From: Sunil V L Date: Tue, 16 Jul 2024 20:13:03 +0530 Subject: [PATCH 55/61] tests/qtest/bios-tables-test.c: Remove the fall back path The expected ACPI AML files are moved now under ${arch}/{machine} path. Hence, there is no need to search in old path which didn't have ${arch}. Remove the code which searches for the expected AML files under old path as well. Suggested-by: Igor Mammedov Signed-off-by: Sunil V L Acked-by: Alistair Francis Reviewed-by: Igor Mammedov Message-Id: <20240716144306.2432257-7-sunilvl@ventanamicro.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- tests/qtest/bios-tables-test.c | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/tests/qtest/bios-tables-test.c b/tests/qtest/bios-tables-test.c index f4c4704bab..498e0e35d9 100644 --- a/tests/qtest/bios-tables-test.c +++ b/tests/qtest/bios-tables-test.c @@ -267,15 +267,6 @@ static void dump_aml_files(test_data *data, bool rebuild) data->arch, data->machine, sdt->aml, ext); - /* - * To keep test cases not failing before the DATA files are moved to - * ${arch}/${machine} folder, add this check as well. - */ - if (!g_file_test(aml_file, G_FILE_TEST_EXISTS)) { - aml_file = g_strdup_printf("%s/%s/%.4s%s", data_dir, - data->machine, sdt->aml, ext); - } - if (!g_file_test(aml_file, G_FILE_TEST_EXISTS) && sdt->aml_len == exp_sdt->aml_len && !memcmp(sdt->aml, exp_sdt->aml, sdt->aml_len)) { @@ -412,11 +403,6 @@ static GArray *load_expected_aml(test_data *data) try_again: aml_file = g_strdup_printf("%s/%s/%s/%.4s%s", data_dir, data->arch, data->machine, sdt->aml, ext); - if (!g_file_test(aml_file, G_FILE_TEST_EXISTS)) { - aml_file = g_strdup_printf("%s/%s/%.4s%s", data_dir, data->machine, - sdt->aml, ext); - } - if (verbosity_level >= 2) { fprintf(stderr, "Looking for expected file '%s'\n", aml_file); } From cc3ba2422554b63f30b697eb67143b5d48c5b7a3 Mon Sep 17 00:00:00 2001 From: Sunil V L Date: Tue, 16 Jul 2024 20:13:04 +0530 Subject: [PATCH 56/61] tests/acpi: Add empty ACPI data files for RISC-V As per process documented (steps 1-3) in bios-tables-test.c, add empty AML data files for RISC-V ACPI tables and add the entries in bios-tables-test-allowed-diff.h. Signed-off-by: Sunil V L Acked-by: Alistair Francis Reviewed-by: Igor Mammedov Message-Id: <20240716144306.2432257-8-sunilvl@ventanamicro.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- tests/data/acpi/riscv64/virt/APIC | 0 tests/data/acpi/riscv64/virt/DSDT | 0 tests/data/acpi/riscv64/virt/FACP | 0 tests/data/acpi/riscv64/virt/MCFG | 0 tests/data/acpi/riscv64/virt/RHCT | 0 tests/data/acpi/riscv64/virt/SPCR | 0 tests/qtest/bios-tables-test-allowed-diff.h | 6 ++++++ 7 files changed, 6 insertions(+) create mode 100644 tests/data/acpi/riscv64/virt/APIC create mode 100644 tests/data/acpi/riscv64/virt/DSDT create mode 100644 tests/data/acpi/riscv64/virt/FACP create mode 100644 tests/data/acpi/riscv64/virt/MCFG create mode 100644 tests/data/acpi/riscv64/virt/RHCT create mode 100644 tests/data/acpi/riscv64/virt/SPCR diff --git a/tests/data/acpi/riscv64/virt/APIC b/tests/data/acpi/riscv64/virt/APIC new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/data/acpi/riscv64/virt/DSDT b/tests/data/acpi/riscv64/virt/DSDT new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/data/acpi/riscv64/virt/FACP b/tests/data/acpi/riscv64/virt/FACP new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/data/acpi/riscv64/virt/MCFG b/tests/data/acpi/riscv64/virt/MCFG new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/data/acpi/riscv64/virt/RHCT b/tests/data/acpi/riscv64/virt/RHCT new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/data/acpi/riscv64/virt/SPCR b/tests/data/acpi/riscv64/virt/SPCR new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/qtest/bios-tables-test-allowed-diff.h b/tests/qtest/bios-tables-test-allowed-diff.h index dfb8523c8b..70474a097f 100644 --- a/tests/qtest/bios-tables-test-allowed-diff.h +++ b/tests/qtest/bios-tables-test-allowed-diff.h @@ -1 +1,7 @@ /* List of comma-separated changed AML files to ignore */ +"tests/data/acpi/riscv64/virt/APIC", +"tests/data/acpi/riscv64/virt/DSDT", +"tests/data/acpi/riscv64/virt/FACP", +"tests/data/acpi/riscv64/virt/MCFG", +"tests/data/acpi/riscv64/virt/RHCT", +"tests/data/acpi/riscv64/virt/SPCR", From 5b966e548fb7a530431379bab053eb1d6fb25867 Mon Sep 17 00:00:00 2001 From: Sunil V L Date: Tue, 16 Jul 2024 20:13:05 +0530 Subject: [PATCH 57/61] tests/qtest/bios-tables-test.c: Enable basic testing for RISC-V Add basic ACPI table test case for RISC-V. Signed-off-by: Sunil V L Reviewed-by: Alistair Francis Reviewed-by: Igor Mammedov Message-Id: <20240716144306.2432257-9-sunilvl@ventanamicro.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- tests/qtest/bios-tables-test.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/tests/qtest/bios-tables-test.c b/tests/qtest/bios-tables-test.c index 498e0e35d9..36e5c0adde 100644 --- a/tests/qtest/bios-tables-test.c +++ b/tests/qtest/bios-tables-test.c @@ -1963,6 +1963,28 @@ static void test_acpi_microvm_acpi_erst(void) } #endif /* CONFIG_POSIX */ +static void test_acpi_riscv64_virt_tcg(void) +{ + test_data data = { + .machine = "virt", + .arch = "riscv64", + .tcg_only = true, + .uefi_fl1 = "pc-bios/edk2-riscv-code.fd", + .uefi_fl2 = "pc-bios/edk2-riscv-vars.fd", + .cd = "tests/data/uefi-boot-images/bios-tables-test.riscv64.iso.qcow2", + .ram_start = 0x80000000ULL, + .scan_len = 128ULL * 1024 * 1024, + }; + + /* + * RHCT will have ISA string encoded. To reduce the effort + * of updating expected AML file for any new default ISA extension, + * use the profile rva22s64. + */ + test_acpi_one("-cpu rva22s64 ", &data); + free_test_data(&data); +} + static void test_acpi_aarch64_virt_tcg(void) { test_data data = { @@ -2441,6 +2463,10 @@ int main(int argc, char *argv[]) qtest_add_func("acpi/virt/viot", test_acpi_aarch64_virt_viot); } } + } else if (strcmp(arch, "riscv64") == 0) { + if (has_tcg && qtest_has_device("virtio-blk-pci")) { + qtest_add_func("acpi/virt", test_acpi_riscv64_virt_tcg); + } } ret = g_test_run(); boot_sector_cleanup(disk); From e9c0d54f4aebeeeebf1124d6acd607e3015e0a51 Mon Sep 17 00:00:00 2001 From: Sunil V L Date: Tue, 16 Jul 2024 20:13:06 +0530 Subject: [PATCH 58/61] tests/acpi: Add expected ACPI AML files for RISC-V As per the step 5 in the process documented in bios-tables-test.c, generate the expected ACPI AML data files for RISC-V using the rebuild-expected-aml.sh script and update the bios-tables-test-allowed-diff.h. These are all new files being added for the first time. Hence, iASL diff output is not added. Signed-off-by: Sunil V L Acked-by: Alistair Francis Acked-by: Igor Mammedov Message-Id: <20240716144306.2432257-10-sunilvl@ventanamicro.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- tests/data/acpi/riscv64/virt/APIC | Bin 0 -> 116 bytes tests/data/acpi/riscv64/virt/DSDT | Bin 0 -> 3576 bytes tests/data/acpi/riscv64/virt/FACP | Bin 0 -> 276 bytes tests/data/acpi/riscv64/virt/MCFG | Bin 0 -> 60 bytes tests/data/acpi/riscv64/virt/RHCT | Bin 0 -> 332 bytes tests/data/acpi/riscv64/virt/SPCR | Bin 0 -> 80 bytes tests/qtest/bios-tables-test-allowed-diff.h | 6 ------ 7 files changed, 6 deletions(-) diff --git a/tests/data/acpi/riscv64/virt/APIC b/tests/data/acpi/riscv64/virt/APIC index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..66a25dfd2d6ea2b607c024722b2eab95873a01e9 100644 GIT binary patch literal 116 zcmZ<^@N_O=U|?X|;^gn_5v<@85#X!<1dKp25F13pfP@Mo12P{Zj?R|`s)2!c7=s}J I#NvT*0o0BN0RR91 literal 0 HcmV?d00001 diff --git a/tests/data/acpi/riscv64/virt/DSDT b/tests/data/acpi/riscv64/virt/DSDT index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..6a33f5647ddd6de3a0f000f718b58f6fff44f0fd 100644 GIT binary patch literal 3576 zcmZvfO>Y}j6o&8Elh_%5#CDu-Cr+Uf3sh+w6Os)g_Bd%=*~#E>8YCmRQk04kR6;38 z1tF>|_=woFL2TFrkzaw3Kw`y?9SFpd4SxW3<~{d%%sFI6^31%~=e+kk*Z0mHH-bj@ zpNvwm2lYK~Cs?V!>U%3VETt6P(>3S)@mfEq_j{*1w&%KTvcJ=8WNMz@gjiR}{(iH8 zbGz2fKj&PZyKX7U;>Z7W?{s7Pz}q%PuWYsVVYCX1pj&fN$-d{+ESx(*KJR2do*=ti zZVrZzRPS`Xi5g61C-80~vob2-W>CkyNK|R1?&w4>;qA3$W_6TFISbCL=}hIQ%g@G@ zWjVUnWWNzK3ahdFl#?s|I{59|`=7VZo_zP!_qV>W3X4`@E|xib^R2_<8+RWZz5VXd z{liD-c5?&O*6iDzu-w*eXAkAB{nzP;GweEtu5-h#Y0wB*TT~3Ow4gz{VzI(3Vnd5M zRk0dn;l!dmT;>ty9@R*Xc$CK`^RT(c3y~gnl!wne#<<5?q{mp4#~tP|&OKg?^cauw zIAIK0WsoGrqNCJXm^Cc?|KNw49okzSZI-0Rs0FVi;iYJK5* zO*UFJcY{_to<-zbb7?gPTQ_m8*LO$b7<4=NvQzSvr<&?Wttc0t2JYTd_tYV-icIt} z@;lwB);rGQLds4J28wM<p$GlFwQaL!0hNX|JcIA;attmK5`oOrQgzvrCboRgf8oO51q&I`_Y z$qC6h7X;^m;9QWLkeqW-a4rhYMac=tIZJ}GBsfcw6OwZ-3C<4`(C}KF7YH%*r(}jWhdc0{4}Ft)TGpaPSz)wD{1VN`q&%{ z1|Nm@_{K`p3#UG?1=9g-sk7%$sG9DR90!PI_v{B_o}QzwM|dBbPf<_=zP8dH)RXFi+dCw;Qyc2X#oHL literal 0 HcmV?d00001 diff --git a/tests/data/acpi/riscv64/virt/SPCR b/tests/data/acpi/riscv64/virt/SPCR index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..4da9daf65f71a13ac2b488d4e9728f194b569a43 100644 GIT binary patch literal 80 zcmWFza1IJ!U|?X{>E!S15v<@85#X!<1dKp25F12;fdT`FDF9*%FmM4$c8~z`e;@#f G!2kgKJqrN< literal 0 HcmV?d00001 diff --git a/tests/qtest/bios-tables-test-allowed-diff.h b/tests/qtest/bios-tables-test-allowed-diff.h index 70474a097f..dfb8523c8b 100644 --- a/tests/qtest/bios-tables-test-allowed-diff.h +++ b/tests/qtest/bios-tables-test-allowed-diff.h @@ -1,7 +1 @@ /* List of comma-separated changed AML files to ignore */ -"tests/data/acpi/riscv64/virt/APIC", -"tests/data/acpi/riscv64/virt/DSDT", -"tests/data/acpi/riscv64/virt/FACP", -"tests/data/acpi/riscv64/virt/MCFG", -"tests/data/acpi/riscv64/virt/RHCT", -"tests/data/acpi/riscv64/virt/SPCR", From 78cc8c69475042ce6b6f720f8c81920fead0d86e Mon Sep 17 00:00:00 2001 From: Alistair Francis Date: Wed, 3 Jul 2024 19:20:25 +1000 Subject: [PATCH 59/61] hw/pci: Add all Data Object Types defined in PCIe r6.0 Add all of the defined protocols/features from the PCIe-SIG r6.0 "Table 6-32 PCI-SIG defined Data Object Types (Vendor ID = 0001h)" table. Signed-off-by: Alistair Francis Reviewed-by: Jonathan Cameron Reviewed-by: Wilfred Mallawa Message-Id: <20240703092027.644758-2-alistair.francis@wdc.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- include/hw/pci/pcie_doe.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/hw/pci/pcie_doe.h b/include/hw/pci/pcie_doe.h index 87dc17dcef..15d94661f9 100644 --- a/include/hw/pci/pcie_doe.h +++ b/include/hw/pci/pcie_doe.h @@ -46,6 +46,8 @@ REG32(PCI_DOE_CAP_STATUS, 0) /* PCI-SIG defined Data Object Types - r6.0 Table 6-32 */ #define PCI_SIG_DOE_DISCOVERY 0x00 +#define PCI_SIG_DOE_CMA 0x01 +#define PCI_SIG_DOE_SECURED_CMA 0x02 #define PCI_DOE_DW_SIZE_MAX (1 << 18) #define PCI_DOE_PROTOCOL_NUM_MAX 256 From bc419a1cc5b15deec9cf7cb7a382392c112810e2 Mon Sep 17 00:00:00 2001 From: Huai-Cheng Kuo Date: Wed, 3 Jul 2024 19:20:26 +1000 Subject: [PATCH 60/61] backends: Initial support for SPDM socket support SPDM enables authentication, attestation and key exchange to assist in providing infrastructure security enablement. It's a standard published by the DMTF [1]. SPDM supports multiple transports, including PCIe DOE and MCTP. This patch adds support to QEMU to connect to an external SPDM instance. SPDM support can be added to any QEMU device by exposing a TCP socket to a SPDM server. The server can then implement the SPDM decoding/encoding support, generally using libspdm [2]. This is similar to how the current TPM implementation works and means that the heavy lifting of setting up certificate chains, capabilities, measurements and complex crypto can be done outside QEMU by a well supported and tested library. 1: https://www.dmtf.org/standards/SPDM 2: https://github.com/DMTF/libspdm Signed-off-by: Huai-Cheng Kuo Signed-off-by: Chris Browy Co-developed-by: Jonathan Cameron Signed-off-by: Jonathan Cameron [ Changes by WM - Bug fixes from testing ] Signed-off-by: Wilfred Mallawa [ Changes by AF: - Convert to be more QEMU-ified - Move to backends as it isn't PCIe specific ] Signed-off-by: Alistair Francis Message-Id: <20240703092027.644758-3-alistair.francis@wdc.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- MAINTAINERS | 6 + backends/Kconfig | 4 + backends/meson.build | 2 + backends/spdm-socket.c | 216 +++++++++++++++++++++++++++++++++++ include/sysemu/spdm-socket.h | 74 ++++++++++++ 5 files changed, 302 insertions(+) create mode 100644 backends/spdm-socket.c create mode 100644 include/sysemu/spdm-socket.h diff --git a/MAINTAINERS b/MAINTAINERS index 93546cfb14..d76b49a597 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3401,6 +3401,12 @@ F: tests/qtest/*tpm* F: docs/specs/tpm.rst T: git https://github.com/stefanberger/qemu-tpm.git tpm-next +SPDM +M: Alistair Francis +S: Maintained +F: backends/spdm-socket.c +F: include/sysemu/spdm-socket.h + Checkpatch S: Odd Fixes F: scripts/checkpatch.pl diff --git a/backends/Kconfig b/backends/Kconfig index 2cb23f62fa..d3dbe19868 100644 --- a/backends/Kconfig +++ b/backends/Kconfig @@ -3,3 +3,7 @@ source tpm/Kconfig config IOMMUFD bool depends on VFIO + +config SPDM_SOCKET + bool + default y diff --git a/backends/meson.build b/backends/meson.build index 749b491f12..da714b93d1 100644 --- a/backends/meson.build +++ b/backends/meson.build @@ -33,4 +33,6 @@ endif system_ss.add(when: gio, if_true: files('dbus-vmstate.c')) system_ss.add(when: 'CONFIG_SGX', if_true: files('hostmem-epc.c')) +system_ss.add(when: 'CONFIG_SPDM_SOCKET', if_true: files('spdm-socket.c')) + subdir('tpm') diff --git a/backends/spdm-socket.c b/backends/spdm-socket.c new file mode 100644 index 0000000000..d0663d696c --- /dev/null +++ b/backends/spdm-socket.c @@ -0,0 +1,216 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* + * QEMU SPDM socket support + * + * This is based on: + * https://github.com/DMTF/spdm-emu/blob/07c0a838bcc1c6207c656ac75885c0603e344b6f/spdm_emu/spdm_emu_common/command.c + * but has been re-written to match QEMU style + * + * Copyright (c) 2021, DMTF. All rights reserved. + * Copyright (c) 2023. Western Digital Corporation or its affiliates. + */ + +#include "qemu/osdep.h" +#include "sysemu/spdm-socket.h" +#include "qapi/error.h" + +static bool read_bytes(const int socket, uint8_t *buffer, + size_t number_of_bytes) +{ + ssize_t number_received = 0; + ssize_t result; + + while (number_received < number_of_bytes) { + result = recv(socket, buffer + number_received, + number_of_bytes - number_received, 0); + if (result <= 0) { + return false; + } + number_received += result; + } + return true; +} + +static bool read_data32(const int socket, uint32_t *data) +{ + bool result; + + result = read_bytes(socket, (uint8_t *)data, sizeof(uint32_t)); + if (!result) { + return result; + } + *data = ntohl(*data); + return true; +} + +static bool read_multiple_bytes(const int socket, uint8_t *buffer, + uint32_t *bytes_received, + uint32_t max_buffer_length) +{ + uint32_t length; + bool result; + + result = read_data32(socket, &length); + if (!result) { + return result; + } + + if (length > max_buffer_length) { + return false; + } + + if (bytes_received) { + *bytes_received = length; + } + + if (length == 0) { + return true; + } + + return read_bytes(socket, buffer, length); +} + +static bool receive_platform_data(const int socket, + uint32_t transport_type, + uint32_t *command, + uint8_t *receive_buffer, + uint32_t *bytes_to_receive) +{ + bool result; + uint32_t response; + uint32_t bytes_received; + + result = read_data32(socket, &response); + if (!result) { + return result; + } + *command = response; + + result = read_data32(socket, &transport_type); + if (!result) { + return result; + } + + bytes_received = 0; + result = read_multiple_bytes(socket, receive_buffer, &bytes_received, + *bytes_to_receive); + if (!result) { + return result; + } + *bytes_to_receive = bytes_received; + + return result; +} + +static bool write_bytes(const int socket, const uint8_t *buffer, + uint32_t number_of_bytes) +{ + ssize_t number_sent = 0; + ssize_t result; + + while (number_sent < number_of_bytes) { + result = send(socket, buffer + number_sent, + number_of_bytes - number_sent, 0); + if (result == -1) { + return false; + } + number_sent += result; + } + return true; +} + +static bool write_data32(const int socket, uint32_t data) +{ + data = htonl(data); + return write_bytes(socket, (uint8_t *)&data, sizeof(uint32_t)); +} + +static bool write_multiple_bytes(const int socket, const uint8_t *buffer, + uint32_t bytes_to_send) +{ + bool result; + + result = write_data32(socket, bytes_to_send); + if (!result) { + return result; + } + + return write_bytes(socket, buffer, bytes_to_send); +} + +static bool send_platform_data(const int socket, + uint32_t transport_type, uint32_t command, + const uint8_t *send_buffer, size_t bytes_to_send) +{ + bool result; + + result = write_data32(socket, command); + if (!result) { + return result; + } + + result = write_data32(socket, transport_type); + if (!result) { + return result; + } + + return write_multiple_bytes(socket, send_buffer, bytes_to_send); +} + +int spdm_socket_connect(uint16_t port, Error **errp) +{ + int client_socket; + struct sockaddr_in server_addr; + + client_socket = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + if (client_socket < 0) { + error_setg(errp, "cannot create socket: %s", strerror(errno)); + return -1; + } + + memset((char *)&server_addr, 0, sizeof(server_addr)); + server_addr.sin_family = AF_INET; + server_addr.sin_addr.s_addr = htonl(INADDR_LOOPBACK); + server_addr.sin_port = htons(port); + + + if (connect(client_socket, (struct sockaddr *)&server_addr, + sizeof(server_addr)) < 0) { + error_setg(errp, "cannot connect: %s", strerror(errno)); + close(client_socket); + return -1; + } + + return client_socket; +} + +uint32_t spdm_socket_rsp(const int socket, uint32_t transport_type, + void *req, uint32_t req_len, + void *rsp, uint32_t rsp_len) +{ + uint32_t command; + bool result; + + result = send_platform_data(socket, transport_type, + SPDM_SOCKET_COMMAND_NORMAL, + req, req_len); + if (!result) { + return 0; + } + + result = receive_platform_data(socket, transport_type, &command, + (uint8_t *)rsp, &rsp_len); + if (!result) { + return 0; + } + + assert(command != 0); + + return rsp_len; +} + +void spdm_socket_close(const int socket, uint32_t transport_type) +{ + send_platform_data(socket, transport_type, + SPDM_SOCKET_COMMAND_SHUTDOWN, NULL, 0); +} diff --git a/include/sysemu/spdm-socket.h b/include/sysemu/spdm-socket.h new file mode 100644 index 0000000000..5d8bd9aa4e --- /dev/null +++ b/include/sysemu/spdm-socket.h @@ -0,0 +1,74 @@ +/* + * QEMU SPDM socket support + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#ifndef SPDM_REQUESTER_H +#define SPDM_REQUESTER_H + +/** + * spdm_socket_connect: connect to an external SPDM socket + * @port: port to connect to + * @errp: error object handle + * + * This will connect to an external SPDM socket server. On error + * it will return -1 and errp will be set. On success this function + * will return the socket number. + */ +int spdm_socket_connect(uint16_t port, Error **errp); + +/** + * spdm_socket_rsp: send and receive a message to a SPDM server + * @socket: socket returned from spdm_socket_connect() + * @transport_type: SPDM_SOCKET_TRANSPORT_TYPE_* macro + * @req: request buffer + * @req_len: request buffer length + * @rsp: response buffer + * @rsp_len: response buffer length + * + * Send platform data to a SPDM server on socket and then receive + * a response. + */ +uint32_t spdm_socket_rsp(const int socket, uint32_t transport_type, + void *req, uint32_t req_len, + void *rsp, uint32_t rsp_len); + +/** + * spdm_socket_close: send a shutdown command to the server + * @socket: socket returned from spdm_socket_connect() + * @transport_type: SPDM_SOCKET_TRANSPORT_TYPE_* macro + * + * This will issue a shutdown command to the server. + */ +void spdm_socket_close(const int socket, uint32_t transport_type); + +#define SPDM_SOCKET_COMMAND_NORMAL 0x0001 +#define SPDM_SOCKET_COMMAND_OOB_ENCAP_KEY_UPDATE 0x8001 +#define SPDM_SOCKET_COMMAND_CONTINUE 0xFFFD +#define SPDM_SOCKET_COMMAND_SHUTDOWN 0xFFFE +#define SPDM_SOCKET_COMMAND_UNKOWN 0xFFFF +#define SPDM_SOCKET_COMMAND_TEST 0xDEAD + +#define SPDM_SOCKET_TRANSPORT_TYPE_MCTP 0x01 +#define SPDM_SOCKET_TRANSPORT_TYPE_PCI_DOE 0x02 + +#define SPDM_SOCKET_MAX_MESSAGE_BUFFER_SIZE 0x1200 + +#endif From 4f947b10d525958578002848a92eeb6152ffbf0d Mon Sep 17 00:00:00 2001 From: Wilfred Mallawa Date: Wed, 3 Jul 2024 19:20:27 +1000 Subject: [PATCH 61/61] hw/nvme: Add SPDM over DOE support Setup Data Object Exchange (DOE) as an extended capability for the NVME controller and connect SPDM to it (CMA) to it. Signed-off-by: Wilfred Mallawa Signed-off-by: Alistair Francis Reviewed-by: Jonathan Cameron Acked-by: Klaus Jensen Message-Id: <20240703092027.644758-4-alistair.francis@wdc.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- docs/specs/index.rst | 1 + docs/specs/spdm.rst | 134 ++++++++++++++++++++++++++++++++++++ hw/nvme/ctrl.c | 62 +++++++++++++++++ include/hw/pci/pci_device.h | 7 ++ include/hw/pci/pcie_doe.h | 3 + 5 files changed, 207 insertions(+) create mode 100644 docs/specs/spdm.rst diff --git a/docs/specs/index.rst b/docs/specs/index.rst index 1484e3e760..e2d907959a 100644 --- a/docs/specs/index.rst +++ b/docs/specs/index.rst @@ -29,6 +29,7 @@ guest hardware that is specific to QEMU. edu ivshmem-spec pvpanic + spdm standard-vga virt-ctlr vmcoreinfo diff --git a/docs/specs/spdm.rst b/docs/specs/spdm.rst new file mode 100644 index 0000000000..f7de080ff0 --- /dev/null +++ b/docs/specs/spdm.rst @@ -0,0 +1,134 @@ +====================================================== +QEMU Security Protocols and Data Models (SPDM) Support +====================================================== + +SPDM enables authentication, attestation and key exchange to assist in +providing infrastructure security enablement. It's a standard published +by the `DMTF`_. + +QEMU supports connecting to a SPDM responder implementation. This allows an +external application to emulate the SPDM responder logic for an SPDM device. + +Setting up a SPDM server +======================== + +When using QEMU with SPDM devices QEMU will connect to a server which +implements the SPDM functionality. + +SPDM-Utils +---------- + +You can use `SPDM Utils`_ to emulate a responder. This is the simplest method. + +SPDM-Utils is a Linux applications to manage, test and develop devices +supporting DMTF Security Protocol and Data Model (SPDM). It is written in Rust +and utilises libspdm. + +To use SPDM-Utils you will need to do the following steps. Details are included +in the SPDM-Utils README. + + 1. `Build libspdm`_ + 2. `Build SPDM Utils`_ + 3. `Run it as a server`_ + +spdm-emu +-------- + +You can use `spdm emu`_ to model the +SPDM responder. + +.. code-block:: shell + + $ cd spdm-emu + $ git submodule init; git submodule update --recursive + $ mkdir build; cd build + $ cmake -DARCH=x64 -DTOOLCHAIN=GCC -DTARGET=Debug -DCRYPTO=openssl .. + $ make -j32 + $ make copy_sample_key # Build certificates, required for SPDM authentication. + +It is worth noting that the certificates should be in compliance with +PCIe r6.1 sec 6.31.3. This means you will need to add the following to +openssl.cnf + +.. code-block:: + + subjectAltName = otherName:2.23.147;UTF8:Vendor=1b36:Device=0010:CC=010802:REV=02:SSVID=1af4:SSID=1100 + 2.23.147 = ASN1:OID:2.23.147 + +and then manually regenerate some certificates with: + +.. code-block:: shell + + $ openssl req -nodes -newkey ec:param.pem -keyout end_responder.key \ + -out end_responder.req -sha384 -batch \ + -subj "/CN=DMTF libspdm ECP384 responder cert" + + $ openssl x509 -req -in end_responder.req -out end_responder.cert \ + -CA inter.cert -CAkey inter.key -sha384 -days 3650 -set_serial 3 \ + -extensions v3_end -extfile ../openssl.cnf + + $ openssl asn1parse -in end_responder.cert -out end_responder.cert.der + + $ cat ca.cert.der inter.cert.der end_responder.cert.der > bundle_responder.certchain.der + +You can use SPDM-Utils instead as it will generate the correct certificates +automatically. + +The responder can then be launched with + +.. code-block:: shell + + $ cd bin + $ ./spdm_responder_emu --trans PCI_DOE + +Connecting an SPDM NVMe device +============================== + +Once a SPDM server is running we can start QEMU and connect to the server. + +For an NVMe device first let's setup a block we can use + +.. code-block:: shell + + $ cd qemu-spdm/linux/image + $ dd if=/dev/zero of=blknvme bs=1M count=2096 # 2GB NNMe Drive + +Then you can add this to your QEMU command line: + +.. code-block:: shell + + -drive file=blknvme,if=none,id=mynvme,format=raw \ + -device nvme,drive=mynvme,serial=deadbeef,spdm_port=2323 + +At which point QEMU will try to connect to the SPDM server. + +Note that if using x64-64 you will want to use the q35 machine instead +of the default. So the entire QEMU command might look like this + +.. code-block:: shell + + qemu-system-x86_64 -M q35 \ + --kernel bzImage \ + -drive file=rootfs.ext2,if=virtio,format=raw \ + -append "root=/dev/vda console=ttyS0" \ + -net none -nographic \ + -drive file=blknvme,if=none,id=mynvme,format=raw \ + -device nvme,drive=mynvme,serial=deadbeef,spdm_port=2323 + +.. _DMTF: + https://www.dmtf.org/standards/SPDM + +.. _SPDM Utils: + https://github.com/westerndigitalcorporation/spdm-utils + +.. _spdm emu: + https://github.com/dmtf/spdm-emu + +.. _Build libspdm: + https://github.com/westerndigitalcorporation/spdm-utils?tab=readme-ov-file#build-libspdm + +.. _Build SPDM Utils: + https://github.com/westerndigitalcorporation/spdm-utils?tab=readme-ov-file#build-the-binary + +.. _Run it as a server: + https://github.com/westerndigitalcorporation/spdm-utils#qemu-spdm-device-emulation diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index 5b1b0cabcf..6ee72014cf 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -203,6 +203,7 @@ #include "sysemu/hostmem.h" #include "hw/pci/msix.h" #include "hw/pci/pcie_sriov.h" +#include "sysemu/spdm-socket.h" #include "migration/vmstate.h" #include "nvme.h" @@ -8113,6 +8114,27 @@ static int nvme_add_pm_capability(PCIDevice *pci_dev, uint8_t offset) return 0; } +static bool pcie_doe_spdm_rsp(DOECap *doe_cap) +{ + void *req = pcie_doe_get_write_mbox_ptr(doe_cap); + uint32_t req_len = pcie_doe_get_obj_len(req) * 4; + void *rsp = doe_cap->read_mbox; + uint32_t rsp_len = SPDM_SOCKET_MAX_MESSAGE_BUFFER_SIZE; + + uint32_t recvd = spdm_socket_rsp(doe_cap->spdm_socket, + SPDM_SOCKET_TRANSPORT_TYPE_PCI_DOE, + req, req_len, rsp, rsp_len); + doe_cap->read_mbox_len += DIV_ROUND_UP(recvd, 4); + + return recvd != 0; +} + +static DOEProtocol doe_spdm_prot[] = { + { PCI_VENDOR_ID_PCI_SIG, PCI_SIG_DOE_CMA, pcie_doe_spdm_rsp }, + { PCI_VENDOR_ID_PCI_SIG, PCI_SIG_DOE_SECURED_CMA, pcie_doe_spdm_rsp }, + { } +}; + static bool nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp) { ERRP_GUARD(); @@ -8200,6 +8222,25 @@ static bool nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp) nvme_update_msixcap_ts(pci_dev, n->conf_msix_qsize); + pcie_cap_deverr_init(pci_dev); + + /* DOE Initialisation */ + if (pci_dev->spdm_port) { + uint16_t doe_offset = n->params.sriov_max_vfs ? + PCI_CONFIG_SPACE_SIZE + PCI_ARI_SIZEOF + : PCI_CONFIG_SPACE_SIZE; + + pcie_doe_init(pci_dev, &pci_dev->doe_spdm, doe_offset, + doe_spdm_prot, true, 0); + + pci_dev->doe_spdm.spdm_socket = spdm_socket_connect(pci_dev->spdm_port, + errp); + + if (pci_dev->doe_spdm.spdm_socket < 0) { + return false; + } + } + if (n->params.cmb_size_mb) { nvme_init_cmb(n, pci_dev); } @@ -8446,6 +8487,11 @@ static void nvme_exit(PCIDevice *pci_dev) g_free(n->cmb.buf); } + if (pci_dev->doe_spdm.spdm_socket > 0) { + spdm_socket_close(pci_dev->doe_spdm.spdm_socket, + SPDM_SOCKET_TRANSPORT_TYPE_PCI_DOE); + } + if (n->pmr.dev) { host_memory_backend_set_mapped(n->pmr.dev, false); } @@ -8491,6 +8537,7 @@ static Property nvme_props[] = { DEFINE_PROP_BOOL("msix-exclusive-bar", NvmeCtrl, params.msix_exclusive_bar, false), DEFINE_PROP_UINT16("mqes", NvmeCtrl, params.mqes, 0x7ff), + DEFINE_PROP_UINT16("spdm_port", PCIDevice, spdm_port, 0), DEFINE_PROP_END_OF_LIST(), }; @@ -8562,11 +8609,25 @@ static void nvme_pci_write_config(PCIDevice *dev, uint32_t address, { uint16_t old_num_vfs = pcie_sriov_num_vfs(dev); + if (pcie_find_capability(dev, PCI_EXT_CAP_ID_DOE)) { + pcie_doe_write_config(&dev->doe_spdm, address, val, len); + } pci_default_write_config(dev, address, val, len); pcie_cap_flr_write_config(dev, address, val, len); nvme_sriov_post_write_config(dev, old_num_vfs); } +static uint32_t nvme_pci_read_config(PCIDevice *dev, uint32_t address, int len) +{ + uint32_t val; + if (dev->spdm_port && pcie_find_capability(dev, PCI_EXT_CAP_ID_DOE)) { + if (pcie_doe_read_config(&dev->doe_spdm, address, len, &val)) { + return val; + } + } + return pci_default_read_config(dev, address, len); +} + static const VMStateDescription nvme_vmstate = { .name = "nvme", .unmigratable = 1, @@ -8579,6 +8640,7 @@ static void nvme_class_init(ObjectClass *oc, void *data) pc->realize = nvme_realize; pc->config_write = nvme_pci_write_config; + pc->config_read = nvme_pci_read_config; pc->exit = nvme_exit; pc->class_id = PCI_CLASS_STORAGE_EXPRESS; pc->revision = 2; diff --git a/include/hw/pci/pci_device.h b/include/hw/pci/pci_device.h index cefd6f7640..e7e41cb939 100644 --- a/include/hw/pci/pci_device.h +++ b/include/hw/pci/pci_device.h @@ -3,6 +3,7 @@ #include "hw/pci/pci.h" #include "hw/pci/pcie.h" +#include "hw/pci/pcie_doe.h" #define TYPE_PCI_DEVICE "pci-device" typedef struct PCIDeviceClass PCIDeviceClass; @@ -159,6 +160,12 @@ struct PCIDevice { MSIVectorReleaseNotifier msix_vector_release_notifier; MSIVectorPollNotifier msix_vector_poll_notifier; + /* SPDM */ + uint16_t spdm_port; + + /* DOE */ + DOECap doe_spdm; + /* ID of standby device in net_failover pair */ char *failover_pair_id; uint32_t acpi_index; diff --git a/include/hw/pci/pcie_doe.h b/include/hw/pci/pcie_doe.h index 15d94661f9..9e1275db8a 100644 --- a/include/hw/pci/pcie_doe.h +++ b/include/hw/pci/pcie_doe.h @@ -108,6 +108,9 @@ struct DOECap { /* Protocols and its callback response */ DOEProtocol *protocols; uint16_t protocol_num; + + /* Used for spdm-socket */ + int spdm_socket; }; void pcie_doe_init(PCIDevice *pdev, DOECap *doe_cap, uint16_t offset,