From eb2e89747eca57fb0028001b28b3c4e0c1540e3a Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Sun, 24 Jan 2021 11:54:45 +0900 Subject: [PATCH 01/38] hw/block/nvme: introduce nvme-subsys device To support multi-path in QEMU NVMe device model, We need to have NVMe subsystem hierarchy to map controllers and namespaces to a NVMe subsystem. This patch introduced a simple nvme-subsys device model. The subsystem will be prepared with subsystem NQN with provided in nvme-subsys device: ex) -device nvme-subsys,id=subsys0: nqn.2019-08.org.qemu:subsys0 Signed-off-by: Minwoo Im Tested-by: Klaus Jensen Reviewed-by: Klaus Jensen Reviewed-by: Keith Busch [k.jensen: added 'nqn' device parameter per request] Signed-off-by: Klaus Jensen --- hw/block/meson.build | 2 +- hw/block/nvme-subsys.c | 70 ++++++++++++++++++++++++++++++++++++++++++ hw/block/nvme-subsys.h | 29 +++++++++++++++++ hw/block/nvme.c | 12 ++++++++ 4 files changed, 112 insertions(+), 1 deletion(-) create mode 100644 hw/block/nvme-subsys.c create mode 100644 hw/block/nvme-subsys.h diff --git a/hw/block/meson.build b/hw/block/meson.build index 4bf994c64f..5492829155 100644 --- a/hw/block/meson.build +++ b/hw/block/meson.build @@ -13,7 +13,7 @@ softmmu_ss.add(when: 'CONFIG_SSI_M25P80', if_true: files('m25p80.c')) softmmu_ss.add(when: 'CONFIG_SWIM', if_true: files('swim.c')) softmmu_ss.add(when: 'CONFIG_XEN', if_true: files('xen-block.c')) softmmu_ss.add(when: 'CONFIG_TC58128', if_true: files('tc58128.c')) -softmmu_ss.add(when: 'CONFIG_NVME_PCI', if_true: files('nvme.c', 'nvme-ns.c')) +softmmu_ss.add(when: 'CONFIG_NVME_PCI', if_true: files('nvme.c', 'nvme-ns.c', 'nvme-subsys.c')) specific_ss.add(when: 'CONFIG_VIRTIO_BLK', if_true: files('virtio-blk.c')) specific_ss.add(when: 'CONFIG_VHOST_USER_BLK', if_true: files('vhost-user-blk.c')) diff --git a/hw/block/nvme-subsys.c b/hw/block/nvme-subsys.c new file mode 100644 index 0000000000..840448bb13 --- /dev/null +++ b/hw/block/nvme-subsys.c @@ -0,0 +1,70 @@ +/* + * QEMU NVM Express Subsystem: nvme-subsys + * + * Copyright (c) 2021 Minwoo Im + * + * This code is licensed under the GNU GPL v2. Refer COPYING. + */ + +#include "qemu/units.h" +#include "qemu/osdep.h" +#include "qemu/uuid.h" +#include "qemu/iov.h" +#include "qemu/cutils.h" +#include "qapi/error.h" +#include "hw/qdev-properties.h" +#include "hw/qdev-core.h" +#include "hw/block/block.h" +#include "block/aio.h" +#include "block/accounting.h" +#include "sysemu/sysemu.h" +#include "hw/pci/pci.h" +#include "nvme.h" +#include "nvme-subsys.h" + +static void nvme_subsys_setup(NvmeSubsystem *subsys) +{ + const char *nqn = subsys->params.nqn ? + subsys->params.nqn : subsys->parent_obj.id; + + snprintf((char *)subsys->subnqn, sizeof(subsys->subnqn), + "nqn.2019-08.org.qemu:%s", nqn); +} + +static void nvme_subsys_realize(DeviceState *dev, Error **errp) +{ + NvmeSubsystem *subsys = NVME_SUBSYS(dev); + + nvme_subsys_setup(subsys); +} + +static Property nvme_subsystem_props[] = { + DEFINE_PROP_STRING("nqn", NvmeSubsystem, params.nqn), + DEFINE_PROP_END_OF_LIST(), +}; + +static void nvme_subsys_class_init(ObjectClass *oc, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(oc); + + set_bit(DEVICE_CATEGORY_STORAGE, dc->categories); + + dc->realize = nvme_subsys_realize; + dc->desc = "Virtual NVMe subsystem"; + + device_class_set_props(dc, nvme_subsystem_props); +} + +static const TypeInfo nvme_subsys_info = { + .name = TYPE_NVME_SUBSYS, + .parent = TYPE_DEVICE, + .class_init = nvme_subsys_class_init, + .instance_size = sizeof(NvmeSubsystem), +}; + +static void nvme_subsys_register_types(void) +{ + type_register_static(&nvme_subsys_info); +} + +type_init(nvme_subsys_register_types) diff --git a/hw/block/nvme-subsys.h b/hw/block/nvme-subsys.h new file mode 100644 index 0000000000..fc86d01ff3 --- /dev/null +++ b/hw/block/nvme-subsys.h @@ -0,0 +1,29 @@ +/* + * QEMU NVM Express Subsystem: nvme-subsys + * + * Copyright (c) 2021 Minwoo Im + * + * This code is licensed under the GNU GPL v2. Refer COPYING. + */ + +#ifndef NVME_SUBSYS_H +#define NVME_SUBSYS_H + +#define TYPE_NVME_SUBSYS "nvme-subsys" +#define NVME_SUBSYS(obj) \ + OBJECT_CHECK(NvmeSubsystem, (obj), TYPE_NVME_SUBSYS) + +#define NVME_SUBSYS_MAX_CTRLS 32 + +typedef struct NvmeCtrl NvmeCtrl; +typedef struct NvmeNamespace NvmeNamespace; +typedef struct NvmeSubsystem { + DeviceState parent_obj; + uint8_t subnqn[256]; + + struct { + char *nqn; + } params; +} NvmeSubsystem; + +#endif /* NVME_SUBSYS_H */ diff --git a/hw/block/nvme.c b/hw/block/nvme.c index fb83636abd..98bd987abc 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -17,6 +17,7 @@ /** * Usage: add options: * -drive file=,if=none,id= + * -device nvme-subsys,id=,nqn= * -device nvme,serial=,id=, \ * cmb_size_mb=, \ * [pmrdev=,] \ @@ -38,6 +39,17 @@ * * The PMR will use BAR 4/5 exclusively. * + * To place controller(s) and namespace(s) to a subsystem, then provide + * nvme-subsys device as above. + * + * nvme subsystem device parameters + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * - `nqn` + * This parameter provides the `` part of the string + * `nqn.2019-08.org.qemu:` which will be reported in the SUBNQN field + * of subsystem controllers. Note that `` should be unique per + * subsystem, but this is not enforced by QEMU. If not specified, it will + * default to the value of the `id` parameter (``). * * nvme device parameters * ~~~~~~~~~~~~~~~~~~~~~~ From 982ed66bb2e89bdb029b186232946fe2e7c217e1 Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Sun, 24 Jan 2021 11:54:46 +0900 Subject: [PATCH 02/38] hw/block/nvme: support to map controller to a subsystem nvme controller(nvme) can be mapped to a NVMe subsystem(nvme-subsys). This patch maps a controller to a subsystem by adding a parameter 'subsys' to the nvme device. To map a controller to a subsystem, we need to put nvme-subsys first and then maps the subsystem to the controller: -device nvme-subsys,id=subsys0 -device nvme,serial=foo,id=nvme0,subsys=subsys0 If 'subsys' property is not given to the nvme controller, then subsystem NQN will be created with serial (e.g., 'foo' in above example), Otherwise, it will be based on subsys id (e.g., 'subsys0' in above example). Signed-off-by: Minwoo Im Tested-by: Klaus Jensen Reviewed-by: Klaus Jensen Reviewed-by: Keith Busch Signed-off-by: Klaus Jensen --- hw/block/nvme.c | 30 +++++++++++++++++++++++++----- hw/block/nvme.h | 3 +++ 2 files changed, 28 insertions(+), 5 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 98bd987abc..f5a19fd546 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -23,7 +23,8 @@ * [pmrdev=,] \ * max_ioqpairs=, \ * aerl=, aer_max_queued=, \ - * mdts=,zoned.append_size_limit= \ + * mdts=,zoned.append_size_limit=, \ + * subsys= * -device nvme-ns,drive=,bus=,nsid=,\ * zoned= * @@ -53,6 +54,13 @@ * * nvme device parameters * ~~~~~~~~~~~~~~~~~~~~~~ + * - `subsys` + * Specifying this parameter attaches the controller to the subsystem and + * the SUBNQN field in the controller will report the NQN of the subsystem + * device. This also enables multi controller capability represented in + * Identify Controller data structure in CMIC (Controller Multi-path I/O and + * Namesapce Sharing Capabilities). + * * - `aerl` * The Asynchronous Event Request Limit (AERL). Indicates the maximum number * of concurrently outstanding Asynchronous Event Request commands support @@ -4417,11 +4425,23 @@ static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp) return 0; } +static void nvme_init_subnqn(NvmeCtrl *n) +{ + NvmeSubsystem *subsys = n->subsys; + NvmeIdCtrl *id = &n->id_ctrl; + + if (!subsys) { + snprintf((char *)id->subnqn, sizeof(id->subnqn), + "nqn.2019-08.org.qemu:%s", n->params.serial); + } else { + pstrcpy((char *)id->subnqn, sizeof(id->subnqn), (char*)subsys->subnqn); + } +} + static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) { NvmeIdCtrl *id = &n->id_ctrl; uint8_t *pci_conf = pci_dev->config; - char *subnqn; id->vid = cpu_to_le16(pci_get_word(pci_conf + PCI_VENDOR_ID)); id->ssvid = cpu_to_le16(pci_get_word(pci_conf + PCI_SUBSYSTEM_VENDOR_ID)); @@ -4468,9 +4488,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN | NVME_CTRL_SGLS_BITBUCKET); - subnqn = g_strdup_printf("nqn.2019-08.org.qemu:%s", n->params.serial); - strpadcpy((char *)id->subnqn, sizeof(id->subnqn), subnqn, '\0'); - g_free(subnqn); + nvme_init_subnqn(n); id->psd[0].mp = cpu_to_le16(0x9c4); id->psd[0].enlat = cpu_to_le32(0x10); @@ -4562,6 +4580,8 @@ static Property nvme_props[] = { DEFINE_BLOCK_PROPERTIES(NvmeCtrl, namespace.blkconf), DEFINE_PROP_LINK("pmrdev", NvmeCtrl, pmr.dev, TYPE_MEMORY_BACKEND, HostMemoryBackend *), + DEFINE_PROP_LINK("subsys", NvmeCtrl, subsys, TYPE_NVME_SUBSYS, + NvmeSubsystem *), DEFINE_PROP_STRING("serial", NvmeCtrl, params.serial), DEFINE_PROP_UINT32("cmb_size_mb", NvmeCtrl, params.cmb_size_mb, 0), DEFINE_PROP_UINT32("num_queues", NvmeCtrl, params.num_queues, 0), diff --git a/hw/block/nvme.h b/hw/block/nvme.h index dee6092bd4..04d4684601 100644 --- a/hw/block/nvme.h +++ b/hw/block/nvme.h @@ -2,6 +2,7 @@ #define HW_NVME_H #include "block/nvme.h" +#include "nvme-subsys.h" #include "nvme-ns.h" #define NVME_MAX_NAMESPACES 256 @@ -170,6 +171,8 @@ typedef struct NvmeCtrl { uint8_t zasl; + NvmeSubsystem *subsys; + NvmeNamespace namespace; NvmeNamespace *namespaces[NVME_MAX_NAMESPACES]; NvmeSQueue **sq; From 66b7e9bed0aee4342aa7cb824b8c46a42cacf7e2 Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Sun, 24 Jan 2021 11:54:47 +0900 Subject: [PATCH 03/38] hw/block/nvme: add CMIC enum value for Identify Controller Added Controller Multi-path I/O and Namespace Sharing Capabilities (CMIC) field to support multi-controller in the following patches. This field is in Identify Controller data structure in [76]. Signed-off-by: Minwoo Im Tested-by: Klaus Jensen Reviewed-by: Klaus Jensen Reviewed-by: Keith Busch Signed-off-by: Klaus Jensen --- include/block/nvme.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/block/nvme.h b/include/block/nvme.h index 07cfc92936..f1d3a78658 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -1034,6 +1034,10 @@ enum NvmeIdCtrlLpa { NVME_LPA_EXTENDED = 1 << 2, }; +enum NvmeIdCtrlCmic { + NVME_CMIC_MULTI_CTRL = 1 << 1, +}; + #define NVME_CTRL_SQES_MIN(sqes) ((sqes) & 0xf) #define NVME_CTRL_SQES_MAX(sqes) (((sqes) >> 4) & 0xf) #define NVME_CTRL_CQES_MIN(cqes) ((cqes) & 0xf) From e36a261d4bf7057a8ffee336422210b58c661a21 Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Sun, 24 Jan 2021 11:54:48 +0900 Subject: [PATCH 04/38] hw/block/nvme: support for multi-controller in subsystem We have nvme-subsys and nvme devices mapped together. To support multi-controller scheme to this setup, controller identifier(id) has to be managed. Earlier, cntlid(controller id) used to be always 0 because we didn't have any subsystem scheme that controller id matters. This patch introduced 'cntlid' attribute to the nvme controller instance(NvmeCtrl) and make it allocated by the nvme-subsys device mapped to the controller. If nvme-subsys is not given to the controller, then it will always be 0 as it was. Added 'ctrls' array in the nvme-subsys instance to manage attached controllers to the subsystem with a limit(32). This patch didn't take list for the controllers to make it seamless with nvme-ns device. Signed-off-by: Minwoo Im Tested-by: Klaus Jensen Reviewed-by: Klaus Jensen Reviewed-by: Keith Busch Signed-off-by: Klaus Jensen --- hw/block/nvme-subsys.c | 21 +++++++++++++++++++++ hw/block/nvme-subsys.h | 4 ++++ hw/block/nvme.c | 29 +++++++++++++++++++++++++++++ hw/block/nvme.h | 1 + 4 files changed, 55 insertions(+) diff --git a/hw/block/nvme-subsys.c b/hw/block/nvme-subsys.c index 840448bb13..283a97b79d 100644 --- a/hw/block/nvme-subsys.c +++ b/hw/block/nvme-subsys.c @@ -22,6 +22,27 @@ #include "nvme.h" #include "nvme-subsys.h" +int nvme_subsys_register_ctrl(NvmeCtrl *n, Error **errp) +{ + NvmeSubsystem *subsys = n->subsys; + int cntlid; + + for (cntlid = 0; cntlid < ARRAY_SIZE(subsys->ctrls); cntlid++) { + if (!subsys->ctrls[cntlid]) { + break; + } + } + + if (cntlid == ARRAY_SIZE(subsys->ctrls)) { + error_setg(errp, "no more free controller id"); + return -1; + } + + subsys->ctrls[cntlid] = n; + + return cntlid; +} + static void nvme_subsys_setup(NvmeSubsystem *subsys) { const char *nqn = subsys->params.nqn ? diff --git a/hw/block/nvme-subsys.h b/hw/block/nvme-subsys.h index fc86d01ff3..55411eccfb 100644 --- a/hw/block/nvme-subsys.h +++ b/hw/block/nvme-subsys.h @@ -21,9 +21,13 @@ typedef struct NvmeSubsystem { DeviceState parent_obj; uint8_t subnqn[256]; + NvmeCtrl *ctrls[NVME_SUBSYS_MAX_CTRLS]; + struct { char *nqn; } params; } NvmeSubsystem; +int nvme_subsys_register_ctrl(NvmeCtrl *n, Error **errp); + #endif /* NVME_SUBSYS_H */ diff --git a/hw/block/nvme.c b/hw/block/nvme.c index f5a19fd546..5a63c356ba 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -4448,6 +4448,9 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) strpadcpy((char *)id->mn, sizeof(id->mn), "QEMU NVMe Ctrl", ' '); strpadcpy((char *)id->fr, sizeof(id->fr), "1.0", ' '); strpadcpy((char *)id->sn, sizeof(id->sn), n->params.serial, ' '); + + id->cntlid = cpu_to_le16(n->cntlid); + id->rab = 6; id->ieee[0] = 0x00; id->ieee[1] = 0x02; @@ -4494,6 +4497,10 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) id->psd[0].enlat = cpu_to_le32(0x10); id->psd[0].exlat = cpu_to_le32(0x4); + if (n->subsys) { + id->cmic |= NVME_CMIC_MULTI_CTRL; + } + NVME_CAP_SET_MQES(n->bar.cap, 0x7ff); NVME_CAP_SET_CQR(n->bar.cap, 1); NVME_CAP_SET_TO(n->bar.cap, 0xf); @@ -4508,6 +4515,24 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) n->bar.intmc = n->bar.intms = 0; } +static int nvme_init_subsys(NvmeCtrl *n, Error **errp) +{ + int cntlid; + + if (!n->subsys) { + return 0; + } + + cntlid = nvme_subsys_register_ctrl(n, errp); + if (cntlid < 0) { + return -1; + } + + n->cntlid = cntlid; + + return 0; +} + static void nvme_realize(PCIDevice *pci_dev, Error **errp) { NvmeCtrl *n = NVME(pci_dev); @@ -4528,6 +4553,10 @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp) return; } + if (nvme_init_subsys(n, errp)) { + error_propagate(errp, local_err); + return; + } nvme_init_ctrl(n, pci_dev); /* setup a namespace if the controller drive property was given */ diff --git a/hw/block/nvme.h b/hw/block/nvme.h index 04d4684601..b8f5f2d6ff 100644 --- a/hw/block/nvme.h +++ b/hw/block/nvme.h @@ -134,6 +134,7 @@ typedef struct NvmeCtrl { NvmeBus bus; BlockConf conf; + uint16_t cntlid; bool qs_created; uint32_t page_size; uint16_t page_bits; From adc36b8d21204c00643016d8766a5214e3d54b5b Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Sun, 24 Jan 2021 11:54:49 +0900 Subject: [PATCH 05/38] hw/block/nvme: add NMIC enum value for Identify Namespace Added Namespace Multi-path I/O and Namespace Sharing Capabilities (NMIC) field to support shared namespace from controller(s). This field is in Identify Namespace data structure in [30]. Signed-off-by: Minwoo Im Tested-by: Klaus Jensen Reviewed-by: Klaus Jensen Reviewed-by: Keith Busch Signed-off-by: Klaus Jensen --- include/block/nvme.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/block/nvme.h b/include/block/nvme.h index f1d3a78658..3db2b9b4cb 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -1203,6 +1203,10 @@ enum NvmeNsIdentifierType { NVME_NIDT_CSI = 0x04, }; +enum NvmeIdNsNmic { + NVME_NMIC_NS_SHARED = 1 << 0, +}; + enum NvmeCsi { NVME_CSI_NVM = 0x00, NVME_CSI_ZONED = 0x02, From e570768566b36f0a0471f65a40b47a6471ef0e24 Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Sun, 24 Jan 2021 11:54:50 +0900 Subject: [PATCH 06/38] hw/block/nvme: support for shared namespace in subsystem nvme-ns device is registered to a nvme controller device during the initialization in nvme_register_namespace() in case that 'bus' property is given which means it's mapped to a single controller. This patch introduced a new property 'subsys' just like the controller device instance did to map a namespace to a NVMe subsystem. If 'subsys' property is given to the nvme-ns device, it will belong to the specified subsystem and will be attached to all controllers in that subsystem by enabling shared namespace capability in NMIC(Namespace Multi-path I/O and Namespace Capabilities) in Identify Namespace. Usage: -device nvme-subsys,id=subsys0 -device nvme,serial=foo,id=nvme0,subsys=subsys0 -device nvme,serial=bar,id=nvme1,subsys=subsys0 -device nvme,serial=baz,id=nvme2,subsys=subsys0 -device nvme-ns,id=ns1,drive=,nsid=1,subsys=subsys0 # Shared -device nvme-ns,id=ns2,drive=,nsid=2,bus=nvme2 # Non-shared In the above example, 'ns1' will be shared to 'nvme0' and 'nvme1' in the same subsystem. On the other hand, 'ns2' will be attached to the 'nvme2' only as a private namespace in that subsystem. All the namespace with 'subsys' parameter will attach all controllers in the subsystem to the namespace by default. Signed-off-by: Minwoo Im Tested-by: Klaus Jensen Reviewed-by: Klaus Jensen Reviewed-by: Keith Busch Signed-off-by: Klaus Jensen --- hw/block/nvme-ns.c | 17 ++++++++++++++--- hw/block/nvme-ns.h | 7 +++++++ hw/block/nvme-subsys.c | 25 +++++++++++++++++++++++++ hw/block/nvme-subsys.h | 3 +++ hw/block/nvme.c | 10 +++++++++- 5 files changed, 58 insertions(+), 4 deletions(-) diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c index 93ac6e107a..64b6a491ad 100644 --- a/hw/block/nvme-ns.c +++ b/hw/block/nvme-ns.c @@ -63,6 +63,10 @@ static int nvme_ns_init(NvmeNamespace *ns, Error **errp) id_ns->npda = id_ns->npdg = npdg - 1; + if (nvme_ns_shared(ns)) { + id_ns->nmic |= NVME_NMIC_NS_SHARED; + } + return 0; } @@ -363,14 +367,21 @@ static void nvme_ns_realize(DeviceState *dev, Error **errp) return; } - if (nvme_register_namespace(n, ns, errp)) { - return; + if (ns->subsys) { + if (nvme_subsys_register_ns(ns, errp)) { + return; + } + } else { + if (nvme_register_namespace(n, ns, errp)) { + return; + } } - } static Property nvme_ns_props[] = { DEFINE_BLOCK_PROPERTIES(NvmeNamespace, blkconf), + DEFINE_PROP_LINK("subsys", NvmeNamespace, subsys, TYPE_NVME_SUBSYS, + NvmeSubsystem *), DEFINE_PROP_UINT32("nsid", NvmeNamespace, params.nsid, 0), DEFINE_PROP_UUID("uuid", NvmeNamespace, params.uuid), DEFINE_PROP_BOOL("zoned", NvmeNamespace, params.zoned, false), diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h index 293ac990e3..929e788619 100644 --- a/hw/block/nvme-ns.h +++ b/hw/block/nvme-ns.h @@ -47,6 +47,8 @@ typedef struct NvmeNamespace { const uint32_t *iocs; uint8_t csi; + NvmeSubsystem *subsys; + NvmeIdNsZoned *id_ns_zoned; NvmeZone *zone_array; QTAILQ_HEAD(, NvmeZone) exp_open_zones; @@ -77,6 +79,11 @@ static inline uint32_t nvme_nsid(NvmeNamespace *ns) return -1; } +static inline bool nvme_ns_shared(NvmeNamespace *ns) +{ + return !!ns->subsys; +} + static inline NvmeLBAF *nvme_ns_lbaf(NvmeNamespace *ns) { NvmeIdNs *id_ns = &ns->id_ns; diff --git a/hw/block/nvme-subsys.c b/hw/block/nvme-subsys.c index 283a97b79d..af4804a819 100644 --- a/hw/block/nvme-subsys.c +++ b/hw/block/nvme-subsys.c @@ -43,6 +43,31 @@ int nvme_subsys_register_ctrl(NvmeCtrl *n, Error **errp) return cntlid; } +int nvme_subsys_register_ns(NvmeNamespace *ns, Error **errp) +{ + NvmeSubsystem *subsys = ns->subsys; + NvmeCtrl *n; + int i; + + if (subsys->namespaces[nvme_nsid(ns)]) { + error_setg(errp, "namespace %d already registerd to subsy %s", + nvme_nsid(ns), subsys->parent_obj.id); + return -1; + } + + subsys->namespaces[nvme_nsid(ns)] = ns; + + for (i = 0; i < ARRAY_SIZE(subsys->ctrls); i++) { + n = subsys->ctrls[i]; + + if (n && nvme_register_namespace(n, ns, errp)) { + return -1; + } + } + + return 0; +} + static void nvme_subsys_setup(NvmeSubsystem *subsys) { const char *nqn = subsys->params.nqn ? diff --git a/hw/block/nvme-subsys.h b/hw/block/nvme-subsys.h index 55411eccfb..7a54a85120 100644 --- a/hw/block/nvme-subsys.h +++ b/hw/block/nvme-subsys.h @@ -14,6 +14,7 @@ OBJECT_CHECK(NvmeSubsystem, (obj), TYPE_NVME_SUBSYS) #define NVME_SUBSYS_MAX_CTRLS 32 +#define NVME_SUBSYS_MAX_NAMESPACES 32 typedef struct NvmeCtrl NvmeCtrl; typedef struct NvmeNamespace NvmeNamespace; @@ -22,6 +23,7 @@ typedef struct NvmeSubsystem { uint8_t subnqn[256]; NvmeCtrl *ctrls[NVME_SUBSYS_MAX_CTRLS]; + NvmeNamespace *namespaces[NVME_SUBSYS_MAX_NAMESPACES]; struct { char *nqn; @@ -29,5 +31,6 @@ typedef struct NvmeSubsystem { } NvmeSubsystem; int nvme_subsys_register_ctrl(NvmeCtrl *n, Error **errp); +int nvme_subsys_register_ns(NvmeNamespace *ns, Error **errp); #endif /* NVME_SUBSYS_H */ diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 5a63c356ba..5e0819fd9e 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -26,7 +26,8 @@ * mdts=,zoned.append_size_limit=, \ * subsys= * -device nvme-ns,drive=,bus=,nsid=,\ - * zoned= + * zoned=, \ + * subsys= * * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at * offset 0 in BAR2 and supports only WDS, RDS and SQS for now. By default, the @@ -79,6 +80,13 @@ * data size being in effect. By setting this property to 0, users can make * ZASL to be equal to MDTS. This property only affects zoned namespaces. * + * nvme namespace device parameters + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * - `subsys` + * If given, the namespace will be attached to all controllers in the + * subsystem. Otherwise, `bus` must be given to attach this namespace to a + * specific controller as a non-shared namespace. + * * Setting `zoned` to true selects Zoned Command Set at the namespace. * In this case, the following namespace properties are available to configure * zoned operation: From eda688ee2403c1efc48f420590623c885aec3393 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Tue, 19 Jan 2021 22:56:14 +0100 Subject: [PATCH 07/38] hw/block/nvme: remove unused parameter in check zone write Remove the unused NvmeCtrl parameter in nvme_check_zone_write. Signed-off-by: Klaus Jensen Reviewed-by: Keith Busch --- hw/block/nvme.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 5e0819fd9e..f39be1961e 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -1213,9 +1213,8 @@ static uint16_t nvme_check_zone_state_for_write(NvmeZone *zone) return NVME_INTERNAL_DEV_ERROR; } -static uint16_t nvme_check_zone_write(NvmeCtrl *n, NvmeNamespace *ns, - NvmeZone *zone, uint64_t slba, - uint32_t nlb) +static uint16_t nvme_check_zone_write(NvmeNamespace *ns, NvmeZone *zone, + uint64_t slba, uint32_t nlb) { uint64_t zcap = nvme_zone_wr_boundary(zone); uint16_t status; @@ -1778,7 +1777,7 @@ static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append, res->slba = cpu_to_le64(slba); } - status = nvme_check_zone_write(n, ns, zone, slba, nlb); + status = nvme_check_zone_write(ns, zone, slba, nlb); if (status) { goto invalid; } From 975b64665048e8e283a3c9cad9808da0a014e283 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Tue, 19 Jan 2021 21:01:15 +0100 Subject: [PATCH 08/38] hw/block/nvme: refactor zone resource management Zone transition handling and resource management is open coded (and semi-duplicated in the case of open, close and finish). In preparation for Simple Copy command support (which also needs to open zones for writing), consolidate into a set of 'nvme_zrm' functions and in the process fix a bug with the controller not closing an open zone to allow another zone to be explicitly opened. Signed-off-by: Klaus Jensen Reviewed-by: Keith Busch --- hw/block/nvme.c | 220 +++++++++++++++++++++++------------------------- 1 file changed, 103 insertions(+), 117 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index f39be1961e..7897390b6d 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -1292,7 +1292,46 @@ static uint16_t nvme_check_zone_read(NvmeNamespace *ns, uint64_t slba, return status; } -static void nvme_auto_transition_zone(NvmeNamespace *ns) +static uint16_t nvme_zrm_finish(NvmeNamespace *ns, NvmeZone *zone) +{ + switch (nvme_get_zone_state(zone)) { + case NVME_ZONE_STATE_FULL: + return NVME_SUCCESS; + + case NVME_ZONE_STATE_IMPLICITLY_OPEN: + case NVME_ZONE_STATE_EXPLICITLY_OPEN: + nvme_aor_dec_open(ns); + /* fallthrough */ + case NVME_ZONE_STATE_CLOSED: + nvme_aor_dec_active(ns); + /* fallthrough */ + case NVME_ZONE_STATE_EMPTY: + nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_FULL); + return NVME_SUCCESS; + + default: + return NVME_ZONE_INVAL_TRANSITION; + } +} + +static uint16_t nvme_zrm_close(NvmeNamespace *ns, NvmeZone *zone) +{ + switch (nvme_get_zone_state(zone)) { + case NVME_ZONE_STATE_CLOSED: + return NVME_SUCCESS; + + case NVME_ZONE_STATE_EXPLICITLY_OPEN: + case NVME_ZONE_STATE_IMPLICITLY_OPEN: + nvme_aor_dec_open(ns); + nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED); + /* fall through */ + + default: + return NVME_ZONE_INVAL_TRANSITION; + } +} + +static void nvme_zrm_auto_transition_zone(NvmeNamespace *ns) { NvmeZone *zone; @@ -1304,34 +1343,74 @@ static void nvme_auto_transition_zone(NvmeNamespace *ns) * Automatically close this implicitly open zone. */ QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry); - nvme_aor_dec_open(ns); - nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED); + nvme_zrm_close(ns, zone); } } } -static uint16_t nvme_auto_open_zone(NvmeNamespace *ns, NvmeZone *zone) +static uint16_t __nvme_zrm_open(NvmeNamespace *ns, NvmeZone *zone, + bool implicit) { - uint16_t status = NVME_SUCCESS; - uint8_t zs = nvme_get_zone_state(zone); + int act = 0; + uint16_t status; - if (zs == NVME_ZONE_STATE_EMPTY) { - nvme_auto_transition_zone(ns); - status = nvme_aor_check(ns, 1, 1); - } else if (zs == NVME_ZONE_STATE_CLOSED) { - nvme_auto_transition_zone(ns); - status = nvme_aor_check(ns, 0, 1); + switch (nvme_get_zone_state(zone)) { + case NVME_ZONE_STATE_EMPTY: + act = 1; + + /* fallthrough */ + + case NVME_ZONE_STATE_CLOSED: + nvme_zrm_auto_transition_zone(ns); + status = nvme_aor_check(ns, act, 1); + if (status) { + return status; + } + + if (act) { + nvme_aor_inc_active(ns); + } + + nvme_aor_inc_open(ns); + + if (implicit) { + nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_IMPLICITLY_OPEN); + return NVME_SUCCESS; + } + + /* fallthrough */ + + case NVME_ZONE_STATE_IMPLICITLY_OPEN: + if (implicit) { + return NVME_SUCCESS; + } + + nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EXPLICITLY_OPEN); + + /* fallthrough */ + + case NVME_ZONE_STATE_EXPLICITLY_OPEN: + return NVME_SUCCESS; + + default: + return NVME_ZONE_INVAL_TRANSITION; } - - return status; } -static void nvme_finalize_zoned_write(NvmeNamespace *ns, NvmeRequest *req, - bool failed) +static inline uint16_t nvme_zrm_auto(NvmeNamespace *ns, NvmeZone *zone) +{ + return __nvme_zrm_open(ns, zone, true); +} + +static inline uint16_t nvme_zrm_open(NvmeNamespace *ns, NvmeZone *zone) +{ + return __nvme_zrm_open(ns, zone, false); +} + +static void nvme_finalize_zoned_write(NvmeNamespace *ns, NvmeRequest *req) { NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; NvmeZone *zone; - NvmeZonedResult *res = (NvmeZonedResult *)&req->cqe; uint64_t slba; uint32_t nlb; @@ -1341,47 +1420,8 @@ static void nvme_finalize_zoned_write(NvmeNamespace *ns, NvmeRequest *req, zone->d.wp += nlb; - if (failed) { - res->slba = 0; - } - if (zone->d.wp == nvme_zone_wr_boundary(zone)) { - switch (nvme_get_zone_state(zone)) { - case NVME_ZONE_STATE_IMPLICITLY_OPEN: - case NVME_ZONE_STATE_EXPLICITLY_OPEN: - nvme_aor_dec_open(ns); - /* fall through */ - case NVME_ZONE_STATE_CLOSED: - nvme_aor_dec_active(ns); - /* fall through */ - case NVME_ZONE_STATE_EMPTY: - nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_FULL); - /* fall through */ - case NVME_ZONE_STATE_FULL: - break; - default: - assert(false); - } - } -} - -static void nvme_advance_zone_wp(NvmeNamespace *ns, NvmeZone *zone, - uint32_t nlb) -{ - uint8_t zs; - - zone->w_ptr += nlb; - - if (zone->w_ptr < nvme_zone_wr_boundary(zone)) { - zs = nvme_get_zone_state(zone); - switch (zs) { - case NVME_ZONE_STATE_EMPTY: - nvme_aor_inc_active(ns); - /* fall through */ - case NVME_ZONE_STATE_CLOSED: - nvme_aor_inc_open(ns); - nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_IMPLICITLY_OPEN); - } + nvme_zrm_finish(ns, zone); } } @@ -1406,7 +1446,7 @@ static void nvme_rw_cb(void *opaque, int ret) trace_pci_nvme_rw_cb(nvme_cid(req), blk_name(blk)); if (ns->params.zoned && nvme_is_write(req)) { - nvme_finalize_zoned_write(ns, req, ret != 0); + nvme_finalize_zoned_write(ns, req); } if (!ret) { @@ -1782,12 +1822,12 @@ static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append, goto invalid; } - status = nvme_auto_open_zone(ns, zone); + status = nvme_zrm_auto(ns, zone); if (status) { goto invalid; } - nvme_advance_zone_wp(ns, zone, nlb); + zone->w_ptr += nlb; } data_offset = nvme_l2b(ns, slba); @@ -1873,73 +1913,19 @@ enum NvmeZoneProcessingMask { static uint16_t nvme_open_zone(NvmeNamespace *ns, NvmeZone *zone, NvmeZoneState state, NvmeRequest *req) { - uint16_t status; - - switch (state) { - case NVME_ZONE_STATE_EMPTY: - status = nvme_aor_check(ns, 1, 0); - if (status) { - return status; - } - nvme_aor_inc_active(ns); - /* fall through */ - case NVME_ZONE_STATE_CLOSED: - status = nvme_aor_check(ns, 0, 1); - if (status) { - if (state == NVME_ZONE_STATE_EMPTY) { - nvme_aor_dec_active(ns); - } - return status; - } - nvme_aor_inc_open(ns); - /* fall through */ - case NVME_ZONE_STATE_IMPLICITLY_OPEN: - nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EXPLICITLY_OPEN); - /* fall through */ - case NVME_ZONE_STATE_EXPLICITLY_OPEN: - return NVME_SUCCESS; - default: - return NVME_ZONE_INVAL_TRANSITION; - } + return nvme_zrm_open(ns, zone); } static uint16_t nvme_close_zone(NvmeNamespace *ns, NvmeZone *zone, NvmeZoneState state, NvmeRequest *req) { - switch (state) { - case NVME_ZONE_STATE_EXPLICITLY_OPEN: - case NVME_ZONE_STATE_IMPLICITLY_OPEN: - nvme_aor_dec_open(ns); - nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED); - /* fall through */ - case NVME_ZONE_STATE_CLOSED: - return NVME_SUCCESS; - default: - return NVME_ZONE_INVAL_TRANSITION; - } + return nvme_zrm_close(ns, zone); } static uint16_t nvme_finish_zone(NvmeNamespace *ns, NvmeZone *zone, NvmeZoneState state, NvmeRequest *req) { - switch (state) { - case NVME_ZONE_STATE_EXPLICITLY_OPEN: - case NVME_ZONE_STATE_IMPLICITLY_OPEN: - nvme_aor_dec_open(ns); - /* fall through */ - case NVME_ZONE_STATE_CLOSED: - nvme_aor_dec_active(ns); - /* fall through */ - case NVME_ZONE_STATE_EMPTY: - zone->w_ptr = nvme_zone_wr_boundary(zone); - zone->d.wp = zone->w_ptr; - nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_FULL); - /* fall through */ - case NVME_ZONE_STATE_FULL: - return NVME_SUCCESS; - default: - return NVME_ZONE_INVAL_TRANSITION; - } + return nvme_zrm_finish(ns, zone); } static uint16_t nvme_reset_zone(NvmeNamespace *ns, NvmeZone *zone, From b0a79429d964ad2d8c1c41fdc18b3ae5def41ff8 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Tue, 19 Jan 2021 23:01:02 +0100 Subject: [PATCH 09/38] hw/block/nvme: pull write pointer advancement to separate function In preparation for Simple Copy, pull write pointer advancement into a separate function that is independent off an NvmeRequest. Signed-off-by: Klaus Jensen Reviewed-by: Keith Busch --- hw/block/nvme.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 7897390b6d..44129f8e8b 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -1407,6 +1407,16 @@ static inline uint16_t nvme_zrm_open(NvmeNamespace *ns, NvmeZone *zone) return __nvme_zrm_open(ns, zone, false); } +static void __nvme_advance_zone_wp(NvmeNamespace *ns, NvmeZone *zone, + uint32_t nlb) +{ + zone->d.wp += nlb; + + if (zone->d.wp == nvme_zone_wr_boundary(zone)) { + nvme_zrm_finish(ns, zone); + } +} + static void nvme_finalize_zoned_write(NvmeNamespace *ns, NvmeRequest *req) { NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; @@ -1418,11 +1428,7 @@ static void nvme_finalize_zoned_write(NvmeNamespace *ns, NvmeRequest *req) nlb = le16_to_cpu(rw->nlb) + 1; zone = nvme_get_zone_by_slba(ns, slba); - zone->d.wp += nlb; - - if (zone->d.wp == nvme_zone_wr_boundary(zone)) { - nvme_zrm_finish(ns, zone); - } + __nvme_advance_zone_wp(ns, zone, nlb); } static inline bool nvme_is_write(NvmeRequest *req) From 3862efff316c1d02b41d1362f97dfba812050e53 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Mon, 16 Nov 2020 13:40:02 +0100 Subject: [PATCH 10/38] nvme: updated shared header for copy command Add new data structures and types for the Simple Copy command. Signed-off-by: Klaus Jensen Reviewed-by: Minwoo Im Acked-by: Stefan Hajnoczi Reviewed-by: Keith Busch --- include/block/nvme.h | 47 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 45 insertions(+), 2 deletions(-) diff --git a/include/block/nvme.h b/include/block/nvme.h index 3db2b9b4cb..9f8eb3988c 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -579,6 +579,7 @@ enum NvmeIoCommands { NVME_CMD_COMPARE = 0x05, NVME_CMD_WRITE_ZEROES = 0x08, NVME_CMD_DSM = 0x09, + NVME_CMD_COPY = 0x19, NVME_CMD_ZONE_MGMT_SEND = 0x79, NVME_CMD_ZONE_MGMT_RECV = 0x7a, NVME_CMD_ZONE_APPEND = 0x7d, @@ -724,6 +725,37 @@ typedef struct QEMU_PACKED NvmeDsmRange { uint64_t slba; } NvmeDsmRange; +enum { + NVME_COPY_FORMAT_0 = 0x0, +}; + +typedef struct QEMU_PACKED NvmeCopyCmd { + uint8_t opcode; + uint8_t flags; + uint16_t cid; + uint32_t nsid; + uint32_t rsvd2[4]; + NvmeCmdDptr dptr; + uint64_t sdlba; + uint8_t nr; + uint8_t control[3]; + uint16_t rsvd13; + uint16_t dspec; + uint32_t reftag; + uint16_t apptag; + uint16_t appmask; +} NvmeCopyCmd; + +typedef struct QEMU_PACKED NvmeCopySourceRange { + uint8_t rsvd0[8]; + uint64_t slba; + uint16_t nlb; + uint8_t rsvd18[6]; + uint32_t reftag; + uint16_t apptag; + uint16_t appmask; +} NvmeCopySourceRange; + enum NvmeAsyncEventRequest { NVME_AER_TYPE_ERROR = 0, NVME_AER_TYPE_SMART = 1, @@ -807,6 +839,7 @@ enum NvmeStatusCodes { NVME_CONFLICTING_ATTRS = 0x0180, NVME_INVALID_PROT_INFO = 0x0181, NVME_WRITE_TO_RO = 0x0182, + NVME_CMD_SIZE_LIMIT = 0x0183, NVME_ZONE_BOUNDARY_ERROR = 0x01b8, NVME_ZONE_FULL = 0x01b9, NVME_ZONE_READ_ONLY = 0x01ba, @@ -994,7 +1027,7 @@ typedef struct QEMU_PACKED NvmeIdCtrl { uint8_t nvscc; uint8_t rsvd531; uint16_t acwu; - uint8_t rsvd534[2]; + uint16_t ocfs; uint32_t sgls; uint8_t rsvd540[228]; uint8_t subnqn[256]; @@ -1022,6 +1055,11 @@ enum NvmeIdCtrlOncs { NVME_ONCS_FEATURES = 1 << 4, NVME_ONCS_RESRVATIONS = 1 << 5, NVME_ONCS_TIMESTAMP = 1 << 6, + NVME_ONCS_COPY = 1 << 8, +}; + +enum NvmeIdCtrlOcfs { + NVME_OCFS_COPY_FORMAT_0 = 1 << 0, }; enum NvmeIdCtrlFrmw { @@ -1175,7 +1213,10 @@ typedef struct QEMU_PACKED NvmeIdNs { uint16_t npdg; uint16_t npda; uint16_t nows; - uint8_t rsvd74[30]; + uint16_t mssrl; + uint32_t mcl; + uint8_t msrc; + uint8_t rsvd81[23]; uint8_t nguid[16]; uint64_t eui64; NvmeLBAF lbaf[16]; @@ -1331,6 +1372,7 @@ static inline void _nvme_check_size(void) QEMU_BUILD_BUG_ON(sizeof(NvmeZonedResult) != 8); QEMU_BUILD_BUG_ON(sizeof(NvmeCqe) != 16); QEMU_BUILD_BUG_ON(sizeof(NvmeDsmRange) != 16); + QEMU_BUILD_BUG_ON(sizeof(NvmeCopySourceRange) != 32); QEMU_BUILD_BUG_ON(sizeof(NvmeCmd) != 64); QEMU_BUILD_BUG_ON(sizeof(NvmeDeleteQ) != 64); QEMU_BUILD_BUG_ON(sizeof(NvmeCreateCq) != 64); @@ -1338,6 +1380,7 @@ static inline void _nvme_check_size(void) QEMU_BUILD_BUG_ON(sizeof(NvmeIdentify) != 64); QEMU_BUILD_BUG_ON(sizeof(NvmeRwCmd) != 64); QEMU_BUILD_BUG_ON(sizeof(NvmeDsmCmd) != 64); + QEMU_BUILD_BUG_ON(sizeof(NvmeCopyCmd) != 64); QEMU_BUILD_BUG_ON(sizeof(NvmeRangeType) != 64); QEMU_BUILD_BUG_ON(sizeof(NvmeErrorLog) != 64); QEMU_BUILD_BUG_ON(sizeof(NvmeFwSlotInfoLog) != 512); From e4e430b3d6baa1c908ba71ca37aad87edac98804 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Fri, 6 Nov 2020 10:46:01 +0100 Subject: [PATCH 11/38] hw/block/nvme: add simple copy command Add support for TP 4065a ("Simple Copy Command"), v2020.05.04 ("Ratified"). The implementation uses a bounce buffer to first read in the source logical blocks, then issue a write of that bounce buffer. The default maximum number of source logical blocks is 128, translating to 512 KiB for 4k logical blocks which aligns with the default value of MDTS. Signed-off-by: Klaus Jensen Reviewed-by: Keith Busch --- hw/block/nvme-ns.c | 8 ++ hw/block/nvme-ns.h | 4 + hw/block/nvme.c | 252 +++++++++++++++++++++++++++++++++++++++++- hw/block/nvme.h | 1 + hw/block/trace-events | 6 + 5 files changed, 270 insertions(+), 1 deletion(-) diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c index 64b6a491ad..fd73d03211 100644 --- a/hw/block/nvme-ns.c +++ b/hw/block/nvme-ns.c @@ -67,6 +67,11 @@ static int nvme_ns_init(NvmeNamespace *ns, Error **errp) id_ns->nmic |= NVME_NMIC_NS_SHARED; } + /* simple copy */ + id_ns->mssrl = cpu_to_le16(ns->params.mssrl); + id_ns->mcl = cpu_to_le32(ns->params.mcl); + id_ns->msrc = ns->params.msrc; + return 0; } @@ -384,6 +389,9 @@ static Property nvme_ns_props[] = { NvmeSubsystem *), DEFINE_PROP_UINT32("nsid", NvmeNamespace, params.nsid, 0), DEFINE_PROP_UUID("uuid", NvmeNamespace, params.uuid), + DEFINE_PROP_UINT16("mssrl", NvmeNamespace, params.mssrl, 128), + DEFINE_PROP_UINT32("mcl", NvmeNamespace, params.mcl, 128), + DEFINE_PROP_UINT8("msrc", NvmeNamespace, params.msrc, 127), DEFINE_PROP_BOOL("zoned", NvmeNamespace, params.zoned, false), DEFINE_PROP_SIZE("zoned.zone_size", NvmeNamespace, params.zone_size_bs, NVME_DEFAULT_ZONE_SIZE), diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h index 929e788619..7af6884862 100644 --- a/hw/block/nvme-ns.h +++ b/hw/block/nvme-ns.h @@ -29,6 +29,10 @@ typedef struct NvmeNamespaceParams { uint32_t nsid; QemuUUID uuid; + uint16_t mssrl; + uint32_t mcl; + uint8_t msrc; + bool zoned; bool cross_zone_read; uint64_t zone_size_bs; diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 44129f8e8b..ab4723ff31 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -195,6 +195,7 @@ static const uint32_t nvme_cse_iocs_nvm[256] = { [NVME_CMD_WRITE] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, [NVME_CMD_READ] = NVME_CMD_EFF_CSUPP, [NVME_CMD_DSM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, + [NVME_CMD_COPY] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, [NVME_CMD_COMPARE] = NVME_CMD_EFF_CSUPP, }; @@ -204,6 +205,7 @@ static const uint32_t nvme_cse_iocs_zoned[256] = { [NVME_CMD_WRITE] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, [NVME_CMD_READ] = NVME_CMD_EFF_CSUPP, [NVME_CMD_DSM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, + [NVME_CMD_COPY] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, [NVME_CMD_COMPARE] = NVME_CMD_EFF_CSUPP, [NVME_CMD_ZONE_APPEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, [NVME_CMD_ZONE_MGMT_SEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, @@ -1532,6 +1534,136 @@ static void nvme_aio_zone_reset_cb(void *opaque, int ret) nvme_enqueue_req_completion(nvme_cq(req), req); } +struct nvme_copy_ctx { + int copies; + uint8_t *bounce; + uint32_t nlb; +}; + +struct nvme_copy_in_ctx { + NvmeRequest *req; + QEMUIOVector iov; +}; + +static void nvme_copy_cb(void *opaque, int ret) +{ + NvmeRequest *req = opaque; + NvmeNamespace *ns = req->ns; + struct nvme_copy_ctx *ctx = req->opaque; + + trace_pci_nvme_copy_cb(nvme_cid(req)); + + if (ns->params.zoned) { + NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd; + uint64_t sdlba = le64_to_cpu(copy->sdlba); + NvmeZone *zone = nvme_get_zone_by_slba(ns, sdlba); + + __nvme_advance_zone_wp(ns, zone, ctx->nlb); + } + + if (!ret) { + block_acct_done(blk_get_stats(ns->blkconf.blk), &req->acct); + } else { + block_acct_failed(blk_get_stats(ns->blkconf.blk), &req->acct); + nvme_aio_err(req, ret); + } + + g_free(ctx->bounce); + g_free(ctx); + + nvme_enqueue_req_completion(nvme_cq(req), req); +} + +static void nvme_copy_in_complete(NvmeRequest *req) +{ + NvmeNamespace *ns = req->ns; + NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd; + struct nvme_copy_ctx *ctx = req->opaque; + uint64_t sdlba = le64_to_cpu(copy->sdlba); + uint16_t status; + + trace_pci_nvme_copy_in_complete(nvme_cid(req)); + + block_acct_done(blk_get_stats(ns->blkconf.blk), &req->acct); + + status = nvme_check_bounds(ns, sdlba, ctx->nlb); + if (status) { + trace_pci_nvme_err_invalid_lba_range(sdlba, ctx->nlb, ns->id_ns.nsze); + goto invalid; + } + + if (ns->params.zoned) { + NvmeZone *zone = nvme_get_zone_by_slba(ns, sdlba); + + status = nvme_check_zone_write(ns, zone, sdlba, ctx->nlb); + if (status) { + goto invalid; + } + + status = nvme_zrm_auto(ns, zone); + if (status) { + goto invalid; + } + + zone->w_ptr += ctx->nlb; + } + + qemu_iovec_init(&req->iov, 1); + qemu_iovec_add(&req->iov, ctx->bounce, nvme_l2b(ns, ctx->nlb)); + + block_acct_start(blk_get_stats(ns->blkconf.blk), &req->acct, 0, + BLOCK_ACCT_WRITE); + + req->aiocb = blk_aio_pwritev(ns->blkconf.blk, nvme_l2b(ns, sdlba), + &req->iov, 0, nvme_copy_cb, req); + + return; + +invalid: + req->status = status; + + g_free(ctx->bounce); + g_free(ctx); + + nvme_enqueue_req_completion(nvme_cq(req), req); +} + +static void nvme_aio_copy_in_cb(void *opaque, int ret) +{ + struct nvme_copy_in_ctx *in_ctx = opaque; + NvmeRequest *req = in_ctx->req; + NvmeNamespace *ns = req->ns; + struct nvme_copy_ctx *ctx = req->opaque; + + qemu_iovec_destroy(&in_ctx->iov); + g_free(in_ctx); + + trace_pci_nvme_aio_copy_in_cb(nvme_cid(req)); + + if (ret) { + nvme_aio_err(req, ret); + } + + ctx->copies--; + + if (ctx->copies) { + return; + } + + if (req->status) { + block_acct_failed(blk_get_stats(ns->blkconf.blk), &req->acct); + + g_free(ctx->bounce); + g_free(ctx); + + nvme_enqueue_req_completion(nvme_cq(req), req); + + return; + } + + nvme_copy_in_complete(req); +} + struct nvme_compare_ctx { QEMUIOVector iov; uint8_t *bounce; @@ -1650,6 +1782,121 @@ static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req) return status; } +static uint16_t nvme_copy(NvmeCtrl *n, NvmeRequest *req) +{ + NvmeNamespace *ns = req->ns; + NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd; + g_autofree NvmeCopySourceRange *range = NULL; + + uint16_t nr = copy->nr + 1; + uint8_t format = copy->control[0] & 0xf; + uint32_t nlb = 0; + + uint8_t *bounce = NULL, *bouncep = NULL; + struct nvme_copy_ctx *ctx; + uint16_t status; + int i; + + trace_pci_nvme_copy(nvme_cid(req), nvme_nsid(ns), nr, format); + + if (!(n->id_ctrl.ocfs & (1 << format))) { + trace_pci_nvme_err_copy_invalid_format(format); + return NVME_INVALID_FIELD | NVME_DNR; + } + + if (nr > ns->id_ns.msrc + 1) { + return NVME_CMD_SIZE_LIMIT | NVME_DNR; + } + + range = g_new(NvmeCopySourceRange, nr); + + status = nvme_dma(n, (uint8_t *)range, nr * sizeof(NvmeCopySourceRange), + DMA_DIRECTION_TO_DEVICE, req); + if (status) { + return status; + } + + for (i = 0; i < nr; i++) { + uint64_t slba = le64_to_cpu(range[i].slba); + uint32_t _nlb = le16_to_cpu(range[i].nlb) + 1; + + if (_nlb > le16_to_cpu(ns->id_ns.mssrl)) { + return NVME_CMD_SIZE_LIMIT | NVME_DNR; + } + + status = nvme_check_bounds(ns, slba, _nlb); + if (status) { + trace_pci_nvme_err_invalid_lba_range(slba, _nlb, ns->id_ns.nsze); + return status; + } + + if (NVME_ERR_REC_DULBE(ns->features.err_rec)) { + status = nvme_check_dulbe(ns, slba, _nlb); + if (status) { + return status; + } + } + + if (ns->params.zoned) { + status = nvme_check_zone_read(ns, slba, _nlb); + if (status) { + return status; + } + } + + nlb += _nlb; + } + + if (nlb > le32_to_cpu(ns->id_ns.mcl)) { + return NVME_CMD_SIZE_LIMIT | NVME_DNR; + } + + bounce = bouncep = g_malloc(nvme_l2b(ns, nlb)); + + block_acct_start(blk_get_stats(ns->blkconf.blk), &req->acct, 0, + BLOCK_ACCT_READ); + + ctx = g_new(struct nvme_copy_ctx, 1); + + ctx->bounce = bounce; + ctx->nlb = nlb; + ctx->copies = 1; + + req->opaque = ctx; + + for (i = 0; i < nr; i++) { + uint64_t slba = le64_to_cpu(range[i].slba); + uint32_t nlb = le16_to_cpu(range[i].nlb) + 1; + + size_t len = nvme_l2b(ns, nlb); + int64_t offset = nvme_l2b(ns, slba); + + trace_pci_nvme_copy_source_range(slba, nlb); + + struct nvme_copy_in_ctx *in_ctx = g_new(struct nvme_copy_in_ctx, 1); + in_ctx->req = req; + + qemu_iovec_init(&in_ctx->iov, 1); + qemu_iovec_add(&in_ctx->iov, bouncep, len); + + ctx->copies++; + + blk_aio_preadv(ns->blkconf.blk, offset, &in_ctx->iov, 0, + nvme_aio_copy_in_cb, in_ctx); + + bouncep += len; + } + + /* account for the 1-initialization */ + ctx->copies--; + + if (!ctx->copies) { + nvme_copy_in_complete(req); + } + + return NVME_NO_COMPLETE; +} + static uint16_t nvme_compare(NvmeCtrl *n, NvmeRequest *req) { NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; @@ -2387,6 +2634,8 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req) return nvme_compare(n, req); case NVME_CMD_DSM: return nvme_dsm(n, req); + case NVME_CMD_COPY: + return nvme_copy(n, req); case NVME_CMD_ZONE_MGMT_SEND: return nvme_zone_mgmt_send(n, req); case NVME_CMD_ZONE_MGMT_RECV: @@ -4484,9 +4733,10 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) id->nn = cpu_to_le32(n->num_namespaces); id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP | NVME_ONCS_FEATURES | NVME_ONCS_DSM | - NVME_ONCS_COMPARE); + NVME_ONCS_COMPARE | NVME_ONCS_COPY); id->vwc = (0x2 << 1) | 0x1; + id->ocfs = cpu_to_le16(NVME_OCFS_COPY_FORMAT_0); id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN | NVME_CTRL_SGLS_BITBUCKET); diff --git a/hw/block/nvme.h b/hw/block/nvme.h index b8f5f2d6ff..cb2b5175f1 100644 --- a/hw/block/nvme.h +++ b/hw/block/nvme.h @@ -69,6 +69,7 @@ static inline const char *nvme_io_opc_str(uint8_t opc) case NVME_CMD_COMPARE: return "NVME_NVM_CMD_COMPARE"; case NVME_CMD_WRITE_ZEROES: return "NVME_NVM_CMD_WRITE_ZEROES"; case NVME_CMD_DSM: return "NVME_NVM_CMD_DSM"; + case NVME_CMD_COPY: return "NVME_NVM_CMD_COPY"; case NVME_CMD_ZONE_MGMT_SEND: return "NVME_ZONED_CMD_MGMT_SEND"; case NVME_CMD_ZONE_MGMT_RECV: return "NVME_ZONED_CMD_MGMT_RECV"; case NVME_CMD_ZONE_APPEND: return "NVME_ZONED_CMD_ZONE_APPEND"; diff --git a/hw/block/trace-events b/hw/block/trace-events index d32475c398..4b5ee04024 100644 --- a/hw/block/trace-events +++ b/hw/block/trace-events @@ -43,12 +43,17 @@ pci_nvme_admin_cmd(uint16_t cid, uint16_t sqid, uint8_t opcode, const char *opna pci_nvme_read(uint16_t cid, uint32_t nsid, uint32_t nlb, uint64_t count, uint64_t lba) "cid %"PRIu16" nsid %"PRIu32" nlb %"PRIu32" count %"PRIu64" lba 0x%"PRIx64"" pci_nvme_write(uint16_t cid, const char *verb, uint32_t nsid, uint32_t nlb, uint64_t count, uint64_t lba) "cid %"PRIu16" opname '%s' nsid %"PRIu32" nlb %"PRIu32" count %"PRIu64" lba 0x%"PRIx64"" pci_nvme_rw_cb(uint16_t cid, const char *blkname) "cid %"PRIu16" blk '%s'" +pci_nvme_copy(uint16_t cid, uint32_t nsid, uint16_t nr, uint8_t format) "cid %"PRIu16" nsid %"PRIu32" nr %"PRIu16" format 0x%"PRIx8"" +pci_nvme_copy_source_range(uint64_t slba, uint32_t nlb) "slba 0x%"PRIx64" nlb %"PRIu32"" +pci_nvme_copy_in_complete(uint16_t cid) "cid %"PRIu16"" +pci_nvme_copy_cb(uint16_t cid) "cid %"PRIu16"" pci_nvme_block_status(int64_t offset, int64_t bytes, int64_t pnum, int ret, bool zeroed) "offset %"PRId64" bytes %"PRId64" pnum %"PRId64" ret 0x%x zeroed %d" pci_nvme_dsm(uint16_t cid, uint32_t nsid, uint32_t nr, uint32_t attr) "cid %"PRIu16" nsid %"PRIu32" nr %"PRIu32" attr 0x%"PRIx32"" pci_nvme_dsm_deallocate(uint16_t cid, uint32_t nsid, uint64_t slba, uint32_t nlb) "cid %"PRIu16" nsid %"PRIu32" slba %"PRIu64" nlb %"PRIu32"" pci_nvme_compare(uint16_t cid, uint32_t nsid, uint64_t slba, uint32_t nlb) "cid %"PRIu16" nsid %"PRIu32" slba 0x%"PRIx64" nlb %"PRIu32"" pci_nvme_compare_cb(uint16_t cid) "cid %"PRIu16"" pci_nvme_aio_discard_cb(uint16_t cid) "cid %"PRIu16"" +pci_nvme_aio_copy_in_cb(uint16_t cid) "cid %"PRIu16"" pci_nvme_aio_zone_reset_cb(uint16_t cid, uint64_t zslba) "cid %"PRIu16" zslba 0x%"PRIx64"" pci_nvme_create_sq(uint64_t addr, uint16_t sqid, uint16_t cqid, uint16_t qsize, uint16_t qflags) "create submission queue, addr=0x%"PRIx64", sqid=%"PRIu16", cqid=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16"" pci_nvme_create_cq(uint64_t addr, uint16_t cqid, uint16_t vector, uint16_t size, uint16_t qflags, int ien) "create completion queue, addr=0x%"PRIx64", cqid=%"PRIu16", vector=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16", ien=%d" @@ -113,6 +118,7 @@ pci_nvme_err_addr_read(uint64_t addr) "addr 0x%"PRIx64"" pci_nvme_err_addr_write(uint64_t addr) "addr 0x%"PRIx64"" pci_nvme_err_cfs(void) "controller fatal status" pci_nvme_err_aio(uint16_t cid, const char *errname, uint16_t status) "cid %"PRIu16" err '%s' status 0x%"PRIx16"" +pci_nvme_err_copy_invalid_format(uint8_t format) "format 0x%"PRIx8"" pci_nvme_err_invalid_sgld(uint16_t cid, uint8_t typ) "cid %"PRIu16" type 0x%"PRIx8"" pci_nvme_err_invalid_num_sgld(uint16_t cid, uint8_t typ) "cid %"PRIu16" type 0x%"PRIx8"" pci_nvme_err_invalid_sgl_excess_length(uint16_t cid) "cid %"PRIu16"" From 92323c8c2566b8ea4cdfe8e72a22d2651b0ee6af Mon Sep 17 00:00:00 2001 From: Dmitry Fomichev Date: Mon, 8 Feb 2021 09:32:56 +0900 Subject: [PATCH 12/38] hw/block/nvme: fix Close Zone Implicitly and Explicitly Open zones can be closed by Close Zone management function. This got broken by a recent commit ("hw/block/nvme: refactor zone resource management") and now such commands fail with Invalid Zone State Transition status. Modify nvm_zrm_close() function to make Close Zone work correctly. Signed-off-by: Dmitry Fomichev Signed-off-by: Klaus Jensen --- hw/block/nvme.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index ab4723ff31..56ef07b74d 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -1319,14 +1319,13 @@ static uint16_t nvme_zrm_finish(NvmeNamespace *ns, NvmeZone *zone) static uint16_t nvme_zrm_close(NvmeNamespace *ns, NvmeZone *zone) { switch (nvme_get_zone_state(zone)) { - case NVME_ZONE_STATE_CLOSED: - return NVME_SUCCESS; - case NVME_ZONE_STATE_EXPLICITLY_OPEN: case NVME_ZONE_STATE_IMPLICITLY_OPEN: nvme_aor_dec_open(ns); nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED); /* fall through */ + case NVME_ZONE_STATE_CLOSED: + return NVME_SUCCESS; default: return NVME_ZONE_INVAL_TRANSITION; From 9ae390046164e8b62fbdc48d2c6de8ee6fbd3cdc Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Tue, 26 Jan 2021 12:32:29 +0100 Subject: [PATCH 13/38] hw/block/nvme: add missing mor/mar constraint checks Firstly, if zoned.max_active is non-zero, zoned.max_open must be less than or equal to zoned.max_active. Secondly, if only zones.max_active is set, we have to explicitly set zones.max_open or we end up with an invalid MAR/MOR configuration. This is an artifact of the parameters not being zeroes-based like in the spec. Cc: Dmitry Fomichev Reported-by: Gollu Appalanaidu Signed-off-by: Klaus Jensen Reviewed-by: Dmitry Fomichev --- hw/block/nvme-ns.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c index fd73d03211..0e87600204 100644 --- a/hw/block/nvme-ns.c +++ b/hw/block/nvme-ns.c @@ -163,6 +163,18 @@ static int nvme_ns_zoned_check_calc_geometry(NvmeNamespace *ns, Error **errp) return -1; } + if (ns->params.max_active_zones) { + if (ns->params.max_open_zones > ns->params.max_active_zones) { + error_setg(errp, "max_open_zones (%u) exceeds max_active_zones (%u)", + ns->params.max_open_zones, ns->params.max_active_zones); + return -1; + } + + if (!ns->params.max_open_zones) { + ns->params.max_open_zones = ns->params.max_active_zones; + } + } + if (ns->params.zd_extension_size) { if (ns->params.zd_extension_size & 0x3f) { error_setg(errp, From 2c7e2ad243b92f02555498392fb4ce761db8ceb3 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Mon, 8 Feb 2021 09:19:41 +0100 Subject: [PATCH 14/38] hw/block/nvme: improve invalid zasl value reporting The Zone Append Size Limit (ZASL) must be at least 4096 bytes, so improve the user experience by adding an early parameter check in nvme_check_constraints. When ZASL is still too small due to the host configuring the device for an even larger page size, convert the trace point in nvme_start_ctrl to an NVME_GUEST_ERR such that this is logged by QEMU instead of only traced. Reported-by: Corne Cc: Dmitry Fomichev Reviewed-by: Dmitry Fomichev Signed-off-by: Klaus Jensen --- hw/block/nvme.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 56ef07b74d..2addaf7c4f 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -3997,8 +3997,10 @@ static int nvme_start_ctrl(NvmeCtrl *n) n->zasl = n->params.mdts; } else { if (n->params.zasl_bs < n->page_size) { - trace_pci_nvme_err_startfail_zasl_too_small(n->params.zasl_bs, - n->page_size); + NVME_GUEST_ERR(pci_nvme_err_startfail_zasl_too_small, + "Zone Append Size Limit (ZASL) of %d bytes is too " + "small; must be at least %d bytes", + n->params.zasl_bs, n->page_size); return -1; } n->zasl = 31 - clz32(n->params.zasl_bs / n->page_size); @@ -4517,6 +4519,12 @@ static void nvme_check_constraints(NvmeCtrl *n, Error **errp) error_setg(errp, "zone append size limit has to be a power of 2"); return; } + + if (n->params.zasl_bs < 4096) { + error_setg(errp, "zone append size limit must be at least " + "4096 bytes"); + return; + } } } From 594a2b742b15a81e3bb41938c25ad6520c38e3cc Mon Sep 17 00:00:00 2001 From: Gollu Appalanaidu Date: Mon, 8 Feb 2021 18:40:31 +0530 Subject: [PATCH 15/38] hw/block/nvme: use locally assigned QEMU IEEE OUI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 6eb7a071292a ("hw/block/nvme: change controller pci id") changed the controller to use a Red Hat assigned PCI Device and Vendor ID, but did not change the IEEE OUI away from the Intel IEEE OUI. Fix that and use the locally assigned QEMU IEEE OUI instead if the `use-intel-id` parameter is not explicitly set. Also reverse the Intel IEEE OUI bytes. Signed-off-by: Gollu Appalanaidu Signed-off-by: Klaus Jensen Reviewed-by: Philippe Mathieu-Daudé --- hw/block/nvme.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 2addaf7c4f..a54ef34ce5 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -4707,9 +4707,17 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) id->cntlid = cpu_to_le16(n->cntlid); id->rab = 6; - id->ieee[0] = 0x00; - id->ieee[1] = 0x02; - id->ieee[2] = 0xb3; + + if (n->params.use_intel_id) { + id->ieee[0] = 0xb3; + id->ieee[1] = 0x02; + id->ieee[2] = 0x00; + } else { + id->ieee[0] = 0x00; + id->ieee[1] = 0x54; + id->ieee[2] = 0x52; + } + id->mdts = n->params.mdts; id->ver = cpu_to_le32(NVME_SPEC_VER); id->oacs = cpu_to_le16(0); From c94973288cd9cfdb0dc23ae84ba256a7345c372e Mon Sep 17 00:00:00 2001 From: Gollu Appalanaidu Date: Mon, 25 Jan 2021 15:09:24 +0530 Subject: [PATCH 16/38] hw/block/nvme: add broadcast nsid support flush command Add support for using the broadcast nsid to issue a flush on all namespaces through a single command. Signed-off-by: Gollu Appalanaidu Reviewed-by: Klaus Jensen Acked-by: Stefan Hajnoczi Acked-by: Keith Busch Signed-off-by: Klaus Jensen --- hw/block/nvme.c | 124 +++++++++++++++++++++++++++++++++++++++--- hw/block/trace-events | 2 + include/block/nvme.h | 8 +++ 3 files changed, 127 insertions(+), 7 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index a54ef34ce5..db1a3aabd8 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -1466,6 +1466,41 @@ static void nvme_rw_cb(void *opaque, int ret) nvme_enqueue_req_completion(nvme_cq(req), req); } +struct nvme_aio_flush_ctx { + NvmeRequest *req; + NvmeNamespace *ns; + BlockAcctCookie acct; +}; + +static void nvme_aio_flush_cb(void *opaque, int ret) +{ + struct nvme_aio_flush_ctx *ctx = opaque; + NvmeRequest *req = ctx->req; + uintptr_t *num_flushes = (uintptr_t *)&req->opaque; + + BlockBackend *blk = ctx->ns->blkconf.blk; + BlockAcctCookie *acct = &ctx->acct; + BlockAcctStats *stats = blk_get_stats(blk); + + trace_pci_nvme_aio_flush_cb(nvme_cid(req), blk_name(blk)); + + if (!ret) { + block_acct_done(stats, acct); + } else { + block_acct_failed(stats, acct); + nvme_aio_err(req, ret); + } + + (*num_flushes)--; + g_free(ctx); + + if (*num_flushes) { + return; + } + + nvme_enqueue_req_completion(nvme_cq(req), req); +} + static void nvme_aio_discard_cb(void *opaque, int ret) { NvmeRequest *req = opaque; @@ -1949,10 +1984,56 @@ static uint16_t nvme_compare(NvmeCtrl *n, NvmeRequest *req) static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req) { - block_acct_start(blk_get_stats(req->ns->blkconf.blk), &req->acct, 0, - BLOCK_ACCT_FLUSH); - req->aiocb = blk_aio_flush(req->ns->blkconf.blk, nvme_rw_cb, req); - return NVME_NO_COMPLETE; + uint32_t nsid = le32_to_cpu(req->cmd.nsid); + uintptr_t *num_flushes = (uintptr_t *)&req->opaque; + uint16_t status; + struct nvme_aio_flush_ctx *ctx; + NvmeNamespace *ns; + + trace_pci_nvme_flush(nvme_cid(req), nsid); + + if (nsid != NVME_NSID_BROADCAST) { + req->ns = nvme_ns(n, nsid); + if (unlikely(!req->ns)) { + return NVME_INVALID_FIELD | NVME_DNR; + } + + block_acct_start(blk_get_stats(req->ns->blkconf.blk), &req->acct, 0, + BLOCK_ACCT_FLUSH); + req->aiocb = blk_aio_flush(req->ns->blkconf.blk, nvme_rw_cb, req); + return NVME_NO_COMPLETE; + } + + /* 1-initialize; see comment in nvme_dsm */ + *num_flushes = 1; + + for (int i = 1; i <= n->num_namespaces; i++) { + ns = nvme_ns(n, i); + if (!ns) { + continue; + } + + ctx = g_new(struct nvme_aio_flush_ctx, 1); + ctx->req = req; + ctx->ns = ns; + + (*num_flushes)++; + + block_acct_start(blk_get_stats(ns->blkconf.blk), &ctx->acct, 0, + BLOCK_ACCT_FLUSH); + blk_aio_flush(ns->blkconf.blk, nvme_aio_flush_cb, ctx); + } + + /* account for the 1-initialization */ + (*num_flushes)--; + + if (*num_flushes) { + status = NVME_NO_COMPLETE; + } else { + status = req->status; + } + + return status; } static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req) @@ -2608,6 +2689,29 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req) return NVME_INVALID_NSID | NVME_DNR; } + /* + * In the base NVM command set, Flush may apply to all namespaces + * (indicated by NSID being set to 0xFFFFFFFF). But if that feature is used + * along with TP 4056 (Namespace Types), it may be pretty screwed up. + * + * If NSID is indeed set to 0xFFFFFFFF, we simply cannot associate the + * opcode with a specific command since we cannot determine a unique I/O + * command set. Opcode 0x0 could have any other meaning than something + * equivalent to flushing and say it DOES have completely different + * semantics in some other command set - does an NSID of 0xFFFFFFFF then + * mean "for all namespaces, apply whatever command set specific command + * that uses the 0x0 opcode?" Or does it mean "for all namespaces, apply + * whatever command that uses the 0x0 opcode if, and only if, it allows + * NSID to be 0xFFFFFFFF"? + * + * Anyway (and luckily), for now, we do not care about this since the + * device only supports namespace types that includes the NVM Flush command + * (NVM and Zoned), so always do an NVM Flush. + */ + if (req->cmd.opcode == NVME_CMD_FLUSH) { + return nvme_flush(n, req); + } + req->ns = nvme_ns(n, nsid); if (unlikely(!req->ns)) { return NVME_INVALID_FIELD | NVME_DNR; @@ -2619,8 +2723,6 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req) } switch (req->cmd.opcode) { - case NVME_CMD_FLUSH: - return nvme_flush(n, req); case NVME_CMD_WRITE_ZEROES: return nvme_write_zeroes(n, req); case NVME_CMD_ZONE_APPEND: @@ -4750,7 +4852,15 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) NVME_ONCS_FEATURES | NVME_ONCS_DSM | NVME_ONCS_COMPARE | NVME_ONCS_COPY); - id->vwc = (0x2 << 1) | 0x1; + /* + * NOTE: If this device ever supports a command set that does NOT use 0x0 + * as a Flush-equivalent operation, support for the broadcast NSID in Flush + * should probably be removed. + * + * See comment in nvme_io_cmd. + */ + id->vwc = NVME_VWC_NSID_BROADCAST_SUPPORT | NVME_VWC_PRESENT; + id->ocfs = cpu_to_le16(NVME_OCFS_COPY_FORMAT_0); id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN | NVME_CTRL_SGLS_BITBUCKET); diff --git a/hw/block/trace-events b/hw/block/trace-events index 4b5ee04024..b04f7a3e18 100644 --- a/hw/block/trace-events +++ b/hw/block/trace-events @@ -40,6 +40,7 @@ pci_nvme_map_prp(uint64_t trans_len, uint32_t len, uint64_t prp1, uint64_t prp2, pci_nvme_map_sgl(uint16_t cid, uint8_t typ, uint64_t len) "cid %"PRIu16" type 0x%"PRIx8" len %"PRIu64"" pci_nvme_io_cmd(uint16_t cid, uint32_t nsid, uint16_t sqid, uint8_t opcode, const char *opname) "cid %"PRIu16" nsid %"PRIu32" sqid %"PRIu16" opc 0x%"PRIx8" opname '%s'" pci_nvme_admin_cmd(uint16_t cid, uint16_t sqid, uint8_t opcode, const char *opname) "cid %"PRIu16" sqid %"PRIu16" opc 0x%"PRIx8" opname '%s'" +pci_nvme_flush(uint16_t cid, uint32_t nsid) "cid %"PRIu16" nsid %"PRIu32"" pci_nvme_read(uint16_t cid, uint32_t nsid, uint32_t nlb, uint64_t count, uint64_t lba) "cid %"PRIu16" nsid %"PRIu32" nlb %"PRIu32" count %"PRIu64" lba 0x%"PRIx64"" pci_nvme_write(uint16_t cid, const char *verb, uint32_t nsid, uint32_t nlb, uint64_t count, uint64_t lba) "cid %"PRIu16" opname '%s' nsid %"PRIu32" nlb %"PRIu32" count %"PRIu64" lba 0x%"PRIx64"" pci_nvme_rw_cb(uint16_t cid, const char *blkname) "cid %"PRIu16" blk '%s'" @@ -55,6 +56,7 @@ pci_nvme_compare_cb(uint16_t cid) "cid %"PRIu16"" pci_nvme_aio_discard_cb(uint16_t cid) "cid %"PRIu16"" pci_nvme_aio_copy_in_cb(uint16_t cid) "cid %"PRIu16"" pci_nvme_aio_zone_reset_cb(uint16_t cid, uint64_t zslba) "cid %"PRIu16" zslba 0x%"PRIx64"" +pci_nvme_aio_flush_cb(uint16_t cid, const char *blkname) "cid %"PRIu16" blk '%s'" pci_nvme_create_sq(uint64_t addr, uint16_t sqid, uint16_t cqid, uint16_t qsize, uint16_t qflags) "create submission queue, addr=0x%"PRIx64", sqid=%"PRIu16", cqid=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16"" pci_nvme_create_cq(uint64_t addr, uint16_t cqid, uint16_t vector, uint16_t size, uint16_t qflags, int ien) "create completion queue, addr=0x%"PRIx64", cqid=%"PRIu16", vector=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16", ien=%d" pci_nvme_del_sq(uint16_t qid) "deleting submission queue sqid=%"PRIu16"" diff --git a/include/block/nvme.h b/include/block/nvme.h index 9f8eb3988c..b23f3ae227 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -1062,6 +1062,14 @@ enum NvmeIdCtrlOcfs { NVME_OCFS_COPY_FORMAT_0 = 1 << 0, }; +enum NvmeIdctrlVwc { + NVME_VWC_PRESENT = 1 << 0, + NVME_VWC_NSID_BROADCAST_NO_SUPPORT = 0 << 1, + NVME_VWC_NSID_BROADCAST_RESERVED = 1 << 1, + NVME_VWC_NSID_BROADCAST_CTRL_SPEC = 2 << 1, + NVME_VWC_NSID_BROADCAST_SUPPORT = 3 << 1, +}; + enum NvmeIdCtrlFrmw { NVME_FRMW_SLOT1_RO = 1 << 0, }; From 5b8bb923ccf749d500593d6f1f0a210062285532 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Mon, 22 Feb 2021 20:13:22 +0100 Subject: [PATCH 17/38] hw/block/nvme: document 'mdts' nvme device parameter Document the 'mdts' nvme device parameter. Signed-off-by: Klaus Jensen Reviewed-by: Keith Busch --- hw/block/nvme.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index db1a3aabd8..6921b1957d 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -72,6 +72,12 @@ * completion when there are no outstanding AERs. When the maximum number of * enqueued events are reached, subsequent events will be dropped. * + * - `mdts` + * Indicates the maximum data transfer size for a command that transfers data + * between host-accessible memory and the controller. The value is specified + * as a power of two (2^n) and is in units of the minimum memory page size + * (CAP.MPSMIN). The default value is 7 (i.e. 512 KiB). + * * - `zoned.append_size_limit` * The maximum I/O size in bytes that is allowed in Zone Append command. * The default is 128KiB. Since internally this this value is maintained as From be5a1c27a3a5576323e910549071be635645aef1 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Mon, 22 Feb 2021 20:29:47 +0100 Subject: [PATCH 18/38] hw/block/nvme: deduplicate bad mdts trace event If mdts is exceeded, trace it from a single place. Signed-off-by: Klaus Jensen Reviewed-by: Keith Busch --- hw/block/nvme.c | 6 +----- hw/block/trace-events | 2 +- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 6921b1957d..b7ec9dccc0 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -1084,6 +1084,7 @@ static inline uint16_t nvme_check_mdts(NvmeCtrl *n, size_t len) uint8_t mdts = n->params.mdts; if (mdts && len > n->page_size << mdts) { + trace_pci_nvme_err_mdts(len); return NVME_INVALID_FIELD | NVME_DNR; } @@ -1954,7 +1955,6 @@ static uint16_t nvme_compare(NvmeCtrl *n, NvmeRequest *req) status = nvme_check_mdts(n, len); if (status) { - trace_pci_nvme_err_mdts(nvme_cid(req), len); return status; } @@ -2057,7 +2057,6 @@ static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req) status = nvme_check_mdts(n, data_size); if (status) { - trace_pci_nvme_err_mdts(nvme_cid(req), data_size); goto invalid; } @@ -2125,7 +2124,6 @@ static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append, if (!wrz) { status = nvme_check_mdts(n, data_size); if (status) { - trace_pci_nvme_err_mdts(nvme_cid(req), data_size); goto invalid; } } @@ -2619,7 +2617,6 @@ static uint16_t nvme_zone_mgmt_recv(NvmeCtrl *n, NvmeRequest *req) status = nvme_check_mdts(n, data_size); if (status) { - trace_pci_nvme_err_mdts(nvme_cid(req), data_size); return status; } @@ -3061,7 +3058,6 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req) status = nvme_check_mdts(n, len); if (status) { - trace_pci_nvme_err_mdts(nvme_cid(req), len); return status; } diff --git a/hw/block/trace-events b/hw/block/trace-events index b04f7a3e18..e1a85661cf 100644 --- a/hw/block/trace-events +++ b/hw/block/trace-events @@ -114,7 +114,7 @@ pci_nvme_clear_ns_close(uint32_t state, uint64_t slba) "zone state=%"PRIu32", sl pci_nvme_clear_ns_reset(uint32_t state, uint64_t slba) "zone state=%"PRIu32", slba=%"PRIu64" transitioned to Empty state" # nvme traces for error conditions -pci_nvme_err_mdts(uint16_t cid, size_t len) "cid %"PRIu16" len %zu" +pci_nvme_err_mdts(size_t len) "len %zu" pci_nvme_err_req_status(uint16_t cid, uint32_t nsid, uint16_t status, uint8_t opc) "cid %"PRIu16" nsid %"PRIu32" status 0x%"PRIx16" opc 0x%"PRIx8"" pci_nvme_err_addr_read(uint64_t addr) "addr 0x%"PRIx64"" pci_nvme_err_addr_write(uint64_t addr) "addr 0x%"PRIx64"" From 578d914b263c1ec71e567b90d744075ea3a8ea74 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Mon, 22 Feb 2021 19:27:58 +0100 Subject: [PATCH 19/38] hw/block/nvme: align zoned.zasl with mdts ZASL (Zone Append Size Limit) is defined exactly like MDTS (Maximum Data Transfer Size), that is, it is a value in units of the minimum memory page size (CAP.MPSMIN) and is reported as a power of two. The 'mdts' nvme device parameter is specified as in the spec, but the 'zoned.append_size_limit' parameter is specified in bytes. This is suboptimal for a number of reasons: 1. It is just plain confusing wrt. the definition of mdts. 2. There is a lot of complexity involved in validating the value; it must be a power of two, it should be larger than 4k, if it is zero we set it internally to mdts, but still report it as zero. 3. While "hw/block/nvme: improve invalid zasl value reporting" slightly improved the handling of the parameter, the validation is still wrong; it does not depend on CC.MPS, it depends on CAP.MPSMIN. And we are not even checking that it is actually less than or equal to MDTS, which is kinda the *one* condition it must satisfy. Fix this by defining zasl exactly like mdts and checking the one thing that it must satisfy (that it is less than or equal to mdts). Also, change the default value from 128KiB to 0 (aka, whatever mdts is). Signed-off-by: Klaus Jensen Reviewed-by: Keith Busch --- hw/block/nvme.c | 59 +++++++++++++------------------------------ hw/block/nvme.h | 4 +-- hw/block/trace-events | 2 +- 3 files changed, 19 insertions(+), 46 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index b7ec9dccc0..23e1c91ed2 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -22,8 +22,8 @@ * cmb_size_mb=, \ * [pmrdev=,] \ * max_ioqpairs=, \ - * aerl=, aer_max_queued=, \ - * mdts=,zoned.append_size_limit=, \ + * aerl=,aer_max_queued=, \ + * mdts=,zoned.zasl=, \ * subsys= * -device nvme-ns,drive=,bus=,nsid=,\ * zoned=, \ @@ -78,13 +78,11 @@ * as a power of two (2^n) and is in units of the minimum memory page size * (CAP.MPSMIN). The default value is 7 (i.e. 512 KiB). * - * - `zoned.append_size_limit` - * The maximum I/O size in bytes that is allowed in Zone Append command. - * The default is 128KiB. Since internally this this value is maintained as - * ZASL = log2( / ), some values assigned - * to this property may be rounded down and result in a lower maximum ZA - * data size being in effect. By setting this property to 0, users can make - * ZASL to be equal to MDTS. This property only affects zoned namespaces. + * - `zoned.zasl` + * Indicates the maximum data transfer size for the Zone Append command. Like + * `mdts`, the value is specified as a power of two (2^n) and is in units of + * the minimum memory page size (CAP.MPSMIN). The default value is 0 (i.e. + * defaulting to the value of `mdts`). * * nvme namespace device parameters * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -2144,10 +2142,9 @@ static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append, goto invalid; } - if (nvme_l2b(ns, nlb) > (n->page_size << n->zasl)) { - trace_pci_nvme_err_append_too_large(slba, nlb, n->zasl); - status = NVME_INVALID_FIELD; - goto invalid; + if (n->params.zasl && data_size > n->page_size << n->params.zasl) { + trace_pci_nvme_err_zasl(data_size); + return NVME_INVALID_FIELD | NVME_DNR; } slba = zone->w_ptr; @@ -3221,9 +3218,8 @@ static uint16_t nvme_identify_ctrl_csi(NvmeCtrl *n, NvmeRequest *req) if (c->csi == NVME_CSI_NVM) { return nvme_rpt_empty_id_struct(n, req); } else if (c->csi == NVME_CSI_ZONED) { - if (n->params.zasl_bs) { - id.zasl = n->zasl; - } + id.zasl = n->params.zasl; + return nvme_dma(n, (uint8_t *)&id, sizeof(id), DMA_DIRECTION_FROM_DEVICE, req); } @@ -4097,19 +4093,6 @@ static int nvme_start_ctrl(NvmeCtrl *n) nvme_init_sq(&n->admin_sq, n, n->bar.asq, 0, 0, NVME_AQA_ASQS(n->bar.aqa) + 1); - if (!n->params.zasl_bs) { - n->zasl = n->params.mdts; - } else { - if (n->params.zasl_bs < n->page_size) { - NVME_GUEST_ERR(pci_nvme_err_startfail_zasl_too_small, - "Zone Append Size Limit (ZASL) of %d bytes is too " - "small; must be at least %d bytes", - n->params.zasl_bs, n->page_size); - return -1; - } - n->zasl = 31 - clz32(n->params.zasl_bs / n->page_size); - } - nvme_set_timestamp(n, 0ULL); QTAILQ_INIT(&n->aer_queue); @@ -4618,17 +4601,10 @@ static void nvme_check_constraints(NvmeCtrl *n, Error **errp) host_memory_backend_set_mapped(n->pmr.dev, true); } - if (n->params.zasl_bs) { - if (!is_power_of_2(n->params.zasl_bs)) { - error_setg(errp, "zone append size limit has to be a power of 2"); - return; - } - - if (n->params.zasl_bs < 4096) { - error_setg(errp, "zone append size limit must be at least " - "4096 bytes"); - return; - } + if (n->params.zasl > n->params.mdts) { + error_setg(errp, "zoned.zasl (Zone Append Size Limit) must be less " + "than or equal to mdts (Maximum Data Transfer Size)"); + return; } } @@ -4997,8 +4973,7 @@ static Property nvme_props[] = { DEFINE_PROP_UINT8("mdts", NvmeCtrl, params.mdts, 7), DEFINE_PROP_BOOL("use-intel-id", NvmeCtrl, params.use_intel_id, false), DEFINE_PROP_BOOL("legacy-cmb", NvmeCtrl, params.legacy_cmb, false), - DEFINE_PROP_SIZE32("zoned.append_size_limit", NvmeCtrl, params.zasl_bs, - NVME_DEFAULT_MAX_ZA_SIZE), + DEFINE_PROP_UINT8("zoned.zasl", NvmeCtrl, params.zasl, 0), DEFINE_PROP_END_OF_LIST(), }; diff --git a/hw/block/nvme.h b/hw/block/nvme.h index cb2b5175f1..f45ace0cff 100644 --- a/hw/block/nvme.h +++ b/hw/block/nvme.h @@ -20,7 +20,7 @@ typedef struct NvmeParams { uint32_t aer_max_queued; uint8_t mdts; bool use_intel_id; - uint32_t zasl_bs; + uint8_t zasl; bool legacy_cmb; } NvmeParams; @@ -171,8 +171,6 @@ typedef struct NvmeCtrl { QTAILQ_HEAD(, NvmeAsyncEvent) aer_queue; int aer_queued; - uint8_t zasl; - NvmeSubsystem *subsys; NvmeNamespace namespace; diff --git a/hw/block/trace-events b/hw/block/trace-events index e1a85661cf..25ba51ea54 100644 --- a/hw/block/trace-events +++ b/hw/block/trace-events @@ -115,6 +115,7 @@ pci_nvme_clear_ns_reset(uint32_t state, uint64_t slba) "zone state=%"PRIu32", sl # nvme traces for error conditions pci_nvme_err_mdts(size_t len) "len %zu" +pci_nvme_err_zasl(size_t len) "len %zu" pci_nvme_err_req_status(uint16_t cid, uint32_t nsid, uint16_t status, uint8_t opc) "cid %"PRIu16" nsid %"PRIu32" status 0x%"PRIx16" opc 0x%"PRIx8"" pci_nvme_err_addr_read(uint64_t addr) "addr 0x%"PRIx64"" pci_nvme_err_addr_write(uint64_t addr) "addr 0x%"PRIx64"" @@ -144,7 +145,6 @@ pci_nvme_err_zone_boundary(uint64_t slba, uint32_t nlb, uint64_t zcap) "lba 0x%" pci_nvme_err_zone_invalid_write(uint64_t slba, uint64_t wp) "lba 0x%"PRIx64" wp 0x%"PRIx64"" pci_nvme_err_zone_write_not_ok(uint64_t slba, uint32_t nlb, uint16_t status) "slba=%"PRIu64", nlb=%"PRIu32", status=0x%"PRIx16"" pci_nvme_err_zone_read_not_ok(uint64_t slba, uint32_t nlb, uint16_t status) "slba=%"PRIu64", nlb=%"PRIu32", status=0x%"PRIx16"" -pci_nvme_err_append_too_large(uint64_t slba, uint32_t nlb, uint8_t zasl) "slba=%"PRIu64", nlb=%"PRIu32", zasl=%"PRIu8"" pci_nvme_err_insuff_active_res(uint32_t max_active) "max_active=%"PRIu32" zone limit exceeded" pci_nvme_err_insuff_open_res(uint32_t max_open) "max_open=%"PRIu32" zone limit exceeded" pci_nvme_err_zd_extension_map_error(uint32_t zone_idx) "can't map descriptor extension for zone_idx=%"PRIu32"" From 8c4d305f31f09064988e7f0a543dc94b7dbff2b2 Mon Sep 17 00:00:00 2001 From: Gollu Appalanaidu Date: Mon, 22 Feb 2021 19:31:19 +0100 Subject: [PATCH 20/38] hw/block/nvme: remove unnecessary endian conversion Remove an unnecessary le_to_cpu conversion in Identify. Signed-off-by: Gollu Appalanaidu Signed-off-by: Klaus Jensen Reviewed-by: Minwoo Im --- hw/block/nvme.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 23e1c91ed2..64340a00df 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -3422,7 +3422,7 @@ static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest *req) { NvmeIdentify *c = (NvmeIdentify *)&req->cmd; - switch (le32_to_cpu(c->cns)) { + switch (c->cns) { case NVME_ID_CNS_NS: /* fall through */ case NVME_ID_CNS_NS_PRESENT: From 49f0eba8b2bdc0c9aeaac620b978da7cb0fa947f Mon Sep 17 00:00:00 2001 From: Gollu Appalanaidu Date: Mon, 22 Feb 2021 19:32:21 +0100 Subject: [PATCH 21/38] hw/block/nvme: add identify trace event Add a trace event for the Identify command. Signed-off-by: Gollu Appalanaidu Signed-off-by: Klaus Jensen Reviewed-by: Minwoo Im --- hw/block/nvme.c | 3 +++ hw/block/trace-events | 1 + 2 files changed, 4 insertions(+) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 64340a00df..f5538fd00f 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -3422,6 +3422,9 @@ static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest *req) { NvmeIdentify *c = (NvmeIdentify *)&req->cmd; + trace_pci_nvme_identify(nvme_cid(req), c->cns, le16_to_cpu(c->ctrlid), + c->csi); + switch (c->cns) { case NVME_ID_CNS_NS: /* fall through */ diff --git a/hw/block/trace-events b/hw/block/trace-events index 25ba51ea54..c165ee2a97 100644 --- a/hw/block/trace-events +++ b/hw/block/trace-events @@ -61,6 +61,7 @@ pci_nvme_create_sq(uint64_t addr, uint16_t sqid, uint16_t cqid, uint16_t qsize, pci_nvme_create_cq(uint64_t addr, uint16_t cqid, uint16_t vector, uint16_t size, uint16_t qflags, int ien) "create completion queue, addr=0x%"PRIx64", cqid=%"PRIu16", vector=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16", ien=%d" pci_nvme_del_sq(uint16_t qid) "deleting submission queue sqid=%"PRIu16"" pci_nvme_del_cq(uint16_t cqid) "deleted completion queue, cqid=%"PRIu16"" +pci_nvme_identify(uint16_t cid, uint8_t cns, uint16_t ctrlid, uint8_t csi) "cid %"PRIu16" cns 0x%"PRIx8" ctrlid %"PRIu16" csi 0x%"PRIx8"" pci_nvme_identify_ctrl(void) "identify controller" pci_nvme_identify_ctrl_csi(uint8_t csi) "identify controller, csi=0x%"PRIx8"" pci_nvme_identify_ns(uint32_t ns) "nsid %"PRIu32"" From f4f872b532a53da7bc734cdb7cb166ec22d617d1 Mon Sep 17 00:00:00 2001 From: Gollu Appalanaidu Date: Mon, 22 Feb 2021 19:36:09 +0100 Subject: [PATCH 22/38] hw/block/nvme: fix potential compilation error assert may be compiled to a noop and we could end up returning an uninitialized status. Fix this by always returning Internal Device Error as a fallback. Note that, as pointed out by Philippe, per commit 262a69f4282 ("osdep.h: Prohibit disabling assert() in supported builds") this shouldn't be possible. But clean it up so we don't worry about it again. Signed-off-by: Gollu Appalanaidu [k.jensen: split commit] Signed-off-by: Klaus Jensen --- hw/block/nvme.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index f5538fd00f..de3d0ca51b 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -1246,8 +1246,6 @@ static uint16_t nvme_check_zone_write(NvmeNamespace *ns, NvmeZone *zone, static uint16_t nvme_check_zone_state_for_read(NvmeZone *zone) { - uint16_t status; - switch (nvme_get_zone_state(zone)) { case NVME_ZONE_STATE_EMPTY: case NVME_ZONE_STATE_IMPLICITLY_OPEN: @@ -1255,16 +1253,14 @@ static uint16_t nvme_check_zone_state_for_read(NvmeZone *zone) case NVME_ZONE_STATE_FULL: case NVME_ZONE_STATE_CLOSED: case NVME_ZONE_STATE_READ_ONLY: - status = NVME_SUCCESS; - break; + return NVME_SUCCESS; case NVME_ZONE_STATE_OFFLINE: - status = NVME_ZONE_OFFLINE; - break; + return NVME_ZONE_OFFLINE; default: assert(false); } - return status; + return NVME_INTERNAL_DEV_ERROR; } static uint16_t nvme_check_zone_read(NvmeNamespace *ns, uint64_t slba, From 57331f9355431d86636580edf4847e299c4b3ad7 Mon Sep 17 00:00:00 2001 From: Gollu Appalanaidu Date: Mon, 22 Feb 2021 19:38:46 +0100 Subject: [PATCH 23/38] hw/block/nvme: add trace event for zone read check Add a trace event for the offline zone condition when checking zone read. Signed-off-by: Gollu Appalanaidu [k.jensen: split commit] Signed-off-by: Klaus Jensen --- hw/block/nvme.c | 1 + 1 file changed, 1 insertion(+) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index de3d0ca51b..b81c4c3705 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -1255,6 +1255,7 @@ static uint16_t nvme_check_zone_state_for_read(NvmeZone *zone) case NVME_ZONE_STATE_READ_ONLY: return NVME_SUCCESS; case NVME_ZONE_STATE_OFFLINE: + trace_pci_nvme_err_zone_is_offline(zone->d.zslba); return NVME_ZONE_OFFLINE; default: assert(false); From 67ce28a1fdcf73e2c026dbc43bb8fb6dc9a56aed Mon Sep 17 00:00:00 2001 From: Gollu Appalanaidu Date: Sun, 21 Feb 2021 19:39:36 +0100 Subject: [PATCH 24/38] hw/block/nvme: report non-mdts command size limit for dsm Dataset Management is not subject to MDTS, but exceeded a certain size per range causes internal looping. Report this limit (DMRSL) in the NVM command set specific identify controller data structure. Signed-off-by: Gollu Appalanaidu Signed-off-by: Klaus Jensen Reviewed-by: Keith Busch --- hw/block/nvme.c | 27 +++++++++++++++++++-------- hw/block/nvme.h | 2 ++ hw/block/trace-events | 1 + include/block/nvme.h | 11 +++++++++++ 4 files changed, 33 insertions(+), 8 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index b81c4c3705..30aef5b09e 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -1789,6 +1789,10 @@ static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req) trace_pci_nvme_dsm_deallocate(nvme_cid(req), nvme_nsid(ns), slba, nlb); + if (nlb > n->dmrsl) { + trace_pci_nvme_dsm_single_range_limit_exceeded(nlb, n->dmrsl); + } + offset = nvme_l2b(ns, slba); len = nvme_l2b(ns, nlb); @@ -3208,20 +3212,24 @@ static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeRequest *req) static uint16_t nvme_identify_ctrl_csi(NvmeCtrl *n, NvmeRequest *req) { NvmeIdentify *c = (NvmeIdentify *)&req->cmd; - NvmeIdCtrlZoned id = {}; + uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {}; trace_pci_nvme_identify_ctrl_csi(c->csi); - if (c->csi == NVME_CSI_NVM) { - return nvme_rpt_empty_id_struct(n, req); - } else if (c->csi == NVME_CSI_ZONED) { - id.zasl = n->params.zasl; + switch (c->csi) { + case NVME_CSI_NVM: + ((NvmeIdCtrlNvm *)&id)->dmrsl = cpu_to_le32(n->dmrsl); + break; - return nvme_dma(n, (uint8_t *)&id, sizeof(id), - DMA_DIRECTION_FROM_DEVICE, req); + case NVME_CSI_ZONED: + ((NvmeIdCtrlZoned *)&id)->zasl = n->params.zasl; + break; + + default: + return NVME_INVALID_FIELD | NVME_DNR; } - return NVME_INVALID_FIELD | NVME_DNR; + return nvme_dma(n, id, sizeof(id), DMA_DIRECTION_FROM_DEVICE, req); } static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req) @@ -4655,6 +4663,9 @@ int nvme_register_namespace(NvmeCtrl *n, NvmeNamespace *ns, Error **errp) n->namespaces[nsid - 1] = ns; + n->dmrsl = MIN_NON_ZERO(n->dmrsl, + BDRV_REQUEST_MAX_BYTES / nvme_l2b(ns, 1)); + return 0; } diff --git a/hw/block/nvme.h b/hw/block/nvme.h index f45ace0cff..294fac1def 100644 --- a/hw/block/nvme.h +++ b/hw/block/nvme.h @@ -171,6 +171,8 @@ typedef struct NvmeCtrl { QTAILQ_HEAD(, NvmeAsyncEvent) aer_queue; int aer_queued; + uint32_t dmrsl; + NvmeSubsystem *subsys; NvmeNamespace namespace; diff --git a/hw/block/trace-events b/hw/block/trace-events index c165ee2a97..8deeacc8c3 100644 --- a/hw/block/trace-events +++ b/hw/block/trace-events @@ -51,6 +51,7 @@ pci_nvme_copy_cb(uint16_t cid) "cid %"PRIu16"" pci_nvme_block_status(int64_t offset, int64_t bytes, int64_t pnum, int ret, bool zeroed) "offset %"PRId64" bytes %"PRId64" pnum %"PRId64" ret 0x%x zeroed %d" pci_nvme_dsm(uint16_t cid, uint32_t nsid, uint32_t nr, uint32_t attr) "cid %"PRIu16" nsid %"PRIu32" nr %"PRIu32" attr 0x%"PRIx32"" pci_nvme_dsm_deallocate(uint16_t cid, uint32_t nsid, uint64_t slba, uint32_t nlb) "cid %"PRIu16" nsid %"PRIu32" slba %"PRIu64" nlb %"PRIu32"" +pci_nvme_dsm_single_range_limit_exceeded(uint32_t nlb, uint32_t dmrsl) "nlb %"PRIu32" dmrsl %"PRIu32"" pci_nvme_compare(uint16_t cid, uint32_t nsid, uint64_t slba, uint32_t nlb) "cid %"PRIu16" nsid %"PRIu32" slba 0x%"PRIx64" nlb %"PRIu32"" pci_nvme_compare_cb(uint16_t cid) "cid %"PRIu16"" pci_nvme_aio_discard_cb(uint16_t cid) "cid %"PRIu16"" diff --git a/include/block/nvme.h b/include/block/nvme.h index b23f3ae227..16d8c4c90f 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -1041,6 +1041,16 @@ typedef struct NvmeIdCtrlZoned { uint8_t rsvd1[4095]; } NvmeIdCtrlZoned; +typedef struct NvmeIdCtrlNvm { + uint8_t vsl; + uint8_t wzsl; + uint8_t wusl; + uint8_t dmrl; + uint32_t dmrsl; + uint64_t dmsl; + uint8_t rsvd16[4080]; +} NvmeIdCtrlNvm; + enum NvmeIdCtrlOacs { NVME_OACS_SECURITY = 1 << 0, NVME_OACS_FORMAT = 1 << 1, @@ -1396,6 +1406,7 @@ static inline void _nvme_check_size(void) QEMU_BUILD_BUG_ON(sizeof(NvmeEffectsLog) != 4096); QEMU_BUILD_BUG_ON(sizeof(NvmeIdCtrl) != 4096); QEMU_BUILD_BUG_ON(sizeof(NvmeIdCtrlZoned) != 4096); + QEMU_BUILD_BUG_ON(sizeof(NvmeIdCtrlNvm) != 4096); QEMU_BUILD_BUG_ON(sizeof(NvmeLBAF) != 4); QEMU_BUILD_BUG_ON(sizeof(NvmeLBAFE) != 16); QEMU_BUILD_BUG_ON(sizeof(NvmeIdNs) != 4096); From ba7b81e769c3d65dc18d1d31b8c9c5b2b0a65cdd Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Tue, 2 Feb 2021 14:38:57 +0100 Subject: [PATCH 25/38] hw/block/nvme: remove redundant len member in compare context The 'len' member of the nvme_compare_ctx struct is redundant since the same information is available in the 'iov' member. Signed-off-by: Klaus Jensen Reviewed-by: Minwoo Im Reviewed-by: Keith Busch --- hw/block/nvme.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 30aef5b09e..fcdd6b7cb9 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -1703,7 +1703,6 @@ static void nvme_aio_copy_in_cb(void *opaque, int ret) struct nvme_compare_ctx { QEMUIOVector iov; uint8_t *bounce; - size_t len; }; static void nvme_compare_cb(void *opaque, int ret) @@ -1724,16 +1723,16 @@ static void nvme_compare_cb(void *opaque, int ret) goto out; } - buf = g_malloc(ctx->len); + buf = g_malloc(ctx->iov.size); - status = nvme_dma(nvme_ctrl(req), buf, ctx->len, DMA_DIRECTION_TO_DEVICE, - req); + status = nvme_dma(nvme_ctrl(req), buf, ctx->iov.size, + DMA_DIRECTION_TO_DEVICE, req); if (status) { req->status = status; goto out; } - if (memcmp(buf, ctx->bounce, ctx->len)) { + if (memcmp(buf, ctx->bounce, ctx->iov.size)) { req->status = NVME_CMP_FAILURE; } @@ -1974,7 +1973,6 @@ static uint16_t nvme_compare(NvmeCtrl *n, NvmeRequest *req) ctx = g_new(struct nvme_compare_ctx, 1); ctx->bounce = bounce; - ctx->len = len; req->opaque = ctx; From d90ba23a846f8c9cd8d238e8391e6be5881cddb4 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Thu, 4 Feb 2021 21:58:51 +0100 Subject: [PATCH 26/38] hw/block/nvme: remove block accounting for write zeroes A Write Zeroes commands should not be counted in either the 'Data Units Written' or in 'Host Write Commands' SMART/Health Information Log page. Signed-off-by: Klaus Jensen Reviewed-by: Minwoo Im Reviewed-by: Keith Busch --- hw/block/nvme.c | 1 - 1 file changed, 1 deletion(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index fcdd6b7cb9..88e8008985 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -2181,7 +2181,6 @@ static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append, nvme_rw_cb, req); } } else { - block_acct_start(blk_get_stats(blk), &req->acct, 0, BLOCK_ACCT_WRITE); req->aiocb = blk_aio_pwrite_zeroes(blk, data_offset, data_size, BDRV_REQ_MAY_UNMAP, nvme_rw_cb, req); From 569dbe19c415865a3b2a1ca806f780d1bd5da2db Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Sat, 6 Feb 2021 22:59:26 +0100 Subject: [PATCH 27/38] hw/block/nvme: fix strerror printing Fix missing sign inversion. Signed-off-by: Klaus Jensen Reviewed-by: Minwoo Im Reviewed-by: Keith Busch --- hw/block/nvme.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 88e8008985..8f27fa7450 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -1164,7 +1164,7 @@ static void nvme_aio_err(NvmeRequest *req, int ret) break; } - trace_pci_nvme_err_aio(nvme_cid(req), strerror(ret), status); + trace_pci_nvme_err_aio(nvme_cid(req), strerror(-ret), status); error_setg_errno(&local_err, -ret, "aio failed"); error_report_err(local_err); From f80a1c331ac2fe7428ddd76b1f059642d6a91338 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Sun, 7 Feb 2021 21:06:01 +0100 Subject: [PATCH 28/38] hw/block/nvme: try to deal with the iov/qsg duality Introduce NvmeSg and try to deal with that pesky qsg/iov duality that haunts all the memory-related functions. Signed-off-by: Klaus Jensen Reviewed-by: Keith Busch --- hw/block/nvme.c | 191 ++++++++++++++++++++++++++---------------------- hw/block/nvme.h | 17 ++++- 2 files changed, 117 insertions(+), 91 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 8f27fa7450..a1e28c6570 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -441,15 +441,31 @@ static void nvme_req_clear(NvmeRequest *req) req->status = NVME_SUCCESS; } -static void nvme_req_exit(NvmeRequest *req) +static inline void nvme_sg_init(NvmeCtrl *n, NvmeSg *sg, bool dma) { - if (req->qsg.sg) { - qemu_sglist_destroy(&req->qsg); + if (dma) { + pci_dma_sglist_init(&sg->qsg, &n->parent_obj, 0); + sg->flags = NVME_SG_DMA; + } else { + qemu_iovec_init(&sg->iov, 0); } - if (req->iov.iov) { - qemu_iovec_destroy(&req->iov); + sg->flags |= NVME_SG_ALLOC; +} + +static inline void nvme_sg_unmap(NvmeSg *sg) +{ + if (!(sg->flags & NVME_SG_ALLOC)) { + return; } + + if (sg->flags & NVME_SG_DMA) { + qemu_sglist_destroy(&sg->qsg); + } else { + qemu_iovec_destroy(&sg->iov); + } + + memset(sg, 0x0, sizeof(*sg)); } static uint16_t nvme_map_addr_cmb(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr, @@ -486,8 +502,7 @@ static uint16_t nvme_map_addr_pmr(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr, return NVME_SUCCESS; } -static uint16_t nvme_map_addr(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov, - hwaddr addr, size_t len) +static uint16_t nvme_map_addr(NvmeCtrl *n, NvmeSg *sg, hwaddr addr, size_t len) { bool cmb = false, pmr = false; @@ -504,38 +519,31 @@ static uint16_t nvme_map_addr(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov, } if (cmb || pmr) { - if (qsg && qsg->sg) { + if (sg->flags & NVME_SG_DMA) { return NVME_INVALID_USE_OF_CMB | NVME_DNR; } - assert(iov); - - if (!iov->iov) { - qemu_iovec_init(iov, 1); - } - if (cmb) { - return nvme_map_addr_cmb(n, iov, addr, len); + return nvme_map_addr_cmb(n, &sg->iov, addr, len); } else { - return nvme_map_addr_pmr(n, iov, addr, len); + return nvme_map_addr_pmr(n, &sg->iov, addr, len); } } - if (iov && iov->iov) { + if (!(sg->flags & NVME_SG_DMA)) { return NVME_INVALID_USE_OF_CMB | NVME_DNR; } - assert(qsg); - - if (!qsg->sg) { - pci_dma_sglist_init(qsg, &n->parent_obj, 1); - } - - qemu_sglist_add(qsg, addr, len); + qemu_sglist_add(&sg->qsg, addr, len); return NVME_SUCCESS; } +static inline bool nvme_addr_is_dma(NvmeCtrl *n, hwaddr addr) +{ + return !(nvme_addr_is_cmb(n, addr) || nvme_addr_is_pmr(n, addr)); +} + static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, uint64_t prp2, uint32_t len, NvmeRequest *req) { @@ -545,20 +553,13 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, uint64_t prp2, uint16_t status; int ret; - QEMUSGList *qsg = &req->qsg; - QEMUIOVector *iov = &req->iov; - trace_pci_nvme_map_prp(trans_len, len, prp1, prp2, num_prps); - if (nvme_addr_is_cmb(n, prp1) || (nvme_addr_is_pmr(n, prp1))) { - qemu_iovec_init(iov, num_prps); - } else { - pci_dma_sglist_init(qsg, &n->parent_obj, num_prps); - } + nvme_sg_init(n, &req->sg, nvme_addr_is_dma(n, prp1)); - status = nvme_map_addr(n, qsg, iov, prp1, trans_len); + status = nvme_map_addr(n, &req->sg, prp1, trans_len); if (status) { - return status; + goto unmap; } len -= trans_len; @@ -573,7 +574,8 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, uint64_t prp2, ret = nvme_addr_read(n, prp2, (void *)prp_list, prp_trans); if (ret) { trace_pci_nvme_err_addr_read(prp2); - return NVME_DATA_TRAS_ERROR; + status = NVME_DATA_TRAS_ERROR; + goto unmap; } while (len != 0) { uint64_t prp_ent = le64_to_cpu(prp_list[i]); @@ -581,7 +583,8 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, uint64_t prp2, if (i == n->max_prp_ents - 1 && len > n->page_size) { if (unlikely(prp_ent & (n->page_size - 1))) { trace_pci_nvme_err_invalid_prplist_ent(prp_ent); - return NVME_INVALID_PRP_OFFSET | NVME_DNR; + status = NVME_INVALID_PRP_OFFSET | NVME_DNR; + goto unmap; } i = 0; @@ -591,20 +594,22 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, uint64_t prp2, prp_trans); if (ret) { trace_pci_nvme_err_addr_read(prp_ent); - return NVME_DATA_TRAS_ERROR; + status = NVME_DATA_TRAS_ERROR; + goto unmap; } prp_ent = le64_to_cpu(prp_list[i]); } if (unlikely(prp_ent & (n->page_size - 1))) { trace_pci_nvme_err_invalid_prplist_ent(prp_ent); - return NVME_INVALID_PRP_OFFSET | NVME_DNR; + status = NVME_INVALID_PRP_OFFSET | NVME_DNR; + goto unmap; } trans_len = MIN(len, n->page_size); - status = nvme_map_addr(n, qsg, iov, prp_ent, trans_len); + status = nvme_map_addr(n, &req->sg, prp_ent, trans_len); if (status) { - return status; + goto unmap; } len -= trans_len; @@ -613,24 +618,28 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, uint64_t prp2, } else { if (unlikely(prp2 & (n->page_size - 1))) { trace_pci_nvme_err_invalid_prp2_align(prp2); - return NVME_INVALID_PRP_OFFSET | NVME_DNR; + status = NVME_INVALID_PRP_OFFSET | NVME_DNR; + goto unmap; } - status = nvme_map_addr(n, qsg, iov, prp2, len); + status = nvme_map_addr(n, &req->sg, prp2, len); if (status) { - return status; + goto unmap; } } } return NVME_SUCCESS; + +unmap: + nvme_sg_unmap(&req->sg); + return status; } /* * Map 'nsgld' data descriptors from 'segment'. The function will subtract the * number of bytes mapped in len. */ -static uint16_t nvme_map_sgl_data(NvmeCtrl *n, QEMUSGList *qsg, - QEMUIOVector *iov, +static uint16_t nvme_map_sgl_data(NvmeCtrl *n, NvmeSg *sg, NvmeSglDescriptor *segment, uint64_t nsgld, size_t *len, NvmeRequest *req) { @@ -688,7 +697,7 @@ static uint16_t nvme_map_sgl_data(NvmeCtrl *n, QEMUSGList *qsg, return NVME_DATA_SGL_LEN_INVALID | NVME_DNR; } - status = nvme_map_addr(n, qsg, iov, addr, trans_len); + status = nvme_map_addr(n, sg, addr, trans_len); if (status) { return status; } @@ -700,9 +709,8 @@ next: return NVME_SUCCESS; } -static uint16_t nvme_map_sgl(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov, - NvmeSglDescriptor sgl, size_t len, - NvmeRequest *req) +static uint16_t nvme_map_sgl(NvmeCtrl *n, NvmeSg *sg, NvmeSglDescriptor sgl, + size_t len, NvmeRequest *req) { /* * Read the segment in chunks of 256 descriptors (one 4k page) to avoid @@ -725,12 +733,14 @@ static uint16_t nvme_map_sgl(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov, trace_pci_nvme_map_sgl(nvme_cid(req), NVME_SGL_TYPE(sgl.type), len); + nvme_sg_init(n, sg, nvme_addr_is_dma(n, addr)); + /* * If the entire transfer can be described with a single data block it can * be mapped directly. */ if (NVME_SGL_TYPE(sgl.type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) { - status = nvme_map_sgl_data(n, qsg, iov, sgld, 1, &len, req); + status = nvme_map_sgl_data(n, sg, sgld, 1, &len, req); if (status) { goto unmap; } @@ -768,7 +778,7 @@ static uint16_t nvme_map_sgl(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov, goto unmap; } - status = nvme_map_sgl_data(n, qsg, iov, segment, SEG_CHUNK_SIZE, + status = nvme_map_sgl_data(n, sg, segment, SEG_CHUNK_SIZE, &len, req); if (status) { goto unmap; @@ -795,7 +805,7 @@ static uint16_t nvme_map_sgl(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov, switch (NVME_SGL_TYPE(last_sgld->type)) { case NVME_SGL_DESCR_TYPE_DATA_BLOCK: case NVME_SGL_DESCR_TYPE_BIT_BUCKET: - status = nvme_map_sgl_data(n, qsg, iov, segment, nsgld, &len, req); + status = nvme_map_sgl_data(n, sg, segment, nsgld, &len, req); if (status) { goto unmap; } @@ -822,7 +832,7 @@ static uint16_t nvme_map_sgl(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov, * Do not map the last descriptor; it will be a Segment or Last Segment * descriptor and is handled by the next iteration. */ - status = nvme_map_sgl_data(n, qsg, iov, segment, nsgld - 1, &len, req); + status = nvme_map_sgl_data(n, sg, segment, nsgld - 1, &len, req); if (status) { goto unmap; } @@ -838,14 +848,7 @@ out: return NVME_SUCCESS; unmap: - if (iov->iov) { - qemu_iovec_destroy(iov); - } - - if (qsg->sg) { - qemu_sglist_destroy(qsg); - } - + nvme_sg_unmap(sg); return status; } @@ -866,8 +869,7 @@ static uint16_t nvme_map_dptr(NvmeCtrl *n, size_t len, NvmeRequest *req) return NVME_INVALID_FIELD | NVME_DNR; } - return nvme_map_sgl(n, &req->qsg, &req->iov, req->cmd.dptr.sgl, len, - req); + return nvme_map_sgl(n, &req->sg, req->cmd.dptr.sgl, len, req); default: return NVME_INVALID_FIELD; } @@ -883,16 +885,13 @@ static uint16_t nvme_dma(NvmeCtrl *n, uint8_t *ptr, uint32_t len, return status; } - /* assert that only one of qsg and iov carries data */ - assert((req->qsg.nsg > 0) != (req->iov.niov > 0)); - - if (req->qsg.nsg > 0) { + if (req->sg.flags & NVME_SG_DMA) { uint64_t residual; if (dir == DMA_DIRECTION_TO_DEVICE) { - residual = dma_buf_write(ptr, len, &req->qsg); + residual = dma_buf_write(ptr, len, &req->sg.qsg); } else { - residual = dma_buf_read(ptr, len, &req->qsg); + residual = dma_buf_read(ptr, len, &req->sg.qsg); } if (unlikely(residual)) { @@ -903,9 +902,9 @@ static uint16_t nvme_dma(NvmeCtrl *n, uint8_t *ptr, uint32_t len, size_t bytes; if (dir == DMA_DIRECTION_TO_DEVICE) { - bytes = qemu_iovec_to_buf(&req->iov, 0, ptr, len); + bytes = qemu_iovec_to_buf(&req->sg.iov, 0, ptr, len); } else { - bytes = qemu_iovec_from_buf(&req->iov, 0, ptr, len); + bytes = qemu_iovec_from_buf(&req->sg.iov, 0, ptr, len); } if (unlikely(bytes != len)) { @@ -917,6 +916,32 @@ static uint16_t nvme_dma(NvmeCtrl *n, uint8_t *ptr, uint32_t len, return status; } +static inline void nvme_blk_read(BlockBackend *blk, int64_t offset, + BlockCompletionFunc *cb, NvmeRequest *req) +{ + assert(req->sg.flags & NVME_SG_ALLOC); + + if (req->sg.flags & NVME_SG_DMA) { + req->aiocb = dma_blk_read(blk, &req->sg.qsg, offset, BDRV_SECTOR_SIZE, + cb, req); + } else { + req->aiocb = blk_aio_preadv(blk, offset, &req->sg.iov, 0, cb, req); + } +} + +static inline void nvme_blk_write(BlockBackend *blk, int64_t offset, + BlockCompletionFunc *cb, NvmeRequest *req) +{ + assert(req->sg.flags & NVME_SG_ALLOC); + + if (req->sg.flags & NVME_SG_DMA) { + req->aiocb = dma_blk_write(blk, &req->sg.qsg, offset, BDRV_SECTOR_SIZE, + cb, req); + } else { + req->aiocb = blk_aio_pwritev(blk, offset, &req->sg.iov, 0, cb, req); + } +} + static void nvme_post_cqes(void *opaque) { NvmeCQueue *cq = opaque; @@ -947,7 +972,7 @@ static void nvme_post_cqes(void *opaque) } QTAILQ_REMOVE(&cq->req_list, req, entry); nvme_inc_cq_tail(cq); - nvme_req_exit(req); + nvme_sg_unmap(&req->sg); QTAILQ_INSERT_TAIL(&sq->req_list, req, entry); } if (cq->tail != cq->head) { @@ -1644,14 +1669,14 @@ static void nvme_copy_in_complete(NvmeRequest *req) zone->w_ptr += ctx->nlb; } - qemu_iovec_init(&req->iov, 1); - qemu_iovec_add(&req->iov, ctx->bounce, nvme_l2b(ns, ctx->nlb)); + qemu_iovec_init(&req->sg.iov, 1); + qemu_iovec_add(&req->sg.iov, ctx->bounce, nvme_l2b(ns, ctx->nlb)); block_acct_start(blk_get_stats(ns->blkconf.blk), &req->acct, 0, BLOCK_ACCT_WRITE); req->aiocb = blk_aio_pwritev(ns->blkconf.blk, nvme_l2b(ns, sdlba), - &req->iov, 0, nvme_copy_cb, req); + &req->sg.iov, 0, nvme_copy_cb, req); return; @@ -2087,13 +2112,7 @@ static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req) block_acct_start(blk_get_stats(blk), &req->acct, data_size, BLOCK_ACCT_READ); - if (req->qsg.sg) { - req->aiocb = dma_blk_read(blk, &req->qsg, data_offset, - BDRV_SECTOR_SIZE, nvme_rw_cb, req); - } else { - req->aiocb = blk_aio_preadv(blk, data_offset, &req->iov, 0, - nvme_rw_cb, req); - } + nvme_blk_read(blk, data_offset, nvme_rw_cb, req); return NVME_NO_COMPLETE; invalid: @@ -2173,13 +2192,7 @@ static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append, block_acct_start(blk_get_stats(blk), &req->acct, data_size, BLOCK_ACCT_WRITE); - if (req->qsg.sg) { - req->aiocb = dma_blk_write(blk, &req->qsg, data_offset, - BDRV_SECTOR_SIZE, nvme_rw_cb, req); - } else { - req->aiocb = blk_aio_pwritev(blk, data_offset, &req->iov, 0, - nvme_rw_cb, req); - } + nvme_blk_write(blk, data_offset, nvme_rw_cb, req); } else { req->aiocb = blk_aio_pwrite_zeroes(blk, data_offset, data_size, BDRV_REQ_MAY_UNMAP, nvme_rw_cb, diff --git a/hw/block/nvme.h b/hw/block/nvme.h index 294fac1def..96afefa8c9 100644 --- a/hw/block/nvme.h +++ b/hw/block/nvme.h @@ -29,6 +29,20 @@ typedef struct NvmeAsyncEvent { NvmeAerResult result; } NvmeAsyncEvent; +enum { + NVME_SG_ALLOC = 1 << 0, + NVME_SG_DMA = 1 << 1, +}; + +typedef struct NvmeSg { + int flags; + + union { + QEMUSGList qsg; + QEMUIOVector iov; + }; +} NvmeSg; + typedef struct NvmeRequest { struct NvmeSQueue *sq; struct NvmeNamespace *ns; @@ -38,8 +52,7 @@ typedef struct NvmeRequest { NvmeCqe cqe; NvmeCmd cmd; BlockAcctCookie acct; - QEMUSGList qsg; - QEMUIOVector iov; + NvmeSg sg; QTAILQ_ENTRY(NvmeRequest)entry; } NvmeRequest; From 073d12d99871d0d500f44bd49cb0c45df14cf2c3 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Sun, 7 Feb 2021 21:21:45 +0100 Subject: [PATCH 29/38] hw/block/nvme: remove the req dependency in map functions The PRP and SGL mapping functions does not have any particular need for the entire NvmeRequest as a parameter. Clean it up. Signed-off-by: Klaus Jensen Reviewed-by: Keith Busch --- hw/block/nvme.c | 61 ++++++++++++++++++++++--------------------- hw/block/trace-events | 4 +-- 2 files changed, 33 insertions(+), 32 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index a1e28c6570..59942f8811 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -544,8 +544,8 @@ static inline bool nvme_addr_is_dma(NvmeCtrl *n, hwaddr addr) return !(nvme_addr_is_cmb(n, addr) || nvme_addr_is_pmr(n, addr)); } -static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, uint64_t prp2, - uint32_t len, NvmeRequest *req) +static uint16_t nvme_map_prp(NvmeCtrl *n, NvmeSg *sg, uint64_t prp1, + uint64_t prp2, uint32_t len) { hwaddr trans_len = n->page_size - (prp1 % n->page_size); trans_len = MIN(len, trans_len); @@ -555,9 +555,9 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, uint64_t prp2, trace_pci_nvme_map_prp(trans_len, len, prp1, prp2, num_prps); - nvme_sg_init(n, &req->sg, nvme_addr_is_dma(n, prp1)); + nvme_sg_init(n, sg, nvme_addr_is_dma(n, prp1)); - status = nvme_map_addr(n, &req->sg, prp1, trans_len); + status = nvme_map_addr(n, sg, prp1, trans_len); if (status) { goto unmap; } @@ -607,7 +607,7 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, uint64_t prp2, } trans_len = MIN(len, n->page_size); - status = nvme_map_addr(n, &req->sg, prp_ent, trans_len); + status = nvme_map_addr(n, sg, prp_ent, trans_len); if (status) { goto unmap; } @@ -621,7 +621,7 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, uint64_t prp2, status = NVME_INVALID_PRP_OFFSET | NVME_DNR; goto unmap; } - status = nvme_map_addr(n, &req->sg, prp2, len); + status = nvme_map_addr(n, sg, prp2, len); if (status) { goto unmap; } @@ -631,7 +631,7 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, uint64_t prp2, return NVME_SUCCESS; unmap: - nvme_sg_unmap(&req->sg); + nvme_sg_unmap(sg); return status; } @@ -641,7 +641,7 @@ unmap: */ static uint16_t nvme_map_sgl_data(NvmeCtrl *n, NvmeSg *sg, NvmeSglDescriptor *segment, uint64_t nsgld, - size_t *len, NvmeRequest *req) + size_t *len, NvmeCmd *cmd) { dma_addr_t addr, trans_len; uint32_t dlen; @@ -652,7 +652,7 @@ static uint16_t nvme_map_sgl_data(NvmeCtrl *n, NvmeSg *sg, switch (type) { case NVME_SGL_DESCR_TYPE_BIT_BUCKET: - if (req->cmd.opcode == NVME_CMD_WRITE) { + if (cmd->opcode == NVME_CMD_WRITE) { continue; } case NVME_SGL_DESCR_TYPE_DATA_BLOCK: @@ -681,7 +681,7 @@ static uint16_t nvme_map_sgl_data(NvmeCtrl *n, NvmeSg *sg, break; } - trace_pci_nvme_err_invalid_sgl_excess_length(nvme_cid(req)); + trace_pci_nvme_err_invalid_sgl_excess_length(dlen); return NVME_DATA_SGL_LEN_INVALID | NVME_DNR; } @@ -710,7 +710,7 @@ next: } static uint16_t nvme_map_sgl(NvmeCtrl *n, NvmeSg *sg, NvmeSglDescriptor sgl, - size_t len, NvmeRequest *req) + size_t len, NvmeCmd *cmd) { /* * Read the segment in chunks of 256 descriptors (one 4k page) to avoid @@ -731,7 +731,7 @@ static uint16_t nvme_map_sgl(NvmeCtrl *n, NvmeSg *sg, NvmeSglDescriptor sgl, sgld = &sgl; addr = le64_to_cpu(sgl.addr); - trace_pci_nvme_map_sgl(nvme_cid(req), NVME_SGL_TYPE(sgl.type), len); + trace_pci_nvme_map_sgl(NVME_SGL_TYPE(sgl.type), len); nvme_sg_init(n, sg, nvme_addr_is_dma(n, addr)); @@ -740,7 +740,7 @@ static uint16_t nvme_map_sgl(NvmeCtrl *n, NvmeSg *sg, NvmeSglDescriptor sgl, * be mapped directly. */ if (NVME_SGL_TYPE(sgl.type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) { - status = nvme_map_sgl_data(n, sg, sgld, 1, &len, req); + status = nvme_map_sgl_data(n, sg, sgld, 1, &len, cmd); if (status) { goto unmap; } @@ -779,7 +779,7 @@ static uint16_t nvme_map_sgl(NvmeCtrl *n, NvmeSg *sg, NvmeSglDescriptor sgl, } status = nvme_map_sgl_data(n, sg, segment, SEG_CHUNK_SIZE, - &len, req); + &len, cmd); if (status) { goto unmap; } @@ -805,7 +805,7 @@ static uint16_t nvme_map_sgl(NvmeCtrl *n, NvmeSg *sg, NvmeSglDescriptor sgl, switch (NVME_SGL_TYPE(last_sgld->type)) { case NVME_SGL_DESCR_TYPE_DATA_BLOCK: case NVME_SGL_DESCR_TYPE_BIT_BUCKET: - status = nvme_map_sgl_data(n, sg, segment, nsgld, &len, req); + status = nvme_map_sgl_data(n, sg, segment, nsgld, &len, cmd); if (status) { goto unmap; } @@ -832,7 +832,7 @@ static uint16_t nvme_map_sgl(NvmeCtrl *n, NvmeSg *sg, NvmeSglDescriptor sgl, * Do not map the last descriptor; it will be a Segment or Last Segment * descriptor and is handled by the next iteration. */ - status = nvme_map_sgl_data(n, sg, segment, nsgld - 1, &len, req); + status = nvme_map_sgl_data(n, sg, segment, nsgld - 1, &len, cmd); if (status) { goto unmap; } @@ -852,24 +852,20 @@ unmap: return status; } -static uint16_t nvme_map_dptr(NvmeCtrl *n, size_t len, NvmeRequest *req) +static uint16_t nvme_map_dptr(NvmeCtrl *n, NvmeSg *sg, size_t len, + NvmeCmd *cmd) { uint64_t prp1, prp2; - switch (NVME_CMD_FLAGS_PSDT(req->cmd.flags)) { + switch (NVME_CMD_FLAGS_PSDT(cmd->flags)) { case NVME_PSDT_PRP: - prp1 = le64_to_cpu(req->cmd.dptr.prp1); - prp2 = le64_to_cpu(req->cmd.dptr.prp2); + prp1 = le64_to_cpu(cmd->dptr.prp1); + prp2 = le64_to_cpu(cmd->dptr.prp2); - return nvme_map_prp(n, prp1, prp2, len, req); + return nvme_map_prp(n, sg, prp1, prp2, len); case NVME_PSDT_SGL_MPTR_CONTIGUOUS: case NVME_PSDT_SGL_MPTR_SGL: - /* SGLs shall not be used for Admin commands in NVMe over PCIe */ - if (!req->sq->sqid) { - return NVME_INVALID_FIELD | NVME_DNR; - } - - return nvme_map_sgl(n, &req->sg, req->cmd.dptr.sgl, len, req); + return nvme_map_sgl(n, sg, cmd->dptr.sgl, len, cmd); default: return NVME_INVALID_FIELD; } @@ -880,7 +876,7 @@ static uint16_t nvme_dma(NvmeCtrl *n, uint8_t *ptr, uint32_t len, { uint16_t status = NVME_SUCCESS; - status = nvme_map_dptr(n, len, req); + status = nvme_map_dptr(n, &req->sg, len, &req->cmd); if (status) { return status; } @@ -2096,7 +2092,7 @@ static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req) } } - status = nvme_map_dptr(n, data_size, req); + status = nvme_map_dptr(n, &req->sg, data_size, &req->cmd); if (status) { goto invalid; } @@ -2185,7 +2181,7 @@ static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append, data_offset = nvme_l2b(ns, slba); if (!wrz) { - status = nvme_map_dptr(n, data_size, req); + status = nvme_map_dptr(n, &req->sg, data_size, &req->cmd); if (status) { goto invalid; } @@ -3867,6 +3863,11 @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req) return NVME_INVALID_OPCODE | NVME_DNR; } + /* SGLs shall not be used for Admin commands in NVMe over PCIe */ + if (NVME_CMD_FLAGS_PSDT(req->cmd.flags) != NVME_PSDT_PRP) { + return NVME_INVALID_FIELD | NVME_DNR; + } + switch (req->cmd.opcode) { case NVME_ADM_CMD_DELETE_SQ: return nvme_del_sq(n, req); diff --git a/hw/block/trace-events b/hw/block/trace-events index 8deeacc8c3..60a076cea5 100644 --- a/hw/block/trace-events +++ b/hw/block/trace-events @@ -37,7 +37,7 @@ pci_nvme_dma_read(uint64_t prp1, uint64_t prp2) "DMA read, prp1=0x%"PRIx64" prp2 pci_nvme_map_addr(uint64_t addr, uint64_t len) "addr 0x%"PRIx64" len %"PRIu64"" pci_nvme_map_addr_cmb(uint64_t addr, uint64_t len) "addr 0x%"PRIx64" len %"PRIu64"" pci_nvme_map_prp(uint64_t trans_len, uint32_t len, uint64_t prp1, uint64_t prp2, int num_prps) "trans_len %"PRIu64" len %"PRIu32" prp1 0x%"PRIx64" prp2 0x%"PRIx64" num_prps %d" -pci_nvme_map_sgl(uint16_t cid, uint8_t typ, uint64_t len) "cid %"PRIu16" type 0x%"PRIx8" len %"PRIu64"" +pci_nvme_map_sgl(uint8_t typ, uint64_t len) "type 0x%"PRIx8" len %"PRIu64"" pci_nvme_io_cmd(uint16_t cid, uint32_t nsid, uint16_t sqid, uint8_t opcode, const char *opname) "cid %"PRIu16" nsid %"PRIu32" sqid %"PRIu16" opc 0x%"PRIx8" opname '%s'" pci_nvme_admin_cmd(uint16_t cid, uint16_t sqid, uint8_t opcode, const char *opname) "cid %"PRIu16" sqid %"PRIu16" opc 0x%"PRIx8" opname '%s'" pci_nvme_flush(uint16_t cid, uint32_t nsid) "cid %"PRIu16" nsid %"PRIu32"" @@ -126,7 +126,7 @@ pci_nvme_err_aio(uint16_t cid, const char *errname, uint16_t status) "cid %"PRIu pci_nvme_err_copy_invalid_format(uint8_t format) "format 0x%"PRIx8"" pci_nvme_err_invalid_sgld(uint16_t cid, uint8_t typ) "cid %"PRIu16" type 0x%"PRIx8"" pci_nvme_err_invalid_num_sgld(uint16_t cid, uint8_t typ) "cid %"PRIu16" type 0x%"PRIx8"" -pci_nvme_err_invalid_sgl_excess_length(uint16_t cid) "cid %"PRIu16"" +pci_nvme_err_invalid_sgl_excess_length(uint32_t residual) "residual %"PRIu32"" pci_nvme_err_invalid_dma(void) "PRP/SGL is too small for transfer size" pci_nvme_err_invalid_prplist_ent(uint64_t prplist) "PRP list entry is not page aligned: 0x%"PRIx64"" pci_nvme_err_invalid_prp2_align(uint64_t prp2) "PRP2 is not page aligned: 0x%"PRIx64"" From 81d07f4ff51a2b5249f940975acfb9b0d787d593 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Tue, 15 Dec 2020 19:18:25 +0100 Subject: [PATCH 30/38] hw/block/nvme: refactor nvme_dma The nvme_dma function doesn't just do DMA (QEMUSGList-based) memory transfers; it also handles QEMUIOVector copies. Introduce the NvmeTxDirection enum and rename to nvme_tx. Remove mapping of PRPs/SGLs from nvme_tx and instead assert that they have been mapped previously. This allows more fine-grained use in subsequent patches. Add new (better named) helpers, nvme_{c2h,h2c}, that does both PRP/SGL mapping and transfer. Signed-off-by: Klaus Jensen Reviewed-by: Keith Busch --- hw/block/nvme.c | 138 ++++++++++++++++++++++++++---------------------- 1 file changed, 76 insertions(+), 62 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 59942f8811..bfab80c457 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -871,45 +871,71 @@ static uint16_t nvme_map_dptr(NvmeCtrl *n, NvmeSg *sg, size_t len, } } -static uint16_t nvme_dma(NvmeCtrl *n, uint8_t *ptr, uint32_t len, - DMADirection dir, NvmeRequest *req) +typedef enum NvmeTxDirection { + NVME_TX_DIRECTION_TO_DEVICE = 0, + NVME_TX_DIRECTION_FROM_DEVICE = 1, +} NvmeTxDirection; + +static uint16_t nvme_tx(NvmeCtrl *n, NvmeSg *sg, uint8_t *ptr, uint32_t len, + NvmeTxDirection dir) { - uint16_t status = NVME_SUCCESS; + assert(sg->flags & NVME_SG_ALLOC); + + if (sg->flags & NVME_SG_DMA) { + uint64_t residual; + + if (dir == NVME_TX_DIRECTION_TO_DEVICE) { + residual = dma_buf_write(ptr, len, &sg->qsg); + } else { + residual = dma_buf_read(ptr, len, &sg->qsg); + } + + if (unlikely(residual)) { + trace_pci_nvme_err_invalid_dma(); + return NVME_INVALID_FIELD | NVME_DNR; + } + } else { + size_t bytes; + + if (dir == NVME_TX_DIRECTION_TO_DEVICE) { + bytes = qemu_iovec_to_buf(&sg->iov, 0, ptr, len); + } else { + bytes = qemu_iovec_from_buf(&sg->iov, 0, ptr, len); + } + + if (unlikely(bytes != len)) { + trace_pci_nvme_err_invalid_dma(); + return NVME_INVALID_FIELD | NVME_DNR; + } + } + + return NVME_SUCCESS; +} + +static inline uint16_t nvme_c2h(NvmeCtrl *n, uint8_t *ptr, uint32_t len, + NvmeRequest *req) +{ + uint16_t status; status = nvme_map_dptr(n, &req->sg, len, &req->cmd); if (status) { return status; } - if (req->sg.flags & NVME_SG_DMA) { - uint64_t residual; + return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_FROM_DEVICE); +} - if (dir == DMA_DIRECTION_TO_DEVICE) { - residual = dma_buf_write(ptr, len, &req->sg.qsg); - } else { - residual = dma_buf_read(ptr, len, &req->sg.qsg); - } +static inline uint16_t nvme_h2c(NvmeCtrl *n, uint8_t *ptr, uint32_t len, + NvmeRequest *req) +{ + uint16_t status; - if (unlikely(residual)) { - trace_pci_nvme_err_invalid_dma(); - status = NVME_INVALID_FIELD | NVME_DNR; - } - } else { - size_t bytes; - - if (dir == DMA_DIRECTION_TO_DEVICE) { - bytes = qemu_iovec_to_buf(&req->sg.iov, 0, ptr, len); - } else { - bytes = qemu_iovec_from_buf(&req->sg.iov, 0, ptr, len); - } - - if (unlikely(bytes != len)) { - trace_pci_nvme_err_invalid_dma(); - status = NVME_INVALID_FIELD | NVME_DNR; - } + status = nvme_map_dptr(n, &req->sg, len, &req->cmd); + if (status) { + return status; } - return status; + return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_TO_DEVICE); } static inline void nvme_blk_read(BlockBackend *blk, int64_t offset, @@ -1746,8 +1772,7 @@ static void nvme_compare_cb(void *opaque, int ret) buf = g_malloc(ctx->iov.size); - status = nvme_dma(nvme_ctrl(req), buf, ctx->iov.size, - DMA_DIRECTION_TO_DEVICE, req); + status = nvme_h2c(nvme_ctrl(req), buf, ctx->iov.size, req); if (status) { req->status = status; goto out; @@ -1783,8 +1808,7 @@ static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req) NvmeDsmRange range[nr]; uintptr_t *discards = (uintptr_t *)&req->opaque; - status = nvme_dma(n, (uint8_t *)range, sizeof(range), - DMA_DIRECTION_TO_DEVICE, req); + status = nvme_h2c(n, (uint8_t *)range, sizeof(range), req); if (status) { return status; } @@ -1870,8 +1894,8 @@ static uint16_t nvme_copy(NvmeCtrl *n, NvmeRequest *req) range = g_new(NvmeCopySourceRange, nr); - status = nvme_dma(n, (uint8_t *)range, nr * sizeof(NvmeCopySourceRange), - DMA_DIRECTION_TO_DEVICE, req); + status = nvme_h2c(n, (uint8_t *)range, nr * sizeof(NvmeCopySourceRange), + req); if (status) { return status; } @@ -2522,8 +2546,7 @@ static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req) return NVME_INVALID_FIELD | NVME_DNR; } zd_ext = nvme_get_zd_extension(ns, zone_idx); - status = nvme_dma(n, zd_ext, ns->params.zd_extension_size, - DMA_DIRECTION_TO_DEVICE, req); + status = nvme_h2c(n, zd_ext, ns->params.zd_extension_size, req); if (status) { trace_pci_nvme_err_zd_extension_map_error(zone_idx); return status; @@ -2677,8 +2700,7 @@ static uint16_t nvme_zone_mgmt_recv(NvmeCtrl *n, NvmeRequest *req) } } - status = nvme_dma(n, (uint8_t *)buf, data_size, - DMA_DIRECTION_FROM_DEVICE, req); + status = nvme_c2h(n, (uint8_t *)buf, data_size, req); g_free(buf); @@ -2944,8 +2966,7 @@ static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len, nvme_clear_events(n, NVME_AER_TYPE_SMART); } - return nvme_dma(n, (uint8_t *) &smart + off, trans_len, - DMA_DIRECTION_FROM_DEVICE, req); + return nvme_c2h(n, (uint8_t *) &smart + off, trans_len, req); } static uint16_t nvme_fw_log_info(NvmeCtrl *n, uint32_t buf_len, uint64_t off, @@ -2963,8 +2984,7 @@ static uint16_t nvme_fw_log_info(NvmeCtrl *n, uint32_t buf_len, uint64_t off, strpadcpy((char *)&fw_log.frs1, sizeof(fw_log.frs1), "1.0", ' '); trans_len = MIN(sizeof(fw_log) - off, buf_len); - return nvme_dma(n, (uint8_t *) &fw_log + off, trans_len, - DMA_DIRECTION_FROM_DEVICE, req); + return nvme_c2h(n, (uint8_t *) &fw_log + off, trans_len, req); } static uint16_t nvme_error_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len, @@ -2984,8 +3004,7 @@ static uint16_t nvme_error_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len, memset(&errlog, 0x0, sizeof(errlog)); trans_len = MIN(sizeof(errlog) - off, buf_len); - return nvme_dma(n, (uint8_t *)&errlog, trans_len, - DMA_DIRECTION_FROM_DEVICE, req); + return nvme_c2h(n, (uint8_t *)&errlog, trans_len, req); } static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint8_t csi, uint32_t buf_len, @@ -3025,8 +3044,7 @@ static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint8_t csi, uint32_t buf_len, trans_len = MIN(sizeof(log) - off, buf_len); - return nvme_dma(n, ((uint8_t *)&log) + off, trans_len, - DMA_DIRECTION_FROM_DEVICE, req); + return nvme_c2h(n, ((uint8_t *)&log) + off, trans_len, req); } static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req) @@ -3194,7 +3212,7 @@ static uint16_t nvme_rpt_empty_id_struct(NvmeCtrl *n, NvmeRequest *req) { uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {}; - return nvme_dma(n, id, sizeof(id), DMA_DIRECTION_FROM_DEVICE, req); + return nvme_c2h(n, id, sizeof(id), req); } static inline bool nvme_csi_has_nvm_support(NvmeNamespace *ns) @@ -3211,8 +3229,7 @@ static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeRequest *req) { trace_pci_nvme_identify_ctrl(); - return nvme_dma(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl), - DMA_DIRECTION_FROM_DEVICE, req); + return nvme_c2h(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl), req); } static uint16_t nvme_identify_ctrl_csi(NvmeCtrl *n, NvmeRequest *req) @@ -3235,7 +3252,7 @@ static uint16_t nvme_identify_ctrl_csi(NvmeCtrl *n, NvmeRequest *req) return NVME_INVALID_FIELD | NVME_DNR; } - return nvme_dma(n, id, sizeof(id), DMA_DIRECTION_FROM_DEVICE, req); + return nvme_c2h(n, id, sizeof(id), req); } static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req) @@ -3256,8 +3273,7 @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req) } if (c->csi == NVME_CSI_NVM && nvme_csi_has_nvm_support(ns)) { - return nvme_dma(n, (uint8_t *)&ns->id_ns, sizeof(NvmeIdNs), - DMA_DIRECTION_FROM_DEVICE, req); + return nvme_c2h(n, (uint8_t *)&ns->id_ns, sizeof(NvmeIdNs), req); } return NVME_INVALID_CMD_SET | NVME_DNR; @@ -3283,8 +3299,8 @@ static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req) if (c->csi == NVME_CSI_NVM && nvme_csi_has_nvm_support(ns)) { return nvme_rpt_empty_id_struct(n, req); } else if (c->csi == NVME_CSI_ZONED && ns->csi == NVME_CSI_ZONED) { - return nvme_dma(n, (uint8_t *)ns->id_ns_zoned, sizeof(NvmeIdNsZoned), - DMA_DIRECTION_FROM_DEVICE, req); + return nvme_c2h(n, (uint8_t *)ns->id_ns_zoned, sizeof(NvmeIdNsZoned), + req); } return NVME_INVALID_FIELD | NVME_DNR; @@ -3326,7 +3342,7 @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req) } } - return nvme_dma(n, list, data_len, DMA_DIRECTION_FROM_DEVICE, req); + return nvme_c2h(n, list, data_len, req); } static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, NvmeRequest *req) @@ -3366,7 +3382,7 @@ static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, NvmeRequest *req) } } - return nvme_dma(n, list, data_len, DMA_DIRECTION_FROM_DEVICE, req); + return nvme_c2h(n, list, data_len, req); } static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req) @@ -3413,7 +3429,7 @@ static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req) ns_descrs->csi.hdr.nidl = NVME_NIDL_CSI; ns_descrs->csi.v = ns->csi; - return nvme_dma(n, list, sizeof(list), DMA_DIRECTION_FROM_DEVICE, req); + return nvme_c2h(n, list, sizeof(list), req); } static uint16_t nvme_identify_cmd_set(NvmeCtrl *n, NvmeRequest *req) @@ -3426,7 +3442,7 @@ static uint16_t nvme_identify_cmd_set(NvmeCtrl *n, NvmeRequest *req) NVME_SET_CSI(*list, NVME_CSI_NVM); NVME_SET_CSI(*list, NVME_CSI_ZONED); - return nvme_dma(n, list, data_len, DMA_DIRECTION_FROM_DEVICE, req); + return nvme_c2h(n, list, data_len, req); } static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest *req) @@ -3518,8 +3534,7 @@ static uint16_t nvme_get_feature_timestamp(NvmeCtrl *n, NvmeRequest *req) { uint64_t timestamp = nvme_get_timestamp(n); - return nvme_dma(n, (uint8_t *)×tamp, sizeof(timestamp), - DMA_DIRECTION_FROM_DEVICE, req); + return nvme_c2h(n, (uint8_t *)×tamp, sizeof(timestamp), req); } static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req) @@ -3680,8 +3695,7 @@ static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, NvmeRequest *req) uint16_t ret; uint64_t timestamp; - ret = nvme_dma(n, (uint8_t *)×tamp, sizeof(timestamp), - DMA_DIRECTION_TO_DEVICE, req); + ret = nvme_h2c(n, (uint8_t *)×tamp, sizeof(timestamp), req); if (ret) { return ret; } From 037953b5b299daec4d92253858de32c15dd4e9f4 Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Sat, 6 Feb 2021 00:30:10 +0900 Subject: [PATCH 31/38] hw/block/nvme: support namespace detach Given that now we have nvme-subsys device supported, we can manage namespace allocated, but not attached: detached. This patch introduced a parameter for nvme-ns device named 'detached'. This parameter indicates whether the given namespace device is detached from a entire NVMe subsystem('subsys' given case, shared namespace) or a controller('bus' given case, private namespace). - Allocated namespace 1) Shared ns in the subsystem 'subsys0': -device nvme-ns,id=ns1,drive=blknvme0,nsid=1,subsys=subsys0,detached=true 2) Private ns for the controller 'nvme0' of the subsystem 'subsys0': -device nvme-subsys,id=subsys0 -device nvme,serial=foo,id=nvme0,subsys=subsys0 -device nvme-ns,id=ns1,drive=blknvme0,nsid=1,bus=nvme0,detached=true 3) (Invalid case) Controller 'nvme0' has no subsystem to manage ns: -device nvme,serial=foo,id=nvme0 -device nvme-ns,id=ns1,drive=blknvme0,nsid=1,bus=nvme0,detached=true Signed-off-by: Minwoo Im Reviewed-by: Keith Busch Signed-off-by: Klaus Jensen --- hw/block/nvme-ns.c | 1 + hw/block/nvme-ns.h | 1 + hw/block/nvme-subsys.h | 1 + hw/block/nvme.c | 41 +++++++++++++++++++++++++++++++++++++++-- hw/block/nvme.h | 22 ++++++++++++++++++++++ 5 files changed, 64 insertions(+), 2 deletions(-) diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c index 0e87600204..eda6a0c003 100644 --- a/hw/block/nvme-ns.c +++ b/hw/block/nvme-ns.c @@ -399,6 +399,7 @@ static Property nvme_ns_props[] = { DEFINE_BLOCK_PROPERTIES(NvmeNamespace, blkconf), DEFINE_PROP_LINK("subsys", NvmeNamespace, subsys, TYPE_NVME_SUBSYS, NvmeSubsystem *), + DEFINE_PROP_BOOL("detached", NvmeNamespace, params.detached, false), DEFINE_PROP_UINT32("nsid", NvmeNamespace, params.nsid, 0), DEFINE_PROP_UUID("uuid", NvmeNamespace, params.uuid), DEFINE_PROP_UINT16("mssrl", NvmeNamespace, params.mssrl, 128), diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h index 7af6884862..b0c00e115d 100644 --- a/hw/block/nvme-ns.h +++ b/hw/block/nvme-ns.h @@ -26,6 +26,7 @@ typedef struct NvmeZone { } NvmeZone; typedef struct NvmeNamespaceParams { + bool detached; uint32_t nsid; QemuUUID uuid; diff --git a/hw/block/nvme-subsys.h b/hw/block/nvme-subsys.h index 7a54a85120..507efcd23f 100644 --- a/hw/block/nvme-subsys.h +++ b/hw/block/nvme-subsys.h @@ -23,6 +23,7 @@ typedef struct NvmeSubsystem { uint8_t subnqn[256]; NvmeCtrl *ctrls[NVME_SUBSYS_MAX_CTRLS]; + /* Allocated namespaces for this subsystem */ NvmeNamespace *namespaces[NVME_SUBSYS_MAX_NAMESPACES]; struct { diff --git a/hw/block/nvme.c b/hw/block/nvme.c index bfab80c457..1790a66cb8 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -27,7 +27,7 @@ * subsys= * -device nvme-ns,drive=,bus=,nsid=,\ * zoned=, \ - * subsys= + * subsys=,detached= * * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at * offset 0 in BAR2 and supports only WDS, RDS and SQS for now. By default, the @@ -91,6 +91,13 @@ * subsystem. Otherwise, `bus` must be given to attach this namespace to a * specific controller as a non-shared namespace. * + * - `detached` + * This parameter is only valid together with the `subsys` parameter. If left + * at the default value (`false/off`), the namespace will be attached to all + * controllers in the NVMe subsystem at boot-up. If set to `true/on`, the + * namespace will be be available in the subsystem not not attached to any + * controllers. + * * Setting `zoned` to true selects Zoned Command Set at the namespace. * In this case, the following namespace properties are available to configure * zoned operation: @@ -4655,6 +4662,20 @@ static void nvme_init_state(NvmeCtrl *n) n->aer_reqs = g_new0(NvmeRequest *, n->params.aerl + 1); } +static int nvme_attach_namespace(NvmeCtrl *n, NvmeNamespace *ns, Error **errp) +{ + if (nvme_ns_is_attached(n, ns)) { + error_setg(errp, + "namespace %d is already attached to controller %d", + nvme_nsid(ns), n->cntlid); + return -1; + } + + nvme_ns_attach(n, ns); + + return 0; +} + int nvme_register_namespace(NvmeCtrl *n, NvmeNamespace *ns, Error **errp) { uint32_t nsid = nvme_nsid(ns); @@ -4686,7 +4707,23 @@ int nvme_register_namespace(NvmeCtrl *n, NvmeNamespace *ns, Error **errp) trace_pci_nvme_register_namespace(nsid); - n->namespaces[nsid - 1] = ns; + /* + * If subsys is not given, namespae is always attached to the controller + * because there's no subsystem to manage namespace allocation. + */ + if (!n->subsys) { + if (ns->params.detached) { + error_setg(errp, + "detached needs nvme-subsys specified nvme or nvme-ns"); + return -1; + } + + return nvme_attach_namespace(n, ns, errp); + } else { + if (!ns->params.detached) { + return nvme_attach_namespace(n, ns, errp); + } + } n->dmrsl = MIN_NON_ZERO(n->dmrsl, BDRV_REQUEST_MAX_BYTES / nvme_l2b(ns, 1)); diff --git a/hw/block/nvme.h b/hw/block/nvme.h index 96afefa8c9..cd8d406344 100644 --- a/hw/block/nvme.h +++ b/hw/block/nvme.h @@ -189,6 +189,10 @@ typedef struct NvmeCtrl { NvmeSubsystem *subsys; NvmeNamespace namespace; + /* + * Attached namespaces to this controller. If subsys is not given, all + * namespaces in this list will always be attached. + */ NvmeNamespace *namespaces[NVME_MAX_NAMESPACES]; NvmeSQueue **sq; NvmeCQueue **cq; @@ -207,6 +211,24 @@ static inline NvmeNamespace *nvme_ns(NvmeCtrl *n, uint32_t nsid) return n->namespaces[nsid - 1]; } +static inline bool nvme_ns_is_attached(NvmeCtrl *n, NvmeNamespace *ns) +{ + int nsid; + + for (nsid = 1; nsid <= n->num_namespaces; nsid++) { + if (nvme_ns(n, nsid) == ns) { + return true; + } + } + + return false; +} + +static inline void nvme_ns_attach(NvmeCtrl *n, NvmeNamespace *ns) +{ + n->namespaces[nvme_nsid(ns) - 1] = ns; +} + static inline NvmeCQueue *nvme_cq(NvmeRequest *req) { NvmeSQueue *sq = req->sq; From 5215e60600b0bc0a011f4456e0f0a0d9376d9133 Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Sat, 6 Feb 2021 02:03:20 +0900 Subject: [PATCH 32/38] hw/block/nvme: fix namespaces array to 1-based subsys->namespaces array used to be sized to NVME_SUBSYS_MAX_NAMESPACES. But subsys->namespaces are being accessed with 1-based namespace id which means the very first array entry will always be empty(NULL). Signed-off-by: Minwoo Im Reviewed-by: Keith Busch Reviewed-by: Klaus Jensen Tested-by: Klaus Jensen Signed-off-by: Klaus Jensen --- hw/block/nvme-subsys.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/block/nvme-subsys.h b/hw/block/nvme-subsys.h index 507efcd23f..20d34004c6 100644 --- a/hw/block/nvme-subsys.h +++ b/hw/block/nvme-subsys.h @@ -24,7 +24,7 @@ typedef struct NvmeSubsystem { NvmeCtrl *ctrls[NVME_SUBSYS_MAX_CTRLS]; /* Allocated namespaces for this subsystem */ - NvmeNamespace *namespaces[NVME_SUBSYS_MAX_NAMESPACES]; + NvmeNamespace *namespaces[NVME_SUBSYS_MAX_NAMESPACES + 1]; struct { char *nqn; From 92cad003c131c1866580beb4c00e19551652be8d Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Sat, 6 Feb 2021 02:09:20 +0900 Subject: [PATCH 33/38] hw/block/nvme: fix allocated namespace list to 256 Expand allocated namespace list (subsys->namespaces) to have 256 entries which is a value lager than at least NVME_MAX_NAMESPACES which is for attached namespace list in a controller. Allocated namespace list should at least larger than attached namespace list. n->num_namespaces = NVME_MAX_NAMESPACES; The above line will set the NN field by id->nn so that the subsystem should also prepare at least this number of namespace list entries. Signed-off-by: Minwoo Im Reviewed-by: Keith Busch Reviewed-by: Klaus Jensen Tested-by: Klaus Jensen Signed-off-by: Klaus Jensen --- hw/block/nvme-subsys.h | 2 +- hw/block/nvme.h | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/hw/block/nvme-subsys.h b/hw/block/nvme-subsys.h index 20d34004c6..65a8bcda03 100644 --- a/hw/block/nvme-subsys.h +++ b/hw/block/nvme-subsys.h @@ -14,7 +14,7 @@ OBJECT_CHECK(NvmeSubsystem, (obj), TYPE_NVME_SUBSYS) #define NVME_SUBSYS_MAX_CTRLS 32 -#define NVME_SUBSYS_MAX_NAMESPACES 32 +#define NVME_SUBSYS_MAX_NAMESPACES 256 typedef struct NvmeCtrl NvmeCtrl; typedef struct NvmeNamespace NvmeNamespace; diff --git a/hw/block/nvme.h b/hw/block/nvme.h index cd8d406344..85a7b5a14f 100644 --- a/hw/block/nvme.h +++ b/hw/block/nvme.h @@ -10,6 +10,12 @@ #define NVME_DEFAULT_ZONE_SIZE (128 * MiB) #define NVME_DEFAULT_MAX_ZA_SIZE (128 * KiB) +/* + * Subsystem namespace list for allocated namespaces should be larger than + * attached namespace list in a controller. + */ +QEMU_BUILD_BUG_ON(NVME_MAX_NAMESPACES > NVME_SUBSYS_MAX_NAMESPACES); + typedef struct NvmeParams { char *serial; uint32_t num_queues; /* deprecated since 5.1 */ From 94d8d6d1678156dfc7244beef75c05db52965d60 Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Sat, 6 Feb 2021 02:15:10 +0900 Subject: [PATCH 34/38] hw/block/nvme: support allocated namespace type From NVMe spec 1.4b "6.1.5. NSID and Namespace Relationships" defines valid namespace types: - Unallocated: Not exists in the NVMe subsystem - Allocated: Exists in the NVMe subsystem - Inactive: Not attached to the controller - Active: Attached to the controller This patch added support for allocated, but not attached namespace type: !nvme_ns(n, nsid) && nvme_subsys_ns(n->subsys, nsid) nvme_ns() returns attached namespace instance of the given controller and nvme_subsys_ns() returns allocated namespace instance in the subsystem. Signed-off-by: Minwoo Im Reviewed-by: Keith Busch Reviewed-by: Klaus Jensen Tested-by: Klaus Jensen Signed-off-by: Klaus Jensen --- hw/block/nvme-subsys.h | 13 +++++++++ hw/block/nvme.c | 63 +++++++++++++++++++++++++++++++----------- 2 files changed, 60 insertions(+), 16 deletions(-) diff --git a/hw/block/nvme-subsys.h b/hw/block/nvme-subsys.h index 65a8bcda03..83a6427b6e 100644 --- a/hw/block/nvme-subsys.h +++ b/hw/block/nvme-subsys.h @@ -34,4 +34,17 @@ typedef struct NvmeSubsystem { int nvme_subsys_register_ctrl(NvmeCtrl *n, Error **errp); int nvme_subsys_register_ns(NvmeNamespace *ns, Error **errp); +/* + * Return allocated namespace of the specified nsid in the subsystem. + */ +static inline NvmeNamespace *nvme_subsys_ns(NvmeSubsystem *subsys, + uint32_t nsid) +{ + if (!subsys) { + return NULL; + } + + return subsys->namespaces[nsid]; +} + #endif /* NVME_SUBSYS_H */ diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 1790a66cb8..414473c19c 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -3262,7 +3262,7 @@ static uint16_t nvme_identify_ctrl_csi(NvmeCtrl *n, NvmeRequest *req) return nvme_c2h(n, id, sizeof(id), req); } -static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req) +static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req, bool active) { NvmeNamespace *ns; NvmeIdentify *c = (NvmeIdentify *)&req->cmd; @@ -3276,7 +3276,14 @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req) ns = nvme_ns(n, nsid); if (unlikely(!ns)) { - return nvme_rpt_empty_id_struct(n, req); + if (!active) { + ns = nvme_subsys_ns(n->subsys, nsid); + if (!ns) { + return nvme_rpt_empty_id_struct(n, req); + } + } else { + return nvme_rpt_empty_id_struct(n, req); + } } if (c->csi == NVME_CSI_NVM && nvme_csi_has_nvm_support(ns)) { @@ -3286,7 +3293,8 @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req) return NVME_INVALID_CMD_SET | NVME_DNR; } -static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req) +static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req, + bool active) { NvmeNamespace *ns; NvmeIdentify *c = (NvmeIdentify *)&req->cmd; @@ -3300,7 +3308,14 @@ static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req) ns = nvme_ns(n, nsid); if (unlikely(!ns)) { - return nvme_rpt_empty_id_struct(n, req); + if (!active) { + ns = nvme_subsys_ns(n->subsys, nsid); + if (!ns) { + return nvme_rpt_empty_id_struct(n, req); + } + } else { + return nvme_rpt_empty_id_struct(n, req); + } } if (c->csi == NVME_CSI_NVM && nvme_csi_has_nvm_support(ns)) { @@ -3313,7 +3328,8 @@ static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req) return NVME_INVALID_FIELD | NVME_DNR; } -static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req) +static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req, + bool active) { NvmeNamespace *ns; NvmeIdentify *c = (NvmeIdentify *)&req->cmd; @@ -3338,7 +3354,14 @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req) for (i = 1; i <= n->num_namespaces; i++) { ns = nvme_ns(n, i); if (!ns) { - continue; + if (!active) { + ns = nvme_subsys_ns(n->subsys, i); + if (!ns) { + continue; + } + } else { + continue; + } } if (ns->params.nsid <= min_nsid) { continue; @@ -3352,7 +3375,8 @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req) return nvme_c2h(n, list, data_len, req); } -static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, NvmeRequest *req) +static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, NvmeRequest *req, + bool active) { NvmeNamespace *ns; NvmeIdentify *c = (NvmeIdentify *)&req->cmd; @@ -3378,7 +3402,14 @@ static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, NvmeRequest *req) for (i = 1; i <= n->num_namespaces; i++) { ns = nvme_ns(n, i); if (!ns) { - continue; + if (!active) { + ns = nvme_subsys_ns(n->subsys, i); + if (!ns) { + continue; + } + } else { + continue; + } } if (ns->params.nsid <= min_nsid || c->csi != ns->csi) { continue; @@ -3461,25 +3492,25 @@ static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest *req) switch (c->cns) { case NVME_ID_CNS_NS: - /* fall through */ + return nvme_identify_ns(n, req, true); case NVME_ID_CNS_NS_PRESENT: - return nvme_identify_ns(n, req); + return nvme_identify_ns(n, req, false); case NVME_ID_CNS_CS_NS: - /* fall through */ + return nvme_identify_ns_csi(n, req, true); case NVME_ID_CNS_CS_NS_PRESENT: - return nvme_identify_ns_csi(n, req); + return nvme_identify_ns_csi(n, req, false); case NVME_ID_CNS_CTRL: return nvme_identify_ctrl(n, req); case NVME_ID_CNS_CS_CTRL: return nvme_identify_ctrl_csi(n, req); case NVME_ID_CNS_NS_ACTIVE_LIST: - /* fall through */ + return nvme_identify_nslist(n, req, true); case NVME_ID_CNS_NS_PRESENT_LIST: - return nvme_identify_nslist(n, req); + return nvme_identify_nslist(n, req, false); case NVME_ID_CNS_CS_NS_ACTIVE_LIST: - /* fall through */ + return nvme_identify_nslist_csi(n, req, true); case NVME_ID_CNS_CS_NS_PRESENT_LIST: - return nvme_identify_nslist_csi(n, req); + return nvme_identify_nslist_csi(n, req, false); case NVME_ID_CNS_NS_DESCR_LIST: return nvme_identify_ns_descr_list(n, req); case NVME_ID_CNS_IO_COMMAND_SET: From 1f46660788542ae7f86e18bc4de14bc4b642423d Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Sat, 6 Feb 2021 12:16:52 +0900 Subject: [PATCH 35/38] hw/block/nvme: refactor nvme_select_ns_iocs This patch has no functional changes. This patch just refactored nvme_select_ns_iocs() to iterate the attached namespaces of the controlller and make it invoke __nvme_select_ns_iocs(). Signed-off-by: Minwoo Im Reviewed-by: Keith Busch Reviewed-by: Klaus Jensen Tested-by: Klaus Jensen Signed-off-by: Klaus Jensen --- hw/block/nvme.c | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 414473c19c..b4843d3bcf 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -4042,6 +4042,25 @@ static void nvme_ctrl_shutdown(NvmeCtrl *n) } } +static void __nvme_select_ns_iocs(NvmeCtrl *n, NvmeNamespace *ns) +{ + ns->iocs = nvme_cse_iocs_none; + switch (ns->csi) { + case NVME_CSI_NVM: + if (NVME_CC_CSS(n->bar.cc) != NVME_CC_CSS_ADMIN_ONLY) { + ns->iocs = nvme_cse_iocs_nvm; + } + break; + case NVME_CSI_ZONED: + if (NVME_CC_CSS(n->bar.cc) == NVME_CC_CSS_CSI) { + ns->iocs = nvme_cse_iocs_zoned; + } else if (NVME_CC_CSS(n->bar.cc) == NVME_CC_CSS_NVM) { + ns->iocs = nvme_cse_iocs_nvm; + } + break; + } +} + static void nvme_select_ns_iocs(NvmeCtrl *n) { NvmeNamespace *ns; @@ -4052,21 +4071,8 @@ static void nvme_select_ns_iocs(NvmeCtrl *n) if (!ns) { continue; } - ns->iocs = nvme_cse_iocs_none; - switch (ns->csi) { - case NVME_CSI_NVM: - if (NVME_CC_CSS(n->bar.cc) != NVME_CC_CSS_ADMIN_ONLY) { - ns->iocs = nvme_cse_iocs_nvm; - } - break; - case NVME_CSI_ZONED: - if (NVME_CC_CSS(n->bar.cc) == NVME_CC_CSS_CSI) { - ns->iocs = nvme_cse_iocs_zoned; - } else if (NVME_CC_CSS(n->bar.cc) == NVME_CC_CSS_NVM) { - ns->iocs = nvme_cse_iocs_nvm; - } - break; - } + + __nvme_select_ns_iocs(n, ns); } } From 645ce1a70cb6bedc85a11edb547db091375dea55 Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Sat, 6 Feb 2021 12:18:09 +0900 Subject: [PATCH 36/38] hw/block/nvme: support namespace attachment command This patch supports Namespace Attachment command for the pre-defined nvme-ns device nodes. Of course, attach/detach namespace should only be supported in case 'subsys' is given. This is because if we detach a namespace from a controller, somebody needs to manage the detached, but allocated namespace in the NVMe subsystem. As command effect for the namespace attachment command is registered, the host will be notified that namespace inventory is changed so that host will rescan the namespace inventory after this command. For example, kernel driver manages this command effect via passthru IOCTL. Signed-off-by: Minwoo Im Reviewed-by: Keith Busch Reviewed-by: Klaus Jensen Tested-by: Klaus Jensen [k.jensen: rebased for dma refactor] Signed-off-by: Klaus Jensen --- hw/block/nvme-subsys.h | 10 +++++++ hw/block/nvme.c | 60 +++++++++++++++++++++++++++++++++++++++++- hw/block/nvme.h | 5 ++++ hw/block/trace-events | 2 ++ include/block/nvme.h | 6 +++++ 5 files changed, 82 insertions(+), 1 deletion(-) diff --git a/hw/block/nvme-subsys.h b/hw/block/nvme-subsys.h index 83a6427b6e..fb66ae752a 100644 --- a/hw/block/nvme-subsys.h +++ b/hw/block/nvme-subsys.h @@ -34,6 +34,16 @@ typedef struct NvmeSubsystem { int nvme_subsys_register_ctrl(NvmeCtrl *n, Error **errp); int nvme_subsys_register_ns(NvmeNamespace *ns, Error **errp); +static inline NvmeCtrl *nvme_subsys_ctrl(NvmeSubsystem *subsys, + uint32_t cntlid) +{ + if (!subsys) { + return NULL; + } + + return subsys->ctrls[cntlid]; +} + /* * Return allocated namespace of the specified nsid in the subsystem. */ diff --git a/hw/block/nvme.c b/hw/block/nvme.c index b4843d3bcf..86fbab1fc4 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -196,6 +196,7 @@ static const uint32_t nvme_cse_acs[256] = { [NVME_ADM_CMD_SET_FEATURES] = NVME_CMD_EFF_CSUPP, [NVME_ADM_CMD_GET_FEATURES] = NVME_CMD_EFF_CSUPP, [NVME_ADM_CMD_ASYNC_EV_REQ] = NVME_CMD_EFF_CSUPP, + [NVME_ADM_CMD_NS_ATTACHMENT] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_NIC, }; static const uint32_t nvme_cse_iocs_none[256]; @@ -3905,6 +3906,61 @@ static uint16_t nvme_aer(NvmeCtrl *n, NvmeRequest *req) return NVME_NO_COMPLETE; } +static void __nvme_select_ns_iocs(NvmeCtrl *n, NvmeNamespace *ns); +static uint16_t nvme_ns_attachment(NvmeCtrl *n, NvmeRequest *req) +{ + NvmeNamespace *ns; + NvmeCtrl *ctrl; + uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {}; + uint32_t nsid = le32_to_cpu(req->cmd.nsid); + uint32_t dw10 = le32_to_cpu(req->cmd.cdw10); + bool attach = !(dw10 & 0xf); + uint16_t *nr_ids = &list[0]; + uint16_t *ids = &list[1]; + uint16_t ret; + int i; + + trace_pci_nvme_ns_attachment(nvme_cid(req), dw10 & 0xf); + + ns = nvme_subsys_ns(n->subsys, nsid); + if (!ns) { + return NVME_INVALID_FIELD | NVME_DNR; + } + + ret = nvme_h2c(n, (uint8_t *)list, 4096, req); + if (ret) { + return ret; + } + + if (!*nr_ids) { + return NVME_NS_CTRL_LIST_INVALID | NVME_DNR; + } + + for (i = 0; i < *nr_ids; i++) { + ctrl = nvme_subsys_ctrl(n->subsys, ids[i]); + if (!ctrl) { + return NVME_NS_CTRL_LIST_INVALID | NVME_DNR; + } + + if (attach) { + if (nvme_ns_is_attached(ctrl, ns)) { + return NVME_NS_ALREADY_ATTACHED | NVME_DNR; + } + + nvme_ns_attach(ctrl, ns); + __nvme_select_ns_iocs(ctrl, ns); + } else { + if (!nvme_ns_is_attached(ctrl, ns)) { + return NVME_NS_NOT_ATTACHED | NVME_DNR; + } + + nvme_ns_detach(ctrl, ns); + } + } + + return NVME_SUCCESS; +} + static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req) { trace_pci_nvme_admin_cmd(nvme_cid(req), nvme_sqid(req), req->cmd.opcode, @@ -3941,6 +3997,8 @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req) return nvme_get_feature(n, req); case NVME_ADM_CMD_ASYNC_EV_REQ: return nvme_aer(n, req); + case NVME_ADM_CMD_NS_ATTACHMENT: + return nvme_ns_attachment(n, req); default: assert(false); } @@ -4910,7 +4968,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) id->mdts = n->params.mdts; id->ver = cpu_to_le32(NVME_SPEC_VER); - id->oacs = cpu_to_le16(0); + id->oacs = cpu_to_le16(NVME_OACS_NS_MGMT); id->cntrltype = 0x1; /* diff --git a/hw/block/nvme.h b/hw/block/nvme.h index 85a7b5a14f..1287bc2cd1 100644 --- a/hw/block/nvme.h +++ b/hw/block/nvme.h @@ -235,6 +235,11 @@ static inline void nvme_ns_attach(NvmeCtrl *n, NvmeNamespace *ns) n->namespaces[nvme_nsid(ns) - 1] = ns; } +static inline void nvme_ns_detach(NvmeCtrl *n, NvmeNamespace *ns) +{ + n->namespaces[nvme_nsid(ns) - 1] = NULL; +} + static inline NvmeCQueue *nvme_cq(NvmeRequest *req) { NvmeSQueue *sq = req->sq; diff --git a/hw/block/trace-events b/hw/block/trace-events index 60a076cea5..c5dba935a0 100644 --- a/hw/block/trace-events +++ b/hw/block/trace-events @@ -84,6 +84,8 @@ pci_nvme_aer(uint16_t cid) "cid %"PRIu16"" pci_nvme_aer_aerl_exceeded(void) "aerl exceeded" pci_nvme_aer_masked(uint8_t type, uint8_t mask) "type 0x%"PRIx8" mask 0x%"PRIx8"" pci_nvme_aer_post_cqe(uint8_t typ, uint8_t info, uint8_t log_page) "type 0x%"PRIx8" info 0x%"PRIx8" lid 0x%"PRIx8"" +pci_nvme_ns_attachment(uint16_t cid, uint8_t sel) "cid %"PRIu16", sel=0x%"PRIx8"" +pci_nvme_ns_attachment_attach(uint16_t cntlid, uint32_t nsid) "cntlid=0x%"PRIx16", nsid=0x%"PRIx32"" pci_nvme_enqueue_event(uint8_t typ, uint8_t info, uint8_t log_page) "type 0x%"PRIx8" info 0x%"PRIx8" lid 0x%"PRIx8"" pci_nvme_enqueue_event_noqueue(int queued) "queued %d" pci_nvme_enqueue_event_masked(uint8_t typ) "type 0x%"PRIx8"" diff --git a/include/block/nvme.h b/include/block/nvme.h index 16d8c4c90f..03471a4d5a 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -566,6 +566,7 @@ enum NvmeAdminCommands { NVME_ADM_CMD_ASYNC_EV_REQ = 0x0c, NVME_ADM_CMD_ACTIVATE_FW = 0x10, NVME_ADM_CMD_DOWNLOAD_FW = 0x11, + NVME_ADM_CMD_NS_ATTACHMENT = 0x15, NVME_ADM_CMD_FORMAT_NVM = 0x80, NVME_ADM_CMD_SECURITY_SEND = 0x81, NVME_ADM_CMD_SECURITY_RECV = 0x82, @@ -836,6 +837,9 @@ enum NvmeStatusCodes { NVME_FEAT_NOT_CHANGEABLE = 0x010e, NVME_FEAT_NOT_NS_SPEC = 0x010f, NVME_FW_REQ_SUSYSTEM_RESET = 0x0110, + NVME_NS_ALREADY_ATTACHED = 0x0118, + NVME_NS_NOT_ATTACHED = 0x011A, + NVME_NS_CTRL_LIST_INVALID = 0x011C, NVME_CONFLICTING_ATTRS = 0x0180, NVME_INVALID_PROT_INFO = 0x0181, NVME_WRITE_TO_RO = 0x0182, @@ -951,6 +955,7 @@ typedef struct QEMU_PACKED NvmePSD { uint8_t resv[16]; } NvmePSD; +#define NVME_CONTROLLER_LIST_SIZE 2048 #define NVME_IDENTIFY_DATA_SIZE 4096 enum NvmeIdCns { @@ -1055,6 +1060,7 @@ enum NvmeIdCtrlOacs { NVME_OACS_SECURITY = 1 << 0, NVME_OACS_FORMAT = 1 << 1, NVME_OACS_FW = 1 << 2, + NVME_OACS_NS_MGMT = 1 << 3, }; enum NvmeIdCtrlOncs { From f432fdfa1215bc3a00468b2e711176be279b0fd2 Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Sun, 28 Feb 2021 17:51:02 +0900 Subject: [PATCH 37/38] hw/block/nvme: support changed namespace asynchronous event If namespace inventory is changed due to some reasons (e.g., namespace attachment/detachment), controller can send out event notifier to the host to manage namespaces. This patch sends out the AEN to the host after either attach or detach namespaces from controllers. To support clear of the event from the controller, this patch also implemented Get Log Page command for Changed Namespace List log type. To return namespace id list through the command, when namespace inventory is updated, id is added to the per-controller list (changed_ns_list). To indicate the support of this async event, this patch set OAES(Optional Asynchronous Events Supported) in Identify Controller data structure. Signed-off-by: Minwoo Im Reviewed-by: Keith Busch Reviewed-by: Klaus Jensen Tested-by: Klaus Jensen Signed-off-by: Klaus Jensen --- hw/block/nvme-ns.h | 1 + hw/block/nvme.c | 56 ++++++++++++++++++++++++++++++++++++++++++++ hw/block/nvme.h | 4 ++++ include/block/nvme.h | 7 ++++++ 4 files changed, 68 insertions(+) diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h index b0c00e115d..318d3aebe1 100644 --- a/hw/block/nvme-ns.h +++ b/hw/block/nvme-ns.h @@ -53,6 +53,7 @@ typedef struct NvmeNamespace { uint8_t csi; NvmeSubsystem *subsys; + QTAILQ_ENTRY(NvmeNamespace) entry; NvmeIdNsZoned *id_ns_zoned; NvmeZone *zone_array; diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 86fbab1fc4..cf0bb508aa 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -3015,6 +3015,48 @@ static uint16_t nvme_error_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len, return nvme_c2h(n, (uint8_t *)&errlog, trans_len, req); } +static uint16_t nvme_changed_nslist(NvmeCtrl *n, uint8_t rae, uint32_t buf_len, + uint64_t off, NvmeRequest *req) +{ + uint32_t nslist[1024]; + uint32_t trans_len; + int i = 0; + uint32_t nsid; + + memset(nslist, 0x0, sizeof(nslist)); + trans_len = MIN(sizeof(nslist) - off, buf_len); + + while ((nsid = find_first_bit(n->changed_nsids, NVME_CHANGED_NSID_SIZE)) != + NVME_CHANGED_NSID_SIZE) { + /* + * If more than 1024 namespaces, the first entry in the log page should + * be set to 0xffffffff and the others to 0 as spec. + */ + if (i == ARRAY_SIZE(nslist)) { + memset(nslist, 0x0, sizeof(nslist)); + nslist[0] = 0xffffffff; + break; + } + + nslist[i++] = nsid; + clear_bit(nsid, n->changed_nsids); + } + + /* + * Remove all the remaining list entries in case returns directly due to + * more than 1024 namespaces. + */ + if (nslist[0] == 0xffffffff) { + bitmap_zero(n->changed_nsids, NVME_CHANGED_NSID_SIZE); + } + + if (!rae) { + nvme_clear_events(n, NVME_AER_TYPE_NOTICE); + } + + return nvme_c2h(n, ((uint8_t *)nslist) + off, trans_len, req); +} + static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint8_t csi, uint32_t buf_len, uint64_t off, NvmeRequest *req) { @@ -3098,6 +3140,8 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req) return nvme_smart_info(n, rae, len, off, req); case NVME_LOG_FW_SLOT_INFO: return nvme_fw_log_info(n, len, off, req); + case NVME_LOG_CHANGED_NSLIST: + return nvme_changed_nslist(n, rae, len, off, req); case NVME_LOG_CMD_EFFECTS: return nvme_cmd_effects(n, csi, len, off, req); default: @@ -3956,6 +4000,16 @@ static uint16_t nvme_ns_attachment(NvmeCtrl *n, NvmeRequest *req) nvme_ns_detach(ctrl, ns); } + + /* + * Add namespace id to the changed namespace id list for event clearing + * via Get Log Page command. + */ + if (!test_and_set_bit(nsid, ctrl->changed_nsids)) { + nvme_enqueue_event(ctrl, NVME_AER_TYPE_NOTICE, + NVME_AER_INFO_NOTICE_NS_ATTR_CHANGED, + NVME_LOG_CHANGED_NSLIST); + } } return NVME_SUCCESS; @@ -4954,6 +5008,8 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) id->cntlid = cpu_to_le16(n->cntlid); + id->oaes = cpu_to_le32(NVME_OAES_NS_ATTR); + id->rab = 6; if (n->params.use_intel_id) { diff --git a/hw/block/nvme.h b/hw/block/nvme.h index 1287bc2cd1..4955d649c7 100644 --- a/hw/block/nvme.h +++ b/hw/block/nvme.h @@ -192,6 +192,10 @@ typedef struct NvmeCtrl { uint32_t dmrsl; + /* Namespace ID is started with 1 so bitmap should be 1-based */ +#define NVME_CHANGED_NSID_SIZE (NVME_MAX_NAMESPACES + 1) + DECLARE_BITMAP(changed_nsids, NVME_CHANGED_NSID_SIZE); + NvmeSubsystem *subsys; NvmeNamespace namespace; diff --git a/include/block/nvme.h b/include/block/nvme.h index 03471a4d5a..7ee887022a 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -760,6 +760,7 @@ typedef struct QEMU_PACKED NvmeCopySourceRange { enum NvmeAsyncEventRequest { NVME_AER_TYPE_ERROR = 0, NVME_AER_TYPE_SMART = 1, + NVME_AER_TYPE_NOTICE = 2, NVME_AER_TYPE_IO_SPECIFIC = 6, NVME_AER_TYPE_VENDOR_SPECIFIC = 7, NVME_AER_INFO_ERR_INVALID_DB_REGISTER = 0, @@ -771,6 +772,7 @@ enum NvmeAsyncEventRequest { NVME_AER_INFO_SMART_RELIABILITY = 0, NVME_AER_INFO_SMART_TEMP_THRESH = 1, NVME_AER_INFO_SMART_SPARE_THRESH = 2, + NVME_AER_INFO_NOTICE_NS_ATTR_CHANGED = 0, }; typedef struct QEMU_PACKED NvmeAerResult { @@ -940,6 +942,7 @@ enum NvmeLogIdentifier { NVME_LOG_ERROR_INFO = 0x01, NVME_LOG_SMART_INFO = 0x02, NVME_LOG_FW_SLOT_INFO = 0x03, + NVME_LOG_CHANGED_NSLIST = 0x04, NVME_LOG_CMD_EFFECTS = 0x05, }; @@ -1056,6 +1059,10 @@ typedef struct NvmeIdCtrlNvm { uint8_t rsvd16[4080]; } NvmeIdCtrlNvm; +enum NvmeIdCtrlOaes { + NVME_OAES_NS_ATTR = 1 << 8, +}; + enum NvmeIdCtrlOacs { NVME_OACS_SECURITY = 1 << 0, NVME_OACS_FORMAT = 1 << 1, From 23fb7dfeca17c55e4329ca98459d33fc204c1f59 Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Thu, 11 Feb 2021 00:10:25 +0900 Subject: [PATCH 38/38] hw/block/nvme: support Identify NS Attached Controller List Support Identify command for Namespace attached controller list. This command handler will traverse the controller instances in the given subsystem to figure out whether the specified nsid is attached to the controllers or not. The 4096bytes Identify data will return with the first entry (16bits) indicating the number of the controller id entries. So, the data can hold up to 2047 entries for the controller ids. Signed-off-by: Minwoo Im Reviewed-by: Keith Busch Reviewed-by: Klaus Jensen Tested-by: Klaus Jensen [k.jensen: rebased for dma refactor] Signed-off-by: Klaus Jensen --- hw/block/nvme.c | 41 +++++++++++++++++++++++++++++++++++++++++ hw/block/trace-events | 1 + include/block/nvme.h | 1 + 3 files changed, 43 insertions(+) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index cf0bb508aa..d439e44db8 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -3338,6 +3338,45 @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req, bool active) return NVME_INVALID_CMD_SET | NVME_DNR; } +static uint16_t nvme_identify_ns_attached_list(NvmeCtrl *n, NvmeRequest *req) +{ + NvmeIdentify *c = (NvmeIdentify *)&req->cmd; + uint16_t min_id = le16_to_cpu(c->ctrlid); + uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {}; + uint16_t *ids = &list[1]; + NvmeNamespace *ns; + NvmeCtrl *ctrl; + int cntlid, nr_ids = 0; + + trace_pci_nvme_identify_ns_attached_list(min_id); + + if (c->nsid == NVME_NSID_BROADCAST) { + return NVME_INVALID_FIELD | NVME_DNR; + } + + ns = nvme_subsys_ns(n->subsys, c->nsid); + if (!ns) { + return NVME_INVALID_FIELD | NVME_DNR; + } + + for (cntlid = min_id; cntlid < ARRAY_SIZE(n->subsys->ctrls); cntlid++) { + ctrl = nvme_subsys_ctrl(n->subsys, cntlid); + if (!ctrl) { + continue; + } + + if (!nvme_ns_is_attached(ctrl, ns)) { + continue; + } + + ids[nr_ids++] = cntlid; + } + + list[0] = nr_ids; + + return nvme_c2h(n, (uint8_t *)list, sizeof(list), req); +} + static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req, bool active) { @@ -3540,6 +3579,8 @@ static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest *req) return nvme_identify_ns(n, req, true); case NVME_ID_CNS_NS_PRESENT: return nvme_identify_ns(n, req, false); + case NVME_ID_CNS_NS_ATTACHED_CTRL_LIST: + return nvme_identify_ns_attached_list(n, req); case NVME_ID_CNS_CS_NS: return nvme_identify_ns_csi(n, req, true); case NVME_ID_CNS_CS_NS_PRESENT: diff --git a/hw/block/trace-events b/hw/block/trace-events index c5dba935a0..ef06d2ea74 100644 --- a/hw/block/trace-events +++ b/hw/block/trace-events @@ -66,6 +66,7 @@ pci_nvme_identify(uint16_t cid, uint8_t cns, uint16_t ctrlid, uint8_t csi) "cid pci_nvme_identify_ctrl(void) "identify controller" pci_nvme_identify_ctrl_csi(uint8_t csi) "identify controller, csi=0x%"PRIx8"" pci_nvme_identify_ns(uint32_t ns) "nsid %"PRIu32"" +pci_nvme_identify_ns_attached_list(uint16_t cntid) "cntid=%"PRIu16"" pci_nvme_identify_ns_csi(uint32_t ns, uint8_t csi) "nsid=%"PRIu32", csi=0x%"PRIx8"" pci_nvme_identify_nslist(uint32_t ns) "nsid %"PRIu32"" pci_nvme_identify_nslist_csi(uint16_t ns, uint8_t csi) "nsid=%"PRIu16", csi=0x%"PRIx8"" diff --git a/include/block/nvme.h b/include/block/nvme.h index 7ee887022a..372d0f2799 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -971,6 +971,7 @@ enum NvmeIdCns { NVME_ID_CNS_CS_NS_ACTIVE_LIST = 0x07, NVME_ID_CNS_NS_PRESENT_LIST = 0x10, NVME_ID_CNS_NS_PRESENT = 0x11, + NVME_ID_CNS_NS_ATTACHED_CTRL_LIST = 0x12, NVME_ID_CNS_CS_NS_PRESENT_LIST = 0x1a, NVME_ID_CNS_CS_NS_PRESENT = 0x1b, NVME_ID_CNS_IO_COMMAND_SET = 0x1c,