From 5f62d00f4d293b79f6ccb017638c111e764e4f0b Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Thu, 17 Sep 2020 14:48:45 +0200 Subject: [PATCH 01/30] hw/block/nvme: fix typo in trace event MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix a typo in the sq doorbell trace event. Signed-off-by: Klaus Jensen Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Keith Busch --- hw/block/trace-events | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/block/trace-events b/hw/block/trace-events index ec94c56a41..8ff4cbc493 100644 --- a/hw/block/trace-events +++ b/hw/block/trace-events @@ -70,7 +70,7 @@ pci_nvme_enqueue_req_completion(uint16_t cid, uint16_t cqid, uint16_t status) "c pci_nvme_mmio_read(uint64_t addr) "addr 0x%"PRIx64"" pci_nvme_mmio_write(uint64_t addr, uint64_t data) "addr 0x%"PRIx64" data 0x%"PRIx64"" pci_nvme_mmio_doorbell_cq(uint16_t cqid, uint16_t new_head) "cqid %"PRIu16" new_head %"PRIu16"" -pci_nvme_mmio_doorbell_sq(uint16_t sqid, uint16_t new_tail) "cqid %"PRIu16" new_tail %"PRIu16"" +pci_nvme_mmio_doorbell_sq(uint16_t sqid, uint16_t new_tail) "sqid %"PRIu16" new_tail %"PRIu16"" pci_nvme_mmio_intm_set(uint64_t data, uint64_t new_mask) "wrote MMIO, interrupt mask set, data=0x%"PRIx64", new_mask=0x%"PRIx64"" pci_nvme_mmio_intm_clr(uint64_t data, uint64_t new_mask) "wrote MMIO, interrupt mask clr, data=0x%"PRIx64", new_mask=0x%"PRIx64"" pci_nvme_mmio_cfg(uint64_t data) "wrote MMIO, config controller config=0x%"PRIx64"" From 195cc354696d75e9625cf303a0791404b3215501 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Tue, 1 Oct 2019 13:40:24 +0200 Subject: [PATCH 02/30] pci: pass along the return value of dma_memory_rw MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some devices might want to know the return value of dma_memory_rw, so pass it along instead of ignoring it. There are no existing users of the return value, so this patch should be safe. Signed-off-by: Klaus Jensen Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Michael S. Tsirkin Acked-by: Keith Busch --- include/hw/pci/pci.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h index 0a59a06b14..f19ffe6b4f 100644 --- a/include/hw/pci/pci.h +++ b/include/hw/pci/pci.h @@ -783,8 +783,7 @@ static inline AddressSpace *pci_get_address_space(PCIDevice *dev) static inline int pci_dma_rw(PCIDevice *dev, dma_addr_t addr, void *buf, dma_addr_t len, DMADirection dir) { - dma_memory_rw(pci_get_address_space(dev), addr, buf, len, dir); - return 0; + return dma_memory_rw(pci_get_address_space(dev), addr, buf, len, dir); } static inline int pci_dma_read(PCIDevice *dev, dma_addr_t addr, From 94cbcf530f9d9789b75ef1ae227ba4a4f702bf7d Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Fri, 11 Oct 2019 08:32:00 +0200 Subject: [PATCH 03/30] hw/block/nvme: handle dma errors Handling DMA errors gracefully is required for the device to pass the block/011 test ("disable PCI device while doing I/O") in the blktests suite. With this patch the device sets the Controller Fatal Status bit in the CSTS register when failing to read from a submission queue or writing to a completion queue; expecting the host to reset the controller. If DMA errors occur at any other point in the execution of the command (say, while mapping the PRPs), the command is aborted with a Data Transfer Error status code. Signed-off-by: Klaus Jensen Reviewed-by: Keith Busch --- hw/block/nvme.c | 41 +++++++++++++++++++++++++++++++---------- hw/block/trace-events | 3 +++ 2 files changed, 34 insertions(+), 10 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 44fa5b9076..7d328c08b8 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -140,14 +140,14 @@ static inline void *nvme_addr_to_cmb(NvmeCtrl *n, hwaddr addr) return &n->cmbuf[addr - n->ctrl_mem.addr]; } -static void nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size) +static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size) { if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr)) { memcpy(buf, nvme_addr_to_cmb(n, addr), size); - return; + return 0; } - pci_dma_read(&n->parent_obj, addr, buf, size); + return pci_dma_read(&n->parent_obj, addr, buf, size); } static int nvme_check_sqid(NvmeCtrl *n, uint16_t sqid) @@ -307,6 +307,7 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, uint64_t prp2, int num_prps = (len >> n->page_bits) + 1; uint16_t status; bool prp_list_in_cmb = false; + int ret; QEMUSGList *qsg = &req->qsg; QEMUIOVector *iov = &req->iov; @@ -347,7 +348,11 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, uint64_t prp2, nents = (len + n->page_size - 1) >> n->page_bits; prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t); - nvme_addr_read(n, prp2, (void *)prp_list, prp_trans); + ret = nvme_addr_read(n, prp2, (void *)prp_list, prp_trans); + if (ret) { + trace_pci_nvme_err_addr_read(prp2); + return NVME_DATA_TRAS_ERROR; + } while (len != 0) { uint64_t prp_ent = le64_to_cpu(prp_list[i]); @@ -364,8 +369,12 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, uint64_t prp2, i = 0; nents = (len + n->page_size - 1) >> n->page_bits; prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t); - nvme_addr_read(n, prp_ent, (void *)prp_list, - prp_trans); + ret = nvme_addr_read(n, prp_ent, (void *)prp_list, + prp_trans); + if (ret) { + trace_pci_nvme_err_addr_read(prp_ent); + return NVME_DATA_TRAS_ERROR; + } prp_ent = le64_to_cpu(prp_list[i]); } @@ -457,6 +466,7 @@ static void nvme_post_cqes(void *opaque) NvmeCQueue *cq = opaque; NvmeCtrl *n = cq->ctrl; NvmeRequest *req, *next; + int ret; QTAILQ_FOREACH_SAFE(req, &cq->req_list, entry, next) { NvmeSQueue *sq; @@ -466,15 +476,21 @@ static void nvme_post_cqes(void *opaque) break; } - QTAILQ_REMOVE(&cq->req_list, req, entry); sq = req->sq; req->cqe.status = cpu_to_le16((req->status << 1) | cq->phase); req->cqe.sq_id = cpu_to_le16(sq->sqid); req->cqe.sq_head = cpu_to_le16(sq->head); addr = cq->dma_addr + cq->tail * n->cqe_size; + ret = pci_dma_write(&n->parent_obj, addr, (void *)&req->cqe, + sizeof(req->cqe)); + if (ret) { + trace_pci_nvme_err_addr_write(addr); + trace_pci_nvme_err_cfs(); + n->bar.csts = NVME_CSTS_FAILED; + break; + } + QTAILQ_REMOVE(&cq->req_list, req, entry); nvme_inc_cq_tail(cq); - pci_dma_write(&n->parent_obj, addr, (void *)&req->cqe, - sizeof(req->cqe)); nvme_req_exit(req); QTAILQ_INSERT_TAIL(&sq->req_list, req, entry); } @@ -1606,7 +1622,12 @@ static void nvme_process_sq(void *opaque) while (!(nvme_sq_empty(sq) || QTAILQ_EMPTY(&sq->req_list))) { addr = sq->dma_addr + sq->head * n->sqe_size; - nvme_addr_read(n, addr, (void *)&cmd, sizeof(cmd)); + if (nvme_addr_read(n, addr, (void *)&cmd, sizeof(cmd))) { + trace_pci_nvme_err_addr_read(addr); + trace_pci_nvme_err_cfs(); + n->bar.csts = NVME_CSTS_FAILED; + break; + } nvme_inc_sq_head(sq); req = QTAILQ_FIRST(&sq->req_list); diff --git a/hw/block/trace-events b/hw/block/trace-events index 8ff4cbc493..5589db4a01 100644 --- a/hw/block/trace-events +++ b/hw/block/trace-events @@ -86,6 +86,9 @@ pci_nvme_mmio_shutdown_cleared(void) "shutdown bit cleared" # nvme traces for error conditions pci_nvme_err_mdts(uint16_t cid, size_t len) "cid %"PRIu16" len %zu" +pci_nvme_err_addr_read(uint64_t addr) "addr 0x%"PRIx64"" +pci_nvme_err_addr_write(uint64_t addr) "addr 0x%"PRIx64"" +pci_nvme_err_cfs(void) "controller fatal status" pci_nvme_err_invalid_dma(void) "PRP/SGL is too small for transfer size" pci_nvme_err_invalid_prplist_ent(uint64_t prplist) "PRP list entry is null or not page aligned: 0x%"PRIx64"" pci_nvme_err_invalid_prp2_align(uint64_t prp2) "PRP2 is not page aligned: 0x%"PRIx64"" From c6056bd1ce63d6d4eac6c54126c32838624efdac Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Mon, 24 Aug 2020 08:48:55 +0200 Subject: [PATCH 04/30] hw/block/nvme: commonize nvme_rw error handling Move common error handling to a label. Signed-off-by: Klaus Jensen Reviewed-by: Keith Busch --- hw/block/nvme.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 7d328c08b8..0ac9d85663 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -687,20 +687,18 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeRequest *req) status = nvme_check_mdts(n, data_size); if (status) { trace_pci_nvme_err_mdts(nvme_cid(req), data_size); - block_acct_invalid(blk_get_stats(n->conf.blk), acct); - return status; + goto invalid; } status = nvme_check_bounds(n, ns, slba, nlb); if (status) { trace_pci_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze); - block_acct_invalid(blk_get_stats(n->conf.blk), acct); - return status; + goto invalid; } - if (nvme_map_dptr(n, data_size, req)) { - block_acct_invalid(blk_get_stats(n->conf.blk), acct); - return NVME_INVALID_FIELD | NVME_DNR; + status = nvme_map_dptr(n, data_size, req); + if (status) { + goto invalid; } if (req->qsg.nsg > 0) { @@ -722,6 +720,10 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeRequest *req) } return NVME_NO_COMPLETE; + +invalid: + block_acct_invalid(blk_get_stats(n->conf.blk), acct); + return status; } static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req) From 2750384669ee038bd1f1fa33bbd4a660c351ea90 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Mon, 24 Aug 2020 08:58:56 +0200 Subject: [PATCH 05/30] hw/block/nvme: alignment style fixes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Style fixes. Signed-off-by: Klaus Jensen Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Keith Busch --- hw/block/nvme.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 0ac9d85663..d6d8324fa1 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -634,7 +634,7 @@ static void nvme_rw_cb(void *opaque, int ret) static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req) { block_acct_start(blk_get_stats(n->conf.blk), &req->acct, 0, - BLOCK_ACCT_FLUSH); + BLOCK_ACCT_FLUSH); req->aiocb = blk_aio_flush(n->conf.blk, nvme_rw_cb, req); return NVME_NO_COMPLETE; @@ -663,7 +663,7 @@ static uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req) block_acct_start(blk_get_stats(n->conf.blk), &req->acct, 0, BLOCK_ACCT_WRITE); req->aiocb = blk_aio_pwrite_zeroes(n->conf.blk, offset, count, - BDRV_REQ_MAY_UNMAP, nvme_rw_cb, req); + BDRV_REQ_MAY_UNMAP, nvme_rw_cb, req); return NVME_NO_COMPLETE; } @@ -803,7 +803,7 @@ static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeRequest *req) } static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr, - uint16_t sqid, uint16_t cqid, uint16_t size) + uint16_t sqid, uint16_t cqid, uint16_t size) { int i; NvmeCQueue *cq; @@ -1058,7 +1058,8 @@ static uint16_t nvme_del_cq(NvmeCtrl *n, NvmeRequest *req) } static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr, - uint16_t cqid, uint16_t vector, uint16_t size, uint16_t irq_enabled) + uint16_t cqid, uint16_t vector, uint16_t size, + uint16_t irq_enabled) { int ret; @@ -1118,7 +1119,7 @@ static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest *req) cq = g_malloc0(sizeof(*cq)); nvme_init_cq(cq, n, prp1, cqid, vector, qsize + 1, - NVME_CQ_FLAGS_IEN(qflags)); + NVME_CQ_FLAGS_IEN(qflags)); /* * It is only required to set qs_created when creating a completion queue; @@ -1515,7 +1516,7 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req) } if (((n->temperature >= n->features.temp_thresh_hi) || - (n->temperature <= n->features.temp_thresh_low)) && + (n->temperature <= n->features.temp_thresh_low)) && NVME_AEC_SMART(n->features.async_config) & NVME_SMART_TEMPERATURE) { nvme_enqueue_event(n, NVME_AER_TYPE_SMART, NVME_AER_INFO_SMART_TEMP_THRESH, @@ -1765,9 +1766,9 @@ static int nvme_start_ctrl(NvmeCtrl *n) n->cqe_size = 1 << NVME_CC_IOCQES(n->bar.cc); n->sqe_size = 1 << NVME_CC_IOSQES(n->bar.cc); nvme_init_cq(&n->admin_cq, n, n->bar.acq, 0, 0, - NVME_AQA_ACQS(n->bar.aqa) + 1, 1); + NVME_AQA_ACQS(n->bar.aqa) + 1, 1); nvme_init_sq(&n->admin_sq, n, n->bar.asq, 0, 0, - NVME_AQA_ASQS(n->bar.aqa) + 1); + NVME_AQA_ASQS(n->bar.aqa) + 1); nvme_set_timestamp(n, 0ULL); @@ -1777,7 +1778,7 @@ static int nvme_start_ctrl(NvmeCtrl *n) } static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data, - unsigned size) + unsigned size) { if (unlikely(offset & (sizeof(uint32_t) - 1))) { NVME_GUEST_ERR(pci_nvme_ub_mmiowr_misaligned32, @@ -1920,7 +1921,7 @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data, "invalid write to PMRSWTP register, ignored"); return; case 0xE14: /* TODO PMRMSC */ - break; + break; default: NVME_GUEST_ERR(pci_nvme_ub_mmiowr_invalid, "invalid MMIO write," @@ -2096,7 +2097,7 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val) } static void nvme_mmio_write(void *opaque, hwaddr addr, uint64_t data, - unsigned size) + unsigned size) { NvmeCtrl *n = (NvmeCtrl *)opaque; @@ -2120,7 +2121,7 @@ static const MemoryRegionOps nvme_mmio_ops = { }; static void nvme_cmb_write(void *opaque, hwaddr addr, uint64_t data, - unsigned size) + unsigned size) { NvmeCtrl *n = (NvmeCtrl *)opaque; stn_le_p(&n->cmbuf[addr], size, data); From 9994f72bd8c379eda01503ef6a7b06b7900110c5 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Mon, 24 Aug 2020 08:59:41 +0200 Subject: [PATCH 06/30] hw/block/nvme: add a lba to bytes helper Add the nvme_l2b helper and use it for converting NLB and SLBA to byte counts and offsets. Signed-off-by: Klaus Jensen Reviewed-by: Keith Busch --- hw/block/nvme.c | 12 ++++-------- hw/block/nvme.h | 6 ++++++ 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index d6d8324fa1..59338b4232 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -644,12 +644,10 @@ static uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req) { NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; NvmeNamespace *ns = req->ns; - const uint8_t lba_index = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas); - const uint8_t data_shift = ns->id_ns.lbaf[lba_index].ds; uint64_t slba = le64_to_cpu(rw->slba); uint32_t nlb = le16_to_cpu(rw->nlb) + 1; - uint64_t offset = slba << data_shift; - uint32_t count = nlb << data_shift; + uint64_t offset = nvme_l2b(ns, slba); + uint32_t count = nvme_l2b(ns, nlb); uint16_t status; trace_pci_nvme_write_zeroes(nvme_cid(req), slba, nlb); @@ -674,10 +672,8 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeRequest *req) uint32_t nlb = le32_to_cpu(rw->nlb) + 1; uint64_t slba = le64_to_cpu(rw->slba); - uint8_t lba_index = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas); - uint8_t data_shift = ns->id_ns.lbaf[lba_index].ds; - uint64_t data_size = (uint64_t)nlb << data_shift; - uint64_t data_offset = slba << data_shift; + uint64_t data_size = nvme_l2b(ns, nlb); + uint64_t data_offset = nvme_l2b(ns, slba); int is_write = rw->opcode == NVME_CMD_WRITE ? 1 : 0; enum BlockAcctType acct = is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ; uint16_t status; diff --git a/hw/block/nvme.h b/hw/block/nvme.h index 52ba794f2e..1675c1e075 100644 --- a/hw/block/nvme.h +++ b/hw/block/nvme.h @@ -77,6 +77,12 @@ static inline uint8_t nvme_ns_lbads(NvmeNamespace *ns) return nvme_ns_lbaf(ns)->ds; } +/* convert an LBA to the equivalent in bytes */ +static inline size_t nvme_l2b(NvmeNamespace *ns, uint64_t lba) +{ + return lba << nvme_ns_lbads(ns); +} + #define TYPE_NVME "nvme" #define NVME(obj) \ OBJECT_CHECK(NvmeCtrl, (obj), TYPE_NVME) From fd90f26cc732b5c0f51140ba0d1f7fd31e8bf910 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Mon, 24 Aug 2020 11:55:46 +0200 Subject: [PATCH 07/30] hw/block/nvme: fix endian conversion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The raw NLB field is a 16 bit value, so use le16_to_cpu instead of le32_to_cpu and cast to uint32_t before incrementing the value to not wrap around. Signed-off-by: Klaus Jensen Reviewed-by: Keith Busch Reviewed-by: Philippe Mathieu-Daudé --- hw/block/nvme.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 59338b4232..158843c14a 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -645,7 +645,7 @@ static uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req) NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; NvmeNamespace *ns = req->ns; uint64_t slba = le64_to_cpu(rw->slba); - uint32_t nlb = le16_to_cpu(rw->nlb) + 1; + uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1; uint64_t offset = nvme_l2b(ns, slba); uint32_t count = nvme_l2b(ns, nlb); uint16_t status; @@ -669,7 +669,7 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeRequest *req) { NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; NvmeNamespace *ns = req->ns; - uint32_t nlb = le32_to_cpu(rw->nlb) + 1; + uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1; uint64_t slba = le64_to_cpu(rw->slba); uint64_t data_size = nvme_l2b(ns, nlb); From e2f79209cd0129b8f27b87908de09cdc44a94baa Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Mon, 24 Aug 2020 22:11:33 +0200 Subject: [PATCH 08/30] hw/block/nvme: add symbolic command name to trace events MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add the symbolic command name to the pci_nvme_{io,admin}_cmd and pci_nvme_rw trace events. Signed-off-by: Klaus Jensen Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Keith Busch --- hw/block/nvme.c | 8 +++++--- hw/block/nvme.h | 28 ++++++++++++++++++++++++++++ hw/block/trace-events | 6 +++--- 3 files changed, 36 insertions(+), 6 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 158843c14a..961e6ffc5b 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -678,7 +678,8 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeRequest *req) enum BlockAcctType acct = is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ; uint16_t status; - trace_pci_nvme_rw(is_write ? "write" : "read", nlb, data_size, slba); + trace_pci_nvme_rw(nvme_cid(req), nvme_io_opc_str(rw->opcode), nlb, + data_size, slba); status = nvme_check_mdts(n, data_size); if (status) { @@ -727,7 +728,7 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req) uint32_t nsid = le32_to_cpu(req->cmd.nsid); trace_pci_nvme_io_cmd(nvme_cid(req), nsid, nvme_sqid(req), - req->cmd.opcode); + req->cmd.opcode, nvme_io_opc_str(req->cmd.opcode)); if (unlikely(nsid == 0 || nsid > n->num_namespaces)) { trace_pci_nvme_err_invalid_ns(nsid, n->num_namespaces); @@ -1579,7 +1580,8 @@ static uint16_t nvme_aer(NvmeCtrl *n, NvmeRequest *req) static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req) { - trace_pci_nvme_admin_cmd(nvme_cid(req), nvme_sqid(req), req->cmd.opcode); + trace_pci_nvme_admin_cmd(nvme_cid(req), nvme_sqid(req), req->cmd.opcode, + nvme_adm_opc_str(req->cmd.opcode)); switch (req->cmd.opcode) { case NVME_ADM_CMD_DELETE_SQ: diff --git a/hw/block/nvme.h b/hw/block/nvme.h index 1675c1e075..ce9e931420 100644 --- a/hw/block/nvme.h +++ b/hw/block/nvme.h @@ -32,6 +32,34 @@ typedef struct NvmeRequest { QTAILQ_ENTRY(NvmeRequest)entry; } NvmeRequest; +static inline const char *nvme_adm_opc_str(uint8_t opc) +{ + switch (opc) { + case NVME_ADM_CMD_DELETE_SQ: return "NVME_ADM_CMD_DELETE_SQ"; + case NVME_ADM_CMD_CREATE_SQ: return "NVME_ADM_CMD_CREATE_SQ"; + case NVME_ADM_CMD_GET_LOG_PAGE: return "NVME_ADM_CMD_GET_LOG_PAGE"; + case NVME_ADM_CMD_DELETE_CQ: return "NVME_ADM_CMD_DELETE_CQ"; + case NVME_ADM_CMD_CREATE_CQ: return "NVME_ADM_CMD_CREATE_CQ"; + case NVME_ADM_CMD_IDENTIFY: return "NVME_ADM_CMD_IDENTIFY"; + case NVME_ADM_CMD_ABORT: return "NVME_ADM_CMD_ABORT"; + case NVME_ADM_CMD_SET_FEATURES: return "NVME_ADM_CMD_SET_FEATURES"; + case NVME_ADM_CMD_GET_FEATURES: return "NVME_ADM_CMD_GET_FEATURES"; + case NVME_ADM_CMD_ASYNC_EV_REQ: return "NVME_ADM_CMD_ASYNC_EV_REQ"; + default: return "NVME_ADM_CMD_UNKNOWN"; + } +} + +static inline const char *nvme_io_opc_str(uint8_t opc) +{ + switch (opc) { + case NVME_CMD_FLUSH: return "NVME_NVM_CMD_FLUSH"; + case NVME_CMD_WRITE: return "NVME_NVM_CMD_WRITE"; + case NVME_CMD_READ: return "NVME_NVM_CMD_READ"; + case NVME_CMD_WRITE_ZEROES: return "NVME_NVM_CMD_WRITE_ZEROES"; + default: return "NVME_NVM_CMD_UNKNOWN"; + } +} + typedef struct NvmeSQueue { struct NvmeCtrl *ctrl; uint16_t sqid; diff --git a/hw/block/trace-events b/hw/block/trace-events index 5589db4a01..024786f483 100644 --- a/hw/block/trace-events +++ b/hw/block/trace-events @@ -36,9 +36,9 @@ pci_nvme_dma_read(uint64_t prp1, uint64_t prp2) "DMA read, prp1=0x%"PRIx64" prp2 pci_nvme_map_addr(uint64_t addr, uint64_t len) "addr 0x%"PRIx64" len %"PRIu64"" pci_nvme_map_addr_cmb(uint64_t addr, uint64_t len) "addr 0x%"PRIx64" len %"PRIu64"" pci_nvme_map_prp(uint64_t trans_len, uint32_t len, uint64_t prp1, uint64_t prp2, int num_prps) "trans_len %"PRIu64" len %"PRIu32" prp1 0x%"PRIx64" prp2 0x%"PRIx64" num_prps %d" -pci_nvme_io_cmd(uint16_t cid, uint32_t nsid, uint16_t sqid, uint8_t opcode) "cid %"PRIu16" nsid %"PRIu32" sqid %"PRIu16" opc 0x%"PRIx8"" -pci_nvme_admin_cmd(uint16_t cid, uint16_t sqid, uint8_t opcode) "cid %"PRIu16" sqid %"PRIu16" opc 0x%"PRIx8"" -pci_nvme_rw(const char *verb, uint32_t blk_count, uint64_t byte_count, uint64_t lba) "%s %"PRIu32" blocks (%"PRIu64" bytes) from LBA %"PRIu64"" +pci_nvme_io_cmd(uint16_t cid, uint32_t nsid, uint16_t sqid, uint8_t opcode, const char *opname) "cid %"PRIu16" nsid %"PRIu32" sqid %"PRIu16" opc 0x%"PRIx8" opname '%s'" +pci_nvme_admin_cmd(uint16_t cid, uint16_t sqid, uint8_t opcode, const char *opname) "cid %"PRIu16" sqid %"PRIu16" opc 0x%"PRIx8" opname '%s'" +pci_nvme_rw(uint16_t cid, const char *verb, uint32_t nlb, uint64_t count, uint64_t lba) "cid %"PRIu16" '%s' nlb %"PRIu32" count %"PRIu64" lba 0x%"PRIx64"" pci_nvme_rw_cb(uint16_t cid) "cid %"PRIu16"" pci_nvme_write_zeroes(uint16_t cid, uint64_t slba, uint32_t nlb) "cid %"PRIu16" slba %"PRIu64" nlb %"PRIu32"" pci_nvme_create_sq(uint64_t addr, uint16_t sqid, uint16_t cqid, uint16_t qsize, uint16_t qflags) "create submission queue, addr=0x%"PRIx64", sqid=%"PRIu16", cqid=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16"" From 6a09a3d737adbacd2b1d19596f0ac05de2a08aa7 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Mon, 24 Aug 2020 12:43:38 +0200 Subject: [PATCH 09/30] hw/block/nvme: refactor aio submission This pulls block layer aio submission/completion to common functions. For completions, additionally map an AIO error to the Unrecovered Read and Write Fault status codes. Signed-off-by: Klaus Jensen Reviewed-by: Keith Busch --- hw/block/nvme.c | 136 ++++++++++++++++++++++++++++++------------ hw/block/nvme.h | 14 +++++ hw/block/trace-events | 4 +- 3 files changed, 114 insertions(+), 40 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 961e6ffc5b..84cde40fad 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -614,30 +614,110 @@ static inline uint16_t nvme_check_bounds(NvmeCtrl *n, NvmeNamespace *ns, static void nvme_rw_cb(void *opaque, int ret) { NvmeRequest *req = opaque; - NvmeSQueue *sq = req->sq; - NvmeCtrl *n = sq->ctrl; - NvmeCQueue *cq = n->cq[sq->cqid]; + NvmeCtrl *n = nvme_ctrl(req); - trace_pci_nvme_rw_cb(nvme_cid(req)); + BlockBackend *blk = n->conf.blk; + BlockAcctCookie *acct = &req->acct; + BlockAcctStats *stats = blk_get_stats(blk); + + Error *local_err = NULL; + + trace_pci_nvme_rw_cb(nvme_cid(req), blk_name(blk)); if (!ret) { - block_acct_done(blk_get_stats(n->conf.blk), &req->acct); + block_acct_done(stats, acct); req->status = NVME_SUCCESS; } else { - block_acct_failed(blk_get_stats(n->conf.blk), &req->acct); - req->status = NVME_INTERNAL_DEV_ERROR; + uint16_t status; + + block_acct_failed(stats, acct); + + switch (req->cmd.opcode) { + case NVME_CMD_READ: + status = NVME_UNRECOVERED_READ; + break; + case NVME_CMD_FLUSH: + case NVME_CMD_WRITE: + case NVME_CMD_WRITE_ZEROES: + status = NVME_WRITE_FAULT; + break; + default: + status = NVME_INTERNAL_DEV_ERROR; + break; + } + + trace_pci_nvme_err_aio(nvme_cid(req), strerror(ret), status); + + error_setg_errno(&local_err, -ret, "aio failed"); + error_report_err(local_err); + + req->status = status; } - nvme_enqueue_req_completion(cq, req); + nvme_enqueue_req_completion(nvme_cq(req), req); +} + +static uint16_t nvme_do_aio(BlockBackend *blk, int64_t offset, size_t len, + NvmeRequest *req) +{ + BlockAcctCookie *acct = &req->acct; + BlockAcctStats *stats = blk_get_stats(blk); + + bool is_write = false; + + trace_pci_nvme_do_aio(nvme_cid(req), req->cmd.opcode, + nvme_io_opc_str(req->cmd.opcode), blk_name(blk), + offset, len); + + switch (req->cmd.opcode) { + case NVME_CMD_FLUSH: + block_acct_start(stats, acct, 0, BLOCK_ACCT_FLUSH); + req->aiocb = blk_aio_flush(blk, nvme_rw_cb, req); + break; + + case NVME_CMD_WRITE_ZEROES: + block_acct_start(stats, acct, len, BLOCK_ACCT_WRITE); + req->aiocb = blk_aio_pwrite_zeroes(blk, offset, len, + BDRV_REQ_MAY_UNMAP, nvme_rw_cb, + req); + break; + + case NVME_CMD_WRITE: + is_write = true; + + /* fallthrough */ + + case NVME_CMD_READ: + block_acct_start(stats, acct, len, + is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ); + + if (req->qsg.sg) { + if (is_write) { + req->aiocb = dma_blk_write(blk, &req->qsg, offset, + BDRV_SECTOR_SIZE, nvme_rw_cb, req); + } else { + req->aiocb = dma_blk_read(blk, &req->qsg, offset, + BDRV_SECTOR_SIZE, nvme_rw_cb, req); + } + } else { + if (is_write) { + req->aiocb = blk_aio_pwritev(blk, offset, &req->iov, 0, + nvme_rw_cb, req); + } else { + req->aiocb = blk_aio_preadv(blk, offset, &req->iov, 0, + nvme_rw_cb, req); + } + } + + break; + } + + return NVME_NO_COMPLETE; } static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req) { - block_acct_start(blk_get_stats(n->conf.blk), &req->acct, 0, - BLOCK_ACCT_FLUSH); - req->aiocb = blk_aio_flush(n->conf.blk, nvme_rw_cb, req); - - return NVME_NO_COMPLETE; + return nvme_do_aio(n->conf.blk, 0, 0, req); } static uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req) @@ -658,11 +738,7 @@ static uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req) return status; } - block_acct_start(blk_get_stats(n->conf.blk), &req->acct, 0, - BLOCK_ACCT_WRITE); - req->aiocb = blk_aio_pwrite_zeroes(n->conf.blk, offset, count, - BDRV_REQ_MAY_UNMAP, nvme_rw_cb, req); - return NVME_NO_COMPLETE; + return nvme_do_aio(n->conf.blk, offset, count, req); } static uint16_t nvme_rw(NvmeCtrl *n, NvmeRequest *req) @@ -674,8 +750,8 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeRequest *req) uint64_t data_size = nvme_l2b(ns, nlb); uint64_t data_offset = nvme_l2b(ns, slba); - int is_write = rw->opcode == NVME_CMD_WRITE ? 1 : 0; - enum BlockAcctType acct = is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ; + enum BlockAcctType acct = req->cmd.opcode == NVME_CMD_WRITE ? + BLOCK_ACCT_WRITE : BLOCK_ACCT_READ; uint16_t status; trace_pci_nvme_rw(nvme_cid(req), nvme_io_opc_str(rw->opcode), nlb, @@ -698,25 +774,7 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeRequest *req) goto invalid; } - if (req->qsg.nsg > 0) { - block_acct_start(blk_get_stats(n->conf.blk), &req->acct, req->qsg.size, - acct); - req->aiocb = is_write ? - dma_blk_write(n->conf.blk, &req->qsg, data_offset, BDRV_SECTOR_SIZE, - nvme_rw_cb, req) : - dma_blk_read(n->conf.blk, &req->qsg, data_offset, BDRV_SECTOR_SIZE, - nvme_rw_cb, req); - } else { - block_acct_start(blk_get_stats(n->conf.blk), &req->acct, req->iov.size, - acct); - req->aiocb = is_write ? - blk_aio_pwritev(n->conf.blk, data_offset, &req->iov, 0, nvme_rw_cb, - req) : - blk_aio_preadv(n->conf.blk, data_offset, &req->iov, 0, nvme_rw_cb, - req); - } - - return NVME_NO_COMPLETE; + return nvme_do_aio(n->conf.blk, data_offset, data_size, req); invalid: block_acct_invalid(blk_get_stats(n->conf.blk), acct); diff --git a/hw/block/nvme.h b/hw/block/nvme.h index ce9e931420..f355eccb32 100644 --- a/hw/block/nvme.h +++ b/hw/block/nvme.h @@ -171,4 +171,18 @@ static inline uint64_t nvme_ns_nlbas(NvmeCtrl *n, NvmeNamespace *ns) return n->ns_size >> nvme_ns_lbads(ns); } +static inline NvmeCQueue *nvme_cq(NvmeRequest *req) +{ + NvmeSQueue *sq = req->sq; + NvmeCtrl *n = sq->ctrl; + + return n->cq[sq->cqid]; +} + +static inline NvmeCtrl *nvme_ctrl(NvmeRequest *req) +{ + NvmeSQueue *sq = req->sq; + return sq->ctrl; +} + #endif /* HW_NVME_H */ diff --git a/hw/block/trace-events b/hw/block/trace-events index 024786f483..9a6c5fb3dd 100644 --- a/hw/block/trace-events +++ b/hw/block/trace-events @@ -39,8 +39,9 @@ pci_nvme_map_prp(uint64_t trans_len, uint32_t len, uint64_t prp1, uint64_t prp2, pci_nvme_io_cmd(uint16_t cid, uint32_t nsid, uint16_t sqid, uint8_t opcode, const char *opname) "cid %"PRIu16" nsid %"PRIu32" sqid %"PRIu16" opc 0x%"PRIx8" opname '%s'" pci_nvme_admin_cmd(uint16_t cid, uint16_t sqid, uint8_t opcode, const char *opname) "cid %"PRIu16" sqid %"PRIu16" opc 0x%"PRIx8" opname '%s'" pci_nvme_rw(uint16_t cid, const char *verb, uint32_t nlb, uint64_t count, uint64_t lba) "cid %"PRIu16" '%s' nlb %"PRIu32" count %"PRIu64" lba 0x%"PRIx64"" -pci_nvme_rw_cb(uint16_t cid) "cid %"PRIu16"" +pci_nvme_rw_cb(uint16_t cid, const char *blkname) "cid %"PRIu16" blk '%s'" pci_nvme_write_zeroes(uint16_t cid, uint64_t slba, uint32_t nlb) "cid %"PRIu16" slba %"PRIu64" nlb %"PRIu32"" +pci_nvme_do_aio(uint16_t cid, uint8_t opc, const char *opname, const char *blkname, int64_t offset, size_t len) "cid %"PRIu16" opc 0x%"PRIx8" opname '%s' blk '%s' offset %"PRId64" len %zu" pci_nvme_create_sq(uint64_t addr, uint16_t sqid, uint16_t cqid, uint16_t qsize, uint16_t qflags) "create submission queue, addr=0x%"PRIx64", sqid=%"PRIu16", cqid=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16"" pci_nvme_create_cq(uint64_t addr, uint16_t cqid, uint16_t vector, uint16_t size, uint16_t qflags, int ien) "create completion queue, addr=0x%"PRIx64", cqid=%"PRIu16", vector=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16", ien=%d" pci_nvme_del_sq(uint16_t qid) "deleting submission queue sqid=%"PRIu16"" @@ -89,6 +90,7 @@ pci_nvme_err_mdts(uint16_t cid, size_t len) "cid %"PRIu16" len %zu" pci_nvme_err_addr_read(uint64_t addr) "addr 0x%"PRIx64"" pci_nvme_err_addr_write(uint64_t addr) "addr 0x%"PRIx64"" pci_nvme_err_cfs(void) "controller fatal status" +pci_nvme_err_aio(uint16_t cid, const char *errname, uint16_t status) "cid %"PRIu16" err '%s' status 0x%"PRIx16"" pci_nvme_err_invalid_dma(void) "PRP/SGL is too small for transfer size" pci_nvme_err_invalid_prplist_ent(uint64_t prplist) "PRP list entry is null or not page aligned: 0x%"PRIx64"" pci_nvme_err_invalid_prp2_align(uint64_t prp2) "PRP2 is not page aligned: 0x%"PRIx64"" From 92a10ec17f3ae7221b23f3eaefa29066e10d7973 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Mon, 24 Aug 2020 13:32:06 +0200 Subject: [PATCH 10/30] hw/block/nvme: default request status to success MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make the default request status NVME_SUCCESS so only error status codes have to be set. Signed-off-by: Klaus Jensen Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Keith Busch --- hw/block/nvme.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 84cde40fad..0e916d48d7 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -230,6 +230,7 @@ static void nvme_req_clear(NvmeRequest *req) { req->ns = NULL; memset(&req->cqe, 0x0, sizeof(req->cqe)); + req->status = NVME_SUCCESS; } static void nvme_req_exit(NvmeRequest *req) @@ -546,8 +547,6 @@ static void nvme_process_aers(void *opaque) result->log_page = event->result.log_page; g_free(event); - req->status = NVME_SUCCESS; - trace_pci_nvme_aer_post_cqe(result->event_type, result->event_info, result->log_page); @@ -626,7 +625,6 @@ static void nvme_rw_cb(void *opaque, int ret) if (!ret) { block_acct_done(stats, acct); - req->status = NVME_SUCCESS; } else { uint16_t status; From 6e0ac3a03f3ab0d0b69d086e3226bac77a20a468 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Mon, 30 Mar 2020 23:23:15 +0200 Subject: [PATCH 11/30] hw/block/nvme: harden cmb access Since the controller has only supported PRPs so far it has not been required to check the ending address (addr + len - 1) of the CMB access for validity since it has been guaranteed to be in range of the CMB. This changes when the controller adds support for SGLs (next patch), so add that check. Signed-off-by: Klaus Jensen Reviewed-by: Keith Busch --- hw/block/nvme.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 0e916d48d7..c0f1f8ccd4 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -142,7 +142,12 @@ static inline void *nvme_addr_to_cmb(NvmeCtrl *n, hwaddr addr) static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size) { - if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr)) { + hwaddr hi = addr + size - 1; + if (hi < addr) { + return 1; + } + + if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) { memcpy(buf, nvme_addr_to_cmb(n, addr), size); return 0; } From cba0a8a344fea94aa2212e105611b8e099343cb1 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Fri, 12 Apr 2019 20:53:16 +0200 Subject: [PATCH 12/30] hw/block/nvme: add support for scatter gather lists For now, support the Data Block, Segment and Last Segment descriptor types. See NVM Express 1.3d, Section 4.4 ("Scatter Gather List (SGL)"). Signed-off-by: Klaus Jensen Reviewed-by: Keith Busch --- hw/block/nvme.c | 329 ++++++++++++++++++++++++++++++++++-------- hw/block/trace-events | 4 + include/block/nvme.h | 6 +- 3 files changed, 279 insertions(+), 60 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index c0f1f8ccd4..63d0a17bc5 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -413,13 +413,262 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, uint64_t prp2, return NVME_SUCCESS; } -static uint16_t nvme_dma_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len, - uint64_t prp1, uint64_t prp2, DMADirection dir, +/* + * Map 'nsgld' data descriptors from 'segment'. The function will subtract the + * number of bytes mapped in len. + */ +static uint16_t nvme_map_sgl_data(NvmeCtrl *n, QEMUSGList *qsg, + QEMUIOVector *iov, + NvmeSglDescriptor *segment, uint64_t nsgld, + size_t *len, NvmeRequest *req) +{ + dma_addr_t addr, trans_len; + uint32_t dlen; + uint16_t status; + + for (int i = 0; i < nsgld; i++) { + uint8_t type = NVME_SGL_TYPE(segment[i].type); + + switch (type) { + case NVME_SGL_DESCR_TYPE_DATA_BLOCK: + break; + case NVME_SGL_DESCR_TYPE_SEGMENT: + case NVME_SGL_DESCR_TYPE_LAST_SEGMENT: + return NVME_INVALID_NUM_SGL_DESCRS | NVME_DNR; + default: + return NVME_SGL_DESCR_TYPE_INVALID | NVME_DNR; + } + + dlen = le32_to_cpu(segment[i].len); + if (!dlen) { + continue; + } + + if (*len == 0) { + /* + * All data has been mapped, but the SGL contains additional + * segments and/or descriptors. The controller might accept + * ignoring the rest of the SGL. + */ + uint16_t sgls = le16_to_cpu(n->id_ctrl.sgls); + if (sgls & NVME_CTRL_SGLS_EXCESS_LENGTH) { + break; + } + + trace_pci_nvme_err_invalid_sgl_excess_length(nvme_cid(req)); + return NVME_DATA_SGL_LEN_INVALID | NVME_DNR; + } + + trans_len = MIN(*len, dlen); + addr = le64_to_cpu(segment[i].addr); + + if (UINT64_MAX - addr < dlen) { + return NVME_DATA_SGL_LEN_INVALID | NVME_DNR; + } + + status = nvme_map_addr(n, qsg, iov, addr, trans_len); + if (status) { + return status; + } + + *len -= trans_len; + } + + return NVME_SUCCESS; +} + +static uint16_t nvme_map_sgl(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov, + NvmeSglDescriptor sgl, size_t len, NvmeRequest *req) +{ + /* + * Read the segment in chunks of 256 descriptors (one 4k page) to avoid + * dynamically allocating a potentially huge SGL. The spec allows the SGL + * to be larger (as in number of bytes required to describe the SGL + * descriptors and segment chain) than the command transfer size, so it is + * not bounded by MDTS. + */ + const int SEG_CHUNK_SIZE = 256; + + NvmeSglDescriptor segment[SEG_CHUNK_SIZE], *sgld, *last_sgld; + uint64_t nsgld; + uint32_t seg_len; + uint16_t status; + bool sgl_in_cmb = false; + hwaddr addr; + int ret; + + sgld = &sgl; + addr = le64_to_cpu(sgl.addr); + + trace_pci_nvme_map_sgl(nvme_cid(req), NVME_SGL_TYPE(sgl.type), len); + + /* + * If the entire transfer can be described with a single data block it can + * be mapped directly. + */ + if (NVME_SGL_TYPE(sgl.type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) { + status = nvme_map_sgl_data(n, qsg, iov, sgld, 1, &len, req); + if (status) { + goto unmap; + } + + goto out; + } + + /* + * If the segment is located in the CMB, the submission queue of the + * request must also reside there. + */ + if (nvme_addr_is_cmb(n, addr)) { + if (!nvme_addr_is_cmb(n, req->sq->dma_addr)) { + return NVME_INVALID_USE_OF_CMB | NVME_DNR; + } + + sgl_in_cmb = true; + } + + for (;;) { + switch (NVME_SGL_TYPE(sgld->type)) { + case NVME_SGL_DESCR_TYPE_SEGMENT: + case NVME_SGL_DESCR_TYPE_LAST_SEGMENT: + break; + default: + return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR; + } + + seg_len = le32_to_cpu(sgld->len); + + /* check the length of the (Last) Segment descriptor */ + if (!seg_len || seg_len & 0xf) { + return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR; + } + + if (UINT64_MAX - addr < seg_len) { + return NVME_DATA_SGL_LEN_INVALID | NVME_DNR; + } + + nsgld = seg_len / sizeof(NvmeSglDescriptor); + + while (nsgld > SEG_CHUNK_SIZE) { + if (nvme_addr_read(n, addr, segment, sizeof(segment))) { + trace_pci_nvme_err_addr_read(addr); + status = NVME_DATA_TRAS_ERROR; + goto unmap; + } + + status = nvme_map_sgl_data(n, qsg, iov, segment, SEG_CHUNK_SIZE, + &len, req); + if (status) { + goto unmap; + } + + nsgld -= SEG_CHUNK_SIZE; + addr += SEG_CHUNK_SIZE * sizeof(NvmeSglDescriptor); + } + + ret = nvme_addr_read(n, addr, segment, nsgld * + sizeof(NvmeSglDescriptor)); + if (ret) { + trace_pci_nvme_err_addr_read(addr); + status = NVME_DATA_TRAS_ERROR; + goto unmap; + } + + last_sgld = &segment[nsgld - 1]; + + /* if the segment ends with a Data Block, then we are done */ + if (NVME_SGL_TYPE(last_sgld->type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) { + status = nvme_map_sgl_data(n, qsg, iov, segment, nsgld, &len, req); + if (status) { + goto unmap; + } + + goto out; + } + + /* + * If the last descriptor was not a Data Block, then the current + * segment must not be a Last Segment. + */ + if (NVME_SGL_TYPE(sgld->type) == NVME_SGL_DESCR_TYPE_LAST_SEGMENT) { + status = NVME_INVALID_SGL_SEG_DESCR | NVME_DNR; + goto unmap; + } + + sgld = last_sgld; + addr = le64_to_cpu(sgld->addr); + + /* + * Do not map the last descriptor; it will be a Segment or Last Segment + * descriptor and is handled by the next iteration. + */ + status = nvme_map_sgl_data(n, qsg, iov, segment, nsgld - 1, &len, req); + if (status) { + goto unmap; + } + + /* + * If the next segment is in the CMB, make sure that the sgl was + * already located there. + */ + if (sgl_in_cmb != nvme_addr_is_cmb(n, addr)) { + status = NVME_INVALID_USE_OF_CMB | NVME_DNR; + goto unmap; + } + } + +out: + /* if there is any residual left in len, the SGL was too short */ + if (len) { + status = NVME_DATA_SGL_LEN_INVALID | NVME_DNR; + goto unmap; + } + + return NVME_SUCCESS; + +unmap: + if (iov->iov) { + qemu_iovec_destroy(iov); + } + + if (qsg->sg) { + qemu_sglist_destroy(qsg); + } + + return status; +} + +static uint16_t nvme_map_dptr(NvmeCtrl *n, size_t len, NvmeRequest *req) +{ + uint64_t prp1, prp2; + + switch (NVME_CMD_FLAGS_PSDT(req->cmd.flags)) { + case NVME_PSDT_PRP: + prp1 = le64_to_cpu(req->cmd.dptr.prp1); + prp2 = le64_to_cpu(req->cmd.dptr.prp2); + + return nvme_map_prp(n, prp1, prp2, len, req); + case NVME_PSDT_SGL_MPTR_CONTIGUOUS: + case NVME_PSDT_SGL_MPTR_SGL: + /* SGLs shall not be used for Admin commands in NVMe over PCIe */ + if (!req->sq->sqid) { + return NVME_INVALID_FIELD | NVME_DNR; + } + + return nvme_map_sgl(n, &req->qsg, &req->iov, req->cmd.dptr.sgl, len, + req); + default: + return NVME_INVALID_FIELD; + } +} + +static uint16_t nvme_dma(NvmeCtrl *n, uint8_t *ptr, uint32_t len, + DMADirection dir, NvmeRequest *req) { uint16_t status = NVME_SUCCESS; - status = nvme_map_prp(n, prp1, prp2, len, req); + status = nvme_map_dptr(n, len, req); if (status) { return status; } @@ -458,15 +707,6 @@ static uint16_t nvme_dma_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len, return status; } -static uint16_t nvme_map_dptr(NvmeCtrl *n, size_t len, NvmeRequest *req) -{ - NvmeCmd *cmd = &req->cmd; - uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1); - uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2); - - return nvme_map_prp(n, prp1, prp2, len, req); -} - static void nvme_post_cqes(void *opaque) { NvmeCQueue *cq = opaque; @@ -929,10 +1169,7 @@ static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeRequest *req) static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len, uint64_t off, NvmeRequest *req) { - NvmeCmd *cmd = &req->cmd; - uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1); - uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2); - uint32_t nsid = le32_to_cpu(cmd->nsid); + uint32_t nsid = le32_to_cpu(req->cmd.nsid); uint32_t trans_len; time_t current_ms; @@ -981,17 +1218,14 @@ static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len, nvme_clear_events(n, NVME_AER_TYPE_SMART); } - return nvme_dma_prp(n, (uint8_t *) &smart + off, trans_len, prp1, prp2, - DMA_DIRECTION_FROM_DEVICE, req); + return nvme_dma(n, (uint8_t *) &smart + off, trans_len, + DMA_DIRECTION_FROM_DEVICE, req); } static uint16_t nvme_fw_log_info(NvmeCtrl *n, uint32_t buf_len, uint64_t off, NvmeRequest *req) { uint32_t trans_len; - NvmeCmd *cmd = &req->cmd; - uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1); - uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2); NvmeFwSlotInfoLog fw_log = { .afi = 0x1, }; @@ -1004,17 +1238,14 @@ static uint16_t nvme_fw_log_info(NvmeCtrl *n, uint32_t buf_len, uint64_t off, trans_len = MIN(sizeof(fw_log) - off, buf_len); - return nvme_dma_prp(n, (uint8_t *) &fw_log + off, trans_len, prp1, prp2, - DMA_DIRECTION_FROM_DEVICE, req); + return nvme_dma(n, (uint8_t *) &fw_log + off, trans_len, + DMA_DIRECTION_FROM_DEVICE, req); } static uint16_t nvme_error_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len, uint64_t off, NvmeRequest *req) { uint32_t trans_len; - NvmeCmd *cmd = &req->cmd; - uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1); - uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2); NvmeErrorLog errlog; if (!rae) { @@ -1029,8 +1260,8 @@ static uint16_t nvme_error_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len, trans_len = MIN(sizeof(errlog) - off, buf_len); - return nvme_dma_prp(n, (uint8_t *)&errlog, trans_len, prp1, prp2, - DMA_DIRECTION_FROM_DEVICE, req); + return nvme_dma(n, (uint8_t *)&errlog, trans_len, + DMA_DIRECTION_FROM_DEVICE, req); } static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req) @@ -1190,14 +1421,10 @@ static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest *req) static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeRequest *req) { - NvmeIdentify *c = (NvmeIdentify *)&req->cmd; - uint64_t prp1 = le64_to_cpu(c->prp1); - uint64_t prp2 = le64_to_cpu(c->prp2); - trace_pci_nvme_identify_ctrl(); - return nvme_dma_prp(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl), prp1, - prp2, DMA_DIRECTION_FROM_DEVICE, req); + return nvme_dma(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl), + DMA_DIRECTION_FROM_DEVICE, req); } static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req) @@ -1205,8 +1432,6 @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req) NvmeNamespace *ns; NvmeIdentify *c = (NvmeIdentify *)&req->cmd; uint32_t nsid = le32_to_cpu(c->nsid); - uint64_t prp1 = le64_to_cpu(c->prp1); - uint64_t prp2 = le64_to_cpu(c->prp2); trace_pci_nvme_identify_ns(nsid); @@ -1217,8 +1442,8 @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req) ns = &n->namespaces[nsid - 1]; - return nvme_dma_prp(n, (uint8_t *)&ns->id_ns, sizeof(ns->id_ns), prp1, - prp2, DMA_DIRECTION_FROM_DEVICE, req); + return nvme_dma(n, (uint8_t *)&ns->id_ns, sizeof(ns->id_ns), + DMA_DIRECTION_FROM_DEVICE, req); } static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req) @@ -1226,8 +1451,6 @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req) NvmeIdentify *c = (NvmeIdentify *)&req->cmd; static const int data_len = NVME_IDENTIFY_DATA_SIZE; uint32_t min_nsid = le32_to_cpu(c->nsid); - uint64_t prp1 = le64_to_cpu(c->prp1); - uint64_t prp2 = le64_to_cpu(c->prp2); uint32_t *list; uint16_t ret; int i, j = 0; @@ -1254,8 +1477,8 @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req) break; } } - ret = nvme_dma_prp(n, (uint8_t *)list, data_len, prp1, prp2, - DMA_DIRECTION_FROM_DEVICE, req); + ret = nvme_dma(n, (uint8_t *)list, data_len, DMA_DIRECTION_FROM_DEVICE, + req); g_free(list); return ret; } @@ -1264,8 +1487,6 @@ static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req) { NvmeIdentify *c = (NvmeIdentify *)&req->cmd; uint32_t nsid = le32_to_cpu(c->nsid); - uint64_t prp1 = le64_to_cpu(c->prp1); - uint64_t prp2 = le64_to_cpu(c->prp2); uint8_t list[NVME_IDENTIFY_DATA_SIZE]; @@ -1297,8 +1518,8 @@ static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req) ns_descrs->uuid.hdr.nidl = NVME_NIDT_UUID_LEN; stl_be_p(&ns_descrs->uuid.v, nsid); - return nvme_dma_prp(n, list, NVME_IDENTIFY_DATA_SIZE, prp1, prp2, - DMA_DIRECTION_FROM_DEVICE, req); + return nvme_dma(n, list, NVME_IDENTIFY_DATA_SIZE, + DMA_DIRECTION_FROM_DEVICE, req); } static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest *req) @@ -1369,14 +1590,10 @@ static inline uint64_t nvme_get_timestamp(const NvmeCtrl *n) static uint16_t nvme_get_feature_timestamp(NvmeCtrl *n, NvmeRequest *req) { - NvmeCmd *cmd = &req->cmd; - uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1); - uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2); - uint64_t timestamp = nvme_get_timestamp(n); - return nvme_dma_prp(n, (uint8_t *)×tamp, sizeof(timestamp), prp1, - prp2, DMA_DIRECTION_FROM_DEVICE, req); + return nvme_dma(n, (uint8_t *)×tamp, sizeof(timestamp), + DMA_DIRECTION_FROM_DEVICE, req); } static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req) @@ -1505,12 +1722,9 @@ static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, NvmeRequest *req) { uint16_t ret; uint64_t timestamp; - NvmeCmd *cmd = &req->cmd; - uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1); - uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2); - ret = nvme_dma_prp(n, (uint8_t *)×tamp, sizeof(timestamp), prp1, - prp2, DMA_DIRECTION_TO_DEVICE, req); + ret = nvme_dma(n, (uint8_t *)×tamp, sizeof(timestamp), + DMA_DIRECTION_TO_DEVICE, req); if (ret != NVME_SUCCESS) { return ret; } @@ -2437,6 +2651,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) id->nn = cpu_to_le32(n->num_namespaces); id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP | NVME_ONCS_FEATURES); + id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN); subnqn = g_strdup_printf("nqn.2019-08.org.qemu:%s", n->params.serial); strpadcpy((char *)id->subnqn, sizeof(id->subnqn), subnqn, '\0'); diff --git a/hw/block/trace-events b/hw/block/trace-events index 9a6c5fb3dd..22ea635144 100644 --- a/hw/block/trace-events +++ b/hw/block/trace-events @@ -36,6 +36,7 @@ pci_nvme_dma_read(uint64_t prp1, uint64_t prp2) "DMA read, prp1=0x%"PRIx64" prp2 pci_nvme_map_addr(uint64_t addr, uint64_t len) "addr 0x%"PRIx64" len %"PRIu64"" pci_nvme_map_addr_cmb(uint64_t addr, uint64_t len) "addr 0x%"PRIx64" len %"PRIu64"" pci_nvme_map_prp(uint64_t trans_len, uint32_t len, uint64_t prp1, uint64_t prp2, int num_prps) "trans_len %"PRIu64" len %"PRIu32" prp1 0x%"PRIx64" prp2 0x%"PRIx64" num_prps %d" +pci_nvme_map_sgl(uint16_t cid, uint8_t typ, uint64_t len) "cid %"PRIu16" type 0x%"PRIx8" len %"PRIu64"" pci_nvme_io_cmd(uint16_t cid, uint32_t nsid, uint16_t sqid, uint8_t opcode, const char *opname) "cid %"PRIu16" nsid %"PRIu32" sqid %"PRIu16" opc 0x%"PRIx8" opname '%s'" pci_nvme_admin_cmd(uint16_t cid, uint16_t sqid, uint8_t opcode, const char *opname) "cid %"PRIu16" sqid %"PRIu16" opc 0x%"PRIx8" opname '%s'" pci_nvme_rw(uint16_t cid, const char *verb, uint32_t nlb, uint64_t count, uint64_t lba) "cid %"PRIu16" '%s' nlb %"PRIu32" count %"PRIu64" lba 0x%"PRIx64"" @@ -91,6 +92,9 @@ pci_nvme_err_addr_read(uint64_t addr) "addr 0x%"PRIx64"" pci_nvme_err_addr_write(uint64_t addr) "addr 0x%"PRIx64"" pci_nvme_err_cfs(void) "controller fatal status" pci_nvme_err_aio(uint16_t cid, const char *errname, uint16_t status) "cid %"PRIu16" err '%s' status 0x%"PRIx16"" +pci_nvme_err_invalid_sgld(uint16_t cid, uint8_t typ) "cid %"PRIu16" type 0x%"PRIx8"" +pci_nvme_err_invalid_num_sgld(uint16_t cid, uint8_t typ) "cid %"PRIu16" type 0x%"PRIx8"" +pci_nvme_err_invalid_sgl_excess_length(uint16_t cid) "cid %"PRIu16"" pci_nvme_err_invalid_dma(void) "PRP/SGL is too small for transfer size" pci_nvme_err_invalid_prplist_ent(uint64_t prplist) "PRP list entry is null or not page aligned: 0x%"PRIx64"" pci_nvme_err_invalid_prp2_align(uint64_t prp2) "PRP2 is not page aligned: 0x%"PRIx64"" diff --git a/include/block/nvme.h b/include/block/nvme.h index 65e68a82c8..58647bcdad 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -412,9 +412,9 @@ typedef union NvmeCmdDptr { } NvmeCmdDptr; enum NvmePsdt { - PSDT_PRP = 0x0, - PSDT_SGL_MPTR_CONTIGUOUS = 0x1, - PSDT_SGL_MPTR_SGL = 0x2, + NVME_PSDT_PRP = 0x0, + NVME_PSDT_SGL_MPTR_CONTIGUOUS = 0x1, + NVME_PSDT_SGL_MPTR_SGL = 0x2, }; typedef struct QEMU_PACKED NvmeCmd { From d97eee64fef35655bd06f5c44a07fdb83a6274ae Mon Sep 17 00:00:00 2001 From: Gollu Appalanaidu Date: Wed, 18 Mar 2020 14:11:19 +0530 Subject: [PATCH 13/30] hw/block/nvme: add support for sgl bit bucket descriptor This adds support for SGL descriptor type 0x1 (bit bucket descriptor). See the NVM Express v1.3d specification, Section 4.4 ("Scatter Gather List (SGL)"). Signed-off-by: Gollu Appalanaidu Signed-off-by: Klaus Jensen Reviewed-by: Keith Busch --- hw/block/nvme.c | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 63d0a17bc5..4f08f55a76 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -430,6 +430,10 @@ static uint16_t nvme_map_sgl_data(NvmeCtrl *n, QEMUSGList *qsg, uint8_t type = NVME_SGL_TYPE(segment[i].type); switch (type) { + case NVME_SGL_DESCR_TYPE_BIT_BUCKET: + if (req->cmd.opcode == NVME_CMD_WRITE) { + continue; + } case NVME_SGL_DESCR_TYPE_DATA_BLOCK: break; case NVME_SGL_DESCR_TYPE_SEGMENT: @@ -440,6 +444,7 @@ static uint16_t nvme_map_sgl_data(NvmeCtrl *n, QEMUSGList *qsg, } dlen = le32_to_cpu(segment[i].len); + if (!dlen) { continue; } @@ -460,6 +465,11 @@ static uint16_t nvme_map_sgl_data(NvmeCtrl *n, QEMUSGList *qsg, } trans_len = MIN(*len, dlen); + + if (type == NVME_SGL_DESCR_TYPE_BIT_BUCKET) { + goto next; + } + addr = le64_to_cpu(segment[i].addr); if (UINT64_MAX - addr < dlen) { @@ -471,6 +481,7 @@ static uint16_t nvme_map_sgl_data(NvmeCtrl *n, QEMUSGList *qsg, return status; } +next: *len -= trans_len; } @@ -540,7 +551,8 @@ static uint16_t nvme_map_sgl(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov, seg_len = le32_to_cpu(sgld->len); /* check the length of the (Last) Segment descriptor */ - if (!seg_len || seg_len & 0xf) { + if ((!seg_len || seg_len & 0xf) && + (NVME_SGL_TYPE(sgld->type) != NVME_SGL_DESCR_TYPE_BIT_BUCKET)) { return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR; } @@ -577,19 +589,27 @@ static uint16_t nvme_map_sgl(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov, last_sgld = &segment[nsgld - 1]; - /* if the segment ends with a Data Block, then we are done */ - if (NVME_SGL_TYPE(last_sgld->type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) { + /* + * If the segment ends with a Data Block or Bit Bucket Descriptor Type, + * then we are done. + */ + switch (NVME_SGL_TYPE(last_sgld->type)) { + case NVME_SGL_DESCR_TYPE_DATA_BLOCK: + case NVME_SGL_DESCR_TYPE_BIT_BUCKET: status = nvme_map_sgl_data(n, qsg, iov, segment, nsgld, &len, req); if (status) { goto unmap; } goto out; + + default: + break; } /* - * If the last descriptor was not a Data Block, then the current - * segment must not be a Last Segment. + * If the last descriptor was not a Data Block or Bit Bucket, then the + * current segment must not be a Last Segment. */ if (NVME_SGL_TYPE(sgld->type) == NVME_SGL_DESCR_TYPE_LAST_SEGMENT) { status = NVME_INVALID_SGL_SEG_DESCR | NVME_DNR; @@ -2651,7 +2671,8 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) id->nn = cpu_to_le32(n->num_namespaces); id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP | NVME_ONCS_FEATURES); - id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN); + id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN | + NVME_CTRL_SGLS_BITBUCKET); subnqn = g_strdup_printf("nqn.2019-08.org.qemu:%s", n->params.serial); strpadcpy((char *)id->subnqn, sizeof(id->subnqn), subnqn, '\0'); From 7c9c350c15289b601766c391024e0bfea8a36ee0 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Sun, 23 Feb 2020 14:35:20 -0800 Subject: [PATCH 14/30] hw/block/nvme: refactor identify active namespace id list Prepare to support inactive namespaces. Signed-off-by: Klaus Jensen Reviewed-by: Maxim Levitsky Reviewed-by: Keith Busch --- hw/block/nvme.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 4f08f55a76..924279d602 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -1473,7 +1473,7 @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req) uint32_t min_nsid = le32_to_cpu(c->nsid); uint32_t *list; uint16_t ret; - int i, j = 0; + int j = 0; trace_pci_nvme_identify_nslist(min_nsid); @@ -1488,11 +1488,11 @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req) } list = g_malloc0(data_len); - for (i = 0; i < n->num_namespaces; i++) { - if (i < min_nsid) { + for (int i = 1; i <= n->num_namespaces; i++) { + if (i <= min_nsid) { continue; } - list[j++] = cpu_to_le32(i + 1); + list[j++] = cpu_to_le32(i); if (j == data_len / sizeof(uint32_t)) { break; } From 7f0f1acedf159d00684d495d7a14d52220c1d16b Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Wed, 26 Jun 2019 08:51:06 +0200 Subject: [PATCH 15/30] hw/block/nvme: support multiple namespaces This adds support for multiple namespaces by introducing a new 'nvme-ns' device model. The nvme device creates a bus named from the device name ('id'). The nvme-ns devices then connect to this and registers themselves with the nvme device. This changes how an nvme device is created. Example with two namespaces: -drive file=nvme0n1.img,if=none,id=disk1 -drive file=nvme0n2.img,if=none,id=disk2 -device nvme,serial=deadbeef,id=nvme0 -device nvme-ns,drive=disk1,bus=nvme0,nsid=1 -device nvme-ns,drive=disk2,bus=nvme0,nsid=2 The drive property is kept on the nvme device to keep the change backward compatible, but the property is now optional. Specifying a drive for the nvme device will always create the namespace with nsid 1. Signed-off-by: Klaus Jensen Reviewed-by: Keith Busch Reviewed-by: Minwoo Im --- hw/block/meson.build | 2 +- hw/block/nvme-ns.c | 167 ++++++++++++++++++++++++++++ hw/block/nvme-ns.h | 74 +++++++++++++ hw/block/nvme.c | 249 +++++++++++++++++++++++++++--------------- hw/block/nvme.h | 46 ++++---- hw/block/trace-events | 6 +- 6 files changed, 428 insertions(+), 116 deletions(-) create mode 100644 hw/block/nvme-ns.c create mode 100644 hw/block/nvme-ns.h diff --git a/hw/block/meson.build b/hw/block/meson.build index 78cad8f7cb..602ca6c854 100644 --- a/hw/block/meson.build +++ b/hw/block/meson.build @@ -13,7 +13,7 @@ softmmu_ss.add(when: 'CONFIG_SSI_M25P80', if_true: files('m25p80.c')) softmmu_ss.add(when: 'CONFIG_SWIM', if_true: files('swim.c')) softmmu_ss.add(when: 'CONFIG_XEN', if_true: files('xen-block.c')) softmmu_ss.add(when: 'CONFIG_SH4', if_true: files('tc58128.c')) -softmmu_ss.add(when: 'CONFIG_NVME_PCI', if_true: files('nvme.c')) +softmmu_ss.add(when: 'CONFIG_NVME_PCI', if_true: files('nvme.c', 'nvme-ns.c')) specific_ss.add(when: 'CONFIG_VIRTIO_BLK', if_true: files('virtio-blk.c')) specific_ss.add(when: 'CONFIG_VHOST_USER_BLK', if_true: files('vhost-user-blk.c')) diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c new file mode 100644 index 0000000000..2ba0263dda --- /dev/null +++ b/hw/block/nvme-ns.c @@ -0,0 +1,167 @@ +/* + * QEMU NVM Express Virtual Namespace + * + * Copyright (c) 2019 CNEX Labs + * Copyright (c) 2020 Samsung Electronics + * + * Authors: + * Klaus Jensen + * + * This work is licensed under the terms of the GNU GPL, version 2. See the + * COPYING file in the top-level directory. + * + */ + +#include "qemu/osdep.h" +#include "qemu/units.h" +#include "qemu/cutils.h" +#include "qemu/log.h" +#include "hw/block/block.h" +#include "hw/pci/pci.h" +#include "sysemu/sysemu.h" +#include "sysemu/block-backend.h" +#include "qapi/error.h" + +#include "hw/qdev-properties.h" +#include "hw/qdev-core.h" + +#include "nvme.h" +#include "nvme-ns.h" + +static void nvme_ns_init(NvmeNamespace *ns) +{ + NvmeIdNs *id_ns = &ns->id_ns; + + if (blk_get_flags(ns->blkconf.blk) & BDRV_O_UNMAP) { + ns->id_ns.dlfeat = 0x9; + } + + id_ns->lbaf[0].ds = BDRV_SECTOR_BITS; + + id_ns->nsze = cpu_to_le64(nvme_ns_nlbas(ns)); + + /* no thin provisioning */ + id_ns->ncap = id_ns->nsze; + id_ns->nuse = id_ns->ncap; +} + +static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, Error **errp) +{ + if (!blkconf_blocksizes(&ns->blkconf, errp)) { + return -1; + } + + if (!blkconf_apply_backend_options(&ns->blkconf, + blk_is_read_only(ns->blkconf.blk), + false, errp)) { + return -1; + } + + ns->size = blk_getlength(ns->blkconf.blk); + if (ns->size < 0) { + error_setg_errno(errp, -ns->size, "could not get blockdev size"); + return -1; + } + + if (blk_enable_write_cache(ns->blkconf.blk)) { + n->features.vwc = 0x1; + } + + return 0; +} + +static int nvme_ns_check_constraints(NvmeNamespace *ns, Error **errp) +{ + if (!ns->blkconf.blk) { + error_setg(errp, "block backend not configured"); + return -1; + } + + return 0; +} + +int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp) +{ + if (nvme_ns_check_constraints(ns, errp)) { + return -1; + } + + if (nvme_ns_init_blk(n, ns, errp)) { + return -1; + } + + nvme_ns_init(ns); + if (nvme_register_namespace(n, ns, errp)) { + return -1; + } + + return 0; +} + +void nvme_ns_drain(NvmeNamespace *ns) +{ + blk_drain(ns->blkconf.blk); +} + +void nvme_ns_flush(NvmeNamespace *ns) +{ + blk_flush(ns->blkconf.blk); +} + +static void nvme_ns_realize(DeviceState *dev, Error **errp) +{ + NvmeNamespace *ns = NVME_NS(dev); + BusState *s = qdev_get_parent_bus(dev); + NvmeCtrl *n = NVME(s->parent); + Error *local_err = NULL; + + if (nvme_ns_setup(n, ns, &local_err)) { + error_propagate_prepend(errp, local_err, + "could not setup namespace: "); + return; + } +} + +static Property nvme_ns_props[] = { + DEFINE_BLOCK_PROPERTIES(NvmeNamespace, blkconf), + DEFINE_PROP_UINT32("nsid", NvmeNamespace, params.nsid, 0), + DEFINE_PROP_END_OF_LIST(), +}; + +static void nvme_ns_class_init(ObjectClass *oc, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(oc); + + set_bit(DEVICE_CATEGORY_STORAGE, dc->categories); + + dc->bus_type = TYPE_NVME_BUS; + dc->realize = nvme_ns_realize; + device_class_set_props(dc, nvme_ns_props); + dc->desc = "Virtual NVMe namespace"; +} + +static void nvme_ns_instance_init(Object *obj) +{ + NvmeNamespace *ns = NVME_NS(obj); + char *bootindex = g_strdup_printf("/namespace@%d,0", ns->params.nsid); + + device_add_bootindex_property(obj, &ns->bootindex, "bootindex", + bootindex, DEVICE(obj)); + + g_free(bootindex); +} + +static const TypeInfo nvme_ns_info = { + .name = TYPE_NVME_NS, + .parent = TYPE_DEVICE, + .class_init = nvme_ns_class_init, + .instance_size = sizeof(NvmeNamespace), + .instance_init = nvme_ns_instance_init, +}; + +static void nvme_ns_register_types(void) +{ + type_register_static(&nvme_ns_info); +} + +type_init(nvme_ns_register_types) diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h new file mode 100644 index 0000000000..83734f4606 --- /dev/null +++ b/hw/block/nvme-ns.h @@ -0,0 +1,74 @@ +/* + * QEMU NVM Express Virtual Namespace + * + * Copyright (c) 2019 CNEX Labs + * Copyright (c) 2020 Samsung Electronics + * + * Authors: + * Klaus Jensen + * + * This work is licensed under the terms of the GNU GPL, version 2. See the + * COPYING file in the top-level directory. + * + */ + +#ifndef NVME_NS_H +#define NVME_NS_H + +#define TYPE_NVME_NS "nvme-ns" +#define NVME_NS(obj) \ + OBJECT_CHECK(NvmeNamespace, (obj), TYPE_NVME_NS) + +typedef struct NvmeNamespaceParams { + uint32_t nsid; +} NvmeNamespaceParams; + +typedef struct NvmeNamespace { + DeviceState parent_obj; + BlockConf blkconf; + int32_t bootindex; + int64_t size; + NvmeIdNs id_ns; + + NvmeNamespaceParams params; +} NvmeNamespace; + +static inline uint32_t nvme_nsid(NvmeNamespace *ns) +{ + if (ns) { + return ns->params.nsid; + } + + return -1; +} + +static inline NvmeLBAF *nvme_ns_lbaf(NvmeNamespace *ns) +{ + NvmeIdNs *id_ns = &ns->id_ns; + return &id_ns->lbaf[NVME_ID_NS_FLBAS_INDEX(id_ns->flbas)]; +} + +static inline uint8_t nvme_ns_lbads(NvmeNamespace *ns) +{ + return nvme_ns_lbaf(ns)->ds; +} + +/* calculate the number of LBAs that the namespace can accomodate */ +static inline uint64_t nvme_ns_nlbas(NvmeNamespace *ns) +{ + return ns->size >> nvme_ns_lbads(ns); +} + +/* convert an LBA to the equivalent in bytes */ +static inline size_t nvme_l2b(NvmeNamespace *ns, uint64_t lba) +{ + return lba << nvme_ns_lbads(ns); +} + +typedef struct NvmeCtrl NvmeCtrl; + +int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp); +void nvme_ns_drain(NvmeNamespace *ns); +void nvme_ns_flush(NvmeNamespace *ns); + +#endif /* NVME_NS_H */ diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 924279d602..1af12f861a 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -17,12 +17,13 @@ /** * Usage: add options: * -drive file=,if=none,id= - * -device nvme,drive=,serial=,id=, \ + * -device nvme,serial=,id=, \ * cmb_size_mb=, \ * [pmrdev=,] \ * max_ioqpairs=, \ * aerl=, aer_max_queued=, \ * mdts= + * -device nvme-ns,drive=,bus=bus_name,nsid= * * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at * offset 0 in BAR2 and supports only WDS, RDS and SQS for now. @@ -69,6 +70,7 @@ #include "qemu/cutils.h" #include "trace.h" #include "nvme.h" +#include "nvme-ns.h" #define NVME_MAX_IOQPAIRS 0xffff #define NVME_DB_SIZE 4 @@ -155,6 +157,11 @@ static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size) return pci_dma_read(&n->parent_obj, addr, buf, size); } +static bool nvme_nsid_valid(NvmeCtrl *n, uint32_t nsid) +{ + return nsid && (nsid == NVME_NSID_BROADCAST || nsid <= n->num_namespaces); +} + static int nvme_check_sqid(NvmeCtrl *n, uint16_t sqid) { return sqid < n->params.max_ioqpairs + 1 && n->sq[sqid] != NULL ? 0 : -1; @@ -878,9 +885,9 @@ static inline uint16_t nvme_check_bounds(NvmeCtrl *n, NvmeNamespace *ns, static void nvme_rw_cb(void *opaque, int ret) { NvmeRequest *req = opaque; - NvmeCtrl *n = nvme_ctrl(req); + NvmeNamespace *ns = req->ns; - BlockBackend *blk = n->conf.blk; + BlockBackend *blk = ns->blkconf.blk; BlockAcctCookie *acct = &req->acct; BlockAcctStats *stats = blk_get_stats(blk); @@ -980,7 +987,8 @@ static uint16_t nvme_do_aio(BlockBackend *blk, int64_t offset, size_t len, static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req) { - return nvme_do_aio(n->conf.blk, 0, 0, req); + NvmeNamespace *ns = req->ns; + return nvme_do_aio(ns->blkconf.blk, 0, 0, req); } static uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req) @@ -993,7 +1001,7 @@ static uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req) uint32_t count = nvme_l2b(ns, nlb); uint16_t status; - trace_pci_nvme_write_zeroes(nvme_cid(req), slba, nlb); + trace_pci_nvme_write_zeroes(nvme_cid(req), nvme_nsid(ns), slba, nlb); status = nvme_check_bounds(n, ns, slba, nlb); if (status) { @@ -1001,7 +1009,7 @@ static uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req) return status; } - return nvme_do_aio(n->conf.blk, offset, count, req); + return nvme_do_aio(ns->blkconf.blk, offset, count, req); } static uint16_t nvme_rw(NvmeCtrl *n, NvmeRequest *req) @@ -1017,8 +1025,8 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeRequest *req) BLOCK_ACCT_WRITE : BLOCK_ACCT_READ; uint16_t status; - trace_pci_nvme_rw(nvme_cid(req), nvme_io_opc_str(rw->opcode), nlb, - data_size, slba); + trace_pci_nvme_rw(nvme_cid(req), nvme_io_opc_str(rw->opcode), + nvme_nsid(ns), nlb, data_size, slba); status = nvme_check_mdts(n, data_size); if (status) { @@ -1037,10 +1045,10 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeRequest *req) goto invalid; } - return nvme_do_aio(n->conf.blk, data_offset, data_size, req); + return nvme_do_aio(ns->blkconf.blk, data_offset, data_size, req); invalid: - block_acct_invalid(blk_get_stats(n->conf.blk), acct); + block_acct_invalid(blk_get_stats(ns->blkconf.blk), acct); return status; } @@ -1051,12 +1059,15 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req) trace_pci_nvme_io_cmd(nvme_cid(req), nsid, nvme_sqid(req), req->cmd.opcode, nvme_io_opc_str(req->cmd.opcode)); - if (unlikely(nsid == 0 || nsid > n->num_namespaces)) { - trace_pci_nvme_err_invalid_ns(nsid, n->num_namespaces); + if (!nvme_nsid_valid(n, nsid)) { return NVME_INVALID_NSID | NVME_DNR; } - req->ns = &n->namespaces[nsid - 1]; + req->ns = nvme_ns(n, nsid); + if (unlikely(!req->ns)) { + return NVME_INVALID_FIELD | NVME_DNR; + } + switch (req->cmd.opcode) { case NVME_CMD_FLUSH: return nvme_flush(n, req); @@ -1196,18 +1207,24 @@ static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len, uint64_t units_read = 0, units_written = 0; uint64_t read_commands = 0, write_commands = 0; NvmeSmartLog smart; - BlockAcctStats *s; if (nsid && nsid != 0xffffffff) { return NVME_INVALID_FIELD | NVME_DNR; } - s = blk_get_stats(n->conf.blk); + for (int i = 1; i <= n->num_namespaces; i++) { + NvmeNamespace *ns = nvme_ns(n, i); + if (!ns) { + continue; + } - units_read = s->nr_bytes[BLOCK_ACCT_READ] >> BDRV_SECTOR_BITS; - units_written = s->nr_bytes[BLOCK_ACCT_WRITE] >> BDRV_SECTOR_BITS; - read_commands = s->nr_ops[BLOCK_ACCT_READ]; - write_commands = s->nr_ops[BLOCK_ACCT_WRITE]; + BlockAcctStats *s = blk_get_stats(ns->blkconf.blk); + + units_read += s->nr_bytes[BLOCK_ACCT_READ] >> BDRV_SECTOR_BITS; + units_written += s->nr_bytes[BLOCK_ACCT_WRITE] >> BDRV_SECTOR_BITS; + read_commands += s->nr_ops[BLOCK_ACCT_READ]; + write_commands += s->nr_ops[BLOCK_ACCT_WRITE]; + } if (off > sizeof(smart)) { return NVME_INVALID_FIELD | NVME_DNR; @@ -1451,18 +1468,23 @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req) { NvmeNamespace *ns; NvmeIdentify *c = (NvmeIdentify *)&req->cmd; + NvmeIdNs *id_ns, inactive = { 0 }; uint32_t nsid = le32_to_cpu(c->nsid); trace_pci_nvme_identify_ns(nsid); - if (unlikely(nsid == 0 || nsid > n->num_namespaces)) { - trace_pci_nvme_err_invalid_ns(nsid, n->num_namespaces); + if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) { return NVME_INVALID_NSID | NVME_DNR; } - ns = &n->namespaces[nsid - 1]; + ns = nvme_ns(n, nsid); + if (unlikely(!ns)) { + id_ns = &inactive; + } else { + id_ns = &ns->id_ns; + } - return nvme_dma(n, (uint8_t *)&ns->id_ns, sizeof(ns->id_ns), + return nvme_dma(n, (uint8_t *)id_ns, sizeof(NvmeIdNs), DMA_DIRECTION_FROM_DEVICE, req); } @@ -1489,7 +1511,7 @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req) list = g_malloc0(data_len); for (int i = 1; i <= n->num_namespaces; i++) { - if (i <= min_nsid) { + if (i <= min_nsid || !nvme_ns(n, i)) { continue; } list[j++] = cpu_to_le32(i); @@ -1507,7 +1529,6 @@ static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req) { NvmeIdentify *c = (NvmeIdentify *)&req->cmd; uint32_t nsid = le32_to_cpu(c->nsid); - uint8_t list[NVME_IDENTIFY_DATA_SIZE]; struct data { @@ -1521,11 +1542,14 @@ static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req) trace_pci_nvme_identify_ns_descr_list(nsid); - if (unlikely(nsid == 0 || nsid > n->num_namespaces)) { - trace_pci_nvme_err_invalid_ns(nsid, n->num_namespaces); + if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) { return NVME_INVALID_NSID | NVME_DNR; } + if (unlikely(!nvme_ns(n, nsid))) { + return NVME_INVALID_FIELD | NVME_DNR; + } + memset(list, 0x0, sizeof(list)); /* @@ -1638,7 +1662,7 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req) } if (nvme_feature_cap[fid] & NVME_FEAT_CAP_NS) { - if (!nsid || nsid > n->num_namespaces) { + if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) { /* * The Reservation Notification Mask and Reservation Persistence * features require a status code of Invalid Field in Command when @@ -1648,6 +1672,10 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req) */ return NVME_INVALID_NSID | NVME_DNR; } + + if (!nvme_ns(n, nsid)) { + return NVME_INVALID_FIELD | NVME_DNR; + } } switch (sel) { @@ -1685,7 +1713,7 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req) return NVME_INVALID_FIELD | NVME_DNR; case NVME_VOLATILE_WRITE_CACHE: - result = blk_enable_write_cache(n->conf.blk); + result = n->features.vwc; trace_pci_nvme_getfeat_vwcache(result ? "enabled" : "disabled"); goto out; case NVME_ASYNCHRONOUS_EVENT_CONF: @@ -1756,6 +1784,8 @@ static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, NvmeRequest *req) static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req) { + NvmeNamespace *ns; + NvmeCmd *cmd = &req->cmd; uint32_t dw10 = le32_to_cpu(cmd->cdw10); uint32_t dw11 = le32_to_cpu(cmd->cdw11); @@ -1774,12 +1804,18 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req) } if (nvme_feature_cap[fid] & NVME_FEAT_CAP_NS) { - if (!nsid || (nsid != NVME_NSID_BROADCAST && - nsid > n->num_namespaces)) { - return NVME_INVALID_NSID | NVME_DNR; + if (nsid != NVME_NSID_BROADCAST) { + if (!nvme_nsid_valid(n, nsid)) { + return NVME_INVALID_NSID | NVME_DNR; + } + + ns = nvme_ns(n, nsid); + if (unlikely(!ns)) { + return NVME_INVALID_FIELD | NVME_DNR; + } } } else if (nsid && nsid != NVME_NSID_BROADCAST) { - if (nsid > n->num_namespaces) { + if (!nvme_nsid_valid(n, nsid)) { return NVME_INVALID_NSID | NVME_DNR; } @@ -1817,12 +1853,23 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req) break; case NVME_VOLATILE_WRITE_CACHE: - if (!(dw11 & 0x1) && blk_enable_write_cache(n->conf.blk)) { - blk_flush(n->conf.blk); + n->features.vwc = dw11 & 0x1; + + for (int i = 1; i <= n->num_namespaces; i++) { + ns = nvme_ns(n, i); + if (!ns) { + continue; + } + + if (!(dw11 & 0x1) && blk_enable_write_cache(ns->blkconf.blk)) { + blk_flush(ns->blkconf.blk); + } + + blk_set_enable_write_cache(ns->blkconf.blk, dw11 & 1); } - blk_set_enable_write_cache(n->conf.blk, dw11 & 1); break; + case NVME_NUMBER_OF_QUEUES: if (n->qs_created) { return NVME_CMD_SEQ_ERROR | NVME_DNR; @@ -1944,9 +1991,17 @@ static void nvme_process_sq(void *opaque) static void nvme_clear_ctrl(NvmeCtrl *n) { + NvmeNamespace *ns; int i; - blk_drain(n->conf.blk); + for (i = 1; i <= n->num_namespaces; i++) { + ns = nvme_ns(n, i); + if (!ns) { + continue; + } + + nvme_ns_drain(ns); + } for (i = 0; i < n->params.max_ioqpairs + 1; i++) { if (n->sq[i] != NULL) { @@ -1969,7 +2024,15 @@ static void nvme_clear_ctrl(NvmeCtrl *n) n->outstanding_aers = 0; n->qs_created = false; - blk_flush(n->conf.blk); + for (i = 1; i <= n->num_namespaces; i++) { + ns = nvme_ns(n, i); + if (!ns) { + continue; + } + + nvme_ns_flush(ns); + } + n->bar.cc = 0; } @@ -2447,6 +2510,11 @@ static void nvme_check_constraints(NvmeCtrl *n, Error **errp) params->max_ioqpairs = params->num_queues - 1; } + if (n->conf.blk) { + warn_report("drive property is deprecated; " + "please use an nvme-ns device instead"); + } + if (params->max_ioqpairs < 1 || params->max_ioqpairs > NVME_MAX_IOQPAIRS) { error_setg(errp, "max_ioqpairs must be between 1 and %d", @@ -2461,11 +2529,6 @@ static void nvme_check_constraints(NvmeCtrl *n, Error **errp) return; } - if (!n->conf.blk) { - error_setg(errp, "drive property not set"); - return; - } - if (!params->serial) { error_setg(errp, "serial property not set"); return; @@ -2489,11 +2552,10 @@ static void nvme_check_constraints(NvmeCtrl *n, Error **errp) static void nvme_init_state(NvmeCtrl *n) { - n->num_namespaces = 1; + n->num_namespaces = NVME_MAX_NAMESPACES; /* add one to max_ioqpairs to account for the admin queue pair */ n->reg_size = pow2ceil(sizeof(NvmeBar) + 2 * (n->params.max_ioqpairs + 1) * NVME_DB_SIZE); - n->namespaces = g_new0(NvmeNamespace, n->num_namespaces); n->sq = g_new0(NvmeSQueue *, n->params.max_ioqpairs + 1); n->cq = g_new0(NvmeCQueue *, n->params.max_ioqpairs + 1); n->temperature = NVME_TEMPERATURE; @@ -2502,34 +2564,41 @@ static void nvme_init_state(NvmeCtrl *n) n->aer_reqs = g_new0(NvmeRequest *, n->params.aerl + 1); } -static void nvme_init_blk(NvmeCtrl *n, Error **errp) +int nvme_register_namespace(NvmeCtrl *n, NvmeNamespace *ns, Error **errp) { - if (!blkconf_blocksizes(&n->conf, errp)) { - return; - } - blkconf_apply_backend_options(&n->conf, blk_is_read_only(n->conf.blk), - false, errp); -} + uint32_t nsid = nvme_nsid(ns); -static void nvme_init_namespace(NvmeCtrl *n, NvmeNamespace *ns, Error **errp) -{ - int64_t bs_size; - NvmeIdNs *id_ns = &ns->id_ns; - - bs_size = blk_getlength(n->conf.blk); - if (bs_size < 0) { - error_setg_errno(errp, -bs_size, "could not get backing file size"); - return; + if (nsid > NVME_MAX_NAMESPACES) { + error_setg(errp, "invalid namespace id (must be between 0 and %d)", + NVME_MAX_NAMESPACES); + return -1; } - n->ns_size = bs_size; + if (!nsid) { + for (int i = 1; i <= n->num_namespaces; i++) { + NvmeNamespace *ns = nvme_ns(n, i); + if (!ns) { + nsid = i; + break; + } + } - id_ns->lbaf[0].ds = BDRV_SECTOR_BITS; - id_ns->nsze = cpu_to_le64(nvme_ns_nlbas(n, ns)); + if (!nsid) { + error_setg(errp, "no free namespace id"); + return -1; + } + } else { + if (n->namespaces[nsid - 1]) { + error_setg(errp, "namespace id '%d' is already in use", nsid); + return -1; + } + } - /* no thin provisioning */ - id_ns->ncap = id_ns->nsze; - id_ns->nuse = id_ns->ncap; + trace_pci_nvme_register_namespace(nsid); + + n->namespaces[nsid - 1] = ns; + + return 0; } static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev) @@ -2671,6 +2740,8 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) id->nn = cpu_to_le32(n->num_namespaces); id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP | NVME_ONCS_FEATURES); + + id->vwc = 0x1; id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN | NVME_CTRL_SGLS_BITBUCKET); @@ -2681,9 +2752,6 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) id->psd[0].mp = cpu_to_le16(0x9c4); id->psd[0].enlat = cpu_to_le32(0x10); id->psd[0].exlat = cpu_to_le32(0x4); - if (blk_enable_write_cache(n->conf.blk)) { - id->vwc = 1; - } n->bar.cap = 0; NVME_CAP_SET_MQES(n->bar.cap, 0x7ff); @@ -2699,23 +2767,19 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) static void nvme_realize(PCIDevice *pci_dev, Error **errp) { NvmeCtrl *n = NVME(pci_dev); + NvmeNamespace *ns; Error *local_err = NULL; - int i; - nvme_check_constraints(n, &local_err); if (local_err) { error_propagate(errp, local_err); return; } - nvme_init_state(n); - nvme_init_blk(n, &local_err); - if (local_err) { - error_propagate(errp, local_err); - return; - } + qbus_create_inplace(&n->bus, sizeof(NvmeBus), TYPE_NVME_BUS, + &pci_dev->qdev, n->parent_obj.qdev.id); + nvme_init_state(n); nvme_init_pci(n, pci_dev, &local_err); if (local_err) { error_propagate(errp, local_err); @@ -2724,10 +2788,12 @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp) nvme_init_ctrl(n, pci_dev); - for (i = 0; i < n->num_namespaces; i++) { - nvme_init_namespace(n, &n->namespaces[i], &local_err); - if (local_err) { - error_propagate(errp, local_err); + /* setup a namespace if the controller drive property was given */ + if (n->namespace.blkconf.blk) { + ns = &n->namespace; + ns->params.nsid = 1; + + if (nvme_ns_setup(n, ns, errp)) { return; } } @@ -2754,7 +2820,7 @@ static void nvme_exit(PCIDevice *pci_dev) } static Property nvme_props[] = { - DEFINE_BLOCK_PROPERTIES(NvmeCtrl, conf), + DEFINE_BLOCK_PROPERTIES(NvmeCtrl, namespace.blkconf), DEFINE_PROP_LINK("pmrdev", NvmeCtrl, pmrdev, TYPE_MEMORY_BACKEND, HostMemoryBackend *), DEFINE_PROP_STRING("serial", NvmeCtrl, params.serial), @@ -2795,26 +2861,35 @@ static void nvme_instance_init(Object *obj) { NvmeCtrl *s = NVME(obj); - device_add_bootindex_property(obj, &s->conf.bootindex, - "bootindex", "/namespace@1,0", - DEVICE(obj)); + if (s->namespace.blkconf.blk) { + device_add_bootindex_property(obj, &s->namespace.blkconf.bootindex, + "bootindex", "/namespace@1,0", + DEVICE(obj)); + } } static const TypeInfo nvme_info = { .name = TYPE_NVME, .parent = TYPE_PCI_DEVICE, .instance_size = sizeof(NvmeCtrl), - .class_init = nvme_class_init, .instance_init = nvme_instance_init, + .class_init = nvme_class_init, .interfaces = (InterfaceInfo[]) { { INTERFACE_PCIE_DEVICE }, { } }, }; +static const TypeInfo nvme_bus_info = { + .name = TYPE_NVME_BUS, + .parent = TYPE_BUS, + .instance_size = sizeof(NvmeBus), +}; + static void nvme_register_types(void) { type_register_static(&nvme_info); + type_register_static(&nvme_bus_info); } type_init(nvme_register_types) diff --git a/hw/block/nvme.h b/hw/block/nvme.h index f355eccb32..d96ec15cdf 100644 --- a/hw/block/nvme.h +++ b/hw/block/nvme.h @@ -2,6 +2,9 @@ #define HW_NVME_H #include "block/nvme.h" +#include "nvme-ns.h" + +#define NVME_MAX_NAMESPACES 256 typedef struct NvmeParams { char *serial; @@ -90,26 +93,12 @@ typedef struct NvmeCQueue { QTAILQ_HEAD(, NvmeRequest) req_list; } NvmeCQueue; -typedef struct NvmeNamespace { - NvmeIdNs id_ns; -} NvmeNamespace; +#define TYPE_NVME_BUS "nvme-bus" +#define NVME_BUS(obj) OBJECT_CHECK(NvmeBus, (obj), TYPE_NVME_BUS) -static inline NvmeLBAF *nvme_ns_lbaf(NvmeNamespace *ns) -{ - NvmeIdNs *id_ns = &ns->id_ns; - return &id_ns->lbaf[NVME_ID_NS_FLBAS_INDEX(id_ns->flbas)]; -} - -static inline uint8_t nvme_ns_lbads(NvmeNamespace *ns) -{ - return nvme_ns_lbaf(ns)->ds; -} - -/* convert an LBA to the equivalent in bytes */ -static inline size_t nvme_l2b(NvmeNamespace *ns, uint64_t lba) -{ - return lba << nvme_ns_lbads(ns); -} +typedef struct NvmeBus { + BusState parent_bus; +} NvmeBus; #define TYPE_NVME "nvme" #define NVME(obj) \ @@ -121,6 +110,7 @@ typedef struct NvmeFeatureVal { uint16_t temp_thresh_low; }; uint32_t async_config; + uint32_t vwc; } NvmeFeatureVal; typedef struct NvmeCtrl { @@ -128,8 +118,9 @@ typedef struct NvmeCtrl { MemoryRegion iomem; MemoryRegion ctrl_mem; NvmeBar bar; - BlockConf conf; NvmeParams params; + NvmeBus bus; + BlockConf conf; bool qs_created; uint32_t page_size; @@ -140,7 +131,6 @@ typedef struct NvmeCtrl { uint32_t reg_size; uint32_t num_namespaces; uint32_t max_q_ents; - uint64_t ns_size; uint8_t outstanding_aers; uint8_t *cmbuf; uint32_t irq_status; @@ -156,7 +146,8 @@ typedef struct NvmeCtrl { QTAILQ_HEAD(, NvmeAsyncEvent) aer_queue; int aer_queued; - NvmeNamespace *namespaces; + NvmeNamespace namespace; + NvmeNamespace *namespaces[NVME_MAX_NAMESPACES]; NvmeSQueue **sq; NvmeCQueue **cq; NvmeSQueue admin_sq; @@ -165,10 +156,13 @@ typedef struct NvmeCtrl { NvmeFeatureVal features; } NvmeCtrl; -/* calculate the number of LBAs that the namespace can accomodate */ -static inline uint64_t nvme_ns_nlbas(NvmeCtrl *n, NvmeNamespace *ns) +static inline NvmeNamespace *nvme_ns(NvmeCtrl *n, uint32_t nsid) { - return n->ns_size >> nvme_ns_lbads(ns); + if (!nsid || nsid > n->num_namespaces) { + return NULL; + } + + return n->namespaces[nsid - 1]; } static inline NvmeCQueue *nvme_cq(NvmeRequest *req) @@ -185,4 +179,6 @@ static inline NvmeCtrl *nvme_ctrl(NvmeRequest *req) return sq->ctrl; } +int nvme_register_namespace(NvmeCtrl *n, NvmeNamespace *ns, Error **errp); + #endif /* HW_NVME_H */ diff --git a/hw/block/trace-events b/hw/block/trace-events index 22ea635144..446cca08e9 100644 --- a/hw/block/trace-events +++ b/hw/block/trace-events @@ -29,6 +29,7 @@ hd_geometry_guess(void *blk, uint32_t cyls, uint32_t heads, uint32_t secs, int t # nvme.c # nvme traces for successful events +pci_nvme_register_namespace(uint32_t nsid) "nsid %"PRIu32"" pci_nvme_irq_msix(uint32_t vector) "raising MSI-X IRQ vector %u" pci_nvme_irq_pin(void) "pulsing IRQ pin" pci_nvme_irq_masked(void) "IRQ is masked" @@ -39,9 +40,9 @@ pci_nvme_map_prp(uint64_t trans_len, uint32_t len, uint64_t prp1, uint64_t prp2, pci_nvme_map_sgl(uint16_t cid, uint8_t typ, uint64_t len) "cid %"PRIu16" type 0x%"PRIx8" len %"PRIu64"" pci_nvme_io_cmd(uint16_t cid, uint32_t nsid, uint16_t sqid, uint8_t opcode, const char *opname) "cid %"PRIu16" nsid %"PRIu32" sqid %"PRIu16" opc 0x%"PRIx8" opname '%s'" pci_nvme_admin_cmd(uint16_t cid, uint16_t sqid, uint8_t opcode, const char *opname) "cid %"PRIu16" sqid %"PRIu16" opc 0x%"PRIx8" opname '%s'" -pci_nvme_rw(uint16_t cid, const char *verb, uint32_t nlb, uint64_t count, uint64_t lba) "cid %"PRIu16" '%s' nlb %"PRIu32" count %"PRIu64" lba 0x%"PRIx64"" +pci_nvme_rw(uint16_t cid, const char *verb, uint32_t nsid, uint32_t nlb, uint64_t count, uint64_t lba) "cid %"PRIu16" opname '%s' nsid %"PRIu32" nlb %"PRIu32" count %"PRIu64" lba 0x%"PRIx64"" pci_nvme_rw_cb(uint16_t cid, const char *blkname) "cid %"PRIu16" blk '%s'" -pci_nvme_write_zeroes(uint16_t cid, uint64_t slba, uint32_t nlb) "cid %"PRIu16" slba %"PRIu64" nlb %"PRIu32"" +pci_nvme_write_zeroes(uint16_t cid, uint32_t nsid, uint64_t slba, uint32_t nlb) "cid %"PRIu16" nsid %"PRIu32" slba %"PRIu64" nlb %"PRIu32"" pci_nvme_do_aio(uint16_t cid, uint8_t opc, const char *opname, const char *blkname, int64_t offset, size_t len) "cid %"PRIu16" opc 0x%"PRIx8" opname '%s' blk '%s' offset %"PRId64" len %zu" pci_nvme_create_sq(uint64_t addr, uint16_t sqid, uint16_t cqid, uint16_t qsize, uint16_t qflags) "create submission queue, addr=0x%"PRIx64", sqid=%"PRIu16", cqid=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16"" pci_nvme_create_cq(uint64_t addr, uint16_t cqid, uint16_t vector, uint16_t size, uint16_t qflags, int ien) "create completion queue, addr=0x%"PRIx64", cqid=%"PRIu16", vector=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16", ien=%d" @@ -100,7 +101,6 @@ pci_nvme_err_invalid_prplist_ent(uint64_t prplist) "PRP list entry is null or no pci_nvme_err_invalid_prp2_align(uint64_t prp2) "PRP2 is not page aligned: 0x%"PRIx64"" pci_nvme_err_invalid_prp2_missing(void) "PRP2 is null and more data to be transferred" pci_nvme_err_invalid_prp(void) "invalid PRP" -pci_nvme_err_invalid_ns(uint32_t ns, uint32_t limit) "invalid namespace %u not within 1-%u" pci_nvme_err_invalid_opc(uint8_t opc) "invalid opcode 0x%"PRIx8"" pci_nvme_err_invalid_admin_opc(uint8_t opc) "invalid admin opcode 0x%"PRIx8"" pci_nvme_err_invalid_lba_range(uint64_t start, uint64_t len, uint64_t limit) "Invalid LBA start=%"PRIu64" len=%"PRIu64" limit=%"PRIu64"" From c1e18246618b3401ba1769bf88d2bcdf49e947aa Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Mon, 13 Jan 2020 19:12:50 +0100 Subject: [PATCH 16/30] pci: allocate pci id for nvme The emulated nvme device (hw/block/nvme.c) is currently using an internal Intel device id. Prepare to change that by allocating a device id under the 1b36 (Red Hat, Inc.) vendor id. Signed-off-by: Klaus Jensen Acked-by: Gerd Hoffmann Reviewed-by: Maxim Levitsky Reviewed-by: Keith Busch --- MAINTAINERS | 1 + docs/specs/nvme.txt | 23 +++++++++++++++++++++++ docs/specs/pci-ids.txt | 1 + include/hw/pci/pci.h | 1 + 4 files changed, 26 insertions(+) create mode 100644 docs/specs/nvme.txt diff --git a/MAINTAINERS b/MAINTAINERS index ef6f5c7399..9e215088ce 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1877,6 +1877,7 @@ L: qemu-block@nongnu.org S: Supported F: hw/block/nvme* F: tests/qtest/nvme-test.c +F: docs/specs/nvme.txt T: git git://git.infradead.org/qemu-nvme.git nvme-next megasas diff --git a/docs/specs/nvme.txt b/docs/specs/nvme.txt new file mode 100644 index 0000000000..56d393884e --- /dev/null +++ b/docs/specs/nvme.txt @@ -0,0 +1,23 @@ +NVM Express Controller +====================== + +The nvme device (-device nvme) emulates an NVM Express Controller. + + +Reference Specifications +------------------------ + +The device currently implements most mandatory features of NVMe v1.3d, see + + https://nvmexpress.org/resources/specifications/ + +for the specification. + + +Known issues +------------ + +* The accounting numbers in the SMART/Health are reset across power cycles + +* Interrupt Coalescing is not supported and is disabled by default in volation + of the specification. diff --git a/docs/specs/pci-ids.txt b/docs/specs/pci-ids.txt index 4d53e5c7d9..abbdbca6be 100644 --- a/docs/specs/pci-ids.txt +++ b/docs/specs/pci-ids.txt @@ -63,6 +63,7 @@ PCI devices (other than virtio): 1b36:000b PCIe Expander Bridge (-device pxb-pcie) 1b36:000d PCI xhci usb host adapter 1b36:000f mdpy (mdev sample device), linux/samples/vfio-mdev/mdpy.c +1b36:0010 PCIe NVMe device (-device nvme) All these devices are documented in docs/specs. diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h index f19ffe6b4f..72ce649eee 100644 --- a/include/hw/pci/pci.h +++ b/include/hw/pci/pci.h @@ -106,6 +106,7 @@ extern bool pci_available; #define PCI_DEVICE_ID_REDHAT_XHCI 0x000d #define PCI_DEVICE_ID_REDHAT_PCIE_BRIDGE 0x000e #define PCI_DEVICE_ID_REDHAT_MDPY 0x000f +#define PCI_DEVICE_ID_REDHAT_NVME 0x0010 #define PCI_DEVICE_ID_REDHAT_QXL 0x0100 #define FMT_PCIBUS PRIx64 From 6eb7a071292a2f11065127ac152fa24248806021 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Fri, 27 Sep 2019 11:43:12 +0200 Subject: [PATCH 17/30] hw/block/nvme: change controller pci id There are two reasons for changing this: 1. The nvme device currently uses an internal Intel device id. 2. Since commits "nvme: fix write zeroes offset and count" and "nvme: support multiple namespaces" the controller device no longer has the quirks that the Linux kernel think it has. As the quirks are applied based on pci vendor and device id, change them to get rid of the quirks. To keep backward compatibility, add a new 'use-intel-id' parameter to the nvme device to force use of the Intel vendor and device id. This is off by default but add a compat property to set this for 5.1 machines and older. If a 5.1 machine is booted (or the use-intel-id parameter is explicitly set to true), the Linux kernel will just apply these unnecessary quirks: 1. NVME_QUIRK_IDENTIFY_CNS which says that the device does not support anything else than values 0x0 and 0x1 for CNS (Identify Namespace and Identify Namespace). With multiple namespace support, this just means that the kernel will "scan" namespaces instead of using "Active Namespace ID list" (CNS 0x2). 2. NVME_QUIRK_DISABLE_WRITE_ZEROES. The nvme device started out with a broken Write Zeroes implementation which has since been fixed in commit 9d6459d21a6e ("nvme: fix write zeroes offset and count"). Signed-off-by: Klaus Jensen Reviewed-by: Keith Busch Reviewed-by: Maxim Levitsky --- hw/block/nvme.c | 12 ++++++++++-- hw/block/nvme.h | 1 + hw/core/machine.c | 1 + 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 1af12f861a..5768a6804f 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -2678,6 +2678,15 @@ static void nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp) pci_conf[PCI_INTERRUPT_PIN] = 1; pci_config_set_prog_interface(pci_conf, 0x2); + + if (n->params.use_intel_id) { + pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_INTEL); + pci_config_set_device_id(pci_conf, 0x5845); + } else { + pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_REDHAT); + pci_config_set_device_id(pci_conf, PCI_DEVICE_ID_REDHAT_NVME); + } + pci_config_set_class(pci_conf, PCI_CLASS_STORAGE_EXPRESS); pcie_endpoint_cap_init(pci_dev, 0x80); @@ -2831,6 +2840,7 @@ static Property nvme_props[] = { DEFINE_PROP_UINT8("aerl", NvmeCtrl, params.aerl, 3), DEFINE_PROP_UINT32("aer_max_queued", NvmeCtrl, params.aer_max_queued, 64), DEFINE_PROP_UINT8("mdts", NvmeCtrl, params.mdts, 7), + DEFINE_PROP_BOOL("use-intel-id", NvmeCtrl, params.use_intel_id, false), DEFINE_PROP_END_OF_LIST(), }; @@ -2847,8 +2857,6 @@ static void nvme_class_init(ObjectClass *oc, void *data) pc->realize = nvme_realize; pc->exit = nvme_exit; pc->class_id = PCI_CLASS_STORAGE_EXPRESS; - pc->vendor_id = PCI_VENDOR_ID_INTEL; - pc->device_id = 0x5845; pc->revision = 2; set_bit(DEVICE_CATEGORY_STORAGE, dc->categories); diff --git a/hw/block/nvme.h b/hw/block/nvme.h index d96ec15cdf..e080a2318a 100644 --- a/hw/block/nvme.h +++ b/hw/block/nvme.h @@ -15,6 +15,7 @@ typedef struct NvmeParams { uint8_t aerl; uint32_t aer_max_queued; uint8_t mdts; + bool use_intel_id; } NvmeParams; typedef struct NvmeAsyncEvent { diff --git a/hw/core/machine.c b/hw/core/machine.c index c5e0e79e6d..98b87f76cb 100644 --- a/hw/core/machine.c +++ b/hw/core/machine.c @@ -34,6 +34,7 @@ GlobalProperty hw_compat_5_1[] = { { "vhost-user-scsi", "num_queues", "1"}, { "virtio-blk-device", "num-queues", "1"}, { "virtio-scsi-device", "num_queues", "1"}, + { "nvme", "use-intel-id", "on"}, }; const size_t hw_compat_5_1_len = G_N_ELEMENTS(hw_compat_5_1); From b20804946bce7545ec9758d82806ee30a21b6211 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Thu, 1 Oct 2020 23:37:20 +0200 Subject: [PATCH 18/30] hw/block/nvme: update nsid when registered If the user does not specify an nsid parameter on the nvme-ns device, nvme_register_namespace will find the first free namespace id and assign that. This fix makes sure the assigned id is saved. Signed-off-by: Klaus Jensen Reviewed-by: Dmitry Fomichev --- hw/block/nvme.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 5768a6804f..2225b944f9 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -2578,7 +2578,7 @@ int nvme_register_namespace(NvmeCtrl *n, NvmeNamespace *ns, Error **errp) for (int i = 1; i <= n->num_namespaces; i++) { NvmeNamespace *ns = nvme_ns(n, i); if (!ns) { - nsid = i; + nsid = ns->params.nsid = i; break; } } From 8c125590dfa33699a267c797a41939c1ac8b77bf Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 30 Sep 2020 12:22:27 -0700 Subject: [PATCH 19/30] hw/block/nvme: remove pointless rw indirection The code switches on the opcode to invoke a function specific to that opcode. There's no point in consolidating back to a common function that just switches on that same opcode without any actual common code. Restore the opcode specific behavior without going back through another level of switches. Signed-off-by: Keith Busch Signed-off-by: Klaus Jensen --- hw/block/nvme.c | 91 ++++++++++++++----------------------------- hw/block/trace-events | 1 - 2 files changed, 29 insertions(+), 63 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 2225b944f9..a168f0bf4a 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -927,68 +927,12 @@ static void nvme_rw_cb(void *opaque, int ret) nvme_enqueue_req_completion(nvme_cq(req), req); } -static uint16_t nvme_do_aio(BlockBackend *blk, int64_t offset, size_t len, - NvmeRequest *req) -{ - BlockAcctCookie *acct = &req->acct; - BlockAcctStats *stats = blk_get_stats(blk); - - bool is_write = false; - - trace_pci_nvme_do_aio(nvme_cid(req), req->cmd.opcode, - nvme_io_opc_str(req->cmd.opcode), blk_name(blk), - offset, len); - - switch (req->cmd.opcode) { - case NVME_CMD_FLUSH: - block_acct_start(stats, acct, 0, BLOCK_ACCT_FLUSH); - req->aiocb = blk_aio_flush(blk, nvme_rw_cb, req); - break; - - case NVME_CMD_WRITE_ZEROES: - block_acct_start(stats, acct, len, BLOCK_ACCT_WRITE); - req->aiocb = blk_aio_pwrite_zeroes(blk, offset, len, - BDRV_REQ_MAY_UNMAP, nvme_rw_cb, - req); - break; - - case NVME_CMD_WRITE: - is_write = true; - - /* fallthrough */ - - case NVME_CMD_READ: - block_acct_start(stats, acct, len, - is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ); - - if (req->qsg.sg) { - if (is_write) { - req->aiocb = dma_blk_write(blk, &req->qsg, offset, - BDRV_SECTOR_SIZE, nvme_rw_cb, req); - } else { - req->aiocb = dma_blk_read(blk, &req->qsg, offset, - BDRV_SECTOR_SIZE, nvme_rw_cb, req); - } - } else { - if (is_write) { - req->aiocb = blk_aio_pwritev(blk, offset, &req->iov, 0, - nvme_rw_cb, req); - } else { - req->aiocb = blk_aio_preadv(blk, offset, &req->iov, 0, - nvme_rw_cb, req); - } - } - - break; - } - - return NVME_NO_COMPLETE; -} - static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req) { - NvmeNamespace *ns = req->ns; - return nvme_do_aio(ns->blkconf.blk, 0, 0, req); + block_acct_start(blk_get_stats(req->ns->blkconf.blk), &req->acct, 0, + BLOCK_ACCT_FLUSH); + req->aiocb = blk_aio_flush(req->ns->blkconf.blk, nvme_rw_cb, req); + return NVME_NO_COMPLETE; } static uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req) @@ -1009,7 +953,11 @@ static uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req) return status; } - return nvme_do_aio(ns->blkconf.blk, offset, count, req); + block_acct_start(blk_get_stats(req->ns->blkconf.blk), &req->acct, 0, + BLOCK_ACCT_WRITE); + req->aiocb = blk_aio_pwrite_zeroes(req->ns->blkconf.blk, offset, count, + BDRV_REQ_MAY_UNMAP, nvme_rw_cb, req); + return NVME_NO_COMPLETE; } static uint16_t nvme_rw(NvmeCtrl *n, NvmeRequest *req) @@ -1023,6 +971,7 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeRequest *req) uint64_t data_offset = nvme_l2b(ns, slba); enum BlockAcctType acct = req->cmd.opcode == NVME_CMD_WRITE ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ; + BlockBackend *blk = ns->blkconf.blk; uint16_t status; trace_pci_nvme_rw(nvme_cid(req), nvme_io_opc_str(rw->opcode), @@ -1045,7 +994,25 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeRequest *req) goto invalid; } - return nvme_do_aio(ns->blkconf.blk, data_offset, data_size, req); + block_acct_start(blk_get_stats(blk), &req->acct, data_size, acct); + if (req->qsg.sg) { + if (acct == BLOCK_ACCT_WRITE) { + req->aiocb = dma_blk_write(blk, &req->qsg, data_offset, + BDRV_SECTOR_SIZE, nvme_rw_cb, req); + } else { + req->aiocb = dma_blk_read(blk, &req->qsg, data_offset, + BDRV_SECTOR_SIZE, nvme_rw_cb, req); + } + } else { + if (acct == BLOCK_ACCT_WRITE) { + req->aiocb = blk_aio_pwritev(blk, data_offset, &req->iov, 0, + nvme_rw_cb, req); + } else { + req->aiocb = blk_aio_preadv(blk, data_offset, &req->iov, 0, + nvme_rw_cb, req); + } + } + return NVME_NO_COMPLETE; invalid: block_acct_invalid(blk_get_stats(ns->blkconf.blk), acct); diff --git a/hw/block/trace-events b/hw/block/trace-events index 446cca08e9..e56d688b88 100644 --- a/hw/block/trace-events +++ b/hw/block/trace-events @@ -43,7 +43,6 @@ pci_nvme_admin_cmd(uint16_t cid, uint16_t sqid, uint8_t opcode, const char *opna pci_nvme_rw(uint16_t cid, const char *verb, uint32_t nsid, uint32_t nlb, uint64_t count, uint64_t lba) "cid %"PRIu16" opname '%s' nsid %"PRIu32" nlb %"PRIu32" count %"PRIu64" lba 0x%"PRIx64"" pci_nvme_rw_cb(uint16_t cid, const char *blkname) "cid %"PRIu16" blk '%s'" pci_nvme_write_zeroes(uint16_t cid, uint32_t nsid, uint64_t slba, uint32_t nlb) "cid %"PRIu16" nsid %"PRIu32" slba %"PRIu64" nlb %"PRIu32"" -pci_nvme_do_aio(uint16_t cid, uint8_t opc, const char *opname, const char *blkname, int64_t offset, size_t len) "cid %"PRIu16" opc 0x%"PRIx8" opname '%s' blk '%s' offset %"PRId64" len %zu" pci_nvme_create_sq(uint64_t addr, uint16_t sqid, uint16_t cqid, uint16_t qsize, uint16_t qflags) "create submission queue, addr=0x%"PRIx64", sqid=%"PRIu16", cqid=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16"" pci_nvme_create_cq(uint64_t addr, uint16_t cqid, uint16_t vector, uint16_t size, uint16_t qflags, int ien) "create completion queue, addr=0x%"PRIx64", cqid=%"PRIu16", vector=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16", ien=%d" pci_nvme_del_sq(uint16_t qid) "deleting submission queue sqid=%"PRIu16"" From a740facfbd05c9dd630e1f992a9dc6b5444096a7 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 30 Sep 2020 10:01:02 -0700 Subject: [PATCH 20/30] hw/block/nvme: fix log page offset check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Return error if the requested offset starts after the size of the log being returned. Also, move the check for earlier in the function so we're not doing unnecessary calculations. Reviewed-by: Philippe Mathieu-Daudé Reviewed- by: Dmitry Fomichev Signed-off-by: Keith Busch Signed-off-by: Klaus Jensen --- hw/block/nvme.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index a168f0bf4a..aa725d1141 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -1179,6 +1179,10 @@ static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len, return NVME_INVALID_FIELD | NVME_DNR; } + if (off >= sizeof(smart)) { + return NVME_INVALID_FIELD | NVME_DNR; + } + for (int i = 1; i <= n->num_namespaces; i++) { NvmeNamespace *ns = nvme_ns(n, i); if (!ns) { @@ -1193,10 +1197,6 @@ static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len, write_commands += s->nr_ops[BLOCK_ACCT_WRITE]; } - if (off > sizeof(smart)) { - return NVME_INVALID_FIELD | NVME_DNR; - } - trans_len = MIN(sizeof(smart) - off, buf_len); memset(&smart, 0x0, sizeof(smart)); @@ -1234,12 +1234,11 @@ static uint16_t nvme_fw_log_info(NvmeCtrl *n, uint32_t buf_len, uint64_t off, .afi = 0x1, }; - strpadcpy((char *)&fw_log.frs1, sizeof(fw_log.frs1), "1.0", ' '); - - if (off > sizeof(fw_log)) { + if (off >= sizeof(fw_log)) { return NVME_INVALID_FIELD | NVME_DNR; } + strpadcpy((char *)&fw_log.frs1, sizeof(fw_log.frs1), "1.0", ' '); trans_len = MIN(sizeof(fw_log) - off, buf_len); return nvme_dma(n, (uint8_t *) &fw_log + off, trans_len, @@ -1252,16 +1251,15 @@ static uint16_t nvme_error_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len, uint32_t trans_len; NvmeErrorLog errlog; + if (off >= sizeof(errlog)) { + return NVME_INVALID_FIELD | NVME_DNR; + } + if (!rae) { nvme_clear_events(n, NVME_AER_TYPE_ERROR); } - if (off > sizeof(errlog)) { - return NVME_INVALID_FIELD | NVME_DNR; - } - memset(&errlog, 0x0, sizeof(errlog)); - trans_len = MIN(sizeof(errlog) - off, buf_len); return nvme_dma(n, (uint8_t *)&errlog, trans_len, From 2fbbecc5cd90ec00027a155f7044f2f70ed84f30 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 30 Sep 2020 10:15:50 -0700 Subject: [PATCH 21/30] hw/block/nvme: support per-namespace smart log Let the user specify a specific namespace if they want to get access stats for a specific namespace. Signed-off-by: Keith Busch Signed-off-by: Klaus Jensen --- hw/block/nvme.c | 63 +++++++++++++++++++++++++++----------------- include/block/nvme.h | 1 + 2 files changed, 40 insertions(+), 24 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index aa725d1141..5a9ae699af 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -1164,48 +1164,63 @@ static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeRequest *req) return NVME_SUCCESS; } +struct nvme_stats { + uint64_t units_read; + uint64_t units_written; + uint64_t read_commands; + uint64_t write_commands; +}; + +static void nvme_set_blk_stats(NvmeNamespace *ns, struct nvme_stats *stats) +{ + BlockAcctStats *s = blk_get_stats(ns->blkconf.blk); + + stats->units_read += s->nr_bytes[BLOCK_ACCT_READ] >> BDRV_SECTOR_BITS; + stats->units_written += s->nr_bytes[BLOCK_ACCT_WRITE] >> BDRV_SECTOR_BITS; + stats->read_commands += s->nr_ops[BLOCK_ACCT_READ]; + stats->write_commands += s->nr_ops[BLOCK_ACCT_WRITE]; +} + static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len, uint64_t off, NvmeRequest *req) { uint32_t nsid = le32_to_cpu(req->cmd.nsid); - + struct nvme_stats stats = { 0 }; + NvmeSmartLog smart = { 0 }; uint32_t trans_len; + NvmeNamespace *ns; time_t current_ms; - uint64_t units_read = 0, units_written = 0; - uint64_t read_commands = 0, write_commands = 0; - NvmeSmartLog smart; - - if (nsid && nsid != 0xffffffff) { - return NVME_INVALID_FIELD | NVME_DNR; - } if (off >= sizeof(smart)) { return NVME_INVALID_FIELD | NVME_DNR; } - for (int i = 1; i <= n->num_namespaces; i++) { - NvmeNamespace *ns = nvme_ns(n, i); + if (nsid != 0xffffffff) { + ns = nvme_ns(n, nsid); if (!ns) { - continue; + return NVME_INVALID_NSID | NVME_DNR; } + nvme_set_blk_stats(ns, &stats); + } else { + int i; - BlockAcctStats *s = blk_get_stats(ns->blkconf.blk); - - units_read += s->nr_bytes[BLOCK_ACCT_READ] >> BDRV_SECTOR_BITS; - units_written += s->nr_bytes[BLOCK_ACCT_WRITE] >> BDRV_SECTOR_BITS; - read_commands += s->nr_ops[BLOCK_ACCT_READ]; - write_commands += s->nr_ops[BLOCK_ACCT_WRITE]; + for (i = 1; i <= n->num_namespaces; i++) { + ns = nvme_ns(n, i); + if (!ns) { + continue; + } + nvme_set_blk_stats(ns, &stats); + } } trans_len = MIN(sizeof(smart) - off, buf_len); - memset(&smart, 0x0, sizeof(smart)); - - smart.data_units_read[0] = cpu_to_le64(DIV_ROUND_UP(units_read, 1000)); - smart.data_units_written[0] = cpu_to_le64(DIV_ROUND_UP(units_written, + smart.data_units_read[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_read, + 1000)); + smart.data_units_written[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_written, 1000)); - smart.host_read_commands[0] = cpu_to_le64(read_commands); - smart.host_write_commands[0] = cpu_to_le64(write_commands); + smart.host_read_commands[0] = cpu_to_le64(stats.read_commands); + smart.host_write_commands[0] = cpu_to_le64(stats.write_commands); smart.temperature = cpu_to_le16(n->temperature); @@ -2703,7 +2718,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) id->acl = 3; id->aerl = n->params.aerl; id->frmw = (NVME_NUM_FW_SLOTS << 1) | NVME_FRMW_SLOT1_RO; - id->lpa = NVME_LPA_EXTENDED; + id->lpa = NVME_LPA_NS_SMART | NVME_LPA_EXTENDED; /* recommended default value (~70 C) */ id->wctemp = cpu_to_le16(NVME_TEMPERATURE_WARNING); diff --git a/include/block/nvme.h b/include/block/nvme.h index 58647bcdad..868cf53f0b 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -849,6 +849,7 @@ enum NvmeIdCtrlFrmw { }; enum NvmeIdCtrlLpa { + NVME_LPA_NS_SMART = 1 << 0, NVME_LPA_EXTENDED = 1 << 2, }; From 492f9a8d79f2e815007e985cad8dd73b713722f0 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 30 Sep 2020 10:54:05 -0700 Subject: [PATCH 22/30] hw/block/nvme: validate command set selected Fail to start the controller if the user requests a command set that the controller does not support. Signed-off-by: Keith Busch Signed-off-by: Klaus Jensen --- hw/block/nvme.c | 6 +++++- hw/block/trace-events | 1 + include/block/nvme.h | 4 ++++ 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 5a9ae699af..94db06cf72 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -2045,6 +2045,10 @@ static int nvme_start_ctrl(NvmeCtrl *n) trace_pci_nvme_err_startfail_acq_misaligned(n->bar.acq); return -1; } + if (unlikely(!(NVME_CAP_CSS(n->bar.cap) & (1 << NVME_CC_CSS(n->bar.cc))))) { + trace_pci_nvme_err_startfail_css(NVME_CC_CSS(n->bar.cc)); + return -1; + } if (unlikely(NVME_CC_MPS(n->bar.cc) < NVME_CAP_MPSMIN(n->bar.cap))) { trace_pci_nvme_err_startfail_page_too_small( @@ -2746,7 +2750,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) NVME_CAP_SET_MQES(n->bar.cap, 0x7ff); NVME_CAP_SET_CQR(n->bar.cap, 1); NVME_CAP_SET_TO(n->bar.cap, 0xf); - NVME_CAP_SET_CSS(n->bar.cap, 1); + NVME_CAP_SET_CSS(n->bar.cap, NVME_CAP_CSS_NVM); NVME_CAP_SET_MPSMAX(n->bar.cap, 4); n->bar.vs = NVME_SPEC_VER; diff --git a/hw/block/trace-events b/hw/block/trace-events index e56d688b88..7b28091bd6 100644 --- a/hw/block/trace-events +++ b/hw/block/trace-events @@ -132,6 +132,7 @@ pci_nvme_err_startfail_cqent_too_small(uint8_t log2ps, uint8_t maxlog2ps) "nvme_ pci_nvme_err_startfail_cqent_too_large(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the completion queue entry size is too large: log2size=%u, max=%u" pci_nvme_err_startfail_sqent_too_small(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the submission queue entry size is too small: log2size=%u, min=%u" pci_nvme_err_startfail_sqent_too_large(uint8_t log2ps, uint8_t maxlog2ps) "nvme_start_ctrl failed because the submission queue entry size is too large: log2size=%u, max=%u" +pci_nvme_err_startfail_css(uint8_t css) "nvme_start_ctrl failed because invalid command set selected:%u" pci_nvme_err_startfail_asqent_sz_zero(void) "nvme_start_ctrl failed because the admin submission queue size is zero" pci_nvme_err_startfail_acqent_sz_zero(void) "nvme_start_ctrl failed because the admin completion queue size is zero" pci_nvme_err_startfail(void) "setting controller enable bit failed" diff --git a/include/block/nvme.h b/include/block/nvme.h index 868cf53f0b..bc20a2ba5e 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -82,6 +82,10 @@ enum NvmeCapMask { #define NVME_CAP_SET_PMRS(cap, val) (cap |= (uint64_t)(val & CAP_PMR_MASK)\ << CAP_PMR_SHIFT) +enum NvmeCapCss { + NVME_CAP_CSS_NVM = 1 << 0, +}; + enum NvmeCcShift { CC_EN_SHIFT = 0, CC_CSS_SHIFT = 4, From 8c5cea85934eb7b580ced14f7f188e19880d4c1c Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 30 Sep 2020 10:58:03 -0700 Subject: [PATCH 23/30] hw/block/nvme: support for admin-only command set Signed-off-by: Keith Busch Signed-off-by: Klaus Jensen --- hw/block/nvme.c | 1 + include/block/nvme.h | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 94db06cf72..c1323ca869 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -2751,6 +2751,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) NVME_CAP_SET_CQR(n->bar.cap, 1); NVME_CAP_SET_TO(n->bar.cap, 0xf); NVME_CAP_SET_CSS(n->bar.cap, NVME_CAP_CSS_NVM); + NVME_CAP_SET_CSS(n->bar.cap, NVME_CAP_CSS_ADMIN_ONLY); NVME_CAP_SET_MPSMAX(n->bar.cap, 4); n->bar.vs = NVME_SPEC_VER; diff --git a/include/block/nvme.h b/include/block/nvme.h index bc20a2ba5e..521533fd2a 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -83,7 +83,8 @@ enum NvmeCapMask { << CAP_PMR_SHIFT) enum NvmeCapCss { - NVME_CAP_CSS_NVM = 1 << 0, + NVME_CAP_CSS_NVM = 1 << 0, + NVME_CAP_CSS_ADMIN_ONLY = 1 << 7, }; enum NvmeCcShift { From 1b48e4611a7a3ee3065d3bb8428f5f6acb5232fe Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Wed, 30 Sep 2020 01:19:07 +0200 Subject: [PATCH 24/30] hw/block/nvme: reject io commands if only admin command set selected If the host sets CC.CSS to 111b, all commands submitted to I/O queues should be completed with status Invalid Command Opcode. Note that this is technically a v1.4 feature, but it does not hurt to implement before we finally bump the reported version implemented. Reviewed-by: Dmitry Fomichev Signed-off-by: Klaus Jensen Signed-off-by: Keith Busch --- hw/block/nvme.c | 4 ++++ include/block/nvme.h | 5 +++++ 2 files changed, 9 insertions(+) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index c1323ca869..32c35fe587 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -1026,6 +1026,10 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req) trace_pci_nvme_io_cmd(nvme_cid(req), nsid, nvme_sqid(req), req->cmd.opcode, nvme_io_opc_str(req->cmd.opcode)); + if (NVME_CC_CSS(n->bar.cc) == NVME_CC_CSS_ADMIN_ONLY) { + return NVME_INVALID_OPCODE | NVME_DNR; + } + if (!nvme_nsid_valid(n, nsid)) { return NVME_INVALID_NSID | NVME_DNR; } diff --git a/include/block/nvme.h b/include/block/nvme.h index 521533fd2a..6de2d5aa75 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -115,6 +115,11 @@ enum NvmeCcMask { #define NVME_CC_IOSQES(cc) ((cc >> CC_IOSQES_SHIFT) & CC_IOSQES_MASK) #define NVME_CC_IOCQES(cc) ((cc >> CC_IOCQES_SHIFT) & CC_IOCQES_MASK) +enum NvmeCcCss { + NVME_CC_CSS_NVM = 0x0, + NVME_CC_CSS_ADMIN_ONLY = 0x7, +}; + enum NvmeCstsShift { CSTS_RDY_SHIFT = 0, CSTS_CFS_SHIFT = 1, From 976951048c2fb31ba1622c36d25b4cd69bea5a64 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Wed, 30 Sep 2020 01:19:04 +0200 Subject: [PATCH 25/30] hw/block/nvme: add nsid to get/setfeat trace events Include the namespace id in the pci_nvme_{get,set}feat trace events. Signed-off-by: Klaus Jensen Signed-off-by: Keith Busch --- hw/block/nvme.c | 4 ++-- hw/block/trace-events | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 32c35fe587..5fd5a45a28 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -1639,7 +1639,7 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req) [NVME_ARBITRATION] = NVME_ARB_AB_NOLIMIT, }; - trace_pci_nvme_getfeat(nvme_cid(req), fid, sel, dw11); + trace_pci_nvme_getfeat(nvme_cid(req), nsid, fid, sel, dw11); if (!nvme_feature_support[fid]) { return NVME_INVALID_FIELD | NVME_DNR; @@ -1777,7 +1777,7 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req) uint8_t fid = NVME_GETSETFEAT_FID(dw10); uint8_t save = NVME_SETFEAT_SAVE(dw10); - trace_pci_nvme_setfeat(nvme_cid(req), fid, save, dw11); + trace_pci_nvme_setfeat(nvme_cid(req), nsid, fid, save, dw11); if (save) { return NVME_FID_NOT_SAVEABLE | NVME_DNR; diff --git a/hw/block/trace-events b/hw/block/trace-events index 7b28091bd6..2dc85281dc 100644 --- a/hw/block/trace-events +++ b/hw/block/trace-events @@ -52,8 +52,8 @@ pci_nvme_identify_ns(uint32_t ns) "nsid %"PRIu32"" pci_nvme_identify_nslist(uint32_t ns) "nsid %"PRIu32"" pci_nvme_identify_ns_descr_list(uint32_t ns) "nsid %"PRIu32"" pci_nvme_get_log(uint16_t cid, uint8_t lid, uint8_t lsp, uint8_t rae, uint32_t len, uint64_t off) "cid %"PRIu16" lid 0x%"PRIx8" lsp 0x%"PRIx8" rae 0x%"PRIx8" len %"PRIu32" off %"PRIu64"" -pci_nvme_getfeat(uint16_t cid, uint8_t fid, uint8_t sel, uint32_t cdw11) "cid %"PRIu16" fid 0x%"PRIx8" sel 0x%"PRIx8" cdw11 0x%"PRIx32"" -pci_nvme_setfeat(uint16_t cid, uint8_t fid, uint8_t save, uint32_t cdw11) "cid %"PRIu16" fid 0x%"PRIx8" save 0x%"PRIx8" cdw11 0x%"PRIx32"" +pci_nvme_getfeat(uint16_t cid, uint32_t nsid, uint8_t fid, uint8_t sel, uint32_t cdw11) "cid %"PRIu16" nsid 0x%"PRIx32" fid 0x%"PRIx8" sel 0x%"PRIx8" cdw11 0x%"PRIx32"" +pci_nvme_setfeat(uint16_t cid, uint32_t nsid, uint8_t fid, uint8_t save, uint32_t cdw11) "cid %"PRIu16" nsid 0x%"PRIx32" fid 0x%"PRIx8" save 0x%"PRIx8" cdw11 0x%"PRIx32"" pci_nvme_getfeat_vwcache(const char* result) "get feature volatile write cache, result=%s" pci_nvme_getfeat_numq(int result) "get feature number of queues, result=%d" pci_nvme_setfeat_numq(int reqcq, int reqsq, int gotcq, int gotsq) "requested cq_count=%d sq_count=%d, responding with cq_count=%d sq_count=%d" From dcd1496132704ca16fc857a40370a6b9209341ae Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Wed, 30 Sep 2020 01:19:05 +0200 Subject: [PATCH 26/30] hw/block/nvme: add trace event for requests with non-zero status code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If a command results in a non-zero status code, trace it. Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Dmitry Fomichev Signed-off-by: Klaus Jensen Signed-off-by: Keith Busch --- hw/block/nvme.c | 6 ++++++ hw/block/trace-events | 1 + 2 files changed, 7 insertions(+) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 5fd5a45a28..b8c6be6318 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -777,6 +777,12 @@ static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req) assert(cq->cqid == req->sq->cqid); trace_pci_nvme_enqueue_req_completion(nvme_cid(req), cq->cqid, req->status); + + if (req->status) { + trace_pci_nvme_err_req_status(nvme_cid(req), nvme_nsid(req->ns), + req->status, req->cmd.opcode); + } + QTAILQ_REMOVE(&req->sq->out_req_list, req, entry); QTAILQ_INSERT_TAIL(&cq->req_list, req, entry); timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500); diff --git a/hw/block/trace-events b/hw/block/trace-events index 2dc85281dc..cab9913b1f 100644 --- a/hw/block/trace-events +++ b/hw/block/trace-events @@ -88,6 +88,7 @@ pci_nvme_mmio_shutdown_cleared(void) "shutdown bit cleared" # nvme traces for error conditions pci_nvme_err_mdts(uint16_t cid, size_t len) "cid %"PRIu16" len %zu" +pci_nvme_err_req_status(uint16_t cid, uint32_t nsid, uint16_t status, uint8_t opc) "cid %"PRIu16" nsid %"PRIu32" status 0x%"PRIx16" opc 0x%"PRIx8"" pci_nvme_err_addr_read(uint64_t addr) "addr 0x%"PRIx64"" pci_nvme_err_addr_write(uint64_t addr) "addr 0x%"PRIx64"" pci_nvme_err_cfs(void) "controller fatal status" From b865cabf735be793789ad2c7eac97f47a1325966 Mon Sep 17 00:00:00 2001 From: Dmitry Fomichev Date: Mon, 28 Sep 2020 11:35:15 +0900 Subject: [PATCH 27/30] hw/block/nvme: report actual LBA data shift in LBAF Calculate the data shift value to report based on the set value of logical_block_size device property. In the process, use a local variable to calculate the LBA format index instead of the hardcoded value 0. This makes the code more readable and it will make it easier to add support for multiple LBA formats in the future. Signed-off-by: Dmitry Fomichev Signed-off-by: Keith Busch Signed-off-by: Klaus Jensen --- hw/block/nvme-ns.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c index 2ba0263dda..31c80cdf5b 100644 --- a/hw/block/nvme-ns.c +++ b/hw/block/nvme-ns.c @@ -31,12 +31,13 @@ static void nvme_ns_init(NvmeNamespace *ns) { NvmeIdNs *id_ns = &ns->id_ns; + int lba_index = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas); if (blk_get_flags(ns->blkconf.blk) & BDRV_O_UNMAP) { ns->id_ns.dlfeat = 0x9; } - id_ns->lbaf[0].ds = BDRV_SECTOR_BITS; + id_ns->lbaf[lba_index].ds = 31 - clz32(ns->blkconf.logical_block_size); id_ns->nsze = cpu_to_le64(nvme_ns_nlbas(ns)); From 28fee5b5d02d59a2b039c71a0a72292b1bc7f75b Mon Sep 17 00:00:00 2001 From: Gollu Appalanaidu Date: Mon, 19 Oct 2020 12:41:31 +0530 Subject: [PATCH 28/30] hw/block/nvme: fix prp mapping status codes Address 0 is not an invalid address. Remove those invalikd checks. Unaligned PRP2 and PRP list entries should result in Invalid PRP Offset status code and not Invalid Field. Fix that. See NVMe Express v1.3d, Section 4.3 ("Physical Region Page Entry and List"). Suggested-by: Keith Busch Signed-off-by: Gollu Appalanaidu Signed-off-by: Klaus Jensen Reviewed-by: Keith Busch --- hw/block/nvme.c | 20 +++++--------------- hw/block/trace-events | 4 +--- include/block/nvme.h | 1 + 3 files changed, 7 insertions(+), 18 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index b8c6be6318..2896bb49b9 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -327,11 +327,6 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, uint64_t prp2, trace_pci_nvme_map_prp(trans_len, len, prp1, prp2, num_prps); - if (unlikely(!prp1)) { - trace_pci_nvme_err_invalid_prp(); - return NVME_INVALID_FIELD | NVME_DNR; - } - if (nvme_addr_is_cmb(n, prp1)) { qemu_iovec_init(iov, num_prps); } else { @@ -345,11 +340,6 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, uint64_t prp2, len -= trans_len; if (len) { - if (unlikely(!prp2)) { - trace_pci_nvme_err_invalid_prp2_missing(); - return NVME_INVALID_FIELD | NVME_DNR; - } - if (len > n->page_size) { uint64_t prp_list[n->max_prp_ents]; uint32_t nents, prp_trans; @@ -370,9 +360,9 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, uint64_t prp2, uint64_t prp_ent = le64_to_cpu(prp_list[i]); if (i == n->max_prp_ents - 1 && len > n->page_size) { - if (unlikely(!prp_ent || prp_ent & (n->page_size - 1))) { + if (unlikely(prp_ent & (n->page_size - 1))) { trace_pci_nvme_err_invalid_prplist_ent(prp_ent); - return NVME_INVALID_FIELD | NVME_DNR; + return NVME_INVALID_PRP_OFFSET | NVME_DNR; } if (prp_list_in_cmb != nvme_addr_is_cmb(n, prp_ent)) { @@ -391,9 +381,9 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, uint64_t prp2, prp_ent = le64_to_cpu(prp_list[i]); } - if (unlikely(!prp_ent || prp_ent & (n->page_size - 1))) { + if (unlikely(prp_ent & (n->page_size - 1))) { trace_pci_nvme_err_invalid_prplist_ent(prp_ent); - return NVME_INVALID_FIELD | NVME_DNR; + return NVME_INVALID_PRP_OFFSET | NVME_DNR; } trans_len = MIN(len, n->page_size); @@ -408,7 +398,7 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, uint64_t prp2, } else { if (unlikely(prp2 & (n->page_size - 1))) { trace_pci_nvme_err_invalid_prp2_align(prp2); - return NVME_INVALID_FIELD | NVME_DNR; + return NVME_INVALID_PRP_OFFSET | NVME_DNR; } status = nvme_map_addr(n, qsg, iov, prp2, len); if (status) { diff --git a/hw/block/trace-events b/hw/block/trace-events index cab9913b1f..c1537e3ac0 100644 --- a/hw/block/trace-events +++ b/hw/block/trace-events @@ -97,10 +97,8 @@ pci_nvme_err_invalid_sgld(uint16_t cid, uint8_t typ) "cid %"PRIu16" type 0x%"PRI pci_nvme_err_invalid_num_sgld(uint16_t cid, uint8_t typ) "cid %"PRIu16" type 0x%"PRIx8"" pci_nvme_err_invalid_sgl_excess_length(uint16_t cid) "cid %"PRIu16"" pci_nvme_err_invalid_dma(void) "PRP/SGL is too small for transfer size" -pci_nvme_err_invalid_prplist_ent(uint64_t prplist) "PRP list entry is null or not page aligned: 0x%"PRIx64"" +pci_nvme_err_invalid_prplist_ent(uint64_t prplist) "PRP list entry is not page aligned: 0x%"PRIx64"" pci_nvme_err_invalid_prp2_align(uint64_t prp2) "PRP2 is not page aligned: 0x%"PRIx64"" -pci_nvme_err_invalid_prp2_missing(void) "PRP2 is null and more data to be transferred" -pci_nvme_err_invalid_prp(void) "invalid PRP" pci_nvme_err_invalid_opc(uint8_t opc) "invalid opcode 0x%"PRIx8"" pci_nvme_err_invalid_admin_opc(uint8_t opc) "invalid admin opcode 0x%"PRIx8"" pci_nvme_err_invalid_lba_range(uint64_t start, uint64_t len, uint64_t limit) "Invalid LBA start=%"PRIu64" len=%"PRIu64" limit=%"PRIu64"" diff --git a/include/block/nvme.h b/include/block/nvme.h index 6de2d5aa75..8a46d9cf01 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -655,6 +655,7 @@ enum NvmeStatusCodes { NVME_MD_SGL_LEN_INVALID = 0x0010, NVME_SGL_DESCR_TYPE_INVALID = 0x0011, NVME_INVALID_USE_OF_CMB = 0x0012, + NVME_INVALID_PRP_OFFSET = 0x0013, NVME_LBA_RANGE = 0x0080, NVME_CAP_EXCEEDED = 0x0081, NVME_NS_NOT_READY = 0x0082, From 482e97fcfad6672d2849f2fe36bd460d70468b0a Mon Sep 17 00:00:00 2001 From: Gollu Appalanaidu Date: Thu, 22 Oct 2020 11:58:46 +0530 Subject: [PATCH 29/30] hw/block/nvme: fix create IO SQ/CQ status codes Replace the Invalid Field in Command with the Invalid PRP Offset status code in the nvme_create_{cq,sq} functions. Also, allow PRP1 to be address 0x0. Also replace the Completion Queue Invalid status code returned in nvme_create_cq when the the queue identifier is invalid with the Invalid Queue Identifier. The Completion Queue Invalid status code is exclusively for indicating that the completion queue identifer given when creating a submission queue is invalid. See NVM Express v1.3d, Section 5.3 ("Create I/O Completion Queue command") and 5.4("Create I/O Submission Queue command"). Signed-off-by: Gollu Appalanaidu Signed-off-by: Klaus Jensen Reviewed-by: Keith Busch --- hw/block/nvme.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 2896bb49b9..5dfef0204c 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -1151,9 +1151,9 @@ static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeRequest *req) trace_pci_nvme_err_invalid_create_sq_size(qsize); return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR; } - if (unlikely(!prp1 || prp1 & (n->page_size - 1))) { + if (unlikely(prp1 & (n->page_size - 1))) { trace_pci_nvme_err_invalid_create_sq_addr(prp1); - return NVME_INVALID_FIELD | NVME_DNR; + return NVME_INVALID_PRP_OFFSET | NVME_DNR; } if (unlikely(!(NVME_SQ_FLAGS_PC(qflags)))) { trace_pci_nvme_err_invalid_create_sq_qflags(NVME_SQ_FLAGS_PC(qflags)); @@ -1400,15 +1400,15 @@ static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest *req) if (unlikely(!cqid || !nvme_check_cqid(n, cqid))) { trace_pci_nvme_err_invalid_create_cq_cqid(cqid); - return NVME_INVALID_CQID | NVME_DNR; + return NVME_INVALID_QID | NVME_DNR; } if (unlikely(!qsize || qsize > NVME_CAP_MQES(n->bar.cap))) { trace_pci_nvme_err_invalid_create_cq_size(qsize); return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR; } - if (unlikely(!prp1)) { + if (unlikely(prp1 & (n->page_size - 1))) { trace_pci_nvme_err_invalid_create_cq_addr(prp1); - return NVME_INVALID_FIELD | NVME_DNR; + return NVME_INVALID_PRP_OFFSET | NVME_DNR; } if (unlikely(!msix_enabled(&n->parent_obj) && vector)) { trace_pci_nvme_err_invalid_create_cq_vector(vector); From 843c8f91a7ad63f8f3e4e564d3f41f3d030ab8a9 Mon Sep 17 00:00:00 2001 From: Gollu Appalanaidu Date: Thu, 22 Oct 2020 14:37:08 +0530 Subject: [PATCH 30/30] hw/block/nvme: fix queue identifer validation The nvme_check_{sq,cq} functions check if the given queue identifer is valid *and* that the queue exists. Thus, the function return value cannot simply be inverted to check if the identifer is valid and that the queue does *not* exist. Replace the call with an OR'ed version of the checks. Signed-off-by: Gollu Appalanaidu Signed-off-by: Klaus Jensen Reviewed-by: Keith Busch --- hw/block/nvme.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 5dfef0204c..fa2cba744b 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -1143,7 +1143,8 @@ static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeRequest *req) trace_pci_nvme_err_invalid_create_sq_cqid(cqid); return NVME_INVALID_CQID | NVME_DNR; } - if (unlikely(!sqid || !nvme_check_sqid(n, sqid))) { + if (unlikely(!sqid || sqid > n->params.max_ioqpairs || + n->sq[sqid] != NULL)) { trace_pci_nvme_err_invalid_create_sq_sqid(sqid); return NVME_INVALID_QID | NVME_DNR; } @@ -1398,7 +1399,8 @@ static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest *req) trace_pci_nvme_create_cq(prp1, cqid, vector, qsize, qflags, NVME_CQ_FLAGS_IEN(qflags) != 0); - if (unlikely(!cqid || !nvme_check_cqid(n, cqid))) { + if (unlikely(!cqid || cqid > n->params.max_ioqpairs || + n->cq[cqid] != NULL)) { trace_pci_nvme_err_invalid_create_cq_cqid(cqid); return NVME_INVALID_QID | NVME_DNR; }