hw/block/nvme: try to deal with the iov/qsg duality

Introduce NvmeSg and try to deal with that pesky qsg/iov duality that
haunts all the memory-related functions.

Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
Reviewed-by: Keith Busch <kbusch@kernel.org>
This commit is contained in:
Klaus Jensen 2021-02-07 21:06:01 +01:00
parent 569dbe19c4
commit f80a1c331a
2 changed files with 117 additions and 91 deletions

View File

@ -441,15 +441,31 @@ static void nvme_req_clear(NvmeRequest *req)
req->status = NVME_SUCCESS;
}
static void nvme_req_exit(NvmeRequest *req)
static inline void nvme_sg_init(NvmeCtrl *n, NvmeSg *sg, bool dma)
{
if (req->qsg.sg) {
qemu_sglist_destroy(&req->qsg);
if (dma) {
pci_dma_sglist_init(&sg->qsg, &n->parent_obj, 0);
sg->flags = NVME_SG_DMA;
} else {
qemu_iovec_init(&sg->iov, 0);
}
if (req->iov.iov) {
qemu_iovec_destroy(&req->iov);
sg->flags |= NVME_SG_ALLOC;
}
static inline void nvme_sg_unmap(NvmeSg *sg)
{
if (!(sg->flags & NVME_SG_ALLOC)) {
return;
}
if (sg->flags & NVME_SG_DMA) {
qemu_sglist_destroy(&sg->qsg);
} else {
qemu_iovec_destroy(&sg->iov);
}
memset(sg, 0x0, sizeof(*sg));
}
static uint16_t nvme_map_addr_cmb(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr,
@ -486,8 +502,7 @@ static uint16_t nvme_map_addr_pmr(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr,
return NVME_SUCCESS;
}
static uint16_t nvme_map_addr(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov,
hwaddr addr, size_t len)
static uint16_t nvme_map_addr(NvmeCtrl *n, NvmeSg *sg, hwaddr addr, size_t len)
{
bool cmb = false, pmr = false;
@ -504,38 +519,31 @@ static uint16_t nvme_map_addr(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov,
}
if (cmb || pmr) {
if (qsg && qsg->sg) {
if (sg->flags & NVME_SG_DMA) {
return NVME_INVALID_USE_OF_CMB | NVME_DNR;
}
assert(iov);
if (!iov->iov) {
qemu_iovec_init(iov, 1);
}
if (cmb) {
return nvme_map_addr_cmb(n, iov, addr, len);
return nvme_map_addr_cmb(n, &sg->iov, addr, len);
} else {
return nvme_map_addr_pmr(n, iov, addr, len);
return nvme_map_addr_pmr(n, &sg->iov, addr, len);
}
}
if (iov && iov->iov) {
if (!(sg->flags & NVME_SG_DMA)) {
return NVME_INVALID_USE_OF_CMB | NVME_DNR;
}
assert(qsg);
if (!qsg->sg) {
pci_dma_sglist_init(qsg, &n->parent_obj, 1);
}
qemu_sglist_add(qsg, addr, len);
qemu_sglist_add(&sg->qsg, addr, len);
return NVME_SUCCESS;
}
static inline bool nvme_addr_is_dma(NvmeCtrl *n, hwaddr addr)
{
return !(nvme_addr_is_cmb(n, addr) || nvme_addr_is_pmr(n, addr));
}
static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, uint64_t prp2,
uint32_t len, NvmeRequest *req)
{
@ -545,20 +553,13 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, uint64_t prp2,
uint16_t status;
int ret;
QEMUSGList *qsg = &req->qsg;
QEMUIOVector *iov = &req->iov;
trace_pci_nvme_map_prp(trans_len, len, prp1, prp2, num_prps);
if (nvme_addr_is_cmb(n, prp1) || (nvme_addr_is_pmr(n, prp1))) {
qemu_iovec_init(iov, num_prps);
} else {
pci_dma_sglist_init(qsg, &n->parent_obj, num_prps);
}
nvme_sg_init(n, &req->sg, nvme_addr_is_dma(n, prp1));
status = nvme_map_addr(n, qsg, iov, prp1, trans_len);
status = nvme_map_addr(n, &req->sg, prp1, trans_len);
if (status) {
return status;
goto unmap;
}
len -= trans_len;
@ -573,7 +574,8 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, uint64_t prp2,
ret = nvme_addr_read(n, prp2, (void *)prp_list, prp_trans);
if (ret) {
trace_pci_nvme_err_addr_read(prp2);
return NVME_DATA_TRAS_ERROR;
status = NVME_DATA_TRAS_ERROR;
goto unmap;
}
while (len != 0) {
uint64_t prp_ent = le64_to_cpu(prp_list[i]);
@ -581,7 +583,8 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, uint64_t prp2,
if (i == n->max_prp_ents - 1 && len > n->page_size) {
if (unlikely(prp_ent & (n->page_size - 1))) {
trace_pci_nvme_err_invalid_prplist_ent(prp_ent);
return NVME_INVALID_PRP_OFFSET | NVME_DNR;
status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
goto unmap;
}
i = 0;
@ -591,20 +594,22 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, uint64_t prp2,
prp_trans);
if (ret) {
trace_pci_nvme_err_addr_read(prp_ent);
return NVME_DATA_TRAS_ERROR;
status = NVME_DATA_TRAS_ERROR;
goto unmap;
}
prp_ent = le64_to_cpu(prp_list[i]);
}
if (unlikely(prp_ent & (n->page_size - 1))) {
trace_pci_nvme_err_invalid_prplist_ent(prp_ent);
return NVME_INVALID_PRP_OFFSET | NVME_DNR;
status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
goto unmap;
}
trans_len = MIN(len, n->page_size);
status = nvme_map_addr(n, qsg, iov, prp_ent, trans_len);
status = nvme_map_addr(n, &req->sg, prp_ent, trans_len);
if (status) {
return status;
goto unmap;
}
len -= trans_len;
@ -613,24 +618,28 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, uint64_t prp2,
} else {
if (unlikely(prp2 & (n->page_size - 1))) {
trace_pci_nvme_err_invalid_prp2_align(prp2);
return NVME_INVALID_PRP_OFFSET | NVME_DNR;
status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
goto unmap;
}
status = nvme_map_addr(n, qsg, iov, prp2, len);
status = nvme_map_addr(n, &req->sg, prp2, len);
if (status) {
return status;
goto unmap;
}
}
}
return NVME_SUCCESS;
unmap:
nvme_sg_unmap(&req->sg);
return status;
}
/*
* Map 'nsgld' data descriptors from 'segment'. The function will subtract the
* number of bytes mapped in len.
*/
static uint16_t nvme_map_sgl_data(NvmeCtrl *n, QEMUSGList *qsg,
QEMUIOVector *iov,
static uint16_t nvme_map_sgl_data(NvmeCtrl *n, NvmeSg *sg,
NvmeSglDescriptor *segment, uint64_t nsgld,
size_t *len, NvmeRequest *req)
{
@ -688,7 +697,7 @@ static uint16_t nvme_map_sgl_data(NvmeCtrl *n, QEMUSGList *qsg,
return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
}
status = nvme_map_addr(n, qsg, iov, addr, trans_len);
status = nvme_map_addr(n, sg, addr, trans_len);
if (status) {
return status;
}
@ -700,9 +709,8 @@ next:
return NVME_SUCCESS;
}
static uint16_t nvme_map_sgl(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov,
NvmeSglDescriptor sgl, size_t len,
NvmeRequest *req)
static uint16_t nvme_map_sgl(NvmeCtrl *n, NvmeSg *sg, NvmeSglDescriptor sgl,
size_t len, NvmeRequest *req)
{
/*
* Read the segment in chunks of 256 descriptors (one 4k page) to avoid
@ -725,12 +733,14 @@ static uint16_t nvme_map_sgl(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov,
trace_pci_nvme_map_sgl(nvme_cid(req), NVME_SGL_TYPE(sgl.type), len);
nvme_sg_init(n, sg, nvme_addr_is_dma(n, addr));
/*
* If the entire transfer can be described with a single data block it can
* be mapped directly.
*/
if (NVME_SGL_TYPE(sgl.type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) {
status = nvme_map_sgl_data(n, qsg, iov, sgld, 1, &len, req);
status = nvme_map_sgl_data(n, sg, sgld, 1, &len, req);
if (status) {
goto unmap;
}
@ -768,7 +778,7 @@ static uint16_t nvme_map_sgl(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov,
goto unmap;
}
status = nvme_map_sgl_data(n, qsg, iov, segment, SEG_CHUNK_SIZE,
status = nvme_map_sgl_data(n, sg, segment, SEG_CHUNK_SIZE,
&len, req);
if (status) {
goto unmap;
@ -795,7 +805,7 @@ static uint16_t nvme_map_sgl(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov,
switch (NVME_SGL_TYPE(last_sgld->type)) {
case NVME_SGL_DESCR_TYPE_DATA_BLOCK:
case NVME_SGL_DESCR_TYPE_BIT_BUCKET:
status = nvme_map_sgl_data(n, qsg, iov, segment, nsgld, &len, req);
status = nvme_map_sgl_data(n, sg, segment, nsgld, &len, req);
if (status) {
goto unmap;
}
@ -822,7 +832,7 @@ static uint16_t nvme_map_sgl(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov,
* Do not map the last descriptor; it will be a Segment or Last Segment
* descriptor and is handled by the next iteration.
*/
status = nvme_map_sgl_data(n, qsg, iov, segment, nsgld - 1, &len, req);
status = nvme_map_sgl_data(n, sg, segment, nsgld - 1, &len, req);
if (status) {
goto unmap;
}
@ -838,14 +848,7 @@ out:
return NVME_SUCCESS;
unmap:
if (iov->iov) {
qemu_iovec_destroy(iov);
}
if (qsg->sg) {
qemu_sglist_destroy(qsg);
}
nvme_sg_unmap(sg);
return status;
}
@ -866,8 +869,7 @@ static uint16_t nvme_map_dptr(NvmeCtrl *n, size_t len, NvmeRequest *req)
return NVME_INVALID_FIELD | NVME_DNR;
}
return nvme_map_sgl(n, &req->qsg, &req->iov, req->cmd.dptr.sgl, len,
req);
return nvme_map_sgl(n, &req->sg, req->cmd.dptr.sgl, len, req);
default:
return NVME_INVALID_FIELD;
}
@ -883,16 +885,13 @@ static uint16_t nvme_dma(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
return status;
}
/* assert that only one of qsg and iov carries data */
assert((req->qsg.nsg > 0) != (req->iov.niov > 0));
if (req->qsg.nsg > 0) {
if (req->sg.flags & NVME_SG_DMA) {
uint64_t residual;
if (dir == DMA_DIRECTION_TO_DEVICE) {
residual = dma_buf_write(ptr, len, &req->qsg);
residual = dma_buf_write(ptr, len, &req->sg.qsg);
} else {
residual = dma_buf_read(ptr, len, &req->qsg);
residual = dma_buf_read(ptr, len, &req->sg.qsg);
}
if (unlikely(residual)) {
@ -903,9 +902,9 @@ static uint16_t nvme_dma(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
size_t bytes;
if (dir == DMA_DIRECTION_TO_DEVICE) {
bytes = qemu_iovec_to_buf(&req->iov, 0, ptr, len);
bytes = qemu_iovec_to_buf(&req->sg.iov, 0, ptr, len);
} else {
bytes = qemu_iovec_from_buf(&req->iov, 0, ptr, len);
bytes = qemu_iovec_from_buf(&req->sg.iov, 0, ptr, len);
}
if (unlikely(bytes != len)) {
@ -917,6 +916,32 @@ static uint16_t nvme_dma(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
return status;
}
static inline void nvme_blk_read(BlockBackend *blk, int64_t offset,
BlockCompletionFunc *cb, NvmeRequest *req)
{
assert(req->sg.flags & NVME_SG_ALLOC);
if (req->sg.flags & NVME_SG_DMA) {
req->aiocb = dma_blk_read(blk, &req->sg.qsg, offset, BDRV_SECTOR_SIZE,
cb, req);
} else {
req->aiocb = blk_aio_preadv(blk, offset, &req->sg.iov, 0, cb, req);
}
}
static inline void nvme_blk_write(BlockBackend *blk, int64_t offset,
BlockCompletionFunc *cb, NvmeRequest *req)
{
assert(req->sg.flags & NVME_SG_ALLOC);
if (req->sg.flags & NVME_SG_DMA) {
req->aiocb = dma_blk_write(blk, &req->sg.qsg, offset, BDRV_SECTOR_SIZE,
cb, req);
} else {
req->aiocb = blk_aio_pwritev(blk, offset, &req->sg.iov, 0, cb, req);
}
}
static void nvme_post_cqes(void *opaque)
{
NvmeCQueue *cq = opaque;
@ -947,7 +972,7 @@ static void nvme_post_cqes(void *opaque)
}
QTAILQ_REMOVE(&cq->req_list, req, entry);
nvme_inc_cq_tail(cq);
nvme_req_exit(req);
nvme_sg_unmap(&req->sg);
QTAILQ_INSERT_TAIL(&sq->req_list, req, entry);
}
if (cq->tail != cq->head) {
@ -1644,14 +1669,14 @@ static void nvme_copy_in_complete(NvmeRequest *req)
zone->w_ptr += ctx->nlb;
}
qemu_iovec_init(&req->iov, 1);
qemu_iovec_add(&req->iov, ctx->bounce, nvme_l2b(ns, ctx->nlb));
qemu_iovec_init(&req->sg.iov, 1);
qemu_iovec_add(&req->sg.iov, ctx->bounce, nvme_l2b(ns, ctx->nlb));
block_acct_start(blk_get_stats(ns->blkconf.blk), &req->acct, 0,
BLOCK_ACCT_WRITE);
req->aiocb = blk_aio_pwritev(ns->blkconf.blk, nvme_l2b(ns, sdlba),
&req->iov, 0, nvme_copy_cb, req);
&req->sg.iov, 0, nvme_copy_cb, req);
return;
@ -2087,13 +2112,7 @@ static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req)
block_acct_start(blk_get_stats(blk), &req->acct, data_size,
BLOCK_ACCT_READ);
if (req->qsg.sg) {
req->aiocb = dma_blk_read(blk, &req->qsg, data_offset,
BDRV_SECTOR_SIZE, nvme_rw_cb, req);
} else {
req->aiocb = blk_aio_preadv(blk, data_offset, &req->iov, 0,
nvme_rw_cb, req);
}
nvme_blk_read(blk, data_offset, nvme_rw_cb, req);
return NVME_NO_COMPLETE;
invalid:
@ -2173,13 +2192,7 @@ static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append,
block_acct_start(blk_get_stats(blk), &req->acct, data_size,
BLOCK_ACCT_WRITE);
if (req->qsg.sg) {
req->aiocb = dma_blk_write(blk, &req->qsg, data_offset,
BDRV_SECTOR_SIZE, nvme_rw_cb, req);
} else {
req->aiocb = blk_aio_pwritev(blk, data_offset, &req->iov, 0,
nvme_rw_cb, req);
}
nvme_blk_write(blk, data_offset, nvme_rw_cb, req);
} else {
req->aiocb = blk_aio_pwrite_zeroes(blk, data_offset, data_size,
BDRV_REQ_MAY_UNMAP, nvme_rw_cb,

View File

@ -29,6 +29,20 @@ typedef struct NvmeAsyncEvent {
NvmeAerResult result;
} NvmeAsyncEvent;
enum {
NVME_SG_ALLOC = 1 << 0,
NVME_SG_DMA = 1 << 1,
};
typedef struct NvmeSg {
int flags;
union {
QEMUSGList qsg;
QEMUIOVector iov;
};
} NvmeSg;
typedef struct NvmeRequest {
struct NvmeSQueue *sq;
struct NvmeNamespace *ns;
@ -38,8 +52,7 @@ typedef struct NvmeRequest {
NvmeCqe cqe;
NvmeCmd cmd;
BlockAcctCookie acct;
QEMUSGList qsg;
QEMUIOVector iov;
NvmeSg sg;
QTAILQ_ENTRY(NvmeRequest)entry;
} NvmeRequest;