hw/block/nvme: add the dataset management command

Add support for the Dataset Management command and the Deallocate
attribute. Deallocation results in discards being sent to the underlying
block device. Whether of not the blocks are actually deallocated is
affected by the same factors as Write Zeroes (see previous commit).

     format | discard | dsm (512B)  dsm (4KiB)  dsm (64KiB)
    --------------------------------------------------------
      qcow2    ignore   n           n           n
      qcow2    unmap    n           n           y
      raw      ignore   n           n           n
      raw      unmap    n           y           y

Again, a raw format and 4KiB LBAs are preferable.

In order to set the Namespace Preferred Deallocate Granularity and
Alignment fields (NPDG and NPDA), choose a sane minimum discard
granularity of 4KiB. If we are using a passthru device supporting
discard at a 512B granularity, user should set the discard_granularity
property explicitly. NPDG and NPDA will also account for the
cluster_size of the block driver if required (i.e. for QCOW2).

See NVM Express 1.3d, Section 6.7 ("Dataset Management command").

Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
Reviewed-by: Keith Busch <kbusch@kernel.org>
This commit is contained in:
Klaus Jensen 2020-10-21 14:03:19 +02:00
parent 6fd704a59a
commit 2605257a26
3 changed files with 125 additions and 5 deletions

View File

@ -28,10 +28,14 @@
#include "nvme.h" #include "nvme.h"
#include "nvme-ns.h" #include "nvme-ns.h"
static void nvme_ns_init(NvmeNamespace *ns) #define MIN_DISCARD_GRANULARITY (4 * KiB)
static int nvme_ns_init(NvmeNamespace *ns, Error **errp)
{ {
BlockDriverInfo bdi;
NvmeIdNs *id_ns = &ns->id_ns; NvmeIdNs *id_ns = &ns->id_ns;
int lba_index = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas); int lba_index = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas);
int npdg;
ns->id_ns.dlfeat = 0x9; ns->id_ns.dlfeat = 0x9;
@ -43,8 +47,19 @@ static void nvme_ns_init(NvmeNamespace *ns)
id_ns->ncap = id_ns->nsze; id_ns->ncap = id_ns->nsze;
id_ns->nuse = id_ns->ncap; id_ns->nuse = id_ns->ncap;
/* support DULBE */ /* support DULBE and I/O optimization fields */
id_ns->nsfeat |= 0x4; id_ns->nsfeat |= (0x4 | 0x10);
npdg = ns->blkconf.discard_granularity / ns->blkconf.logical_block_size;
if (bdrv_get_info(blk_bs(ns->blkconf.blk), &bdi) >= 0 &&
bdi.cluster_size > ns->blkconf.discard_granularity) {
npdg = bdi.cluster_size / ns->blkconf.logical_block_size;
}
id_ns->npda = id_ns->npdg = npdg - 1;
return 0;
} }
static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, Error **errp) static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
@ -60,6 +75,11 @@ static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
return -1; return -1;
} }
if (ns->blkconf.discard_granularity == -1) {
ns->blkconf.discard_granularity =
MAX(ns->blkconf.logical_block_size, MIN_DISCARD_GRANULARITY);
}
ns->size = blk_getlength(ns->blkconf.blk); ns->size = blk_getlength(ns->blkconf.blk);
if (ns->size < 0) { if (ns->size < 0) {
error_setg_errno(errp, -ns->size, "could not get blockdev size"); error_setg_errno(errp, -ns->size, "could not get blockdev size");
@ -93,7 +113,9 @@ int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
return -1; return -1;
} }
nvme_ns_init(ns); if (nvme_ns_init(ns, errp)) {
return -1;
}
if (nvme_register_namespace(n, ns, errp)) { if (nvme_register_namespace(n, ns, errp)) {
return -1; return -1;

View File

@ -242,6 +242,7 @@ static void nvme_irq_deassert(NvmeCtrl *n, NvmeCQueue *cq)
static void nvme_req_clear(NvmeRequest *req) static void nvme_req_clear(NvmeRequest *req)
{ {
req->ns = NULL; req->ns = NULL;
req->opaque = NULL;
memset(&req->cqe, 0x0, sizeof(req->cqe)); memset(&req->cqe, 0x0, sizeof(req->cqe));
req->status = NVME_SUCCESS; req->status = NVME_SUCCESS;
} }
@ -978,6 +979,99 @@ static void nvme_rw_cb(void *opaque, int ret)
nvme_enqueue_req_completion(nvme_cq(req), req); nvme_enqueue_req_completion(nvme_cq(req), req);
} }
static void nvme_aio_discard_cb(void *opaque, int ret)
{
NvmeRequest *req = opaque;
uintptr_t *discards = (uintptr_t *)&req->opaque;
trace_pci_nvme_aio_discard_cb(nvme_cid(req));
if (ret) {
nvme_aio_err(req, ret);
}
(*discards)--;
if (*discards) {
return;
}
nvme_enqueue_req_completion(nvme_cq(req), req);
}
static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req)
{
NvmeNamespace *ns = req->ns;
NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd;
uint32_t attr = le32_to_cpu(dsm->attributes);
uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1;
uint16_t status = NVME_SUCCESS;
trace_pci_nvme_dsm(nvme_cid(req), nvme_nsid(ns), nr, attr);
if (attr & NVME_DSMGMT_AD) {
int64_t offset;
size_t len;
NvmeDsmRange range[nr];
uintptr_t *discards = (uintptr_t *)&req->opaque;
status = nvme_dma(n, (uint8_t *)range, sizeof(range),
DMA_DIRECTION_TO_DEVICE, req);
if (status) {
return status;
}
/*
* AIO callbacks may be called immediately, so initialize discards to 1
* to make sure the the callback does not complete the request before
* all discards have been issued.
*/
*discards = 1;
for (int i = 0; i < nr; i++) {
uint64_t slba = le64_to_cpu(range[i].slba);
uint32_t nlb = le32_to_cpu(range[i].nlb);
if (nvme_check_bounds(ns, slba, nlb)) {
trace_pci_nvme_err_invalid_lba_range(slba, nlb,
ns->id_ns.nsze);
continue;
}
trace_pci_nvme_dsm_deallocate(nvme_cid(req), nvme_nsid(ns), slba,
nlb);
offset = nvme_l2b(ns, slba);
len = nvme_l2b(ns, nlb);
while (len) {
size_t bytes = MIN(BDRV_REQUEST_MAX_BYTES, len);
(*discards)++;
blk_aio_pdiscard(ns->blkconf.blk, offset, bytes,
nvme_aio_discard_cb, req);
offset += bytes;
len -= bytes;
}
}
/* account for the 1-initialization */
(*discards)--;
if (*discards) {
status = NVME_NO_COMPLETE;
} else {
status = req->status;
}
}
return status;
}
static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req) static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req)
{ {
block_acct_start(blk_get_stats(req->ns->blkconf.blk), &req->acct, 0, block_acct_start(blk_get_stats(req->ns->blkconf.blk), &req->acct, 0,
@ -1107,6 +1201,8 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
case NVME_CMD_WRITE: case NVME_CMD_WRITE:
case NVME_CMD_READ: case NVME_CMD_READ:
return nvme_rw(n, req); return nvme_rw(n, req);
case NVME_CMD_DSM:
return nvme_dsm(n, req);
default: default:
trace_pci_nvme_err_invalid_opc(req->cmd.opcode); trace_pci_nvme_err_invalid_opc(req->cmd.opcode);
return NVME_INVALID_OPCODE | NVME_DNR; return NVME_INVALID_OPCODE | NVME_DNR;
@ -2829,7 +2925,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
id->cqes = (0x4 << 4) | 0x4; id->cqes = (0x4 << 4) | 0x4;
id->nn = cpu_to_le32(n->num_namespaces); id->nn = cpu_to_le32(n->num_namespaces);
id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP | id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP |
NVME_ONCS_FEATURES); NVME_ONCS_FEATURES | NVME_ONCS_DSM);
id->vwc = 0x1; id->vwc = 0x1;
id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN | id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN |

View File

@ -28,6 +28,7 @@ typedef struct NvmeRequest {
struct NvmeNamespace *ns; struct NvmeNamespace *ns;
BlockAIOCB *aiocb; BlockAIOCB *aiocb;
uint16_t status; uint16_t status;
void *opaque;
NvmeCqe cqe; NvmeCqe cqe;
NvmeCmd cmd; NvmeCmd cmd;
BlockAcctCookie acct; BlockAcctCookie acct;
@ -60,6 +61,7 @@ static inline const char *nvme_io_opc_str(uint8_t opc)
case NVME_CMD_WRITE: return "NVME_NVM_CMD_WRITE"; case NVME_CMD_WRITE: return "NVME_NVM_CMD_WRITE";
case NVME_CMD_READ: return "NVME_NVM_CMD_READ"; case NVME_CMD_READ: return "NVME_NVM_CMD_READ";
case NVME_CMD_WRITE_ZEROES: return "NVME_NVM_CMD_WRITE_ZEROES"; case NVME_CMD_WRITE_ZEROES: return "NVME_NVM_CMD_WRITE_ZEROES";
case NVME_CMD_DSM: return "NVME_NVM_CMD_DSM";
default: return "NVME_NVM_CMD_UNKNOWN"; default: return "NVME_NVM_CMD_UNKNOWN";
} }
} }