hw/block/nvme: zero out zones on reset

The zoned command set specification states that "All logical blocks in a
zone *shall* be marked as deallocated when [the zone is reset]". Since
the device guarantees 0x00 to be read from deallocated blocks we have to
issue a pwrite_zeroes since we cannot be sure that a discard will do
anything. But typically, this will be achieved with an efficient
unmap/discard operation.

Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
Tested-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
Reviewed-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
This commit is contained in:
Klaus Jensen 2020-12-09 23:43:15 +01:00
parent b05fde2881
commit 5f5dc4c6a9
2 changed files with 113 additions and 38 deletions

View File

@ -1371,6 +1371,53 @@ static void nvme_aio_discard_cb(void *opaque, int ret)
nvme_enqueue_req_completion(nvme_cq(req), req); nvme_enqueue_req_completion(nvme_cq(req), req);
} }
struct nvme_zone_reset_ctx {
NvmeRequest *req;
NvmeZone *zone;
};
static void nvme_aio_zone_reset_cb(void *opaque, int ret)
{
struct nvme_zone_reset_ctx *ctx = opaque;
NvmeRequest *req = ctx->req;
NvmeNamespace *ns = req->ns;
NvmeZone *zone = ctx->zone;
uintptr_t *resets = (uintptr_t *)&req->opaque;
g_free(ctx);
trace_pci_nvme_aio_zone_reset_cb(nvme_cid(req), zone->d.zslba);
if (!ret) {
switch (nvme_get_zone_state(zone)) {
case NVME_ZONE_STATE_EXPLICITLY_OPEN:
case NVME_ZONE_STATE_IMPLICITLY_OPEN:
nvme_aor_dec_open(ns);
/* fall through */
case NVME_ZONE_STATE_CLOSED:
nvme_aor_dec_active(ns);
/* fall through */
case NVME_ZONE_STATE_FULL:
zone->w_ptr = zone->d.zslba;
zone->d.wp = zone->w_ptr;
nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EMPTY);
/* fall through */
default:
break;
}
} else {
nvme_aio_err(req, ret);
}
(*resets)--;
if (*resets) {
return;
}
nvme_enqueue_req_completion(nvme_cq(req), req);
}
struct nvme_compare_ctx { struct nvme_compare_ctx {
QEMUIOVector iov; QEMUIOVector iov;
uint8_t *bounce; uint8_t *bounce;
@ -1735,7 +1782,8 @@ static uint16_t nvme_get_mgmt_zone_slba_idx(NvmeNamespace *ns, NvmeCmd *c,
return NVME_SUCCESS; return NVME_SUCCESS;
} }
typedef uint16_t (*op_handler_t)(NvmeNamespace *, NvmeZone *, NvmeZoneState); typedef uint16_t (*op_handler_t)(NvmeNamespace *, NvmeZone *, NvmeZoneState,
NvmeRequest *);
enum NvmeZoneProcessingMask { enum NvmeZoneProcessingMask {
NVME_PROC_CURRENT_ZONE = 0, NVME_PROC_CURRENT_ZONE = 0,
@ -1746,7 +1794,7 @@ enum NvmeZoneProcessingMask {
}; };
static uint16_t nvme_open_zone(NvmeNamespace *ns, NvmeZone *zone, static uint16_t nvme_open_zone(NvmeNamespace *ns, NvmeZone *zone,
NvmeZoneState state) NvmeZoneState state, NvmeRequest *req)
{ {
uint16_t status; uint16_t status;
@ -1779,7 +1827,7 @@ static uint16_t nvme_open_zone(NvmeNamespace *ns, NvmeZone *zone,
} }
static uint16_t nvme_close_zone(NvmeNamespace *ns, NvmeZone *zone, static uint16_t nvme_close_zone(NvmeNamespace *ns, NvmeZone *zone,
NvmeZoneState state) NvmeZoneState state, NvmeRequest *req)
{ {
switch (state) { switch (state) {
case NVME_ZONE_STATE_EXPLICITLY_OPEN: case NVME_ZONE_STATE_EXPLICITLY_OPEN:
@ -1795,7 +1843,7 @@ static uint16_t nvme_close_zone(NvmeNamespace *ns, NvmeZone *zone,
} }
static uint16_t nvme_finish_zone(NvmeNamespace *ns, NvmeZone *zone, static uint16_t nvme_finish_zone(NvmeNamespace *ns, NvmeZone *zone,
NvmeZoneState state) NvmeZoneState state, NvmeRequest *req)
{ {
switch (state) { switch (state) {
case NVME_ZONE_STATE_EXPLICITLY_OPEN: case NVME_ZONE_STATE_EXPLICITLY_OPEN:
@ -1818,30 +1866,42 @@ static uint16_t nvme_finish_zone(NvmeNamespace *ns, NvmeZone *zone,
} }
static uint16_t nvme_reset_zone(NvmeNamespace *ns, NvmeZone *zone, static uint16_t nvme_reset_zone(NvmeNamespace *ns, NvmeZone *zone,
NvmeZoneState state) NvmeZoneState state, NvmeRequest *req)
{ {
uintptr_t *resets = (uintptr_t *)&req->opaque;
struct nvme_zone_reset_ctx *ctx;
switch (state) { switch (state) {
case NVME_ZONE_STATE_EXPLICITLY_OPEN:
case NVME_ZONE_STATE_IMPLICITLY_OPEN:
nvme_aor_dec_open(ns);
/* fall through */
case NVME_ZONE_STATE_CLOSED:
nvme_aor_dec_active(ns);
/* fall through */
case NVME_ZONE_STATE_FULL:
zone->w_ptr = zone->d.zslba;
zone->d.wp = zone->w_ptr;
nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EMPTY);
/* fall through */
case NVME_ZONE_STATE_EMPTY: case NVME_ZONE_STATE_EMPTY:
return NVME_SUCCESS; return NVME_SUCCESS;
case NVME_ZONE_STATE_EXPLICITLY_OPEN:
case NVME_ZONE_STATE_IMPLICITLY_OPEN:
case NVME_ZONE_STATE_CLOSED:
case NVME_ZONE_STATE_FULL:
break;
default: default:
return NVME_ZONE_INVAL_TRANSITION; return NVME_ZONE_INVAL_TRANSITION;
} }
/*
* The zone reset aio callback needs to know the zone that is being reset
* in order to transition the zone on completion.
*/
ctx = g_new(struct nvme_zone_reset_ctx, 1);
ctx->req = req;
ctx->zone = zone;
(*resets)++;
blk_aio_pwrite_zeroes(ns->blkconf.blk, nvme_l2b(ns, zone->d.zslba),
nvme_l2b(ns, ns->zone_size), BDRV_REQ_MAY_UNMAP,
nvme_aio_zone_reset_cb, ctx);
return NVME_NO_COMPLETE;
} }
static uint16_t nvme_offline_zone(NvmeNamespace *ns, NvmeZone *zone, static uint16_t nvme_offline_zone(NvmeNamespace *ns, NvmeZone *zone,
NvmeZoneState state) NvmeZoneState state, NvmeRequest *req)
{ {
switch (state) { switch (state) {
case NVME_ZONE_STATE_READ_ONLY: case NVME_ZONE_STATE_READ_ONLY:
@ -1875,7 +1935,7 @@ static uint16_t nvme_set_zd_ext(NvmeNamespace *ns, NvmeZone *zone)
static uint16_t nvme_bulk_proc_zone(NvmeNamespace *ns, NvmeZone *zone, static uint16_t nvme_bulk_proc_zone(NvmeNamespace *ns, NvmeZone *zone,
enum NvmeZoneProcessingMask proc_mask, enum NvmeZoneProcessingMask proc_mask,
op_handler_t op_hndlr) op_handler_t op_hndlr, NvmeRequest *req)
{ {
uint16_t status = NVME_SUCCESS; uint16_t status = NVME_SUCCESS;
NvmeZoneState zs = nvme_get_zone_state(zone); NvmeZoneState zs = nvme_get_zone_state(zone);
@ -1900,7 +1960,7 @@ static uint16_t nvme_bulk_proc_zone(NvmeNamespace *ns, NvmeZone *zone,
} }
if (proc_zone) { if (proc_zone) {
status = op_hndlr(ns, zone, zs); status = op_hndlr(ns, zone, zs, req);
} }
return status; return status;
@ -1908,42 +1968,46 @@ static uint16_t nvme_bulk_proc_zone(NvmeNamespace *ns, NvmeZone *zone,
static uint16_t nvme_do_zone_op(NvmeNamespace *ns, NvmeZone *zone, static uint16_t nvme_do_zone_op(NvmeNamespace *ns, NvmeZone *zone,
enum NvmeZoneProcessingMask proc_mask, enum NvmeZoneProcessingMask proc_mask,
op_handler_t op_hndlr) op_handler_t op_hndlr, NvmeRequest *req)
{ {
NvmeZone *next; NvmeZone *next;
uint16_t status = NVME_SUCCESS; uint16_t status = NVME_SUCCESS;
int i; int i;
if (!proc_mask) { if (!proc_mask) {
status = op_hndlr(ns, zone, nvme_get_zone_state(zone)); status = op_hndlr(ns, zone, nvme_get_zone_state(zone), req);
} else { } else {
if (proc_mask & NVME_PROC_CLOSED_ZONES) { if (proc_mask & NVME_PROC_CLOSED_ZONES) {
QTAILQ_FOREACH_SAFE(zone, &ns->closed_zones, entry, next) { QTAILQ_FOREACH_SAFE(zone, &ns->closed_zones, entry, next) {
status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr); status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
if (status != NVME_SUCCESS) { req);
if (status && status != NVME_NO_COMPLETE) {
goto out; goto out;
} }
} }
} }
if (proc_mask & NVME_PROC_OPENED_ZONES) { if (proc_mask & NVME_PROC_OPENED_ZONES) {
QTAILQ_FOREACH_SAFE(zone, &ns->imp_open_zones, entry, next) { QTAILQ_FOREACH_SAFE(zone, &ns->imp_open_zones, entry, next) {
status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr); status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
if (status != NVME_SUCCESS) { req);
if (status && status != NVME_NO_COMPLETE) {
goto out; goto out;
} }
} }
QTAILQ_FOREACH_SAFE(zone, &ns->exp_open_zones, entry, next) { QTAILQ_FOREACH_SAFE(zone, &ns->exp_open_zones, entry, next) {
status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr); status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
if (status != NVME_SUCCESS) { req);
if (status && status != NVME_NO_COMPLETE) {
goto out; goto out;
} }
} }
} }
if (proc_mask & NVME_PROC_FULL_ZONES) { if (proc_mask & NVME_PROC_FULL_ZONES) {
QTAILQ_FOREACH_SAFE(zone, &ns->full_zones, entry, next) { QTAILQ_FOREACH_SAFE(zone, &ns->full_zones, entry, next) {
status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr); status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
if (status != NVME_SUCCESS) { req);
if (status && status != NVME_NO_COMPLETE) {
goto out; goto out;
} }
} }
@ -1951,8 +2015,9 @@ static uint16_t nvme_do_zone_op(NvmeNamespace *ns, NvmeZone *zone,
if (proc_mask & NVME_PROC_READ_ONLY_ZONES) { if (proc_mask & NVME_PROC_READ_ONLY_ZONES) {
for (i = 0; i < ns->num_zones; i++, zone++) { for (i = 0; i < ns->num_zones; i++, zone++) {
status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr); status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
if (status != NVME_SUCCESS) { req);
if (status && status != NVME_NO_COMPLETE) {
goto out; goto out;
} }
} }
@ -1968,6 +2033,7 @@ static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req)
NvmeCmd *cmd = (NvmeCmd *)&req->cmd; NvmeCmd *cmd = (NvmeCmd *)&req->cmd;
NvmeNamespace *ns = req->ns; NvmeNamespace *ns = req->ns;
NvmeZone *zone; NvmeZone *zone;
uintptr_t *resets;
uint8_t *zd_ext; uint8_t *zd_ext;
uint32_t dw13 = le32_to_cpu(cmd->cdw13); uint32_t dw13 = le32_to_cpu(cmd->cdw13);
uint64_t slba = 0; uint64_t slba = 0;
@ -2002,7 +2068,7 @@ static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req)
proc_mask = NVME_PROC_CLOSED_ZONES; proc_mask = NVME_PROC_CLOSED_ZONES;
} }
trace_pci_nvme_open_zone(slba, zone_idx, all); trace_pci_nvme_open_zone(slba, zone_idx, all);
status = nvme_do_zone_op(ns, zone, proc_mask, nvme_open_zone); status = nvme_do_zone_op(ns, zone, proc_mask, nvme_open_zone, req);
break; break;
case NVME_ZONE_ACTION_CLOSE: case NVME_ZONE_ACTION_CLOSE:
@ -2010,7 +2076,7 @@ static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req)
proc_mask = NVME_PROC_OPENED_ZONES; proc_mask = NVME_PROC_OPENED_ZONES;
} }
trace_pci_nvme_close_zone(slba, zone_idx, all); trace_pci_nvme_close_zone(slba, zone_idx, all);
status = nvme_do_zone_op(ns, zone, proc_mask, nvme_close_zone); status = nvme_do_zone_op(ns, zone, proc_mask, nvme_close_zone, req);
break; break;
case NVME_ZONE_ACTION_FINISH: case NVME_ZONE_ACTION_FINISH:
@ -2018,24 +2084,32 @@ static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req)
proc_mask = NVME_PROC_OPENED_ZONES | NVME_PROC_CLOSED_ZONES; proc_mask = NVME_PROC_OPENED_ZONES | NVME_PROC_CLOSED_ZONES;
} }
trace_pci_nvme_finish_zone(slba, zone_idx, all); trace_pci_nvme_finish_zone(slba, zone_idx, all);
status = nvme_do_zone_op(ns, zone, proc_mask, nvme_finish_zone); status = nvme_do_zone_op(ns, zone, proc_mask, nvme_finish_zone, req);
break; break;
case NVME_ZONE_ACTION_RESET: case NVME_ZONE_ACTION_RESET:
resets = (uintptr_t *)&req->opaque;
if (all) { if (all) {
proc_mask = NVME_PROC_OPENED_ZONES | NVME_PROC_CLOSED_ZONES | proc_mask = NVME_PROC_OPENED_ZONES | NVME_PROC_CLOSED_ZONES |
NVME_PROC_FULL_ZONES; NVME_PROC_FULL_ZONES;
} }
trace_pci_nvme_reset_zone(slba, zone_idx, all); trace_pci_nvme_reset_zone(slba, zone_idx, all);
status = nvme_do_zone_op(ns, zone, proc_mask, nvme_reset_zone);
break; *resets = 1;
status = nvme_do_zone_op(ns, zone, proc_mask, nvme_reset_zone, req);
(*resets)--;
return *resets ? NVME_NO_COMPLETE : req->status;
case NVME_ZONE_ACTION_OFFLINE: case NVME_ZONE_ACTION_OFFLINE:
if (all) { if (all) {
proc_mask = NVME_PROC_READ_ONLY_ZONES; proc_mask = NVME_PROC_READ_ONLY_ZONES;
} }
trace_pci_nvme_offline_zone(slba, zone_idx, all); trace_pci_nvme_offline_zone(slba, zone_idx, all);
status = nvme_do_zone_op(ns, zone, proc_mask, nvme_offline_zone); status = nvme_do_zone_op(ns, zone, proc_mask, nvme_offline_zone, req);
break; break;
case NVME_ZONE_ACTION_SET_ZD_EXT: case NVME_ZONE_ACTION_SET_ZD_EXT:

View File

@ -49,6 +49,7 @@ pci_nvme_dsm_deallocate(uint16_t cid, uint32_t nsid, uint64_t slba, uint32_t nlb
pci_nvme_compare(uint16_t cid, uint32_t nsid, uint64_t slba, uint32_t nlb) "cid %"PRIu16" nsid %"PRIu32" slba 0x%"PRIx64" nlb %"PRIu32"" pci_nvme_compare(uint16_t cid, uint32_t nsid, uint64_t slba, uint32_t nlb) "cid %"PRIu16" nsid %"PRIu32" slba 0x%"PRIx64" nlb %"PRIu32""
pci_nvme_compare_cb(uint16_t cid) "cid %"PRIu16"" pci_nvme_compare_cb(uint16_t cid) "cid %"PRIu16""
pci_nvme_aio_discard_cb(uint16_t cid) "cid %"PRIu16"" pci_nvme_aio_discard_cb(uint16_t cid) "cid %"PRIu16""
pci_nvme_aio_zone_reset_cb(uint16_t cid, uint64_t zslba) "cid %"PRIu16" zslba 0x%"PRIx64""
pci_nvme_create_sq(uint64_t addr, uint16_t sqid, uint16_t cqid, uint16_t qsize, uint16_t qflags) "create submission queue, addr=0x%"PRIx64", sqid=%"PRIu16", cqid=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16"" pci_nvme_create_sq(uint64_t addr, uint16_t sqid, uint16_t cqid, uint16_t qsize, uint16_t qflags) "create submission queue, addr=0x%"PRIx64", sqid=%"PRIu16", cqid=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16""
pci_nvme_create_cq(uint64_t addr, uint16_t cqid, uint16_t vector, uint16_t size, uint16_t qflags, int ien) "create completion queue, addr=0x%"PRIx64", cqid=%"PRIu16", vector=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16", ien=%d" pci_nvme_create_cq(uint64_t addr, uint16_t cqid, uint16_t vector, uint16_t size, uint16_t qflags, int ien) "create completion queue, addr=0x%"PRIx64", cqid=%"PRIu16", vector=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16", ien=%d"
pci_nvme_del_sq(uint16_t qid) "deleting submission queue sqid=%"PRIu16"" pci_nvme_del_sq(uint16_t qid) "deleting submission queue sqid=%"PRIu16""