NetBSD/sys/dev/ic/nvmevar.h

205 lines
5.8 KiB
C
Raw Normal View History

Fix a performance issue where one busy queue can starve all other queues. In normal operations with multiple queues, the nvme driver will attempt to schedule I/O requests on the submitting CPU. This breaks down when any one of the queues becomes full; the driver returns EAGAIN to the disk layer, which causes the disk layer to stop submitting more requests until the blocked request is consumed. When space becomes available in the full queue, it pulls the next buffer from the bufq and fills the queue again, until finally hitting EAGAIN and preventing other queues from processing requests. Two changes here to fix the problem: - When processing requests from the bufq, attempt to assign them to the queue associated with the CPU that originated the request. - If that queue is busy, try to find another queue with available space before returning EAGAIN. This way, only when all queues are full will the disk layer stop submitting more requests. Now for some real numbers. On a Rockchip RK3399 board (6 CPUs), with 6 concurrent readers: Old code: 4294967296 bytes transferred in 52.420 secs (81933752 bytes/sec) 4294967296 bytes transferred in 53.969 secs (79582117 bytes/sec) 4294967296 bytes transferred in 55.391 secs (77539082 bytes/sec) 4294967296 bytes transferred in 55.649 secs (77179595 bytes/sec) 4294967296 bytes transferred in 56.102 secs (76556402 bytes/sec) 4294967296 bytes transferred in 72.901 secs (58915066 bytes/sec) New code: 4294967296 bytes transferred in 37.171 secs (115546186 bytes/sec) 4294967296 bytes transferred in 37.611 secs (114194445 bytes/sec) 4294967296 bytes transferred in 37.655 secs (114061009 bytes/sec) 4294967296 bytes transferred in 38.247 secs (112295534 bytes/sec) 4294967296 bytes transferred in 38.496 secs (111569183 bytes/sec) 4294967296 bytes transferred in 38.595 secs (111282997 bytes/sec)
2019-06-28 18:08:47 +03:00
/* $NetBSD: nvmevar.h,v 1.20 2019/06/28 15:08:47 jmcneill Exp $ */
/* $OpenBSD: nvmevar.h,v 1.8 2016/04/14 11:18:32 dlg Exp $ */
/*
* Copyright (c) 2014 David Gwynne <dlg@openbsd.org>
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
#include <sys/bus.h>
#include <sys/cpu.h>
#include <sys/device.h>
#include <sys/mutex.h>
#include <sys/pool.h>
#include <sys/queue.h>
Fix a performance issue where one busy queue can starve all other queues. In normal operations with multiple queues, the nvme driver will attempt to schedule I/O requests on the submitting CPU. This breaks down when any one of the queues becomes full; the driver returns EAGAIN to the disk layer, which causes the disk layer to stop submitting more requests until the blocked request is consumed. When space becomes available in the full queue, it pulls the next buffer from the bufq and fills the queue again, until finally hitting EAGAIN and preventing other queues from processing requests. Two changes here to fix the problem: - When processing requests from the bufq, attempt to assign them to the queue associated with the CPU that originated the request. - If that queue is busy, try to find another queue with available space before returning EAGAIN. This way, only when all queues are full will the disk layer stop submitting more requests. Now for some real numbers. On a Rockchip RK3399 board (6 CPUs), with 6 concurrent readers: Old code: 4294967296 bytes transferred in 52.420 secs (81933752 bytes/sec) 4294967296 bytes transferred in 53.969 secs (79582117 bytes/sec) 4294967296 bytes transferred in 55.391 secs (77539082 bytes/sec) 4294967296 bytes transferred in 55.649 secs (77179595 bytes/sec) 4294967296 bytes transferred in 56.102 secs (76556402 bytes/sec) 4294967296 bytes transferred in 72.901 secs (58915066 bytes/sec) New code: 4294967296 bytes transferred in 37.171 secs (115546186 bytes/sec) 4294967296 bytes transferred in 37.611 secs (114194445 bytes/sec) 4294967296 bytes transferred in 37.655 secs (114061009 bytes/sec) 4294967296 bytes transferred in 38.247 secs (112295534 bytes/sec) 4294967296 bytes transferred in 38.496 secs (111569183 bytes/sec) 4294967296 bytes transferred in 38.595 secs (111282997 bytes/sec)
2019-06-28 18:08:47 +03:00
#include <sys/buf.h>
struct nvme_dmamem {
bus_dmamap_t ndm_map;
bus_dma_segment_t ndm_seg;
size_t ndm_size;
void *ndm_kva;
};
#define NVME_DMA_MAP(_ndm) ((_ndm)->ndm_map)
#define NVME_DMA_LEN(_ndm) ((_ndm)->ndm_map->dm_segs[0].ds_len)
#define NVME_DMA_DVA(_ndm) ((uint64_t)(_ndm)->ndm_map->dm_segs[0].ds_addr)
#define NVME_DMA_KVA(_ndm) ((void *)(_ndm)->ndm_kva)
struct nvme_softc;
struct nvme_queue;
2017-02-28 23:53:50 +03:00
typedef void (*nvme_nnc_done)(void *, struct buf *, uint16_t, uint32_t);
struct nvme_ccb {
SIMPLEQ_ENTRY(nvme_ccb) ccb_entry;
/* DMA handles */
bus_dmamap_t ccb_dmamap;
bus_addr_t ccb_prpl_off;
uint64_t ccb_prpl_dva;
uint64_t *ccb_prpl;
/* command context */
uint16_t ccb_id;
void *ccb_cookie;
#define NVME_CCB_FREE 0xbeefdeed
void (*ccb_done)(struct nvme_queue *,
struct nvme_ccb *, struct nvme_cqe *);
/* namespace context */
void *nnc_cookie;
nvme_nnc_done nnc_done;
uint16_t nnc_nsid;
uint16_t nnc_flags;
#define NVME_NS_CTX_F_READ __BIT(0)
#define NVME_NS_CTX_F_POLL __BIT(1)
#define NVME_NS_CTX_F_FUA __BIT(2)
struct buf *nnc_buf;
daddr_t nnc_blkno;
size_t nnc_datasize;
int nnc_secsize;
};
struct nvme_queue {
struct nvme_softc *q_sc;
kmutex_t q_sq_mtx;
kmutex_t q_cq_mtx;
struct nvme_dmamem *q_sq_dmamem;
struct nvme_dmamem *q_cq_dmamem;
bus_size_t q_sqtdbl; /* submission queue tail doorbell */
bus_size_t q_cqhdbl; /* completion queue head doorbell */
uint16_t q_id;
uint32_t q_entries;
uint32_t q_sq_tail;
uint32_t q_cq_head;
uint16_t q_cq_phase;
kmutex_t q_ccb_mtx;
kcondvar_t q_ccb_wait; /* wait for ccb avail/finish */
bool q_ccb_waiting; /* whether there are waiters */
uint16_t q_nccbs; /* total number of ccbs */
struct nvme_ccb *q_ccbs;
SIMPLEQ_HEAD(, nvme_ccb) q_ccb_list;
struct nvme_dmamem *q_ccb_prpls;
};
struct nvme_namespace {
struct nvm_identify_namespace *ident;
device_t dev;
2016-06-04 19:11:50 +03:00
uint32_t flags;
#define NVME_NS_F_OPEN __BIT(0)
};
struct nvme_softc {
device_t sc_dev;
bus_space_tag_t sc_iot;
bus_space_handle_t sc_ioh;
bus_size_t sc_ios;
bus_dma_tag_t sc_dmat;
int (*sc_intr_establish)(struct nvme_softc *,
uint16_t qid, struct nvme_queue *);
int (*sc_intr_disestablish)(struct nvme_softc *,
uint16_t qid);
void **sc_ih; /* interrupt handlers */
void **sc_softih; /* softintr handlers */
u_int sc_rdy_to; /* RDY timeout */
size_t sc_mps; /* memory page size */
size_t sc_mdts; /* max data trasfer size */
u_int sc_max_sgl; /* max S/G segments */
struct nvm_identify_controller
sc_identify;
u_int sc_nn; /* namespace count */
struct nvme_namespace *sc_namespaces;
bool sc_use_mq;
u_int sc_nq; /* # of io queue (sc_q) */
struct nvme_queue *sc_admin_q;
struct nvme_queue **sc_q;
uint32_t sc_flags;
#define NVME_F_ATTACHED __BIT(0)
2016-06-04 19:11:50 +03:00
#define NVME_F_OPEN __BIT(1)
uint32_t sc_quirks;
#define NVME_QUIRK_DELAY_B4_CHK_RDY __BIT(0)
char sc_modelname[81];
};
#define lemtoh16(p) le16toh(*((uint16_t *)(p)))
#define lemtoh32(p) le32toh(*((uint32_t *)(p)))
#define lemtoh64(p) le64toh(*((uint64_t *)(p)))
#define htolem16(p, x) (*((uint16_t *)(p)) = htole16(x))
#define htolem32(p, x) (*((uint32_t *)(p)) = htole32(x))
#define htolem64(p, x) (*((uint64_t *)(p)) = htole64(x))
struct nvme_attach_args {
uint16_t naa_nsid;
uint32_t naa_qentries; /* total number of queue slots */
uint32_t naa_maxphys; /* maximum device transfer size */
const char *naa_typename; /* identifier */
};
int nvme_attach(struct nvme_softc *);
int nvme_detach(struct nvme_softc *, int flags);
int nvme_rescan(device_t, const char *, const int *);
void nvme_childdet(device_t, device_t);
int nvme_intr(void *);
void nvme_softintr_intx(void *);
int nvme_intr_msi(void *);
void nvme_softintr_msi(void *);
static __inline struct nvme_queue *
Fix a performance issue where one busy queue can starve all other queues. In normal operations with multiple queues, the nvme driver will attempt to schedule I/O requests on the submitting CPU. This breaks down when any one of the queues becomes full; the driver returns EAGAIN to the disk layer, which causes the disk layer to stop submitting more requests until the blocked request is consumed. When space becomes available in the full queue, it pulls the next buffer from the bufq and fills the queue again, until finally hitting EAGAIN and preventing other queues from processing requests. Two changes here to fix the problem: - When processing requests from the bufq, attempt to assign them to the queue associated with the CPU that originated the request. - If that queue is busy, try to find another queue with available space before returning EAGAIN. This way, only when all queues are full will the disk layer stop submitting more requests. Now for some real numbers. On a Rockchip RK3399 board (6 CPUs), with 6 concurrent readers: Old code: 4294967296 bytes transferred in 52.420 secs (81933752 bytes/sec) 4294967296 bytes transferred in 53.969 secs (79582117 bytes/sec) 4294967296 bytes transferred in 55.391 secs (77539082 bytes/sec) 4294967296 bytes transferred in 55.649 secs (77179595 bytes/sec) 4294967296 bytes transferred in 56.102 secs (76556402 bytes/sec) 4294967296 bytes transferred in 72.901 secs (58915066 bytes/sec) New code: 4294967296 bytes transferred in 37.171 secs (115546186 bytes/sec) 4294967296 bytes transferred in 37.611 secs (114194445 bytes/sec) 4294967296 bytes transferred in 37.655 secs (114061009 bytes/sec) 4294967296 bytes transferred in 38.247 secs (112295534 bytes/sec) 4294967296 bytes transferred in 38.496 secs (111569183 bytes/sec) 4294967296 bytes transferred in 38.595 secs (111282997 bytes/sec)
2019-06-28 18:08:47 +03:00
nvme_get_q(struct nvme_softc *sc, struct buf *bp, bool waitok)
{
Fix a performance issue where one busy queue can starve all other queues. In normal operations with multiple queues, the nvme driver will attempt to schedule I/O requests on the submitting CPU. This breaks down when any one of the queues becomes full; the driver returns EAGAIN to the disk layer, which causes the disk layer to stop submitting more requests until the blocked request is consumed. When space becomes available in the full queue, it pulls the next buffer from the bufq and fills the queue again, until finally hitting EAGAIN and preventing other queues from processing requests. Two changes here to fix the problem: - When processing requests from the bufq, attempt to assign them to the queue associated with the CPU that originated the request. - If that queue is busy, try to find another queue with available space before returning EAGAIN. This way, only when all queues are full will the disk layer stop submitting more requests. Now for some real numbers. On a Rockchip RK3399 board (6 CPUs), with 6 concurrent readers: Old code: 4294967296 bytes transferred in 52.420 secs (81933752 bytes/sec) 4294967296 bytes transferred in 53.969 secs (79582117 bytes/sec) 4294967296 bytes transferred in 55.391 secs (77539082 bytes/sec) 4294967296 bytes transferred in 55.649 secs (77179595 bytes/sec) 4294967296 bytes transferred in 56.102 secs (76556402 bytes/sec) 4294967296 bytes transferred in 72.901 secs (58915066 bytes/sec) New code: 4294967296 bytes transferred in 37.171 secs (115546186 bytes/sec) 4294967296 bytes transferred in 37.611 secs (114194445 bytes/sec) 4294967296 bytes transferred in 37.655 secs (114061009 bytes/sec) 4294967296 bytes transferred in 38.247 secs (112295534 bytes/sec) 4294967296 bytes transferred in 38.496 secs (111569183 bytes/sec) 4294967296 bytes transferred in 38.595 secs (111282997 bytes/sec)
2019-06-28 18:08:47 +03:00
struct cpu_info *ci = (bp && bp->b_ci) ? bp->b_ci : curcpu();
/*
* Find a queue with available ccbs, preferring the originating CPU's queue.
*/
for (u_int qoff = 0; qoff < sc->sc_nq; qoff++) {
struct nvme_queue *q = sc->sc_q[(cpu_index(ci) + qoff) % sc->sc_nq];
if (!SIMPLEQ_EMPTY(&q->q_ccb_list) || waitok)
return q;
}
return NULL;
}
/*
* namespace
*/
static __inline struct nvme_namespace *
nvme_ns_get(struct nvme_softc *sc, uint16_t nsid)
{
if (nsid == 0 || nsid - 1 >= sc->sc_nn)
return NULL;
return &sc->sc_namespaces[nsid - 1];
}
int nvme_ns_identify(struct nvme_softc *, uint16_t);
void nvme_ns_free(struct nvme_softc *, uint16_t);
int nvme_ns_dobio(struct nvme_softc *, uint16_t, void *,
struct buf *, void *, size_t, int, daddr_t, int, nvme_nnc_done);
int nvme_ns_sync(struct nvme_softc *, uint16_t, int);
int nvme_admin_getcache(struct nvme_softc *, int *);
int nvme_admin_setcache(struct nvme_softc *, int);