From f141eafe286c785f7e2c1e312a73f90d66bdfb90 Mon Sep 17 00:00:00 2001 From: aliguori Date: Tue, 7 Apr 2009 18:43:24 +0000 Subject: [PATCH] push down vector linearization to posix-aio-compat.c (Christoph Hellwig) Make all AIO requests vectored and defer linearization until the actual I/O thread. This prepares for using native preadv/pwritev. Also enables asynchronous direct I/O by handling that case in the I/O thread. Qcow and qcow2 propably want to be adopted to directly deal with multi-segment requests, but that can be implemented later. Signed-off-by: Christoph Hellwig Signed-off-by: Anthony Liguori git-svn-id: svn://svn.savannah.nongnu.org/qemu/trunk@7020 c046a42c-6fe2-441c-8c8c-71466251a162 --- block-qcow.c | 84 +++++++++++-------- block-qcow2.c | 93 ++++++++++++--------- block-raw-posix.c | 95 +++++++++------------- block.c | 198 ++++++++++++++++----------------------------- block_int.h | 8 +- posix-aio-compat.c | 116 ++++++++++++++++++++------ posix-aio-compat.h | 9 ++- 7 files changed, 318 insertions(+), 285 deletions(-) diff --git a/block-qcow.c b/block-qcow.c index 2decd13e76..b60f4c1921 100644 --- a/block-qcow.c +++ b/block-qcow.c @@ -525,7 +525,9 @@ static int qcow_write(BlockDriverState *bs, int64_t sector_num, typedef struct QCowAIOCB { BlockDriverAIOCB common; int64_t sector_num; + QEMUIOVector *qiov; uint8_t *buf; + void *orig_buf; int nb_sectors; int n; uint64_t cluster_offset; @@ -543,12 +545,8 @@ static void qcow_aio_read_cb(void *opaque, int ret) int index_in_cluster; acb->hd_aiocb = NULL; - if (ret < 0) { - fail: - acb->common.cb(acb->common.opaque, ret); - qemu_aio_release(acb); - return; - } + if (ret < 0) + goto done; redo: /* post process the read buffer */ @@ -570,9 +568,8 @@ static void qcow_aio_read_cb(void *opaque, int ret) if (acb->nb_sectors == 0) { /* request completed */ - acb->common.cb(acb->common.opaque, 0); - qemu_aio_release(acb); - return; + ret = 0; + goto done; } /* prepare next AIO request */ @@ -592,7 +589,7 @@ static void qcow_aio_read_cb(void *opaque, int ret) acb->hd_aiocb = bdrv_aio_readv(bs->backing_hd, acb->sector_num, &acb->hd_qiov, acb->n, qcow_aio_read_cb, acb); if (acb->hd_aiocb == NULL) - goto fail; + goto done; } else { /* Note: in this case, no need to wait */ memset(acb->buf, 0, 512 * acb->n); @@ -601,14 +598,14 @@ static void qcow_aio_read_cb(void *opaque, int ret) } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) { /* add AIO support for compressed blocks ? */ if (decompress_cluster(s, acb->cluster_offset) < 0) - goto fail; + goto done; memcpy(acb->buf, s->cluster_cache + index_in_cluster * 512, 512 * acb->n); goto redo; } else { if ((acb->cluster_offset & 511) != 0) { ret = -EIO; - goto fail; + goto done; } acb->hd_iov.iov_base = acb->buf; acb->hd_iov.iov_len = acb->n * 512; @@ -617,12 +614,22 @@ static void qcow_aio_read_cb(void *opaque, int ret) (acb->cluster_offset >> 9) + index_in_cluster, &acb->hd_qiov, acb->n, qcow_aio_read_cb, acb); if (acb->hd_aiocb == NULL) - goto fail; + goto done; } + + return; + +done: + if (acb->qiov->niov > 1) { + qemu_iovec_from_buffer(acb->qiov, acb->orig_buf, acb->qiov->size); + qemu_vfree(acb->orig_buf); + } + acb->common.cb(acb->common.opaque, ret); + qemu_aio_release(acb); } -static BlockDriverAIOCB *qcow_aio_read(BlockDriverState *bs, - int64_t sector_num, uint8_t *buf, int nb_sectors, +static BlockDriverAIOCB *qcow_aio_readv(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, BlockDriverCompletionFunc *cb, void *opaque) { QCowAIOCB *acb; @@ -632,7 +639,11 @@ static BlockDriverAIOCB *qcow_aio_read(BlockDriverState *bs, return NULL; acb->hd_aiocb = NULL; acb->sector_num = sector_num; - acb->buf = buf; + acb->qiov = qiov; + if (qiov->niov > 1) + acb->buf = acb->orig_buf = qemu_memalign(512, qiov->size); + else + acb->buf = qiov->iov->iov_base; acb->nb_sectors = nb_sectors; acb->n = 0; acb->cluster_offset = 0; @@ -652,12 +663,8 @@ static void qcow_aio_write_cb(void *opaque, int ret) acb->hd_aiocb = NULL; - if (ret < 0) { - fail: - acb->common.cb(acb->common.opaque, ret); - qemu_aio_release(acb); - return; - } + if (ret < 0) + goto done; acb->nb_sectors -= acb->n; acb->sector_num += acb->n; @@ -665,9 +672,8 @@ static void qcow_aio_write_cb(void *opaque, int ret) if (acb->nb_sectors == 0) { /* request completed */ - acb->common.cb(acb->common.opaque, 0); - qemu_aio_release(acb); - return; + ret = 0; + goto done; } index_in_cluster = acb->sector_num & (s->cluster_sectors - 1); @@ -679,14 +685,14 @@ static void qcow_aio_write_cb(void *opaque, int ret) index_in_cluster + acb->n); if (!cluster_offset || (cluster_offset & 511) != 0) { ret = -EIO; - goto fail; + goto done; } if (s->crypt_method) { if (!acb->cluster_data) { acb->cluster_data = qemu_mallocz(s->cluster_size); if (!acb->cluster_data) { ret = -ENOMEM; - goto fail; + goto done; } } encrypt_sectors(s, acb->sector_num, acb->cluster_data, acb->buf, @@ -704,11 +710,18 @@ static void qcow_aio_write_cb(void *opaque, int ret) &acb->hd_qiov, acb->n, qcow_aio_write_cb, acb); if (acb->hd_aiocb == NULL) - goto fail; + goto done; + return; + +done: + if (acb->qiov->niov > 1) + qemu_vfree(acb->orig_buf); + acb->common.cb(acb->common.opaque, ret); + qemu_aio_release(acb); } -static BlockDriverAIOCB *qcow_aio_write(BlockDriverState *bs, - int64_t sector_num, const uint8_t *buf, int nb_sectors, +static BlockDriverAIOCB *qcow_aio_writev(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, BlockDriverCompletionFunc *cb, void *opaque) { BDRVQcowState *s = bs->opaque; @@ -721,7 +734,12 @@ static BlockDriverAIOCB *qcow_aio_write(BlockDriverState *bs, return NULL; acb->hd_aiocb = NULL; acb->sector_num = sector_num; - acb->buf = (uint8_t *)buf; + acb->qiov = qiov; + if (qiov->niov > 1) { + acb->buf = acb->orig_buf = qemu_memalign(512, qiov->size); + qemu_iovec_to_buffer(qiov, acb->buf); + } else + acb->buf = qiov->iov->iov_base; acb->nb_sectors = nb_sectors; acb->n = 0; @@ -909,8 +927,8 @@ BlockDriver bdrv_qcow = { .bdrv_is_allocated = qcow_is_allocated, .bdrv_set_key = qcow_set_key, .bdrv_make_empty = qcow_make_empty, - .bdrv_aio_read = qcow_aio_read, - .bdrv_aio_write = qcow_aio_write, + .bdrv_aio_readv = qcow_aio_readv, + .bdrv_aio_writev = qcow_aio_writev, .bdrv_aio_cancel = qcow_aio_cancel, .aiocb_size = sizeof(QCowAIOCB), .bdrv_write_compressed = qcow_write_compressed, diff --git a/block-qcow2.c b/block-qcow2.c index dd28c28e5f..3bd38b0d9d 100644 --- a/block-qcow2.c +++ b/block-qcow2.c @@ -1264,7 +1264,9 @@ static int qcow_write(BlockDriverState *bs, int64_t sector_num, typedef struct QCowAIOCB { BlockDriverAIOCB common; int64_t sector_num; + QEMUIOVector *qiov; uint8_t *buf; + void *orig_buf; int nb_sectors; int n; uint64_t cluster_offset; @@ -1307,12 +1309,8 @@ static void qcow_aio_read_cb(void *opaque, int ret) int index_in_cluster, n1; acb->hd_aiocb = NULL; - if (ret < 0) { -fail: - acb->common.cb(acb->common.opaque, ret); - qemu_aio_release(acb); - return; - } + if (ret < 0) + goto done; /* post process the read buffer */ if (!acb->cluster_offset) { @@ -1333,9 +1331,8 @@ fail: if (acb->nb_sectors == 0) { /* request completed */ - acb->common.cb(acb->common.opaque, 0); - qemu_aio_release(acb); - return; + ret = 0; + goto done; } /* prepare next AIO request */ @@ -1356,32 +1353,32 @@ fail: &acb->hd_qiov, acb->n, qcow_aio_read_cb, acb); if (acb->hd_aiocb == NULL) - goto fail; + goto done; } else { ret = qcow_schedule_bh(qcow_aio_read_bh, acb); if (ret < 0) - goto fail; + goto done; } } else { /* Note: in this case, no need to wait */ memset(acb->buf, 0, 512 * acb->n); ret = qcow_schedule_bh(qcow_aio_read_bh, acb); if (ret < 0) - goto fail; + goto done; } } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) { /* add AIO support for compressed blocks ? */ if (decompress_cluster(s, acb->cluster_offset) < 0) - goto fail; + goto done; memcpy(acb->buf, s->cluster_cache + index_in_cluster * 512, 512 * acb->n); ret = qcow_schedule_bh(qcow_aio_read_bh, acb); if (ret < 0) - goto fail; + goto done; } else { if ((acb->cluster_offset & 511) != 0) { ret = -EIO; - goto fail; + goto done; } acb->hd_iov.iov_base = acb->buf; @@ -1391,13 +1388,22 @@ fail: (acb->cluster_offset >> 9) + index_in_cluster, &acb->hd_qiov, acb->n, qcow_aio_read_cb, acb); if (acb->hd_aiocb == NULL) - goto fail; + goto done; } + + return; +done: + if (acb->qiov->niov > 1) { + qemu_iovec_from_buffer(acb->qiov, acb->orig_buf, acb->qiov->size); + qemu_vfree(acb->orig_buf); + } + acb->common.cb(acb->common.opaque, ret); + qemu_aio_release(acb); } static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs, - int64_t sector_num, uint8_t *buf, int nb_sectors, - BlockDriverCompletionFunc *cb, void *opaque) + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque, int is_write) { QCowAIOCB *acb; @@ -1406,7 +1412,13 @@ static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs, return NULL; acb->hd_aiocb = NULL; acb->sector_num = sector_num; - acb->buf = buf; + acb->qiov = qiov; + if (qiov->niov > 1) { + acb->buf = acb->orig_buf = qemu_memalign(512, qiov->size); + if (is_write) + qemu_iovec_to_buffer(qiov, acb->buf); + } else + acb->buf = qiov->iov->iov_base; acb->nb_sectors = nb_sectors; acb->n = 0; acb->cluster_offset = 0; @@ -1414,13 +1426,13 @@ static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs, return acb; } -static BlockDriverAIOCB *qcow_aio_read(BlockDriverState *bs, - int64_t sector_num, uint8_t *buf, int nb_sectors, +static BlockDriverAIOCB *qcow_aio_readv(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, BlockDriverCompletionFunc *cb, void *opaque) { QCowAIOCB *acb; - acb = qcow_aio_setup(bs, sector_num, buf, nb_sectors, cb, opaque); + acb = qcow_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 0); if (!acb) return NULL; @@ -1439,16 +1451,12 @@ static void qcow_aio_write_cb(void *opaque, int ret) acb->hd_aiocb = NULL; - if (ret < 0) { - fail: - acb->common.cb(acb->common.opaque, ret); - qemu_aio_release(acb); - return; - } + if (ret < 0) + goto done; if (alloc_cluster_link_l2(bs, acb->cluster_offset, &acb->l2meta) < 0) { free_any_clusters(bs, acb->cluster_offset, acb->l2meta.nb_clusters); - goto fail; + goto done; } acb->nb_sectors -= acb->n; @@ -1457,9 +1465,8 @@ static void qcow_aio_write_cb(void *opaque, int ret) if (acb->nb_sectors == 0) { /* request completed */ - acb->common.cb(acb->common.opaque, 0); - qemu_aio_release(acb); - return; + ret = 0; + goto done; } index_in_cluster = acb->sector_num & (s->cluster_sectors - 1); @@ -1473,7 +1480,7 @@ static void qcow_aio_write_cb(void *opaque, int ret) n_end, &acb->n, &acb->l2meta); if (!acb->cluster_offset || (acb->cluster_offset & 511) != 0) { ret = -EIO; - goto fail; + goto done; } if (s->crypt_method) { if (!acb->cluster_data) { @@ -1494,11 +1501,19 @@ static void qcow_aio_write_cb(void *opaque, int ret) &acb->hd_qiov, acb->n, qcow_aio_write_cb, acb); if (acb->hd_aiocb == NULL) - goto fail; + goto done; + + return; + +done: + if (acb->qiov->niov > 1) + qemu_vfree(acb->orig_buf); + acb->common.cb(acb->common.opaque, ret); + qemu_aio_release(acb); } -static BlockDriverAIOCB *qcow_aio_write(BlockDriverState *bs, - int64_t sector_num, const uint8_t *buf, int nb_sectors, +static BlockDriverAIOCB *qcow_aio_writev(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, BlockDriverCompletionFunc *cb, void *opaque) { BDRVQcowState *s = bs->opaque; @@ -1506,7 +1521,7 @@ static BlockDriverAIOCB *qcow_aio_write(BlockDriverState *bs, s->cluster_cache_offset = -1; /* disable compressed cache */ - acb = qcow_aio_setup(bs, sector_num, (uint8_t*)buf, nb_sectors, cb, opaque); + acb = qcow_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 1); if (!acb) return NULL; @@ -2771,8 +2786,8 @@ BlockDriver bdrv_qcow2 = { .bdrv_set_key = qcow_set_key, .bdrv_make_empty = qcow_make_empty, - .bdrv_aio_read = qcow_aio_read, - .bdrv_aio_write = qcow_aio_write, + .bdrv_aio_readv = qcow_aio_readv, + .bdrv_aio_writev = qcow_aio_writev, .bdrv_aio_cancel = qcow_aio_cancel, .aiocb_size = sizeof(QCowAIOCB), .bdrv_write_compressed = qcow_write_compressed, diff --git a/block-raw-posix.c b/block-raw-posix.c index 4da5ae4397..822839fb8f 100644 --- a/block-raw-posix.c +++ b/block-raw-posix.c @@ -599,8 +599,8 @@ static int posix_aio_init(void) return 0; } -static RawAIOCB *raw_aio_setup(BlockDriverState *bs, - int64_t sector_num, uint8_t *buf, int nb_sectors, +static RawAIOCB *raw_aio_setup(BlockDriverState *bs, int64_t sector_num, + QEMUIOVector *qiov, int nb_sectors, BlockDriverCompletionFunc *cb, void *opaque) { BDRVRawState *s = bs->opaque; @@ -614,24 +614,25 @@ static RawAIOCB *raw_aio_setup(BlockDriverState *bs, return NULL; acb->aiocb.aio_fildes = s->fd; acb->aiocb.ev_signo = SIGUSR2; - acb->aiocb.aio_buf = buf; - if (nb_sectors < 0) - acb->aiocb.aio_nbytes = -nb_sectors; - else - acb->aiocb.aio_nbytes = nb_sectors * 512; + acb->aiocb.aio_iov = qiov->iov; + acb->aiocb.aio_niov = qiov->niov; + acb->aiocb.aio_nbytes = nb_sectors * 512; acb->aiocb.aio_offset = sector_num * 512; + acb->aiocb.aio_flags = 0; + + /* + * If O_DIRECT is used the buffer needs to be aligned on a sector + * boundary. Tell the low level code to ensure that in case it's + * not done yet. + */ + if (s->aligned_buf) + acb->aiocb.aio_flags |= QEMU_AIO_SECTOR_ALIGNED; + acb->next = posix_aio_state->first_aio; posix_aio_state->first_aio = acb; return acb; } -static void raw_aio_em_cb(void* opaque) -{ - RawAIOCB *acb = opaque; - acb->common.cb(acb->common.opaque, acb->ret); - qemu_aio_release(acb); -} - static void raw_aio_remove(RawAIOCB *acb) { RawAIOCB **pacb; @@ -651,28 +652,13 @@ static void raw_aio_remove(RawAIOCB *acb) } } -static BlockDriverAIOCB *raw_aio_read(BlockDriverState *bs, - int64_t sector_num, uint8_t *buf, int nb_sectors, +static BlockDriverAIOCB *raw_aio_readv(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, BlockDriverCompletionFunc *cb, void *opaque) { RawAIOCB *acb; - /* - * If O_DIRECT is used and the buffer is not aligned fall back - * to synchronous IO. - */ - BDRVRawState *s = bs->opaque; - - if (unlikely(s->aligned_buf != NULL && ((uintptr_t) buf % 512))) { - QEMUBH *bh; - acb = qemu_aio_get(bs, cb, opaque); - acb->ret = raw_pread(bs, 512 * sector_num, buf, 512 * nb_sectors); - bh = qemu_bh_new(raw_aio_em_cb, acb); - qemu_bh_schedule(bh); - return &acb->common; - } - - acb = raw_aio_setup(bs, sector_num, buf, nb_sectors, cb, opaque); + acb = raw_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque); if (!acb) return NULL; if (qemu_paio_read(&acb->aiocb) < 0) { @@ -682,28 +668,13 @@ static BlockDriverAIOCB *raw_aio_read(BlockDriverState *bs, return &acb->common; } -static BlockDriverAIOCB *raw_aio_write(BlockDriverState *bs, - int64_t sector_num, const uint8_t *buf, int nb_sectors, +static BlockDriverAIOCB *raw_aio_writev(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, BlockDriverCompletionFunc *cb, void *opaque) { RawAIOCB *acb; - /* - * If O_DIRECT is used and the buffer is not aligned fall back - * to synchronous IO. - */ - BDRVRawState *s = bs->opaque; - - if (unlikely(s->aligned_buf != NULL && ((uintptr_t) buf % 512))) { - QEMUBH *bh; - acb = qemu_aio_get(bs, cb, opaque); - acb->ret = raw_pwrite(bs, 512 * sector_num, buf, 512 * nb_sectors); - bh = qemu_bh_new(raw_aio_em_cb, acb); - qemu_bh_schedule(bh); - return &acb->common; - } - - acb = raw_aio_setup(bs, sector_num, (uint8_t*)buf, nb_sectors, cb, opaque); + acb = raw_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque); if (!acb) return NULL; if (qemu_paio_write(&acb->aiocb) < 0) { @@ -887,8 +858,8 @@ BlockDriver bdrv_raw = { .bdrv_flush = raw_flush, #ifdef CONFIG_AIO - .bdrv_aio_read = raw_aio_read, - .bdrv_aio_write = raw_aio_write, + .bdrv_aio_readv = raw_aio_readv, + .bdrv_aio_writev = raw_aio_writev, .bdrv_aio_cancel = raw_aio_cancel, .aiocb_size = sizeof(RawAIOCB), #endif @@ -1215,12 +1186,24 @@ static BlockDriverAIOCB *raw_aio_ioctl(BlockDriverState *bs, unsigned long int req, void *buf, BlockDriverCompletionFunc *cb, void *opaque) { + BDRVRawState *s = bs->opaque; RawAIOCB *acb; - acb = raw_aio_setup(bs, 0, buf, 0, cb, opaque); - if (!acb) + if (fd_open(bs) < 0) return NULL; + acb = qemu_aio_get(bs, cb, opaque); + if (!acb) + return NULL; + acb->aiocb.aio_fildes = s->fd; + acb->aiocb.ev_signo = SIGUSR2; + acb->aiocb.aio_offset = 0; + acb->aiocb.aio_flags = 0; + + acb->next = posix_aio_state->first_aio; + posix_aio_state->first_aio = acb; + + acb->aiocb.aio_ioctl_buf = buf; acb->aiocb.aio_ioctl_cmd = req; if (qemu_paio_ioctl(&acb->aiocb) < 0) { raw_aio_remove(acb); @@ -1424,8 +1407,8 @@ BlockDriver bdrv_host_device = { .bdrv_flush = raw_flush, #ifdef CONFIG_AIO - .bdrv_aio_read = raw_aio_read, - .bdrv_aio_write = raw_aio_write, + .bdrv_aio_readv = raw_aio_readv, + .bdrv_aio_writev = raw_aio_writev, .bdrv_aio_cancel = raw_aio_cancel, .aiocb_size = sizeof(RawAIOCB), #endif diff --git a/block.c b/block.c index 916f08e76f..8a78f14ed5 100644 --- a/block.c +++ b/block.c @@ -47,25 +47,21 @@ #define SECTOR_BITS 9 #define SECTOR_SIZE (1 << SECTOR_BITS) -static AIOPool vectored_aio_pool; - typedef struct BlockDriverAIOCBSync { BlockDriverAIOCB common; QEMUBH *bh; int ret; + /* vector translation state */ + QEMUIOVector *qiov; + uint8_t *bounce; + int is_write; } BlockDriverAIOCBSync; -static BlockDriverAIOCB *bdrv_aio_read(BlockDriverState *bs, - int64_t sector_num, uint8_t *buf, int nb_sectors, +static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, BlockDriverCompletionFunc *cb, void *opaque); -static BlockDriverAIOCB *bdrv_aio_write(BlockDriverState *bs, - int64_t sector_num, const uint8_t *buf, int nb_sectors, - BlockDriverCompletionFunc *cb, void *opaque); -static BlockDriverAIOCB *bdrv_aio_read_em(BlockDriverState *bs, - int64_t sector_num, uint8_t *buf, int nb_sectors, - BlockDriverCompletionFunc *cb, void *opaque); -static BlockDriverAIOCB *bdrv_aio_write_em(BlockDriverState *bs, - int64_t sector_num, const uint8_t *buf, int nb_sectors, +static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, BlockDriverCompletionFunc *cb, void *opaque); static void bdrv_aio_cancel_em(BlockDriverAIOCB *acb); static int bdrv_read_em(BlockDriverState *bs, int64_t sector_num, @@ -144,10 +140,10 @@ void path_combine(char *dest, int dest_size, static void bdrv_register(BlockDriver *bdrv) { - if (!bdrv->bdrv_aio_read) { + if (!bdrv->bdrv_aio_readv) { /* add AIO emulation layer */ - bdrv->bdrv_aio_read = bdrv_aio_read_em; - bdrv->bdrv_aio_write = bdrv_aio_write_em; + bdrv->bdrv_aio_readv = bdrv_aio_readv_em; + bdrv->bdrv_aio_writev = bdrv_aio_writev_em; bdrv->bdrv_aio_cancel = bdrv_aio_cancel_em; bdrv->aiocb_size = sizeof(BlockDriverAIOCBSync); } else if (!bdrv->bdrv_read) { @@ -1295,90 +1291,9 @@ char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn) /**************************************************************/ /* async I/Os */ -typedef struct VectorTranslationAIOCB { - BlockDriverAIOCB common; - QEMUIOVector *iov; - uint8_t *bounce; - int is_write; - BlockDriverAIOCB *aiocb; -} VectorTranslationAIOCB; - -static void bdrv_aio_cancel_vector(BlockDriverAIOCB *_acb) -{ - VectorTranslationAIOCB *acb - = container_of(_acb, VectorTranslationAIOCB, common); - - bdrv_aio_cancel(acb->aiocb); -} - -static void bdrv_aio_rw_vector_cb(void *opaque, int ret) -{ - VectorTranslationAIOCB *s = (VectorTranslationAIOCB *)opaque; - - if (!s->is_write) { - qemu_iovec_from_buffer(s->iov, s->bounce, s->iov->size); - } - qemu_vfree(s->bounce); - s->common.cb(s->common.opaque, ret); - qemu_aio_release(s); -} - -static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs, - int64_t sector_num, - QEMUIOVector *iov, - int nb_sectors, - BlockDriverCompletionFunc *cb, - void *opaque, - int is_write) - -{ - VectorTranslationAIOCB *s = qemu_aio_get_pool(&vectored_aio_pool, bs, - cb, opaque); - - s->iov = iov; - s->bounce = qemu_memalign(512, nb_sectors * 512); - s->is_write = is_write; - if (is_write) { - qemu_iovec_to_buffer(s->iov, s->bounce); - s->aiocb = bdrv_aio_write(bs, sector_num, s->bounce, nb_sectors, - bdrv_aio_rw_vector_cb, s); - } else { - s->aiocb = bdrv_aio_read(bs, sector_num, s->bounce, nb_sectors, - bdrv_aio_rw_vector_cb, s); - } - if (!s->aiocb) { - qemu_vfree(s->bounce); - qemu_aio_release(s); - return NULL; - } - return &s->common; -} - BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num, - QEMUIOVector *iov, int nb_sectors, + QEMUIOVector *qiov, int nb_sectors, BlockDriverCompletionFunc *cb, void *opaque) -{ - if (bdrv_check_request(bs, sector_num, nb_sectors)) - return NULL; - - return bdrv_aio_rw_vector(bs, sector_num, iov, nb_sectors, - cb, opaque, 0); -} - -BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num, - QEMUIOVector *iov, int nb_sectors, - BlockDriverCompletionFunc *cb, void *opaque) -{ - if (bdrv_check_request(bs, sector_num, nb_sectors)) - return NULL; - - return bdrv_aio_rw_vector(bs, sector_num, iov, nb_sectors, - cb, opaque, 1); -} - -static BlockDriverAIOCB *bdrv_aio_read(BlockDriverState *bs, int64_t sector_num, - uint8_t *buf, int nb_sectors, - BlockDriverCompletionFunc *cb, void *opaque) { BlockDriver *drv = bs->drv; BlockDriverAIOCB *ret; @@ -1388,7 +1303,8 @@ static BlockDriverAIOCB *bdrv_aio_read(BlockDriverState *bs, int64_t sector_num, if (bdrv_check_request(bs, sector_num, nb_sectors)) return NULL; - ret = drv->bdrv_aio_read(bs, sector_num, buf, nb_sectors, cb, opaque); + ret = drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors, + cb, opaque); if (ret) { /* Update stats even though technically transfer has not happened. */ @@ -1399,9 +1315,9 @@ static BlockDriverAIOCB *bdrv_aio_read(BlockDriverState *bs, int64_t sector_num, return ret; } -static BlockDriverAIOCB *bdrv_aio_write(BlockDriverState *bs, int64_t sector_num, - const uint8_t *buf, int nb_sectors, - BlockDriverCompletionFunc *cb, void *opaque) +BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num, + QEMUIOVector *qiov, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque) { BlockDriver *drv = bs->drv; BlockDriverAIOCB *ret; @@ -1413,7 +1329,8 @@ static BlockDriverAIOCB *bdrv_aio_write(BlockDriverState *bs, int64_t sector_num if (bdrv_check_request(bs, sector_num, nb_sectors)) return NULL; - ret = drv->bdrv_aio_write(bs, sector_num, buf, nb_sectors, cb, opaque); + ret = drv->bdrv_aio_writev(bs, sector_num, qiov, nb_sectors, + cb, opaque); if (ret) { /* Update stats even though technically transfer has not happened. */ @@ -1436,42 +1353,62 @@ void bdrv_aio_cancel(BlockDriverAIOCB *acb) static void bdrv_aio_bh_cb(void *opaque) { BlockDriverAIOCBSync *acb = opaque; + + qemu_vfree(acb->bounce); + + if (!acb->is_write) + qemu_iovec_from_buffer(acb->qiov, acb->bounce, acb->qiov->size); acb->common.cb(acb->common.opaque, acb->ret); + qemu_aio_release(acb); } -static BlockDriverAIOCB *bdrv_aio_read_em(BlockDriverState *bs, - int64_t sector_num, uint8_t *buf, int nb_sectors, - BlockDriverCompletionFunc *cb, void *opaque) +static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs, + int64_t sector_num, + QEMUIOVector *qiov, + int nb_sectors, + BlockDriverCompletionFunc *cb, + void *opaque, + int is_write) + { BlockDriverAIOCBSync *acb; - int ret; acb = qemu_aio_get(bs, cb, opaque); + acb->is_write = is_write; + acb->qiov = qiov; + acb->bounce = qemu_memalign(512, qiov->size); + if (!acb->bh) acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb); - ret = bdrv_read(bs, sector_num, buf, nb_sectors); - acb->ret = ret; + + if (is_write) { + qemu_iovec_to_buffer(acb->qiov, acb->bounce); + acb->ret = bdrv_write(bs, sector_num, acb->bounce, nb_sectors); + } else { + acb->ret = bdrv_read(bs, sector_num, acb->bounce, nb_sectors); + } + qemu_bh_schedule(acb->bh); + return &acb->common; } -static BlockDriverAIOCB *bdrv_aio_write_em(BlockDriverState *bs, - int64_t sector_num, const uint8_t *buf, int nb_sectors, +static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, BlockDriverCompletionFunc *cb, void *opaque) { - BlockDriverAIOCBSync *acb; - int ret; - - acb = qemu_aio_get(bs, cb, opaque); - if (!acb->bh) - acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb); - ret = bdrv_write(bs, sector_num, buf, nb_sectors); - acb->ret = ret; - qemu_bh_schedule(acb->bh); - return &acb->common; + return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0); } +static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque) +{ + return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1); +} + + static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb) { BlockDriverAIOCBSync *acb = (BlockDriverAIOCBSync *)blockacb; @@ -1494,10 +1431,15 @@ static int bdrv_read_em(BlockDriverState *bs, int64_t sector_num, { int async_ret; BlockDriverAIOCB *acb; + struct iovec iov; + QEMUIOVector qiov; async_ret = NOT_DONE; - acb = bdrv_aio_read(bs, sector_num, buf, nb_sectors, - bdrv_rw_em_cb, &async_ret); + iov.iov_base = buf; + iov.iov_len = nb_sectors * 512; + qemu_iovec_init_external(&qiov, &iov, 1); + acb = bdrv_aio_readv(bs, sector_num, &qiov, nb_sectors, + bdrv_rw_em_cb, &async_ret); if (acb == NULL) return -1; @@ -1513,10 +1455,15 @@ static int bdrv_write_em(BlockDriverState *bs, int64_t sector_num, { int async_ret; BlockDriverAIOCB *acb; + struct iovec iov; + QEMUIOVector qiov; async_ret = NOT_DONE; - acb = bdrv_aio_write(bs, sector_num, buf, nb_sectors, - bdrv_rw_em_cb, &async_ret); + iov.iov_base = (void *)buf; + iov.iov_len = nb_sectors * 512; + qemu_iovec_init_external(&qiov, &iov, 1); + acb = bdrv_aio_writev(bs, sector_num, &qiov, nb_sectors, + bdrv_rw_em_cb, &async_ret); if (acb == NULL) return -1; while (async_ret == NOT_DONE) { @@ -1527,9 +1474,6 @@ static int bdrv_write_em(BlockDriverState *bs, int64_t sector_num, void bdrv_init(void) { - aio_pool_init(&vectored_aio_pool, sizeof(VectorTranslationAIOCB), - bdrv_aio_cancel_vector); - bdrv_register(&bdrv_raw); bdrv_register(&bdrv_host_device); #ifndef _WIN32 diff --git a/block_int.h b/block_int.h index dc4335982c..3e78997c71 100644 --- a/block_int.h +++ b/block_int.h @@ -54,11 +54,11 @@ struct BlockDriver { int (*bdrv_set_key)(BlockDriverState *bs, const char *key); int (*bdrv_make_empty)(BlockDriverState *bs); /* aio */ - BlockDriverAIOCB *(*bdrv_aio_read)(BlockDriverState *bs, - int64_t sector_num, uint8_t *buf, int nb_sectors, + BlockDriverAIOCB *(*bdrv_aio_readv)(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, BlockDriverCompletionFunc *cb, void *opaque); - BlockDriverAIOCB *(*bdrv_aio_write)(BlockDriverState *bs, - int64_t sector_num, const uint8_t *buf, int nb_sectors, + BlockDriverAIOCB *(*bdrv_aio_writev)(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, BlockDriverCompletionFunc *cb, void *opaque); void (*bdrv_aio_cancel)(BlockDriverAIOCB *acb); int aiocb_size; diff --git a/posix-aio-compat.c b/posix-aio-compat.c index 65c80ecc3e..0eb77a50d3 100644 --- a/posix-aio-compat.c +++ b/posix-aio-compat.c @@ -20,6 +20,7 @@ #include #include #include "osdep.h" +#include "qemu-common.h" #include "posix-aio-compat.h" @@ -76,45 +77,110 @@ static void thread_create(pthread_t *thread, pthread_attr_t *attr, if (ret) die2(ret, "pthread_create"); } -static size_t handle_aiocb_readwrite(struct qemu_paiocb *aiocb) +static size_t handle_aiocb_ioctl(struct qemu_paiocb *aiocb) +{ + int ret; + + ret = ioctl(aiocb->aio_fildes, aiocb->aio_ioctl_cmd, aiocb->aio_ioctl_buf); + if (ret == -1) + return -errno; + return ret; +} + +/* + * Check if we need to copy the data in the aiocb into a new + * properly aligned buffer. + */ +static int aiocb_needs_copy(struct qemu_paiocb *aiocb) +{ + if (aiocb->aio_flags & QEMU_AIO_SECTOR_ALIGNED) { + int i; + + for (i = 0; i < aiocb->aio_niov; i++) + if ((uintptr_t) aiocb->aio_iov[i].iov_base % 512) + return 1; + } + + return 0; +} + +static size_t handle_aiocb_rw_linear(struct qemu_paiocb *aiocb, char *buf) { size_t offset = 0; - ssize_t len; + size_t len; while (offset < aiocb->aio_nbytes) { - if (aiocb->aio_type == QEMU_PAIO_WRITE) - len = pwrite(aiocb->aio_fildes, - (const char *)aiocb->aio_buf + offset, + if (aiocb->aio_type == QEMU_PAIO_WRITE) + len = pwrite(aiocb->aio_fildes, + (const char *)buf + offset, + aiocb->aio_nbytes - offset, + aiocb->aio_offset + offset); + else + len = pread(aiocb->aio_fildes, + buf + offset, aiocb->aio_nbytes - offset, aiocb->aio_offset + offset); - else - len = pread(aiocb->aio_fildes, - (char *)aiocb->aio_buf + offset, - aiocb->aio_nbytes - offset, - aiocb->aio_offset + offset); - if (len == -1 && errno == EINTR) - continue; - else if (len == -1) { - offset = -errno; - break; - } else if (len == 0) - break; + if (len == -1 && errno == EINTR) + continue; + else if (len == -1) { + offset = -errno; + break; + } else if (len == 0) + break; - offset += len; + offset += len; } return offset; } -static size_t handle_aiocb_ioctl(struct qemu_paiocb *aiocb) +static size_t handle_aiocb_rw(struct qemu_paiocb *aiocb) { - int ret; + size_t nbytes; + char *buf; - ret = ioctl(aiocb->aio_fildes, aiocb->aio_ioctl_cmd, aiocb->aio_buf); - if (ret == -1) - return -errno; - return ret; + if (!aiocb_needs_copy(aiocb) && aiocb->aio_niov == 1) { + /* + * If there is just a single buffer, and it is properly aligned + * we can just use plain pread/pwrite without any problems. + */ + return handle_aiocb_rw_linear(aiocb, aiocb->aio_iov->iov_base); + } + + /* + * Ok, we have to do it the hard way, copy all segments into + * a single aligned buffer. + */ + buf = qemu_memalign(512, aiocb->aio_nbytes); + if (aiocb->aio_type == QEMU_PAIO_WRITE) { + char *p = buf; + int i; + + for (i = 0; i < aiocb->aio_niov; ++i) { + memcpy(p, aiocb->aio_iov[i].iov_base, aiocb->aio_iov[i].iov_len); + p += aiocb->aio_iov[i].iov_len; + } + } + + nbytes = handle_aiocb_rw_linear(aiocb, buf); + if (aiocb->aio_type != QEMU_PAIO_WRITE) { + char *p = buf; + size_t count = aiocb->aio_nbytes, copy; + int i; + + for (i = 0; i < aiocb->aio_niov && count; ++i) { + copy = count; + if (copy > aiocb->aio_iov[i].iov_len) + copy = aiocb->aio_iov[i].iov_len; + memcpy(aiocb->aio_iov[i].iov_base, p, copy); + p += copy; + count -= copy; + } + } + qemu_vfree(buf); + + return nbytes; } static void *aio_thread(void *unused) @@ -157,7 +223,7 @@ static void *aio_thread(void *unused) switch (aiocb->aio_type) { case QEMU_PAIO_READ: case QEMU_PAIO_WRITE: - ret = handle_aiocb_readwrite(aiocb); + ret = handle_aiocb_rw(aiocb); break; case QEMU_PAIO_IOCTL: ret = handle_aiocb_ioctl(aiocb); diff --git a/posix-aio-compat.h b/posix-aio-compat.h index a1cdfd7f1f..1c5dcbd920 100644 --- a/posix-aio-compat.h +++ b/posix-aio-compat.h @@ -27,11 +27,18 @@ struct qemu_paiocb { int aio_fildes; - void *aio_buf; + union { + struct iovec *aio_iov; + void *aio_ioctl_buf; + }; + int aio_niov; size_t aio_nbytes; #define aio_ioctl_cmd aio_nbytes /* for QEMU_PAIO_IOCTL */ int ev_signo; off_t aio_offset; + unsigned aio_flags; +/* 512 byte alignment required for buffer, offset and length */ +#define QEMU_AIO_SECTOR_ALIGNED 0x01 /* private */ TAILQ_ENTRY(qemu_paiocb) node;