diff --git a/Makefile.objs b/Makefile.objs index f8ae0316b8..35c2355e82 100644 --- a/Makefile.objs +++ b/Makefile.objs @@ -45,7 +45,6 @@ block-obj-y = iov.o cache-utils.o qemu-option.o module.o async.o block-obj-y += nbd.o block.o blockjob.o aes.o qemu-config.o block-obj-y += thread-pool.o qemu-progress.o qemu-sockets.o uri.o notify.o block-obj-y += $(coroutine-obj-y) $(qobject-obj-y) $(version-obj-y) -block-obj-$(CONFIG_POSIX) += posix-aio-compat.o block-obj-$(CONFIG_POSIX) += event_notifier-posix.o aio-posix.o block-obj-$(CONFIG_WIN32) += event_notifier-win32.o aio-win32.o block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o diff --git a/block/raw-posix-aio.h b/block/raw-posix-aio.h index 6725135dd4..c714367401 100644 --- a/block/raw-posix-aio.h +++ b/block/raw-posix-aio.h @@ -27,14 +27,6 @@ #define QEMU_AIO_MISALIGNED 0x1000 -/* posix-aio-compat.c - thread pool based implementation */ -BlockDriverAIOCB *paio_submit(BlockDriverState *bs, int fd, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockDriverCompletionFunc *cb, void *opaque, int type); -BlockDriverAIOCB *paio_ioctl(BlockDriverState *bs, int fd, - unsigned long int req, void *buf, - BlockDriverCompletionFunc *cb, void *opaque); - /* linux-aio.c - Linux native implementation */ void *laio_init(void); BlockDriverAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd, diff --git a/block/raw-posix.c b/block/raw-posix.c index 9ae2c505a8..4d6d5df5bc 100644 --- a/block/raw-posix.c +++ b/block/raw-posix.c @@ -27,6 +27,9 @@ #include "qemu-log.h" #include "block_int.h" #include "module.h" +#include "trace.h" +#include "thread-pool.h" +#include "iov.h" #include "block/raw-posix-aio.h" #if defined(__APPLE__) && (__MACH__) @@ -149,6 +152,20 @@ typedef struct BDRVRawReopenState { static int fd_open(BlockDriverState *bs); static int64_t raw_getlength(BlockDriverState *bs); +typedef struct RawPosixAIOData { + BlockDriverState *bs; + int aio_fildes; + union { + struct iovec *aio_iov; + void *aio_ioctl_buf; + }; + int aio_niov; + size_t aio_nbytes; +#define aio_ioctl_cmd aio_nbytes /* for QEMU_AIO_IOCTL */ + off_t aio_offset; + int aio_type; +} RawPosixAIOData; + #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__) static int cdrom_reopen(BlockDriverState *bs); #endif @@ -426,6 +443,283 @@ static int qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov) return 1; } +static ssize_t handle_aiocb_ioctl(RawPosixAIOData *aiocb) +{ + int ret; + + ret = ioctl(aiocb->aio_fildes, aiocb->aio_ioctl_cmd, aiocb->aio_ioctl_buf); + if (ret == -1) { + return -errno; + } + + /* + * This looks weird, but the aio code only considers a request + * successful if it has written the full number of bytes. + * + * Now we overload aio_nbytes as aio_ioctl_cmd for the ioctl command, + * so in fact we return the ioctl command here to make posix_aio_read() + * happy.. + */ + return aiocb->aio_nbytes; +} + +static ssize_t handle_aiocb_flush(RawPosixAIOData *aiocb) +{ + int ret; + + ret = qemu_fdatasync(aiocb->aio_fildes); + if (ret == -1) { + return -errno; + } + return 0; +} + +#ifdef CONFIG_PREADV + +static bool preadv_present = true; + +static ssize_t +qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset) +{ + return preadv(fd, iov, nr_iov, offset); +} + +static ssize_t +qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset) +{ + return pwritev(fd, iov, nr_iov, offset); +} + +#else + +static bool preadv_present = false; + +static ssize_t +qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset) +{ + return -ENOSYS; +} + +static ssize_t +qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset) +{ + return -ENOSYS; +} + +#endif + +static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb) +{ + ssize_t len; + + do { + if (aiocb->aio_type & QEMU_AIO_WRITE) + len = qemu_pwritev(aiocb->aio_fildes, + aiocb->aio_iov, + aiocb->aio_niov, + aiocb->aio_offset); + else + len = qemu_preadv(aiocb->aio_fildes, + aiocb->aio_iov, + aiocb->aio_niov, + aiocb->aio_offset); + } while (len == -1 && errno == EINTR); + + if (len == -1) { + return -errno; + } + return len; +} + +/* + * Read/writes the data to/from a given linear buffer. + * + * Returns the number of bytes handles or -errno in case of an error. Short + * reads are only returned if the end of the file is reached. + */ +static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf) +{ + ssize_t offset = 0; + ssize_t len; + + while (offset < aiocb->aio_nbytes) { + if (aiocb->aio_type & QEMU_AIO_WRITE) { + len = pwrite(aiocb->aio_fildes, + (const char *)buf + offset, + aiocb->aio_nbytes - offset, + aiocb->aio_offset + offset); + } else { + len = pread(aiocb->aio_fildes, + buf + offset, + aiocb->aio_nbytes - offset, + aiocb->aio_offset + offset); + } + if (len == -1 && errno == EINTR) { + continue; + } else if (len == -1) { + offset = -errno; + break; + } else if (len == 0) { + break; + } + offset += len; + } + + return offset; +} + +static ssize_t handle_aiocb_rw(RawPosixAIOData *aiocb) +{ + ssize_t nbytes; + char *buf; + + if (!(aiocb->aio_type & QEMU_AIO_MISALIGNED)) { + /* + * If there is just a single buffer, and it is properly aligned + * we can just use plain pread/pwrite without any problems. + */ + if (aiocb->aio_niov == 1) { + return handle_aiocb_rw_linear(aiocb, aiocb->aio_iov->iov_base); + } + /* + * We have more than one iovec, and all are properly aligned. + * + * Try preadv/pwritev first and fall back to linearizing the + * buffer if it's not supported. + */ + if (preadv_present) { + nbytes = handle_aiocb_rw_vector(aiocb); + if (nbytes == aiocb->aio_nbytes || + (nbytes < 0 && nbytes != -ENOSYS)) { + return nbytes; + } + preadv_present = false; + } + + /* + * XXX(hch): short read/write. no easy way to handle the reminder + * using these interfaces. For now retry using plain + * pread/pwrite? + */ + } + + /* + * Ok, we have to do it the hard way, copy all segments into + * a single aligned buffer. + */ + buf = qemu_blockalign(aiocb->bs, aiocb->aio_nbytes); + if (aiocb->aio_type & QEMU_AIO_WRITE) { + char *p = buf; + int i; + + for (i = 0; i < aiocb->aio_niov; ++i) { + memcpy(p, aiocb->aio_iov[i].iov_base, aiocb->aio_iov[i].iov_len); + p += aiocb->aio_iov[i].iov_len; + } + } + + nbytes = handle_aiocb_rw_linear(aiocb, buf); + if (!(aiocb->aio_type & QEMU_AIO_WRITE)) { + char *p = buf; + size_t count = aiocb->aio_nbytes, copy; + int i; + + for (i = 0; i < aiocb->aio_niov && count; ++i) { + copy = count; + if (copy > aiocb->aio_iov[i].iov_len) { + copy = aiocb->aio_iov[i].iov_len; + } + memcpy(aiocb->aio_iov[i].iov_base, p, copy); + p += copy; + count -= copy; + } + } + qemu_vfree(buf); + + return nbytes; +} + +static int aio_worker(void *arg) +{ + RawPosixAIOData *aiocb = arg; + ssize_t ret = 0; + + switch (aiocb->aio_type & QEMU_AIO_TYPE_MASK) { + case QEMU_AIO_READ: + ret = handle_aiocb_rw(aiocb); + if (ret >= 0 && ret < aiocb->aio_nbytes && aiocb->bs->growable) { + iov_memset(aiocb->aio_iov, aiocb->aio_niov, ret, + 0, aiocb->aio_nbytes - ret); + + ret = aiocb->aio_nbytes; + } + if (ret == aiocb->aio_nbytes) { + ret = 0; + } else if (ret >= 0 && ret < aiocb->aio_nbytes) { + ret = -EINVAL; + } + break; + case QEMU_AIO_WRITE: + ret = handle_aiocb_rw(aiocb); + if (ret == aiocb->aio_nbytes) { + ret = 0; + } else if (ret >= 0 && ret < aiocb->aio_nbytes) { + ret = -EINVAL; + } + break; + case QEMU_AIO_FLUSH: + ret = handle_aiocb_flush(aiocb); + break; + case QEMU_AIO_IOCTL: + ret = handle_aiocb_ioctl(aiocb); + break; + default: + fprintf(stderr, "invalid aio request (0x%x)\n", aiocb->aio_type); + ret = -EINVAL; + break; + } + + g_slice_free(RawPosixAIOData, aiocb); + return ret; +} + +static BlockDriverAIOCB *paio_submit(BlockDriverState *bs, int fd, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque, int type) +{ + RawPosixAIOData *acb = g_slice_new(RawPosixAIOData); + + acb->bs = bs; + acb->aio_type = type; + acb->aio_fildes = fd; + + if (qiov) { + acb->aio_iov = qiov->iov; + acb->aio_niov = qiov->niov; + } + acb->aio_nbytes = nb_sectors * 512; + acb->aio_offset = sector_num * 512; + + trace_paio_submit(acb, opaque, sector_num, nb_sectors, type); + return thread_pool_submit_aio(aio_worker, acb, cb, opaque); +} + +static BlockDriverAIOCB *paio_ioctl(BlockDriverState *bs, int fd, + unsigned long int req, void *buf, + BlockDriverCompletionFunc *cb, void *opaque) +{ + RawPosixAIOData *acb = g_slice_new(RawPosixAIOData); + + acb->bs = bs; + acb->aio_type = QEMU_AIO_IOCTL; + acb->aio_fildes = fd; + acb->aio_offset = 0; + acb->aio_ioctl_buf = buf; + acb->aio_ioctl_cmd = req; + + return thread_pool_submit_aio(aio_worker, acb, cb, opaque); +} + static BlockDriverAIOCB *raw_aio_submit(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, BlockDriverCompletionFunc *cb, void *opaque, int type) diff --git a/posix-aio-compat.c b/posix-aio-compat.c deleted file mode 100644 index 4a1e3d3662..0000000000 --- a/posix-aio-compat.c +++ /dev/null @@ -1,330 +0,0 @@ -/* - * QEMU posix-aio emulation - * - * Copyright IBM, Corp. 2008 - * - * Authors: - * Anthony Liguori - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - * Contributions after 2012-01-13 are licensed under the terms of the - * GNU GPL, version 2 or (at your option) any later version. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "qemu-queue.h" -#include "osdep.h" -#include "sysemu.h" -#include "qemu-common.h" -#include "trace.h" -#include "thread-pool.h" -#include "block_int.h" -#include "iov.h" - -#include "block/raw-posix-aio.h" - -struct qemu_paiocb { - BlockDriverAIOCB common; - int aio_fildes; - union { - struct iovec *aio_iov; - void *aio_ioctl_buf; - }; - int aio_niov; - size_t aio_nbytes; -#define aio_ioctl_cmd aio_nbytes /* for QEMU_AIO_IOCTL */ - off_t aio_offset; - int aio_type; -}; - -#ifdef CONFIG_PREADV -static int preadv_present = 1; -#else -static int preadv_present = 0; -#endif - -static ssize_t handle_aiocb_ioctl(struct qemu_paiocb *aiocb) -{ - int ret; - - ret = ioctl(aiocb->aio_fildes, aiocb->aio_ioctl_cmd, aiocb->aio_ioctl_buf); - if (ret == -1) - return -errno; - - /* - * This looks weird, but the aio code only considers a request - * successful if it has written the full number of bytes. - * - * Now we overload aio_nbytes as aio_ioctl_cmd for the ioctl command, - * so in fact we return the ioctl command here to make posix_aio_read() - * happy.. - */ - return aiocb->aio_nbytes; -} - -static ssize_t handle_aiocb_flush(struct qemu_paiocb *aiocb) -{ - int ret; - - ret = qemu_fdatasync(aiocb->aio_fildes); - if (ret == -1) - return -errno; - return 0; -} - -#ifdef CONFIG_PREADV - -static ssize_t -qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset) -{ - return preadv(fd, iov, nr_iov, offset); -} - -static ssize_t -qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset) -{ - return pwritev(fd, iov, nr_iov, offset); -} - -#else - -static ssize_t -qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset) -{ - return -ENOSYS; -} - -static ssize_t -qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset) -{ - return -ENOSYS; -} - -#endif - -static ssize_t handle_aiocb_rw_vector(struct qemu_paiocb *aiocb) -{ - ssize_t len; - - do { - if (aiocb->aio_type & QEMU_AIO_WRITE) - len = qemu_pwritev(aiocb->aio_fildes, - aiocb->aio_iov, - aiocb->aio_niov, - aiocb->aio_offset); - else - len = qemu_preadv(aiocb->aio_fildes, - aiocb->aio_iov, - aiocb->aio_niov, - aiocb->aio_offset); - } while (len == -1 && errno == EINTR); - - if (len == -1) - return -errno; - return len; -} - -/* - * Read/writes the data to/from a given linear buffer. - * - * Returns the number of bytes handles or -errno in case of an error. Short - * reads are only returned if the end of the file is reached. - */ -static ssize_t handle_aiocb_rw_linear(struct qemu_paiocb *aiocb, char *buf) -{ - ssize_t offset = 0; - ssize_t len; - - while (offset < aiocb->aio_nbytes) { - if (aiocb->aio_type & QEMU_AIO_WRITE) - len = pwrite(aiocb->aio_fildes, - (const char *)buf + offset, - aiocb->aio_nbytes - offset, - aiocb->aio_offset + offset); - else - len = pread(aiocb->aio_fildes, - buf + offset, - aiocb->aio_nbytes - offset, - aiocb->aio_offset + offset); - - if (len == -1 && errno == EINTR) - continue; - else if (len == -1) { - offset = -errno; - break; - } else if (len == 0) - break; - - offset += len; - } - - return offset; -} - -static ssize_t handle_aiocb_rw(struct qemu_paiocb *aiocb) -{ - ssize_t nbytes; - char *buf; - - if (!(aiocb->aio_type & QEMU_AIO_MISALIGNED)) { - /* - * If there is just a single buffer, and it is properly aligned - * we can just use plain pread/pwrite without any problems. - */ - if (aiocb->aio_niov == 1) - return handle_aiocb_rw_linear(aiocb, aiocb->aio_iov->iov_base); - - /* - * We have more than one iovec, and all are properly aligned. - * - * Try preadv/pwritev first and fall back to linearizing the - * buffer if it's not supported. - */ - if (preadv_present) { - nbytes = handle_aiocb_rw_vector(aiocb); - if (nbytes == aiocb->aio_nbytes) - return nbytes; - if (nbytes < 0 && nbytes != -ENOSYS) - return nbytes; - preadv_present = 0; - } - - /* - * XXX(hch): short read/write. no easy way to handle the reminder - * using these interfaces. For now retry using plain - * pread/pwrite? - */ - } - - /* - * Ok, we have to do it the hard way, copy all segments into - * a single aligned buffer. - */ - buf = qemu_blockalign(aiocb->common.bs, aiocb->aio_nbytes); - if (aiocb->aio_type & QEMU_AIO_WRITE) { - char *p = buf; - int i; - - for (i = 0; i < aiocb->aio_niov; ++i) { - memcpy(p, aiocb->aio_iov[i].iov_base, aiocb->aio_iov[i].iov_len); - p += aiocb->aio_iov[i].iov_len; - } - } - - nbytes = handle_aiocb_rw_linear(aiocb, buf); - if (!(aiocb->aio_type & QEMU_AIO_WRITE)) { - char *p = buf; - size_t count = aiocb->aio_nbytes, copy; - int i; - - for (i = 0; i < aiocb->aio_niov && count; ++i) { - copy = count; - if (copy > aiocb->aio_iov[i].iov_len) - copy = aiocb->aio_iov[i].iov_len; - memcpy(aiocb->aio_iov[i].iov_base, p, copy); - p += copy; - count -= copy; - } - } - qemu_vfree(buf); - - return nbytes; -} - -static int aio_worker(void *arg) -{ - struct qemu_paiocb *aiocb = arg; - ssize_t ret = 0; - - switch (aiocb->aio_type & QEMU_AIO_TYPE_MASK) { - case QEMU_AIO_READ: - ret = handle_aiocb_rw(aiocb); - if (ret >= 0 && ret < aiocb->aio_nbytes && aiocb->common.bs->growable) { - /* A short read means that we have reached EOF. Pad the buffer - * with zeros for bytes after EOF. */ - iov_memset(aiocb->aio_iov, aiocb->aio_niov, ret, - 0, aiocb->aio_nbytes - ret); - - ret = aiocb->aio_nbytes; - } - if (ret == aiocb->aio_nbytes) { - ret = 0; - } else if (ret >= 0 && ret < aiocb->aio_nbytes) { - ret = -EINVAL; - } - break; - case QEMU_AIO_WRITE: - ret = handle_aiocb_rw(aiocb); - if (ret == aiocb->aio_nbytes) { - ret = 0; - } else if (ret >= 0 && ret < aiocb->aio_nbytes) { - ret = -EINVAL; - } - break; - case QEMU_AIO_FLUSH: - ret = handle_aiocb_flush(aiocb); - break; - case QEMU_AIO_IOCTL: - ret = handle_aiocb_ioctl(aiocb); - break; - default: - fprintf(stderr, "invalid aio request (0x%x)\n", aiocb->aio_type); - ret = -EINVAL; - break; - } - - qemu_aio_release(aiocb); - return ret; -} - -static AIOPool raw_aio_pool = { - .aiocb_size = sizeof(struct qemu_paiocb), -}; - -BlockDriverAIOCB *paio_submit(BlockDriverState *bs, int fd, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockDriverCompletionFunc *cb, void *opaque, int type) -{ - struct qemu_paiocb *acb; - - acb = qemu_aio_get(&raw_aio_pool, bs, cb, opaque); - acb->aio_type = type; - acb->aio_fildes = fd; - - if (qiov) { - acb->aio_iov = qiov->iov; - acb->aio_niov = qiov->niov; - } - acb->aio_nbytes = nb_sectors * 512; - acb->aio_offset = sector_num * 512; - - trace_paio_submit(acb, opaque, sector_num, nb_sectors, type); - return thread_pool_submit_aio(aio_worker, acb, cb, opaque); -} - -BlockDriverAIOCB *paio_ioctl(BlockDriverState *bs, int fd, - unsigned long int req, void *buf, - BlockDriverCompletionFunc *cb, void *opaque) -{ - struct qemu_paiocb *acb; - - acb = qemu_aio_get(&raw_aio_pool, bs, cb, opaque); - acb->aio_type = QEMU_AIO_IOCTL; - acb->aio_fildes = fd; - acb->aio_offset = 0; - acb->aio_ioctl_buf = buf; - acb->aio_ioctl_cmd = req; - - return thread_pool_submit_aio(aio_worker, acb, cb, opaque); -}