2006-08-01 20:21:11 +04:00
|
|
|
/*
|
2007-12-15 20:28:36 +03:00
|
|
|
* Block driver for RAW files (posix)
|
2007-09-17 01:08:06 +04:00
|
|
|
*
|
2006-08-01 20:21:11 +04:00
|
|
|
* Copyright (c) 2006 Fabrice Bellard
|
2007-09-17 01:08:06 +04:00
|
|
|
*
|
2006-08-01 20:21:11 +04:00
|
|
|
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
|
|
* of this software and associated documentation files (the "Software"), to deal
|
|
|
|
* in the Software without restriction, including without limitation the rights
|
|
|
|
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
|
|
* copies of the Software, and to permit persons to whom the Software is
|
|
|
|
* furnished to do so, subject to the following conditions:
|
|
|
|
*
|
|
|
|
* The above copyright notice and this permission notice shall be included in
|
|
|
|
* all copies or substantial portions of the Software.
|
|
|
|
*
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
|
|
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
|
|
* THE SOFTWARE.
|
|
|
|
*/
|
2016-01-18 21:01:42 +03:00
|
|
|
#include "qemu/osdep.h"
|
include/qemu/osdep.h: Don't include qapi/error.h
Commit 57cb38b included qapi/error.h into qemu/osdep.h to get the
Error typedef. Since then, we've moved to include qemu/osdep.h
everywhere. Its file comment explains: "To avoid getting into
possible circular include dependencies, this file should not include
any other QEMU headers, with the exceptions of config-host.h,
compiler.h, os-posix.h and os-win32.h, all of which are doing a
similar job to this file and are under similar constraints."
qapi/error.h doesn't do a similar job, and it doesn't adhere to
similar constraints: it includes qapi-types.h. That's in excess of
100KiB of crap most .c files don't actually need.
Add the typedef to qemu/typedefs.h, and include that instead of
qapi/error.h. Include qapi/error.h in .c files that need it and don't
get it now. Include qapi-types.h in qom/object.h for uint16List.
Update scripts/clean-includes accordingly. Update it further to match
reality: replace config.h by config-target.h, add sysemu/os-posix.h,
sysemu/os-win32.h. Update the list of includes in the qemu/osdep.h
comment quoted above similarly.
This reduces the number of objects depending on qapi/error.h from "all
of them" to less than a third. Unfortunately, the number depending on
qapi-types.h shrinks only a little. More work is needed for that one.
Signed-off-by: Markus Armbruster <armbru@redhat.com>
[Fix compilation without the spice devel packages. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2016-03-14 11:01:28 +03:00
|
|
|
#include "qapi/error.h"
|
2016-03-20 20:16:19 +03:00
|
|
|
#include "qemu/cutils.h"
|
2015-03-17 20:29:20 +03:00
|
|
|
#include "qemu/error-report.h"
|
2012-12-17 21:19:44 +04:00
|
|
|
#include "block/block_int.h"
|
2012-12-17 21:20:00 +04:00
|
|
|
#include "qemu/module.h"
|
2012-05-25 13:46:27 +04:00
|
|
|
#include "trace.h"
|
2012-12-17 21:19:44 +04:00
|
|
|
#include "block/thread-pool.h"
|
2012-12-17 21:20:00 +04:00
|
|
|
#include "qemu/iov.h"
|
2016-07-04 19:33:20 +03:00
|
|
|
#include "block/raw-aio.h"
|
2015-03-17 20:29:20 +03:00
|
|
|
#include "qapi/qmp/qstring.h"
|
2006-08-01 20:21:11 +04:00
|
|
|
|
scsi, file-posix: add support for persistent reservation management
It is a common requirement for virtual machine to send persistent
reservations, but this currently requires either running QEMU with
CAP_SYS_RAWIO, or using out-of-tree patches that let an unprivileged
QEMU bypass Linux's filter on SG_IO commands.
As an alternative mechanism, the next patches will introduce a
privileged helper to run persistent reservation commands without
expanding QEMU's attack surface unnecessarily.
The helper is invoked through a "pr-manager" QOM object, to which
file-posix.c passes SG_IO requests for PERSISTENT RESERVE OUT and
PERSISTENT RESERVE IN commands. For example:
$ qemu-system-x86_64
-device virtio-scsi \
-object pr-manager-helper,id=helper0,path=/var/run/qemu-pr-helper.sock
-drive if=none,id=hd,driver=raw,file.filename=/dev/sdb,file.pr-manager=helper0
-device scsi-block,drive=hd
or:
$ qemu-system-x86_64
-device virtio-scsi \
-object pr-manager-helper,id=helper0,path=/var/run/qemu-pr-helper.sock
-blockdev node-name=hd,driver=raw,file.driver=host_device,file.filename=/dev/sdb,file.pr-manager=helper0
-device scsi-block,drive=hd
Multiple pr-manager implementations are conceivable and possible, though
only one is implemented right now. For example, a pr-manager could:
- talk directly to the multipath daemon from a privileged QEMU
(i.e. QEMU links to libmpathpersist); this makes reservation work
properly with multipath, but still requires CAP_SYS_RAWIO
- use the Linux IOC_PR_* ioctls (they require CAP_SYS_ADMIN though)
- more interestingly, implement reservations directly in QEMU
through file system locks or a shared database (e.g. sqlite)
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2017-08-21 19:58:56 +03:00
|
|
|
#include "scsi/pr-manager.h"
|
|
|
|
#include "scsi/constants.h"
|
|
|
|
|
2011-11-10 22:40:06 +04:00
|
|
|
#if defined(__APPLE__) && (__MACH__)
|
2006-08-01 20:21:11 +04:00
|
|
|
#include <paths.h>
|
|
|
|
#include <sys/param.h>
|
|
|
|
#include <IOKit/IOKitLib.h>
|
|
|
|
#include <IOKit/IOBSD.h>
|
|
|
|
#include <IOKit/storage/IOMediaBSDClient.h>
|
|
|
|
#include <IOKit/storage/IOMedia.h>
|
|
|
|
#include <IOKit/storage/IOCDMedia.h>
|
|
|
|
//#include <IOKit/storage/IOCDTypes.h>
|
2016-03-21 18:41:28 +03:00
|
|
|
#include <IOKit/storage/IODVDMedia.h>
|
2006-08-01 20:21:11 +04:00
|
|
|
#include <CoreFoundation/CoreFoundation.h>
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifdef __sun__
|
2007-01-05 20:44:41 +03:00
|
|
|
#define _POSIX_PTHREAD_SEMANTICS 1
|
2006-08-01 20:21:11 +04:00
|
|
|
#include <sys/dkio.h>
|
|
|
|
#endif
|
2006-08-19 15:45:59 +04:00
|
|
|
#ifdef __linux__
|
|
|
|
#include <sys/ioctl.h>
|
2010-09-06 19:06:02 +04:00
|
|
|
#include <sys/param.h>
|
2006-08-19 15:45:59 +04:00
|
|
|
#include <linux/cdrom.h>
|
|
|
|
#include <linux/fd.h>
|
2012-05-09 18:49:58 +04:00
|
|
|
#include <linux/fs.h>
|
2015-02-16 14:47:56 +03:00
|
|
|
#include <linux/hdreg.h>
|
2015-06-23 13:45:00 +03:00
|
|
|
#include <scsi/sg.h>
|
2015-02-16 14:47:56 +03:00
|
|
|
#ifdef __s390__
|
|
|
|
#include <asm/dasd.h>
|
|
|
|
#endif
|
qemu-img create: add 'nocow' option
Add 'nocow' option so that users could have a chance to set NOCOW flag to
newly created files. It's useful on btrfs file system to enhance performance.
Btrfs has low performance when hosting VM images, even more when the guest
in those VM are also using btrfs as file system. One way to mitigate this bad
performance is to turn off COW attributes on VM files. Generally, there are
two ways to turn off NOCOW on btrfs: a) by mounting fs with nodatacow, then
all newly created files will be NOCOW. b) per file. Add the NOCOW file
attribute. It could only be done to empty or new files.
This patch tries the second way, according to the option, it could add NOCOW
per file.
For most block drivers, since the create file step is in raw-posix.c, so we
can do setting NOCOW flag ioctl in raw-posix.c only.
But there are some exceptions, like block/vpc.c and block/vdi.c, they are
creating file by calling qemu_open directly. For them, do the same setting
NOCOW flag ioctl work in them separately.
[Fixed up 082.out due to the new 'nocow' creation option
--Stefan]
Signed-off-by: Chunyan Liu <cyliu@suse.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-06-30 10:29:58 +04:00
|
|
|
#ifndef FS_NOCOW_FL
|
|
|
|
#define FS_NOCOW_FL 0x00800000 /* Do not cow file */
|
|
|
|
#endif
|
2012-05-09 18:49:58 +04:00
|
|
|
#endif
|
2015-01-30 11:42:14 +03:00
|
|
|
#if defined(CONFIG_FALLOCATE_PUNCH_HOLE) || defined(CONFIG_FALLOCATE_ZERO_RANGE)
|
2013-01-14 19:26:52 +04:00
|
|
|
#include <linux/falloc.h>
|
|
|
|
#endif
|
2009-11-29 20:00:41 +03:00
|
|
|
#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
|
2006-12-21 22:14:11 +03:00
|
|
|
#include <sys/disk.h>
|
2009-03-28 11:37:13 +03:00
|
|
|
#include <sys/cdio.h>
|
2006-12-21 22:14:11 +03:00
|
|
|
#endif
|
2006-08-01 20:21:11 +04:00
|
|
|
|
2008-08-15 22:33:42 +04:00
|
|
|
#ifdef __OpenBSD__
|
|
|
|
#include <sys/ioctl.h>
|
|
|
|
#include <sys/disklabel.h>
|
|
|
|
#include <sys/dkio.h>
|
|
|
|
#endif
|
|
|
|
|
2011-05-23 16:31:17 +04:00
|
|
|
#ifdef __NetBSD__
|
|
|
|
#include <sys/ioctl.h>
|
|
|
|
#include <sys/disklabel.h>
|
|
|
|
#include <sys/dkio.h>
|
|
|
|
#include <sys/disk.h>
|
|
|
|
#endif
|
|
|
|
|
2009-03-07 23:06:23 +03:00
|
|
|
#ifdef __DragonFly__
|
|
|
|
#include <sys/ioctl.h>
|
|
|
|
#include <sys/diskslice.h>
|
|
|
|
#endif
|
|
|
|
|
2010-12-17 13:41:15 +03:00
|
|
|
#ifdef CONFIG_XFS
|
|
|
|
#include <xfs/xfs.h>
|
|
|
|
#endif
|
|
|
|
|
2007-11-11 05:51:17 +03:00
|
|
|
//#define DEBUG_BLOCK
|
2015-06-23 13:44:58 +03:00
|
|
|
|
|
|
|
#ifdef DEBUG_BLOCK
|
|
|
|
# define DEBUG_BLOCK_PRINT 1
|
2007-09-13 16:29:23 +04:00
|
|
|
#else
|
2015-06-23 13:44:58 +03:00
|
|
|
# define DEBUG_BLOCK_PRINT 0
|
2007-09-13 16:29:23 +04:00
|
|
|
#endif
|
2015-06-23 13:44:58 +03:00
|
|
|
#define DPRINTF(fmt, ...) \
|
|
|
|
do { \
|
|
|
|
if (DEBUG_BLOCK_PRINT) { \
|
|
|
|
printf(fmt, ## __VA_ARGS__); \
|
|
|
|
} \
|
|
|
|
} while (0)
|
2007-09-13 16:29:23 +04:00
|
|
|
|
2008-10-14 22:00:38 +04:00
|
|
|
/* OS X does not have O_DSYNC */
|
|
|
|
#ifndef O_DSYNC
|
2009-07-01 21:28:32 +04:00
|
|
|
#ifdef O_SYNC
|
2008-10-14 22:14:47 +04:00
|
|
|
#define O_DSYNC O_SYNC
|
2009-07-01 21:28:32 +04:00
|
|
|
#elif defined(O_FSYNC)
|
|
|
|
#define O_DSYNC O_FSYNC
|
|
|
|
#endif
|
2008-10-14 22:00:38 +04:00
|
|
|
#endif
|
|
|
|
|
2008-10-14 18:42:54 +04:00
|
|
|
/* Approximate O_DIRECT with O_DSYNC if O_DIRECT isn't available */
|
|
|
|
#ifndef O_DIRECT
|
|
|
|
#define O_DIRECT O_DSYNC
|
|
|
|
#endif
|
|
|
|
|
2006-08-19 15:45:59 +04:00
|
|
|
#define FTYPE_FILE 0
|
|
|
|
#define FTYPE_CD 1
|
2006-08-01 20:21:11 +04:00
|
|
|
|
2010-09-13 01:43:21 +04:00
|
|
|
#define MAX_BLOCKSIZE 4096
|
|
|
|
|
2017-05-02 19:35:56 +03:00
|
|
|
/* Posix file locking bytes. Libvirt takes byte 0, we start from higher bytes,
|
|
|
|
* leaving a few more bytes for its future use. */
|
|
|
|
#define RAW_LOCK_PERM_BASE 100
|
|
|
|
#define RAW_LOCK_SHARED_BASE 200
|
|
|
|
|
2006-08-19 15:45:59 +04:00
|
|
|
typedef struct BDRVRawState {
|
|
|
|
int fd;
|
2017-05-02 19:35:56 +03:00
|
|
|
int lock_fd;
|
|
|
|
bool use_lock;
|
2006-08-19 15:45:59 +04:00
|
|
|
int type;
|
2009-06-15 15:53:26 +04:00
|
|
|
int open_flags;
|
2011-11-29 15:42:20 +04:00
|
|
|
size_t buf_align;
|
|
|
|
|
2017-05-02 19:35:56 +03:00
|
|
|
/* The current permissions. */
|
|
|
|
uint64_t perm;
|
|
|
|
uint64_t shared_perm;
|
|
|
|
|
2010-12-17 13:41:15 +03:00
|
|
|
#ifdef CONFIG_XFS
|
2013-11-22 16:39:55 +04:00
|
|
|
bool is_xfs:1;
|
2010-12-17 13:41:15 +03:00
|
|
|
#endif
|
2013-11-22 16:39:55 +04:00
|
|
|
bool has_discard:1;
|
2013-11-22 16:39:57 +04:00
|
|
|
bool has_write_zeroes:1;
|
2013-11-22 16:39:55 +04:00
|
|
|
bool discard_zeroes:1;
|
2016-09-08 16:09:01 +03:00
|
|
|
bool use_linux_aio:1;
|
2017-03-23 00:00:05 +03:00
|
|
|
bool page_cache_inconsistent:1;
|
2015-01-30 11:42:15 +03:00
|
|
|
bool has_fallocate;
|
2014-10-21 18:03:03 +04:00
|
|
|
bool needs_alignment;
|
scsi, file-posix: add support for persistent reservation management
It is a common requirement for virtual machine to send persistent
reservations, but this currently requires either running QEMU with
CAP_SYS_RAWIO, or using out-of-tree patches that let an unprivileged
QEMU bypass Linux's filter on SG_IO commands.
As an alternative mechanism, the next patches will introduce a
privileged helper to run persistent reservation commands without
expanding QEMU's attack surface unnecessarily.
The helper is invoked through a "pr-manager" QOM object, to which
file-posix.c passes SG_IO requests for PERSISTENT RESERVE OUT and
PERSISTENT RESERVE IN commands. For example:
$ qemu-system-x86_64
-device virtio-scsi \
-object pr-manager-helper,id=helper0,path=/var/run/qemu-pr-helper.sock
-drive if=none,id=hd,driver=raw,file.filename=/dev/sdb,file.pr-manager=helper0
-device scsi-block,drive=hd
or:
$ qemu-system-x86_64
-device virtio-scsi \
-object pr-manager-helper,id=helper0,path=/var/run/qemu-pr-helper.sock
-blockdev node-name=hd,driver=raw,file.driver=host_device,file.filename=/dev/sdb,file.pr-manager=helper0
-device scsi-block,drive=hd
Multiple pr-manager implementations are conceivable and possible, though
only one is implemented right now. For example, a pr-manager could:
- talk directly to the multipath daemon from a privileged QEMU
(i.e. QEMU links to libmpathpersist); this makes reservation work
properly with multipath, but still requires CAP_SYS_RAWIO
- use the Linux IOC_PR_* ioctls (they require CAP_SYS_ADMIN though)
- more interestingly, implement reservations directly in QEMU
through file system locks or a shared database (e.g. sqlite)
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2017-08-21 19:58:56 +03:00
|
|
|
|
|
|
|
PRManager *pr_mgr;
|
2006-08-19 15:45:59 +04:00
|
|
|
} BDRVRawState;
|
|
|
|
|
2012-09-20 23:13:25 +04:00
|
|
|
typedef struct BDRVRawReopenState {
|
|
|
|
int fd;
|
|
|
|
int open_flags;
|
|
|
|
} BDRVRawReopenState;
|
|
|
|
|
2006-08-19 15:45:59 +04:00
|
|
|
static int fd_open(BlockDriverState *bs);
|
2009-06-26 21:51:24 +04:00
|
|
|
static int64_t raw_getlength(BlockDriverState *bs);
|
2006-08-01 20:21:11 +04:00
|
|
|
|
2012-05-25 13:46:27 +04:00
|
|
|
typedef struct RawPosixAIOData {
|
|
|
|
BlockDriverState *bs;
|
|
|
|
int aio_fildes;
|
|
|
|
union {
|
|
|
|
struct iovec *aio_iov;
|
|
|
|
void *aio_ioctl_buf;
|
|
|
|
};
|
|
|
|
int aio_niov;
|
2013-01-14 19:26:55 +04:00
|
|
|
uint64_t aio_nbytes;
|
2012-05-25 13:46:27 +04:00
|
|
|
#define aio_ioctl_cmd aio_nbytes /* for QEMU_AIO_IOCTL */
|
|
|
|
off_t aio_offset;
|
|
|
|
int aio_type;
|
|
|
|
} RawPosixAIOData;
|
|
|
|
|
2009-11-29 20:00:41 +03:00
|
|
|
#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
|
2009-06-15 15:55:19 +04:00
|
|
|
static int cdrom_reopen(BlockDriverState *bs);
|
2009-03-28 11:37:13 +03:00
|
|
|
#endif
|
|
|
|
|
2011-05-24 13:30:29 +04:00
|
|
|
#if defined(__NetBSD__)
|
|
|
|
static int raw_normalize_devicepath(const char **filename)
|
|
|
|
{
|
|
|
|
static char namebuf[PATH_MAX];
|
|
|
|
const char *dp, *fname;
|
|
|
|
struct stat sb;
|
|
|
|
|
|
|
|
fname = *filename;
|
|
|
|
dp = strrchr(fname, '/');
|
|
|
|
if (lstat(fname, &sb) < 0) {
|
|
|
|
fprintf(stderr, "%s: stat failed: %s\n",
|
|
|
|
fname, strerror(errno));
|
|
|
|
return -errno;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!S_ISBLK(sb.st_mode)) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (dp == NULL) {
|
|
|
|
snprintf(namebuf, PATH_MAX, "r%s", fname);
|
|
|
|
} else {
|
|
|
|
snprintf(namebuf, PATH_MAX, "%.*s/r%s",
|
|
|
|
(int)(dp - fname), fname, dp + 1);
|
|
|
|
}
|
|
|
|
fprintf(stderr, "%s is a block device", fname);
|
|
|
|
*filename = namebuf;
|
|
|
|
fprintf(stderr, ", using %s\n", *filename);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
static int raw_normalize_devicepath(const char **filename)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2015-02-16 14:47:55 +03:00
|
|
|
/*
|
|
|
|
* Get logical block size via ioctl. On success store it in @sector_size_p.
|
|
|
|
*/
|
|
|
|
static int probe_logical_blocksize(int fd, unsigned int *sector_size_p)
|
2011-11-29 15:42:20 +04:00
|
|
|
{
|
|
|
|
unsigned int sector_size;
|
2015-02-16 14:47:55 +03:00
|
|
|
bool success = false;
|
2017-03-23 17:36:28 +03:00
|
|
|
int i;
|
2011-11-29 15:42:20 +04:00
|
|
|
|
2015-02-16 14:47:55 +03:00
|
|
|
errno = ENOTSUP;
|
2017-03-23 17:36:28 +03:00
|
|
|
static const unsigned long ioctl_list[] = {
|
2011-11-29 15:42:20 +04:00
|
|
|
#ifdef BLKSSZGET
|
2017-03-23 17:36:28 +03:00
|
|
|
BLKSSZGET,
|
2011-11-29 15:42:20 +04:00
|
|
|
#endif
|
|
|
|
#ifdef DKIOCGETBLOCKSIZE
|
2017-03-23 17:36:28 +03:00
|
|
|
DKIOCGETBLOCKSIZE,
|
2011-11-29 15:42:20 +04:00
|
|
|
#endif
|
|
|
|
#ifdef DIOCGSECTORSIZE
|
2017-03-23 17:36:28 +03:00
|
|
|
DIOCGSECTORSIZE,
|
2011-11-29 15:42:20 +04:00
|
|
|
#endif
|
2017-03-23 17:36:28 +03:00
|
|
|
};
|
|
|
|
|
|
|
|
/* Try a few ioctls to get the right size */
|
|
|
|
for (i = 0; i < (int)ARRAY_SIZE(ioctl_list); i++) {
|
|
|
|
if (ioctl(fd, ioctl_list[i], §or_size) >= 0) {
|
|
|
|
*sector_size_p = sector_size;
|
|
|
|
success = true;
|
|
|
|
}
|
|
|
|
}
|
2015-02-16 14:47:55 +03:00
|
|
|
|
|
|
|
return success ? 0 : -errno;
|
|
|
|
}
|
|
|
|
|
2015-02-16 14:47:56 +03:00
|
|
|
/**
|
|
|
|
* Get physical block size of @fd.
|
|
|
|
* On success, store it in @blk_size and return 0.
|
|
|
|
* On failure, return -errno.
|
|
|
|
*/
|
|
|
|
static int probe_physical_blocksize(int fd, unsigned int *blk_size)
|
|
|
|
{
|
|
|
|
#ifdef BLKPBSZGET
|
|
|
|
if (ioctl(fd, BLKPBSZGET, blk_size) < 0) {
|
|
|
|
return -errno;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
#else
|
|
|
|
return -ENOTSUP;
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2015-03-06 00:38:17 +03:00
|
|
|
/* Check if read is allowed with given memory buffer and length.
|
|
|
|
*
|
|
|
|
* This function is used to check O_DIRECT memory buffer and request alignment.
|
|
|
|
*/
|
|
|
|
static bool raw_is_io_aligned(int fd, void *buf, size_t len)
|
|
|
|
{
|
|
|
|
ssize_t ret = pread(fd, buf, len, 0);
|
|
|
|
|
|
|
|
if (ret >= 0) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef __linux__
|
|
|
|
/* The Linux kernel returns EINVAL for misaligned O_DIRECT reads. Ignore
|
|
|
|
* other errors (e.g. real I/O error), which could happen on a failed
|
|
|
|
* drive, since we only care about probing alignment.
|
|
|
|
*/
|
|
|
|
if (errno != EINVAL) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2015-02-16 14:47:55 +03:00
|
|
|
static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp)
|
|
|
|
{
|
|
|
|
BDRVRawState *s = bs->opaque;
|
|
|
|
char *buf;
|
block: align bounce buffers to page
The following sequence
int fd = open(argv[1], O_RDWR | O_CREAT | O_DIRECT, 0644);
for (i = 0; i < 100000; i++)
write(fd, buf, 4096);
performs 5% better if buf is aligned to 4096 bytes.
The difference is quite reliable.
On the other hand we do not want at the moment to enforce bounce
buffering if guest request is aligned to 512 bytes.
The patch changes default bounce buffer optimal alignment to
MAX(page size, 4k). 4k is chosen as maximal known sector size on real
HDD.
The justification of the performance improve is quite interesting.
From the kernel point of view each request to the disk was split
by two. This could be seen by blktrace like this:
9,0 11 1 0.000000000 11151 Q WS 312737792 + 1023 [qemu-img]
9,0 11 2 0.000007938 11151 Q WS 312738815 + 8 [qemu-img]
9,0 11 3 0.000030735 11151 Q WS 312738823 + 1016 [qemu-img]
9,0 11 4 0.000032482 11151 Q WS 312739839 + 8 [qemu-img]
9,0 11 5 0.000041379 11151 Q WS 312739847 + 1016 [qemu-img]
9,0 11 6 0.000042818 11151 Q WS 312740863 + 8 [qemu-img]
9,0 11 7 0.000051236 11151 Q WS 312740871 + 1017 [qemu-img]
9,0 5 1 0.169071519 11151 Q WS 312741888 + 1023 [qemu-img]
After the patch the pattern becomes normal:
9,0 6 1 0.000000000 12422 Q WS 314834944 + 1024 [qemu-img]
9,0 6 2 0.000038527 12422 Q WS 314835968 + 1024 [qemu-img]
9,0 6 3 0.000072849 12422 Q WS 314836992 + 1024 [qemu-img]
9,0 6 4 0.000106276 12422 Q WS 314838016 + 1024 [qemu-img]
and the amount of requests sent to disk (could be calculated counting
number of lines in the output of blktrace) is reduced about 2 times.
Both qemu-img and qemu-io are affected while qemu-kvm is not. The guest
does his job well and real requests comes properly aligned (to page).
Signed-off-by: Denis V. Lunev <den@openvz.org>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Message-id: 1431441056-26198-3-git-send-email-den@openvz.org
CC: Paolo Bonzini <pbonzini@redhat.com>
CC: Kevin Wolf <kwolf@redhat.com>
CC: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2015-05-12 17:30:56 +03:00
|
|
|
size_t max_align = MAX(MAX_BLOCKSIZE, getpagesize());
|
2015-02-16 14:47:55 +03:00
|
|
|
|
2015-06-23 13:44:56 +03:00
|
|
|
/* For SCSI generic devices the alignment is not really used.
|
2015-02-16 14:47:55 +03:00
|
|
|
With buffered I/O, we don't have any restrictions. */
|
2015-06-23 13:44:56 +03:00
|
|
|
if (bdrv_is_sg(bs) || !s->needs_alignment) {
|
2016-06-24 01:37:24 +03:00
|
|
|
bs->bl.request_alignment = 1;
|
2015-02-16 14:47:55 +03:00
|
|
|
s->buf_align = 1;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2016-06-24 01:37:24 +03:00
|
|
|
bs->bl.request_alignment = 0;
|
2015-02-16 14:47:55 +03:00
|
|
|
s->buf_align = 0;
|
|
|
|
/* Let's try to use the logical blocksize for the alignment. */
|
2016-06-24 01:37:24 +03:00
|
|
|
if (probe_logical_blocksize(fd, &bs->bl.request_alignment) < 0) {
|
|
|
|
bs->bl.request_alignment = 0;
|
2015-02-16 14:47:55 +03:00
|
|
|
}
|
2011-11-29 15:42:20 +04:00
|
|
|
#ifdef CONFIG_XFS
|
|
|
|
if (s->is_xfs) {
|
|
|
|
struct dioattr da;
|
2014-07-16 19:48:17 +04:00
|
|
|
if (xfsctl(NULL, fd, XFS_IOC_DIOINFO, &da) >= 0) {
|
2016-06-24 01:37:24 +03:00
|
|
|
bs->bl.request_alignment = da.d_miniosz;
|
2011-11-29 15:42:20 +04:00
|
|
|
/* The kernel returns wrong information for d_mem */
|
|
|
|
/* s->buf_align = da.d_mem; */
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/* If we could not get the sizes so far, we can only guess them */
|
|
|
|
if (!s->buf_align) {
|
|
|
|
size_t align;
|
block: align bounce buffers to page
The following sequence
int fd = open(argv[1], O_RDWR | O_CREAT | O_DIRECT, 0644);
for (i = 0; i < 100000; i++)
write(fd, buf, 4096);
performs 5% better if buf is aligned to 4096 bytes.
The difference is quite reliable.
On the other hand we do not want at the moment to enforce bounce
buffering if guest request is aligned to 512 bytes.
The patch changes default bounce buffer optimal alignment to
MAX(page size, 4k). 4k is chosen as maximal known sector size on real
HDD.
The justification of the performance improve is quite interesting.
From the kernel point of view each request to the disk was split
by two. This could be seen by blktrace like this:
9,0 11 1 0.000000000 11151 Q WS 312737792 + 1023 [qemu-img]
9,0 11 2 0.000007938 11151 Q WS 312738815 + 8 [qemu-img]
9,0 11 3 0.000030735 11151 Q WS 312738823 + 1016 [qemu-img]
9,0 11 4 0.000032482 11151 Q WS 312739839 + 8 [qemu-img]
9,0 11 5 0.000041379 11151 Q WS 312739847 + 1016 [qemu-img]
9,0 11 6 0.000042818 11151 Q WS 312740863 + 8 [qemu-img]
9,0 11 7 0.000051236 11151 Q WS 312740871 + 1017 [qemu-img]
9,0 5 1 0.169071519 11151 Q WS 312741888 + 1023 [qemu-img]
After the patch the pattern becomes normal:
9,0 6 1 0.000000000 12422 Q WS 314834944 + 1024 [qemu-img]
9,0 6 2 0.000038527 12422 Q WS 314835968 + 1024 [qemu-img]
9,0 6 3 0.000072849 12422 Q WS 314836992 + 1024 [qemu-img]
9,0 6 4 0.000106276 12422 Q WS 314838016 + 1024 [qemu-img]
and the amount of requests sent to disk (could be calculated counting
number of lines in the output of blktrace) is reduced about 2 times.
Both qemu-img and qemu-io are affected while qemu-kvm is not. The guest
does his job well and real requests comes properly aligned (to page).
Signed-off-by: Denis V. Lunev <den@openvz.org>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Message-id: 1431441056-26198-3-git-send-email-den@openvz.org
CC: Paolo Bonzini <pbonzini@redhat.com>
CC: Kevin Wolf <kwolf@redhat.com>
CC: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2015-05-12 17:30:56 +03:00
|
|
|
buf = qemu_memalign(max_align, 2 * max_align);
|
|
|
|
for (align = 512; align <= max_align; align <<= 1) {
|
|
|
|
if (raw_is_io_aligned(fd, buf + align, max_align)) {
|
2011-11-29 15:42:20 +04:00
|
|
|
s->buf_align = align;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
qemu_vfree(buf);
|
|
|
|
}
|
|
|
|
|
2016-06-24 01:37:24 +03:00
|
|
|
if (!bs->bl.request_alignment) {
|
2011-11-29 15:42:20 +04:00
|
|
|
size_t align;
|
block: align bounce buffers to page
The following sequence
int fd = open(argv[1], O_RDWR | O_CREAT | O_DIRECT, 0644);
for (i = 0; i < 100000; i++)
write(fd, buf, 4096);
performs 5% better if buf is aligned to 4096 bytes.
The difference is quite reliable.
On the other hand we do not want at the moment to enforce bounce
buffering if guest request is aligned to 512 bytes.
The patch changes default bounce buffer optimal alignment to
MAX(page size, 4k). 4k is chosen as maximal known sector size on real
HDD.
The justification of the performance improve is quite interesting.
From the kernel point of view each request to the disk was split
by two. This could be seen by blktrace like this:
9,0 11 1 0.000000000 11151 Q WS 312737792 + 1023 [qemu-img]
9,0 11 2 0.000007938 11151 Q WS 312738815 + 8 [qemu-img]
9,0 11 3 0.000030735 11151 Q WS 312738823 + 1016 [qemu-img]
9,0 11 4 0.000032482 11151 Q WS 312739839 + 8 [qemu-img]
9,0 11 5 0.000041379 11151 Q WS 312739847 + 1016 [qemu-img]
9,0 11 6 0.000042818 11151 Q WS 312740863 + 8 [qemu-img]
9,0 11 7 0.000051236 11151 Q WS 312740871 + 1017 [qemu-img]
9,0 5 1 0.169071519 11151 Q WS 312741888 + 1023 [qemu-img]
After the patch the pattern becomes normal:
9,0 6 1 0.000000000 12422 Q WS 314834944 + 1024 [qemu-img]
9,0 6 2 0.000038527 12422 Q WS 314835968 + 1024 [qemu-img]
9,0 6 3 0.000072849 12422 Q WS 314836992 + 1024 [qemu-img]
9,0 6 4 0.000106276 12422 Q WS 314838016 + 1024 [qemu-img]
and the amount of requests sent to disk (could be calculated counting
number of lines in the output of blktrace) is reduced about 2 times.
Both qemu-img and qemu-io are affected while qemu-kvm is not. The guest
does his job well and real requests comes properly aligned (to page).
Signed-off-by: Denis V. Lunev <den@openvz.org>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Message-id: 1431441056-26198-3-git-send-email-den@openvz.org
CC: Paolo Bonzini <pbonzini@redhat.com>
CC: Kevin Wolf <kwolf@redhat.com>
CC: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2015-05-12 17:30:56 +03:00
|
|
|
buf = qemu_memalign(s->buf_align, max_align);
|
|
|
|
for (align = 512; align <= max_align; align <<= 1) {
|
2015-03-06 00:38:17 +03:00
|
|
|
if (raw_is_io_aligned(fd, buf, align)) {
|
2016-06-24 01:37:24 +03:00
|
|
|
bs->bl.request_alignment = align;
|
2011-11-29 15:42:20 +04:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
qemu_vfree(buf);
|
|
|
|
}
|
2014-07-16 19:48:17 +04:00
|
|
|
|
2016-06-24 01:37:24 +03:00
|
|
|
if (!s->buf_align || !bs->bl.request_alignment) {
|
2016-06-24 01:37:25 +03:00
|
|
|
error_setg(errp, "Could not find working O_DIRECT alignment");
|
|
|
|
error_append_hint(errp, "Try cache.direct=off\n");
|
2014-07-16 19:48:17 +04:00
|
|
|
}
|
2011-11-29 15:42:20 +04:00
|
|
|
}
|
|
|
|
|
2012-09-20 23:13:21 +04:00
|
|
|
static void raw_parse_flags(int bdrv_flags, int *open_flags)
|
|
|
|
{
|
|
|
|
assert(open_flags != NULL);
|
|
|
|
|
|
|
|
*open_flags |= O_BINARY;
|
|
|
|
*open_flags &= ~O_ACCMODE;
|
|
|
|
if (bdrv_flags & BDRV_O_RDWR) {
|
|
|
|
*open_flags |= O_RDWR;
|
|
|
|
} else {
|
|
|
|
*open_flags |= O_RDONLY;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Use O_DSYNC for write-through caching, no flags for write-back caching,
|
|
|
|
* and O_DIRECT for no caching. */
|
|
|
|
if ((bdrv_flags & BDRV_O_NOCACHE)) {
|
|
|
|
*open_flags |= O_DIRECT;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-03-06 01:41:37 +04:00
|
|
|
static void raw_parse_filename(const char *filename, QDict *options,
|
|
|
|
Error **errp)
|
|
|
|
{
|
2017-05-22 22:52:16 +03:00
|
|
|
bdrv_parse_filename_strip_prefix(filename, "file:", options);
|
2014-03-06 01:41:37 +04:00
|
|
|
}
|
|
|
|
|
2013-04-02 12:47:40 +04:00
|
|
|
static QemuOptsList raw_runtime_opts = {
|
|
|
|
.name = "raw",
|
|
|
|
.head = QTAILQ_HEAD_INITIALIZER(raw_runtime_opts.head),
|
|
|
|
.desc = {
|
|
|
|
{
|
|
|
|
.name = "filename",
|
|
|
|
.type = QEMU_OPT_STRING,
|
|
|
|
.help = "File name of the image",
|
|
|
|
},
|
2016-09-08 16:09:01 +03:00
|
|
|
{
|
|
|
|
.name = "aio",
|
|
|
|
.type = QEMU_OPT_STRING,
|
|
|
|
.help = "host AIO implementation (threads, native)",
|
|
|
|
},
|
2017-05-02 19:35:50 +03:00
|
|
|
{
|
|
|
|
.name = "locking",
|
|
|
|
.type = QEMU_OPT_STRING,
|
|
|
|
.help = "file locking mode (on/off/auto, default: auto)",
|
|
|
|
},
|
scsi, file-posix: add support for persistent reservation management
It is a common requirement for virtual machine to send persistent
reservations, but this currently requires either running QEMU with
CAP_SYS_RAWIO, or using out-of-tree patches that let an unprivileged
QEMU bypass Linux's filter on SG_IO commands.
As an alternative mechanism, the next patches will introduce a
privileged helper to run persistent reservation commands without
expanding QEMU's attack surface unnecessarily.
The helper is invoked through a "pr-manager" QOM object, to which
file-posix.c passes SG_IO requests for PERSISTENT RESERVE OUT and
PERSISTENT RESERVE IN commands. For example:
$ qemu-system-x86_64
-device virtio-scsi \
-object pr-manager-helper,id=helper0,path=/var/run/qemu-pr-helper.sock
-drive if=none,id=hd,driver=raw,file.filename=/dev/sdb,file.pr-manager=helper0
-device scsi-block,drive=hd
or:
$ qemu-system-x86_64
-device virtio-scsi \
-object pr-manager-helper,id=helper0,path=/var/run/qemu-pr-helper.sock
-blockdev node-name=hd,driver=raw,file.driver=host_device,file.filename=/dev/sdb,file.pr-manager=helper0
-device scsi-block,drive=hd
Multiple pr-manager implementations are conceivable and possible, though
only one is implemented right now. For example, a pr-manager could:
- talk directly to the multipath daemon from a privileged QEMU
(i.e. QEMU links to libmpathpersist); this makes reservation work
properly with multipath, but still requires CAP_SYS_RAWIO
- use the Linux IOC_PR_* ioctls (they require CAP_SYS_ADMIN though)
- more interestingly, implement reservations directly in QEMU
through file system locks or a shared database (e.g. sqlite)
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2017-08-21 19:58:56 +03:00
|
|
|
{
|
|
|
|
.name = "pr-manager",
|
|
|
|
.type = QEMU_OPT_STRING,
|
|
|
|
.help = "id of persistent reservation manager object (default: none)",
|
|
|
|
},
|
2013-04-02 12:47:40 +04:00
|
|
|
{ /* end of list */ }
|
|
|
|
},
|
|
|
|
};
|
|
|
|
|
|
|
|
static int raw_open_common(BlockDriverState *bs, QDict *options,
|
2013-10-11 13:37:01 +04:00
|
|
|
int bdrv_flags, int open_flags, Error **errp)
|
2006-08-01 20:21:11 +04:00
|
|
|
{
|
|
|
|
BDRVRawState *s = bs->opaque;
|
2013-04-02 12:47:40 +04:00
|
|
|
QemuOpts *opts;
|
|
|
|
Error *local_err = NULL;
|
2014-04-11 21:16:36 +04:00
|
|
|
const char *filename = NULL;
|
scsi, file-posix: add support for persistent reservation management
It is a common requirement for virtual machine to send persistent
reservations, but this currently requires either running QEMU with
CAP_SYS_RAWIO, or using out-of-tree patches that let an unprivileged
QEMU bypass Linux's filter on SG_IO commands.
As an alternative mechanism, the next patches will introduce a
privileged helper to run persistent reservation commands without
expanding QEMU's attack surface unnecessarily.
The helper is invoked through a "pr-manager" QOM object, to which
file-posix.c passes SG_IO requests for PERSISTENT RESERVE OUT and
PERSISTENT RESERVE IN commands. For example:
$ qemu-system-x86_64
-device virtio-scsi \
-object pr-manager-helper,id=helper0,path=/var/run/qemu-pr-helper.sock
-drive if=none,id=hd,driver=raw,file.filename=/dev/sdb,file.pr-manager=helper0
-device scsi-block,drive=hd
or:
$ qemu-system-x86_64
-device virtio-scsi \
-object pr-manager-helper,id=helper0,path=/var/run/qemu-pr-helper.sock
-blockdev node-name=hd,driver=raw,file.driver=host_device,file.filename=/dev/sdb,file.pr-manager=helper0
-device scsi-block,drive=hd
Multiple pr-manager implementations are conceivable and possible, though
only one is implemented right now. For example, a pr-manager could:
- talk directly to the multipath daemon from a privileged QEMU
(i.e. QEMU links to libmpathpersist); this makes reservation work
properly with multipath, but still requires CAP_SYS_RAWIO
- use the Linux IOC_PR_* ioctls (they require CAP_SYS_ADMIN though)
- more interestingly, implement reservations directly in QEMU
through file system locks or a shared database (e.g. sqlite)
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2017-08-21 19:58:56 +03:00
|
|
|
const char *str;
|
2016-09-08 16:09:01 +03:00
|
|
|
BlockdevAioOptions aio, aio_default;
|
2009-06-15 15:53:26 +04:00
|
|
|
int fd, ret;
|
2013-11-22 16:39:55 +04:00
|
|
|
struct stat st;
|
2017-05-02 19:35:56 +03:00
|
|
|
OnOffAuto locking;
|
2006-08-01 20:21:11 +04:00
|
|
|
|
2014-01-02 06:49:17 +04:00
|
|
|
opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, &error_abort);
|
2013-04-02 12:47:40 +04:00
|
|
|
qemu_opts_absorb_qdict(opts, options, &local_err);
|
2014-01-30 18:07:28 +04:00
|
|
|
if (local_err) {
|
2013-10-11 13:37:01 +04:00
|
|
|
error_propagate(errp, local_err);
|
2013-04-02 12:47:40 +04:00
|
|
|
ret = -EINVAL;
|
|
|
|
goto fail;
|
|
|
|
}
|
|
|
|
|
|
|
|
filename = qemu_opt_get(opts, "filename");
|
|
|
|
|
2011-05-24 13:30:29 +04:00
|
|
|
ret = raw_normalize_devicepath(&filename);
|
|
|
|
if (ret != 0) {
|
2013-10-11 13:37:01 +04:00
|
|
|
error_setg_errno(errp, -ret, "Could not normalize device path");
|
2013-04-02 12:47:40 +04:00
|
|
|
goto fail;
|
2011-05-24 13:30:29 +04:00
|
|
|
}
|
|
|
|
|
2016-09-08 16:09:01 +03:00
|
|
|
aio_default = (bdrv_flags & BDRV_O_NATIVE_AIO)
|
|
|
|
? BLOCKDEV_AIO_OPTIONS_NATIVE
|
|
|
|
: BLOCKDEV_AIO_OPTIONS_THREADS;
|
2017-08-24 11:46:10 +03:00
|
|
|
aio = qapi_enum_parse(&BlockdevAioOptions_lookup,
|
|
|
|
qemu_opt_get(opts, "aio"),
|
2017-08-24 11:45:57 +03:00
|
|
|
aio_default, &local_err);
|
2016-09-08 16:09:01 +03:00
|
|
|
if (local_err) {
|
|
|
|
error_propagate(errp, local_err);
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto fail;
|
|
|
|
}
|
|
|
|
s->use_linux_aio = (aio == BLOCKDEV_AIO_OPTIONS_NATIVE);
|
|
|
|
|
2017-08-24 11:46:10 +03:00
|
|
|
locking = qapi_enum_parse(&OnOffAuto_lookup,
|
|
|
|
qemu_opt_get(opts, "locking"),
|
2017-08-24 11:45:57 +03:00
|
|
|
ON_OFF_AUTO_AUTO, &local_err);
|
2017-05-02 19:35:56 +03:00
|
|
|
if (local_err) {
|
|
|
|
error_propagate(errp, local_err);
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto fail;
|
|
|
|
}
|
|
|
|
switch (locking) {
|
|
|
|
case ON_OFF_AUTO_ON:
|
|
|
|
s->use_lock = true;
|
file-posix: Do runtime check for ofd lock API
It is reported that on Windows Subsystem for Linux, ofd operations fail
with -EINVAL. In other words, QEMU binary built with system headers that
exports F_OFD_SETLK doesn't necessarily run in an environment that
actually supports it:
$ qemu-system-aarch64 ... -drive file=test.vhdx,if=none,id=hd0 \
-device virtio-blk-pci,drive=hd0
qemu-system-aarch64: -drive file=test.vhdx,if=none,id=hd0: Failed to unlock byte 100
qemu-system-aarch64: -drive file=test.vhdx,if=none,id=hd0: Failed to unlock byte 100
qemu-system-aarch64: -drive file=test.vhdx,if=none,id=hd0: Failed to lock byte 100
As a matter of fact this is not WSL specific. It can happen when running
a QEMU compiled against a newer glibc on an older kernel, such as in
a containerized environment.
Let's do a runtime check to cope with that.
Reported-by: Andrew Baumann <Andrew.Baumann@microsoft.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2017-08-11 14:44:47 +03:00
|
|
|
if (!qemu_has_ofd_lock()) {
|
|
|
|
fprintf(stderr,
|
|
|
|
"File lock requested but OFD locking syscall is "
|
|
|
|
"unavailable, falling back to POSIX file locks.\n"
|
|
|
|
"Due to the implementation, locks can be lost "
|
|
|
|
"unexpectedly.\n");
|
|
|
|
}
|
2017-05-02 19:35:56 +03:00
|
|
|
break;
|
|
|
|
case ON_OFF_AUTO_OFF:
|
|
|
|
s->use_lock = false;
|
|
|
|
break;
|
|
|
|
case ON_OFF_AUTO_AUTO:
|
file-posix: Do runtime check for ofd lock API
It is reported that on Windows Subsystem for Linux, ofd operations fail
with -EINVAL. In other words, QEMU binary built with system headers that
exports F_OFD_SETLK doesn't necessarily run in an environment that
actually supports it:
$ qemu-system-aarch64 ... -drive file=test.vhdx,if=none,id=hd0 \
-device virtio-blk-pci,drive=hd0
qemu-system-aarch64: -drive file=test.vhdx,if=none,id=hd0: Failed to unlock byte 100
qemu-system-aarch64: -drive file=test.vhdx,if=none,id=hd0: Failed to unlock byte 100
qemu-system-aarch64: -drive file=test.vhdx,if=none,id=hd0: Failed to lock byte 100
As a matter of fact this is not WSL specific. It can happen when running
a QEMU compiled against a newer glibc on an older kernel, such as in
a containerized environment.
Let's do a runtime check to cope with that.
Reported-by: Andrew Baumann <Andrew.Baumann@microsoft.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2017-08-11 14:44:47 +03:00
|
|
|
s->use_lock = qemu_has_ofd_lock();
|
2017-05-02 19:35:56 +03:00
|
|
|
break;
|
|
|
|
default:
|
|
|
|
abort();
|
|
|
|
}
|
|
|
|
|
scsi, file-posix: add support for persistent reservation management
It is a common requirement for virtual machine to send persistent
reservations, but this currently requires either running QEMU with
CAP_SYS_RAWIO, or using out-of-tree patches that let an unprivileged
QEMU bypass Linux's filter on SG_IO commands.
As an alternative mechanism, the next patches will introduce a
privileged helper to run persistent reservation commands without
expanding QEMU's attack surface unnecessarily.
The helper is invoked through a "pr-manager" QOM object, to which
file-posix.c passes SG_IO requests for PERSISTENT RESERVE OUT and
PERSISTENT RESERVE IN commands. For example:
$ qemu-system-x86_64
-device virtio-scsi \
-object pr-manager-helper,id=helper0,path=/var/run/qemu-pr-helper.sock
-drive if=none,id=hd,driver=raw,file.filename=/dev/sdb,file.pr-manager=helper0
-device scsi-block,drive=hd
or:
$ qemu-system-x86_64
-device virtio-scsi \
-object pr-manager-helper,id=helper0,path=/var/run/qemu-pr-helper.sock
-blockdev node-name=hd,driver=raw,file.driver=host_device,file.filename=/dev/sdb,file.pr-manager=helper0
-device scsi-block,drive=hd
Multiple pr-manager implementations are conceivable and possible, though
only one is implemented right now. For example, a pr-manager could:
- talk directly to the multipath daemon from a privileged QEMU
(i.e. QEMU links to libmpathpersist); this makes reservation work
properly with multipath, but still requires CAP_SYS_RAWIO
- use the Linux IOC_PR_* ioctls (they require CAP_SYS_ADMIN though)
- more interestingly, implement reservations directly in QEMU
through file system locks or a shared database (e.g. sqlite)
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2017-08-21 19:58:56 +03:00
|
|
|
str = qemu_opt_get(opts, "pr-manager");
|
|
|
|
if (str) {
|
|
|
|
s->pr_mgr = pr_manager_lookup(str, &local_err);
|
|
|
|
if (local_err) {
|
|
|
|
error_propagate(errp, local_err);
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto fail;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-09-20 23:13:21 +04:00
|
|
|
s->open_flags = open_flags;
|
|
|
|
raw_parse_flags(bdrv_flags, &s->open_flags);
|
2006-08-01 20:21:11 +04:00
|
|
|
|
2009-06-15 15:53:38 +04:00
|
|
|
s->fd = -1;
|
2009-12-02 14:24:42 +03:00
|
|
|
fd = qemu_open(filename, s->open_flags, 0644);
|
2006-08-19 15:45:59 +04:00
|
|
|
if (fd < 0) {
|
|
|
|
ret = -errno;
|
2016-10-11 17:12:35 +03:00
|
|
|
error_setg_errno(errp, errno, "Could not open '%s'", filename);
|
2013-04-02 12:47:40 +04:00
|
|
|
if (ret == -EROFS) {
|
2006-08-19 15:45:59 +04:00
|
|
|
ret = -EACCES;
|
2013-04-02 12:47:40 +04:00
|
|
|
}
|
|
|
|
goto fail;
|
2006-08-19 15:45:59 +04:00
|
|
|
}
|
2006-08-01 20:21:11 +04:00
|
|
|
s->fd = fd;
|
2009-08-20 18:58:19 +04:00
|
|
|
|
2017-05-02 19:35:56 +03:00
|
|
|
s->lock_fd = -1;
|
|
|
|
if (s->use_lock) {
|
|
|
|
fd = qemu_open(filename, s->open_flags);
|
|
|
|
if (fd < 0) {
|
|
|
|
ret = -errno;
|
|
|
|
error_setg_errno(errp, errno, "Could not open '%s' for locking",
|
|
|
|
filename);
|
|
|
|
qemu_close(s->fd);
|
|
|
|
goto fail;
|
|
|
|
}
|
|
|
|
s->lock_fd = fd;
|
|
|
|
}
|
|
|
|
s->perm = 0;
|
|
|
|
s->shared_perm = BLK_PERM_ALL;
|
|
|
|
|
2009-08-20 18:58:35 +04:00
|
|
|
#ifdef CONFIG_LINUX_AIO
|
2016-09-08 16:09:01 +03:00
|
|
|
/* Currently Linux does AIO only for files opened with O_DIRECT */
|
|
|
|
if (s->use_linux_aio && !(s->open_flags & O_DIRECT)) {
|
2015-12-15 13:35:36 +03:00
|
|
|
error_setg(errp, "aio=native was specified, but it requires "
|
|
|
|
"cache.direct=on, which was not specified.");
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto fail;
|
2015-03-17 15:45:21 +03:00
|
|
|
}
|
2015-07-23 15:48:34 +03:00
|
|
|
#else
|
2016-09-08 16:09:01 +03:00
|
|
|
if (s->use_linux_aio) {
|
2015-12-15 13:35:36 +03:00
|
|
|
error_setg(errp, "aio=native was specified, but is not supported "
|
|
|
|
"in this build.");
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto fail;
|
2015-07-23 15:48:34 +03:00
|
|
|
}
|
|
|
|
#endif /* !defined(CONFIG_LINUX_AIO) */
|
2009-08-20 18:58:19 +04:00
|
|
|
|
2013-11-22 16:39:47 +04:00
|
|
|
s->has_discard = true;
|
2013-11-22 16:39:57 +04:00
|
|
|
s->has_write_zeroes = true;
|
block: Honor BDRV_REQ_FUA during write_zeroes
The block layer has a couple of cases where it can lose
Force Unit Access semantics when writing a large block of
zeroes, such that the request returns before the zeroes
have been guaranteed to land on underlying media.
SCSI does not support FUA during WRITESAME(10/16); FUA is only
supported if it falls back to WRITE(10/16). But where the
underlying device is new enough to not need a fallback, it
means that any upper layer request with FUA semantics was
silently ignoring BDRV_REQ_FUA.
Conversely, NBD has situations where it can support FUA but not
ZERO_WRITE; when that happens, the generic block layer fallback
to bdrv_driver_pwritev() (or the older bdrv_co_writev() in qemu
2.6) was losing the FUA flag.
The problem of losing flags unrelated to ZERO_WRITE has been
latent in bdrv_co_do_write_zeroes() since commit aa7bfbff, but
back then, it did not matter because there was no FUA flag. It
became observable when commit 93f5e6d8 paved the way for flags
that can impact correctness, when we should have been using
bdrv_co_writev_flags() with modified flags. Compare to commit
9eeb6dd, which got flag manipulation right in
bdrv_co_do_zero_pwritev().
Symptoms: I tested with qemu-io with default writethrough cache
(which is supposed to use FUA semantics on every write), and
targetted an NBD client connected to a server that intentionally
did not advertise NBD_FLAG_SEND_FUA. When doing 'write 0 512',
the NBD client sent two operations (NBD_CMD_WRITE then
NBD_CMD_FLUSH) to get the fallback FUA semantics; but when doing
'write -z 0 512', the NBD client sent only NBD_CMD_WRITE.
The fix is do to a cleanup bdrv_co_flush() at the end of the
operation if any step in the middle relied on a BDS that does
not natively support FUA for that step (note that we don't
need to flush after every operation, if the operation is broken
into chunks based on bounce-buffer sizing). Each BDS gains a
new flag .supported_zero_flags, which parallels the use of
.supported_write_flags but only when accessing a zero write
operation (the flags MUST be different, because of SCSI having
different semantics based on WRITE vs. WRITESAME; and also
because BDRV_REQ_MAY_UNMAP only makes sense on zero writes).
Also fix some documentation to describe -ENOTSUP semantics,
particularly since iscsi depends on those semantics.
Down the road, we may want to add a driver where its
.bdrv_co_pwritev() honors all three of BDRV_REQ_FUA,
BDRV_REQ_ZERO_WRITE, and BDRV_REQ_MAY_UNMAP, and advertise
this via bs->supported_write_flags for blocks opened by that
driver; such a driver should NOT supply .bdrv_co_write_zeroes
nor .supported_zero_flags. But none of the drivers touched
in this patch want to do that (the act of writing zeroes is
different enough from normal writes to deserve a second
callback).
Signed-off-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2016-05-04 01:39:07 +03:00
|
|
|
bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP;
|
2014-10-21 18:03:03 +04:00
|
|
|
if ((bs->open_flags & BDRV_O_NOCACHE) != 0) {
|
|
|
|
s->needs_alignment = true;
|
|
|
|
}
|
2013-11-22 16:39:55 +04:00
|
|
|
|
|
|
|
if (fstat(s->fd, &st) < 0) {
|
2014-12-02 20:32:53 +03:00
|
|
|
ret = -errno;
|
2013-11-22 16:39:55 +04:00
|
|
|
error_setg_errno(errp, errno, "Could not stat file");
|
|
|
|
goto fail;
|
|
|
|
}
|
|
|
|
if (S_ISREG(st.st_mode)) {
|
|
|
|
s->discard_zeroes = true;
|
2015-01-30 11:42:15 +03:00
|
|
|
s->has_fallocate = true;
|
2013-11-22 16:39:55 +04:00
|
|
|
}
|
2013-11-22 16:39:56 +04:00
|
|
|
if (S_ISBLK(st.st_mode)) {
|
|
|
|
#ifdef BLKDISCARDZEROES
|
|
|
|
unsigned int arg;
|
|
|
|
if (ioctl(s->fd, BLKDISCARDZEROES, &arg) == 0 && arg) {
|
|
|
|
s->discard_zeroes = true;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
#ifdef __linux__
|
|
|
|
/* On Linux 3.10, BLKDISCARD leaves stale data in the page cache. Do
|
|
|
|
* not rely on the contents of discarded blocks unless using O_DIRECT.
|
2013-11-22 16:39:57 +04:00
|
|
|
* Same for BLKZEROOUT.
|
2013-11-22 16:39:56 +04:00
|
|
|
*/
|
|
|
|
if (!(bs->open_flags & BDRV_O_NOCACHE)) {
|
|
|
|
s->discard_zeroes = false;
|
2013-11-22 16:39:57 +04:00
|
|
|
s->has_write_zeroes = false;
|
2013-11-22 16:39:56 +04:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
}
|
2014-10-21 18:03:03 +04:00
|
|
|
#ifdef __FreeBSD__
|
|
|
|
if (S_ISCHR(st.st_mode)) {
|
|
|
|
/*
|
|
|
|
* The file is a char device (disk), which on FreeBSD isn't behind
|
|
|
|
* a pager, so force all requests to be aligned. This is needed
|
|
|
|
* so QEMU makes sure all IO operations on the device are aligned
|
|
|
|
* to sector size, or else FreeBSD will reject them with EINVAL.
|
|
|
|
*/
|
|
|
|
s->needs_alignment = true;
|
|
|
|
}
|
|
|
|
#endif
|
2013-11-22 16:39:55 +04:00
|
|
|
|
2010-12-17 13:41:15 +03:00
|
|
|
#ifdef CONFIG_XFS
|
|
|
|
if (platform_test_xfs_fd(s->fd)) {
|
2013-11-22 16:39:47 +04:00
|
|
|
s->is_xfs = true;
|
2010-12-17 13:41:15 +03:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2013-04-02 12:47:40 +04:00
|
|
|
ret = 0;
|
|
|
|
fail:
|
2014-04-11 21:16:36 +04:00
|
|
|
if (filename && (bdrv_flags & BDRV_O_TEMPORARY)) {
|
|
|
|
unlink(filename);
|
|
|
|
}
|
2013-04-02 12:47:40 +04:00
|
|
|
qemu_opts_del(opts);
|
|
|
|
return ret;
|
2006-08-01 20:21:11 +04:00
|
|
|
}
|
|
|
|
|
2013-09-05 16:22:29 +04:00
|
|
|
static int raw_open(BlockDriverState *bs, QDict *options, int flags,
|
|
|
|
Error **errp)
|
2009-06-15 15:53:38 +04:00
|
|
|
{
|
|
|
|
BDRVRawState *s = bs->opaque;
|
|
|
|
|
|
|
|
s->type = FTYPE_FILE;
|
2016-06-14 00:57:58 +03:00
|
|
|
return raw_open_common(bs, options, flags, 0, errp);
|
2009-06-15 15:53:38 +04:00
|
|
|
}
|
|
|
|
|
2017-05-02 19:35:56 +03:00
|
|
|
typedef enum {
|
|
|
|
RAW_PL_PREPARE,
|
|
|
|
RAW_PL_COMMIT,
|
|
|
|
RAW_PL_ABORT,
|
|
|
|
} RawPermLockOp;
|
|
|
|
|
|
|
|
#define PERM_FOREACH(i) \
|
|
|
|
for ((i) = 0; (1ULL << (i)) <= BLK_PERM_ALL; i++)
|
|
|
|
|
|
|
|
/* Lock bytes indicated by @perm_lock_bits and @shared_perm_lock_bits in the
|
|
|
|
* file; if @unlock == true, also unlock the unneeded bytes.
|
|
|
|
* @shared_perm_lock_bits is the mask of all permissions that are NOT shared.
|
|
|
|
*/
|
|
|
|
static int raw_apply_lock_bytes(BDRVRawState *s,
|
|
|
|
uint64_t perm_lock_bits,
|
|
|
|
uint64_t shared_perm_lock_bits,
|
|
|
|
bool unlock, Error **errp)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
PERM_FOREACH(i) {
|
|
|
|
int off = RAW_LOCK_PERM_BASE + i;
|
|
|
|
if (perm_lock_bits & (1ULL << i)) {
|
|
|
|
ret = qemu_lock_fd(s->lock_fd, off, 1, false);
|
|
|
|
if (ret) {
|
|
|
|
error_setg(errp, "Failed to lock byte %d", off);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
} else if (unlock) {
|
|
|
|
ret = qemu_unlock_fd(s->lock_fd, off, 1);
|
|
|
|
if (ret) {
|
|
|
|
error_setg(errp, "Failed to unlock byte %d", off);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
PERM_FOREACH(i) {
|
|
|
|
int off = RAW_LOCK_SHARED_BASE + i;
|
|
|
|
if (shared_perm_lock_bits & (1ULL << i)) {
|
|
|
|
ret = qemu_lock_fd(s->lock_fd, off, 1, false);
|
|
|
|
if (ret) {
|
|
|
|
error_setg(errp, "Failed to lock byte %d", off);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
} else if (unlock) {
|
|
|
|
ret = qemu_unlock_fd(s->lock_fd, off, 1);
|
|
|
|
if (ret) {
|
|
|
|
error_setg(errp, "Failed to unlock byte %d", off);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Check "unshared" bytes implied by @perm and ~@shared_perm in the file. */
|
|
|
|
static int raw_check_lock_bytes(BDRVRawState *s,
|
|
|
|
uint64_t perm, uint64_t shared_perm,
|
|
|
|
Error **errp)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
PERM_FOREACH(i) {
|
|
|
|
int off = RAW_LOCK_SHARED_BASE + i;
|
|
|
|
uint64_t p = 1ULL << i;
|
|
|
|
if (perm & p) {
|
|
|
|
ret = qemu_lock_fd_test(s->lock_fd, off, 1, true);
|
|
|
|
if (ret) {
|
|
|
|
char *perm_name = bdrv_perm_names(p);
|
|
|
|
error_setg(errp,
|
|
|
|
"Failed to get \"%s\" lock",
|
|
|
|
perm_name);
|
|
|
|
g_free(perm_name);
|
|
|
|
error_append_hint(errp,
|
|
|
|
"Is another process using the image?\n");
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
PERM_FOREACH(i) {
|
|
|
|
int off = RAW_LOCK_PERM_BASE + i;
|
|
|
|
uint64_t p = 1ULL << i;
|
|
|
|
if (!(shared_perm & p)) {
|
|
|
|
ret = qemu_lock_fd_test(s->lock_fd, off, 1, true);
|
|
|
|
if (ret) {
|
|
|
|
char *perm_name = bdrv_perm_names(p);
|
|
|
|
error_setg(errp,
|
|
|
|
"Failed to get shared \"%s\" lock",
|
|
|
|
perm_name);
|
|
|
|
g_free(perm_name);
|
|
|
|
error_append_hint(errp,
|
|
|
|
"Is another process using the image?\n");
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int raw_handle_perm_lock(BlockDriverState *bs,
|
|
|
|
RawPermLockOp op,
|
|
|
|
uint64_t new_perm, uint64_t new_shared,
|
|
|
|
Error **errp)
|
|
|
|
{
|
|
|
|
BDRVRawState *s = bs->opaque;
|
|
|
|
int ret = 0;
|
|
|
|
Error *local_err = NULL;
|
|
|
|
|
|
|
|
if (!s->use_lock) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (bdrv_get_flags(bs) & BDRV_O_INACTIVE) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
assert(s->lock_fd > 0);
|
|
|
|
|
|
|
|
switch (op) {
|
|
|
|
case RAW_PL_PREPARE:
|
|
|
|
ret = raw_apply_lock_bytes(s, s->perm | new_perm,
|
|
|
|
~s->shared_perm | ~new_shared,
|
|
|
|
false, errp);
|
|
|
|
if (!ret) {
|
|
|
|
ret = raw_check_lock_bytes(s, new_perm, new_shared, errp);
|
|
|
|
if (!ret) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
op = RAW_PL_ABORT;
|
|
|
|
/* fall through to unlock bytes. */
|
|
|
|
case RAW_PL_ABORT:
|
|
|
|
raw_apply_lock_bytes(s, s->perm, ~s->shared_perm, true, &local_err);
|
|
|
|
if (local_err) {
|
|
|
|
/* Theoretically the above call only unlocks bytes and it cannot
|
|
|
|
* fail. Something weird happened, report it.
|
|
|
|
*/
|
|
|
|
error_report_err(local_err);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case RAW_PL_COMMIT:
|
|
|
|
raw_apply_lock_bytes(s, new_perm, ~new_shared, true, &local_err);
|
|
|
|
if (local_err) {
|
|
|
|
/* Theoretically the above call only unlocks bytes and it cannot
|
|
|
|
* fail. Something weird happened, report it.
|
|
|
|
*/
|
|
|
|
error_report_err(local_err);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2012-09-20 23:13:25 +04:00
|
|
|
static int raw_reopen_prepare(BDRVReopenState *state,
|
|
|
|
BlockReopenQueue *queue, Error **errp)
|
|
|
|
{
|
|
|
|
BDRVRawState *s;
|
2016-10-27 13:45:17 +03:00
|
|
|
BDRVRawReopenState *rs;
|
2012-09-20 23:13:25 +04:00
|
|
|
int ret = 0;
|
2014-07-16 19:48:17 +04:00
|
|
|
Error *local_err = NULL;
|
2012-09-20 23:13:25 +04:00
|
|
|
|
|
|
|
assert(state != NULL);
|
|
|
|
assert(state->bs != NULL);
|
|
|
|
|
|
|
|
s = state->bs->opaque;
|
|
|
|
|
block: Use g_new() & friends where that makes obvious sense
g_new(T, n) is neater than g_malloc(sizeof(T) * n). It's also safer,
for two reasons. One, it catches multiplication overflowing size_t.
Two, it returns T * rather than void *, which lets the compiler catch
more type errors.
Patch created with Coccinelle, with two manual changes on top:
* Add const to bdrv_iterate_format() to keep the types straight
* Convert the allocation in bdrv_drop_intermediate(), which Coccinelle
inexplicably misses
Coccinelle semantic patch:
@@
type T;
@@
-g_malloc(sizeof(T))
+g_new(T, 1)
@@
type T;
@@
-g_try_malloc(sizeof(T))
+g_try_new(T, 1)
@@
type T;
@@
-g_malloc0(sizeof(T))
+g_new0(T, 1)
@@
type T;
@@
-g_try_malloc0(sizeof(T))
+g_try_new0(T, 1)
@@
type T;
expression n;
@@
-g_malloc(sizeof(T) * (n))
+g_new(T, n)
@@
type T;
expression n;
@@
-g_try_malloc(sizeof(T) * (n))
+g_try_new(T, n)
@@
type T;
expression n;
@@
-g_malloc0(sizeof(T) * (n))
+g_new0(T, n)
@@
type T;
expression n;
@@
-g_try_malloc0(sizeof(T) * (n))
+g_try_new0(T, n)
@@
type T;
expression p, n;
@@
-g_realloc(p, sizeof(T) * (n))
+g_renew(T, p, n)
@@
type T;
expression p, n;
@@
-g_try_realloc(p, sizeof(T) * (n))
+g_try_renew(T, p, n)
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Jeff Cody <jcody@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2014-08-19 12:31:08 +04:00
|
|
|
state->opaque = g_new0(BDRVRawReopenState, 1);
|
2016-10-27 13:45:17 +03:00
|
|
|
rs = state->opaque;
|
2012-09-20 23:13:25 +04:00
|
|
|
|
2015-10-19 18:53:07 +03:00
|
|
|
if (s->type == FTYPE_CD) {
|
2016-10-27 13:45:17 +03:00
|
|
|
rs->open_flags |= O_NONBLOCK;
|
2012-11-20 19:21:10 +04:00
|
|
|
}
|
|
|
|
|
2016-10-27 13:45:17 +03:00
|
|
|
raw_parse_flags(state->flags, &rs->open_flags);
|
2012-09-20 23:13:25 +04:00
|
|
|
|
2016-10-27 13:45:17 +03:00
|
|
|
rs->fd = -1;
|
2012-09-20 23:13:25 +04:00
|
|
|
|
2013-01-31 18:40:14 +04:00
|
|
|
int fcntl_flags = O_APPEND | O_NONBLOCK;
|
2012-09-20 23:13:25 +04:00
|
|
|
#ifdef O_NOATIME
|
|
|
|
fcntl_flags |= O_NOATIME;
|
|
|
|
#endif
|
|
|
|
|
2013-01-31 18:40:14 +04:00
|
|
|
#ifdef O_ASYNC
|
|
|
|
/* Not all operating systems have O_ASYNC, and those that don't
|
2016-10-27 13:45:17 +03:00
|
|
|
* will not let us track the state into rs->open_flags (typically
|
2013-01-31 18:40:14 +04:00
|
|
|
* you achieve the same effect with an ioctl, for example I_SETSIG
|
|
|
|
* on Solaris). But we do not use O_ASYNC, so that's fine.
|
|
|
|
*/
|
|
|
|
assert((s->open_flags & O_ASYNC) == 0);
|
|
|
|
#endif
|
|
|
|
|
2016-10-27 13:45:17 +03:00
|
|
|
if ((rs->open_flags & ~fcntl_flags) == (s->open_flags & ~fcntl_flags)) {
|
2012-09-20 23:13:25 +04:00
|
|
|
/* dup the original fd */
|
2016-10-27 13:45:17 +03:00
|
|
|
rs->fd = qemu_dup(s->fd);
|
|
|
|
if (rs->fd >= 0) {
|
|
|
|
ret = fcntl_setfl(rs->fd, rs->open_flags);
|
2012-09-20 23:13:25 +04:00
|
|
|
if (ret) {
|
2016-10-27 13:45:17 +03:00
|
|
|
qemu_close(rs->fd);
|
|
|
|
rs->fd = -1;
|
2012-09-20 23:13:25 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* If we cannot use fcntl, or fcntl failed, fall back to qemu_open() */
|
2016-10-27 13:45:17 +03:00
|
|
|
if (rs->fd == -1) {
|
2015-08-12 18:33:31 +03:00
|
|
|
const char *normalized_filename = state->bs->filename;
|
|
|
|
ret = raw_normalize_devicepath(&normalized_filename);
|
|
|
|
if (ret < 0) {
|
|
|
|
error_setg_errno(errp, -ret, "Could not normalize device path");
|
|
|
|
} else {
|
2016-10-27 13:45:17 +03:00
|
|
|
assert(!(rs->open_flags & O_CREAT));
|
|
|
|
rs->fd = qemu_open(normalized_filename, rs->open_flags);
|
|
|
|
if (rs->fd == -1) {
|
2015-08-12 18:33:31 +03:00
|
|
|
error_setg_errno(errp, errno, "Could not reopen file");
|
|
|
|
ret = -1;
|
|
|
|
}
|
2012-09-20 23:13:25 +04:00
|
|
|
}
|
|
|
|
}
|
2014-07-16 19:48:17 +04:00
|
|
|
|
|
|
|
/* Fail already reopen_prepare() if we can't get a working O_DIRECT
|
|
|
|
* alignment with the new fd. */
|
2016-10-27 13:45:17 +03:00
|
|
|
if (rs->fd != -1) {
|
|
|
|
raw_probe_alignment(state->bs, rs->fd, &local_err);
|
2014-07-16 19:48:17 +04:00
|
|
|
if (local_err) {
|
2016-10-27 13:45:17 +03:00
|
|
|
qemu_close(rs->fd);
|
|
|
|
rs->fd = -1;
|
2014-07-16 19:48:17 +04:00
|
|
|
error_propagate(errp, local_err);
|
|
|
|
ret = -EINVAL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-09-20 23:13:25 +04:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void raw_reopen_commit(BDRVReopenState *state)
|
|
|
|
{
|
2016-10-27 13:45:17 +03:00
|
|
|
BDRVRawReopenState *rs = state->opaque;
|
2012-09-20 23:13:25 +04:00
|
|
|
BDRVRawState *s = state->bs->opaque;
|
|
|
|
|
2016-10-27 13:45:17 +03:00
|
|
|
s->open_flags = rs->open_flags;
|
2012-09-20 23:13:25 +04:00
|
|
|
|
|
|
|
qemu_close(s->fd);
|
2016-10-27 13:45:17 +03:00
|
|
|
s->fd = rs->fd;
|
2012-09-20 23:13:25 +04:00
|
|
|
|
|
|
|
g_free(state->opaque);
|
|
|
|
state->opaque = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static void raw_reopen_abort(BDRVReopenState *state)
|
|
|
|
{
|
2016-10-27 13:45:17 +03:00
|
|
|
BDRVRawReopenState *rs = state->opaque;
|
2012-09-20 23:13:25 +04:00
|
|
|
|
|
|
|
/* nothing to do if NULL, we didn't get far enough */
|
2016-10-27 13:45:17 +03:00
|
|
|
if (rs == NULL) {
|
2012-09-20 23:13:25 +04:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2016-10-27 13:45:17 +03:00
|
|
|
if (rs->fd >= 0) {
|
|
|
|
qemu_close(rs->fd);
|
|
|
|
rs->fd = -1;
|
2012-09-20 23:13:25 +04:00
|
|
|
}
|
|
|
|
g_free(state->opaque);
|
|
|
|
state->opaque = NULL;
|
|
|
|
}
|
|
|
|
|
2017-01-20 19:25:26 +03:00
|
|
|
static int hdev_get_max_transfer_length(BlockDriverState *bs, int fd)
|
2016-06-03 05:07:02 +03:00
|
|
|
{
|
|
|
|
#ifdef BLKSECTGET
|
2017-01-20 19:25:26 +03:00
|
|
|
int max_bytes = 0;
|
|
|
|
short max_sectors = 0;
|
|
|
|
if (bs->sg && ioctl(fd, BLKSECTGET, &max_bytes) == 0) {
|
|
|
|
return max_bytes;
|
|
|
|
} else if (!bs->sg && ioctl(fd, BLKSECTGET, &max_sectors) == 0) {
|
|
|
|
return max_sectors << BDRV_SECTOR_BITS;
|
2016-06-03 05:07:02 +03:00
|
|
|
} else {
|
|
|
|
return -errno;
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
return -ENOSYS;
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
file-posix: Consider max_segments for BlockLimits.max_transfer
BlockLimits.max_transfer can be too high without this fix, guest will
encounter I/O error or even get paused with werror=stop or rerror=stop. The
cause is explained below.
Linux has a separate limit, /sys/block/.../queue/max_segments, which in
the worst case can be more restrictive than the BLKSECTGET which we
already consider (note that they are two different things). So, the
failure scenario before this patch is:
1) host device has max_sectors_kb = 4096 and max_segments = 64;
2) guest learns max_sectors_kb limit from QEMU, but doesn't know
max_segments;
3) guest issues e.g. a 512KB request thinking it's okay, but actually
it's not, because it will be passed through to host device as an
SG_IO req that has niov > 64;
4) host kernel doesn't like the segmenting of the request, and returns
-EINVAL;
This patch checks the max_segments sysfs entry for the host device and
calculates a "conservative" bytes limit using the page size, which is
then merged into the existing max_transfer limit. Guest will discover
this from the usual virtual block device interfaces. (In the case of
scsi-generic, it will be done in the INQUIRY reply interception in
device model.)
The other possibility is to actually propagate it as a separate limit,
but it's not better. On the one hand, there is a big complication: the
limit is per-LUN in QEMU PoV (because we can attach LUNs from different
host HBAs to the same virtio-scsi bus), but the channel to communicate
it in a per-LUN manner is missing down the stack; on the other hand,
two limits versus one doesn't change much about the valid size of I/O
(because guest has no control over host segmenting).
Also, the idea to fall back to bounce buffering in QEMU, upon -EINVAL,
was explored. Unfortunately there is no neat way to ensure the bounce
buffer is less segmented (in terms of DMA addr) than the guest buffer.
Practically, this bug is not very common. It is only reported on a
Emulex (lpfc), so it's okay to get it fixed in the easier way.
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2017-03-08 15:08:14 +03:00
|
|
|
static int hdev_get_max_segments(const struct stat *st)
|
|
|
|
{
|
|
|
|
#ifdef CONFIG_LINUX
|
|
|
|
char buf[32];
|
|
|
|
const char *end;
|
|
|
|
char *sysfspath;
|
|
|
|
int ret;
|
|
|
|
int fd = -1;
|
|
|
|
long max_segments;
|
|
|
|
|
|
|
|
sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/max_segments",
|
|
|
|
major(st->st_rdev), minor(st->st_rdev));
|
|
|
|
fd = open(sysfspath, O_RDONLY);
|
|
|
|
if (fd == -1) {
|
|
|
|
ret = -errno;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
do {
|
2017-03-14 12:09:22 +03:00
|
|
|
ret = read(fd, buf, sizeof(buf) - 1);
|
file-posix: Consider max_segments for BlockLimits.max_transfer
BlockLimits.max_transfer can be too high without this fix, guest will
encounter I/O error or even get paused with werror=stop or rerror=stop. The
cause is explained below.
Linux has a separate limit, /sys/block/.../queue/max_segments, which in
the worst case can be more restrictive than the BLKSECTGET which we
already consider (note that they are two different things). So, the
failure scenario before this patch is:
1) host device has max_sectors_kb = 4096 and max_segments = 64;
2) guest learns max_sectors_kb limit from QEMU, but doesn't know
max_segments;
3) guest issues e.g. a 512KB request thinking it's okay, but actually
it's not, because it will be passed through to host device as an
SG_IO req that has niov > 64;
4) host kernel doesn't like the segmenting of the request, and returns
-EINVAL;
This patch checks the max_segments sysfs entry for the host device and
calculates a "conservative" bytes limit using the page size, which is
then merged into the existing max_transfer limit. Guest will discover
this from the usual virtual block device interfaces. (In the case of
scsi-generic, it will be done in the INQUIRY reply interception in
device model.)
The other possibility is to actually propagate it as a separate limit,
but it's not better. On the one hand, there is a big complication: the
limit is per-LUN in QEMU PoV (because we can attach LUNs from different
host HBAs to the same virtio-scsi bus), but the channel to communicate
it in a per-LUN manner is missing down the stack; on the other hand,
two limits versus one doesn't change much about the valid size of I/O
(because guest has no control over host segmenting).
Also, the idea to fall back to bounce buffering in QEMU, upon -EINVAL,
was explored. Unfortunately there is no neat way to ensure the bounce
buffer is less segmented (in terms of DMA addr) than the guest buffer.
Practically, this bug is not very common. It is only reported on a
Emulex (lpfc), so it's okay to get it fixed in the easier way.
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2017-03-08 15:08:14 +03:00
|
|
|
} while (ret == -1 && errno == EINTR);
|
|
|
|
if (ret < 0) {
|
|
|
|
ret = -errno;
|
|
|
|
goto out;
|
|
|
|
} else if (ret == 0) {
|
|
|
|
ret = -EIO;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
buf[ret] = 0;
|
|
|
|
/* The file is ended with '\n', pass 'end' to accept that. */
|
|
|
|
ret = qemu_strtol(buf, &end, 10, &max_segments);
|
|
|
|
if (ret == 0 && end && *end == '\n') {
|
|
|
|
ret = max_segments;
|
|
|
|
}
|
|
|
|
|
|
|
|
out:
|
2017-03-14 19:12:05 +03:00
|
|
|
if (fd != -1) {
|
|
|
|
close(fd);
|
|
|
|
}
|
file-posix: Consider max_segments for BlockLimits.max_transfer
BlockLimits.max_transfer can be too high without this fix, guest will
encounter I/O error or even get paused with werror=stop or rerror=stop. The
cause is explained below.
Linux has a separate limit, /sys/block/.../queue/max_segments, which in
the worst case can be more restrictive than the BLKSECTGET which we
already consider (note that they are two different things). So, the
failure scenario before this patch is:
1) host device has max_sectors_kb = 4096 and max_segments = 64;
2) guest learns max_sectors_kb limit from QEMU, but doesn't know
max_segments;
3) guest issues e.g. a 512KB request thinking it's okay, but actually
it's not, because it will be passed through to host device as an
SG_IO req that has niov > 64;
4) host kernel doesn't like the segmenting of the request, and returns
-EINVAL;
This patch checks the max_segments sysfs entry for the host device and
calculates a "conservative" bytes limit using the page size, which is
then merged into the existing max_transfer limit. Guest will discover
this from the usual virtual block device interfaces. (In the case of
scsi-generic, it will be done in the INQUIRY reply interception in
device model.)
The other possibility is to actually propagate it as a separate limit,
but it's not better. On the one hand, there is a big complication: the
limit is per-LUN in QEMU PoV (because we can attach LUNs from different
host HBAs to the same virtio-scsi bus), but the channel to communicate
it in a per-LUN manner is missing down the stack; on the other hand,
two limits versus one doesn't change much about the valid size of I/O
(because guest has no control over host segmenting).
Also, the idea to fall back to bounce buffering in QEMU, upon -EINVAL,
was explored. Unfortunately there is no neat way to ensure the bounce
buffer is less segmented (in terms of DMA addr) than the guest buffer.
Practically, this bug is not very common. It is only reported on a
Emulex (lpfc), so it's okay to get it fixed in the easier way.
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2017-03-08 15:08:14 +03:00
|
|
|
g_free(sysfspath);
|
|
|
|
return ret;
|
|
|
|
#else
|
|
|
|
return -ENOTSUP;
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2014-07-16 19:48:16 +04:00
|
|
|
static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
|
2011-11-29 15:42:20 +04:00
|
|
|
{
|
|
|
|
BDRVRawState *s = bs->opaque;
|
2016-06-03 05:07:02 +03:00
|
|
|
struct stat st;
|
|
|
|
|
|
|
|
if (!fstat(s->fd, &st)) {
|
2017-01-20 19:25:27 +03:00
|
|
|
if (S_ISBLK(st.st_mode) || S_ISCHR(st.st_mode)) {
|
2017-01-20 19:25:26 +03:00
|
|
|
int ret = hdev_get_max_transfer_length(bs, s->fd);
|
|
|
|
if (ret > 0 && ret <= BDRV_REQUEST_MAX_BYTES) {
|
|
|
|
bs->bl.max_transfer = pow2floor(ret);
|
2016-06-03 05:07:02 +03:00
|
|
|
}
|
file-posix: Consider max_segments for BlockLimits.max_transfer
BlockLimits.max_transfer can be too high without this fix, guest will
encounter I/O error or even get paused with werror=stop or rerror=stop. The
cause is explained below.
Linux has a separate limit, /sys/block/.../queue/max_segments, which in
the worst case can be more restrictive than the BLKSECTGET which we
already consider (note that they are two different things). So, the
failure scenario before this patch is:
1) host device has max_sectors_kb = 4096 and max_segments = 64;
2) guest learns max_sectors_kb limit from QEMU, but doesn't know
max_segments;
3) guest issues e.g. a 512KB request thinking it's okay, but actually
it's not, because it will be passed through to host device as an
SG_IO req that has niov > 64;
4) host kernel doesn't like the segmenting of the request, and returns
-EINVAL;
This patch checks the max_segments sysfs entry for the host device and
calculates a "conservative" bytes limit using the page size, which is
then merged into the existing max_transfer limit. Guest will discover
this from the usual virtual block device interfaces. (In the case of
scsi-generic, it will be done in the INQUIRY reply interception in
device model.)
The other possibility is to actually propagate it as a separate limit,
but it's not better. On the one hand, there is a big complication: the
limit is per-LUN in QEMU PoV (because we can attach LUNs from different
host HBAs to the same virtio-scsi bus), but the channel to communicate
it in a per-LUN manner is missing down the stack; on the other hand,
two limits versus one doesn't change much about the valid size of I/O
(because guest has no control over host segmenting).
Also, the idea to fall back to bounce buffering in QEMU, upon -EINVAL,
was explored. Unfortunately there is no neat way to ensure the bounce
buffer is less segmented (in terms of DMA addr) than the guest buffer.
Practically, this bug is not very common. It is only reported on a
Emulex (lpfc), so it's okay to get it fixed in the easier way.
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2017-03-08 15:08:14 +03:00
|
|
|
ret = hdev_get_max_segments(&st);
|
|
|
|
if (ret > 0) {
|
|
|
|
bs->bl.max_transfer = MIN(bs->bl.max_transfer,
|
|
|
|
ret * getpagesize());
|
|
|
|
}
|
2016-06-03 05:07:02 +03:00
|
|
|
}
|
|
|
|
}
|
2012-09-20 23:13:25 +04:00
|
|
|
|
2014-07-16 19:48:17 +04:00
|
|
|
raw_probe_alignment(bs, s->fd, errp);
|
2015-05-12 17:30:55 +03:00
|
|
|
bs->bl.min_mem_alignment = s->buf_align;
|
block: align bounce buffers to page
The following sequence
int fd = open(argv[1], O_RDWR | O_CREAT | O_DIRECT, 0644);
for (i = 0; i < 100000; i++)
write(fd, buf, 4096);
performs 5% better if buf is aligned to 4096 bytes.
The difference is quite reliable.
On the other hand we do not want at the moment to enforce bounce
buffering if guest request is aligned to 512 bytes.
The patch changes default bounce buffer optimal alignment to
MAX(page size, 4k). 4k is chosen as maximal known sector size on real
HDD.
The justification of the performance improve is quite interesting.
From the kernel point of view each request to the disk was split
by two. This could be seen by blktrace like this:
9,0 11 1 0.000000000 11151 Q WS 312737792 + 1023 [qemu-img]
9,0 11 2 0.000007938 11151 Q WS 312738815 + 8 [qemu-img]
9,0 11 3 0.000030735 11151 Q WS 312738823 + 1016 [qemu-img]
9,0 11 4 0.000032482 11151 Q WS 312739839 + 8 [qemu-img]
9,0 11 5 0.000041379 11151 Q WS 312739847 + 1016 [qemu-img]
9,0 11 6 0.000042818 11151 Q WS 312740863 + 8 [qemu-img]
9,0 11 7 0.000051236 11151 Q WS 312740871 + 1017 [qemu-img]
9,0 5 1 0.169071519 11151 Q WS 312741888 + 1023 [qemu-img]
After the patch the pattern becomes normal:
9,0 6 1 0.000000000 12422 Q WS 314834944 + 1024 [qemu-img]
9,0 6 2 0.000038527 12422 Q WS 314835968 + 1024 [qemu-img]
9,0 6 3 0.000072849 12422 Q WS 314836992 + 1024 [qemu-img]
9,0 6 4 0.000106276 12422 Q WS 314838016 + 1024 [qemu-img]
and the amount of requests sent to disk (could be calculated counting
number of lines in the output of blktrace) is reduced about 2 times.
Both qemu-img and qemu-io are affected while qemu-kvm is not. The guest
does his job well and real requests comes properly aligned (to page).
Signed-off-by: Denis V. Lunev <den@openvz.org>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Message-id: 1431441056-26198-3-git-send-email-den@openvz.org
CC: Paolo Bonzini <pbonzini@redhat.com>
CC: Kevin Wolf <kwolf@redhat.com>
CC: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2015-05-12 17:30:56 +03:00
|
|
|
bs->bl.opt_mem_alignment = MAX(s->buf_align, getpagesize());
|
2011-11-29 15:42:20 +04:00
|
|
|
}
|
2006-08-01 20:21:11 +04:00
|
|
|
|
2015-02-16 14:47:56 +03:00
|
|
|
static int check_for_dasd(int fd)
|
|
|
|
{
|
|
|
|
#ifdef BIODASDINFO2
|
|
|
|
struct dasd_information2_t info = {0};
|
|
|
|
|
|
|
|
return ioctl(fd, BIODASDINFO2, &info);
|
|
|
|
#else
|
|
|
|
return -1;
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Try to get @bs's logical and physical block size.
|
|
|
|
* On success, store them in @bsz and return zero.
|
|
|
|
* On failure, return negative errno.
|
|
|
|
*/
|
|
|
|
static int hdev_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
|
|
|
|
{
|
|
|
|
BDRVRawState *s = bs->opaque;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
/* If DASD, get blocksizes */
|
|
|
|
if (check_for_dasd(s->fd) < 0) {
|
|
|
|
return -ENOTSUP;
|
|
|
|
}
|
|
|
|
ret = probe_logical_blocksize(s->fd, &bsz->log);
|
|
|
|
if (ret < 0) {
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
return probe_physical_blocksize(s->fd, &bsz->phys);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Try to get @bs's geometry: cyls, heads, sectors.
|
|
|
|
* On success, store them in @geo and return 0.
|
|
|
|
* On failure return -errno.
|
|
|
|
* (Allows block driver to assign default geometry values that guest sees)
|
|
|
|
*/
|
|
|
|
#ifdef __linux__
|
|
|
|
static int hdev_probe_geometry(BlockDriverState *bs, HDGeometry *geo)
|
|
|
|
{
|
|
|
|
BDRVRawState *s = bs->opaque;
|
|
|
|
struct hd_geometry ioctl_geo = {0};
|
|
|
|
|
|
|
|
/* If DASD, get its geometry */
|
|
|
|
if (check_for_dasd(s->fd) < 0) {
|
|
|
|
return -ENOTSUP;
|
|
|
|
}
|
|
|
|
if (ioctl(s->fd, HDIO_GETGEO, &ioctl_geo) < 0) {
|
|
|
|
return -errno;
|
|
|
|
}
|
|
|
|
/* HDIO_GETGEO may return success even though geo contains zeros
|
|
|
|
(e.g. certain multipath setups) */
|
|
|
|
if (!ioctl_geo.heads || !ioctl_geo.sectors || !ioctl_geo.cylinders) {
|
|
|
|
return -ENOTSUP;
|
|
|
|
}
|
|
|
|
/* Do not return a geometry for partition */
|
|
|
|
if (ioctl_geo.start != 0) {
|
|
|
|
return -ENOTSUP;
|
|
|
|
}
|
|
|
|
geo->heads = ioctl_geo.heads;
|
|
|
|
geo->sectors = ioctl_geo.sectors;
|
|
|
|
geo->cylinders = ioctl_geo.cylinders;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
#else /* __linux__ */
|
|
|
|
static int hdev_probe_geometry(BlockDriverState *bs, HDGeometry *geo)
|
|
|
|
{
|
|
|
|
return -ENOTSUP;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2012-05-25 13:46:27 +04:00
|
|
|
static ssize_t handle_aiocb_ioctl(RawPosixAIOData *aiocb)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = ioctl(aiocb->aio_fildes, aiocb->aio_ioctl_cmd, aiocb->aio_ioctl_buf);
|
|
|
|
if (ret == -1) {
|
|
|
|
return -errno;
|
|
|
|
}
|
|
|
|
|
2013-01-10 18:28:35 +04:00
|
|
|
return 0;
|
2012-05-25 13:46:27 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
static ssize_t handle_aiocb_flush(RawPosixAIOData *aiocb)
|
|
|
|
{
|
2017-03-23 00:00:05 +03:00
|
|
|
BDRVRawState *s = aiocb->bs->opaque;
|
2012-05-25 13:46:27 +04:00
|
|
|
int ret;
|
|
|
|
|
2017-03-23 00:00:05 +03:00
|
|
|
if (s->page_cache_inconsistent) {
|
|
|
|
return -EIO;
|
|
|
|
}
|
|
|
|
|
2012-05-25 13:46:27 +04:00
|
|
|
ret = qemu_fdatasync(aiocb->aio_fildes);
|
|
|
|
if (ret == -1) {
|
2017-03-23 00:00:05 +03:00
|
|
|
/* There is no clear definition of the semantics of a failing fsync(),
|
|
|
|
* so we may have to assume the worst. The sad truth is that this
|
|
|
|
* assumption is correct for Linux. Some pages are now probably marked
|
|
|
|
* clean in the page cache even though they are inconsistent with the
|
|
|
|
* on-disk contents. The next fdatasync() call would succeed, but no
|
|
|
|
* further writeback attempt will be made. We can't get back to a state
|
|
|
|
* in which we know what is on disk (we would have to rewrite
|
|
|
|
* everything that was touched since the last fdatasync() at least), so
|
|
|
|
* make bdrv_flush() fail permanently. Given that the behaviour isn't
|
|
|
|
* really defined, I have little hope that other OSes are doing better.
|
|
|
|
*
|
|
|
|
* Obviously, this doesn't affect O_DIRECT, which bypasses the page
|
|
|
|
* cache. */
|
|
|
|
if ((s->open_flags & O_DIRECT) == 0) {
|
|
|
|
s->page_cache_inconsistent = true;
|
|
|
|
}
|
2012-05-25 13:46:27 +04:00
|
|
|
return -errno;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef CONFIG_PREADV
|
|
|
|
|
|
|
|
static bool preadv_present = true;
|
|
|
|
|
|
|
|
static ssize_t
|
|
|
|
qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset)
|
|
|
|
{
|
|
|
|
return preadv(fd, iov, nr_iov, offset);
|
|
|
|
}
|
|
|
|
|
|
|
|
static ssize_t
|
|
|
|
qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
|
|
|
|
{
|
|
|
|
return pwritev(fd, iov, nr_iov, offset);
|
|
|
|
}
|
|
|
|
|
|
|
|
#else
|
|
|
|
|
|
|
|
static bool preadv_present = false;
|
|
|
|
|
|
|
|
static ssize_t
|
|
|
|
qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset)
|
|
|
|
{
|
|
|
|
return -ENOSYS;
|
|
|
|
}
|
|
|
|
|
|
|
|
static ssize_t
|
|
|
|
qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
|
|
|
|
{
|
|
|
|
return -ENOSYS;
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
|
|
|
|
{
|
|
|
|
ssize_t len;
|
|
|
|
|
|
|
|
do {
|
|
|
|
if (aiocb->aio_type & QEMU_AIO_WRITE)
|
|
|
|
len = qemu_pwritev(aiocb->aio_fildes,
|
|
|
|
aiocb->aio_iov,
|
|
|
|
aiocb->aio_niov,
|
|
|
|
aiocb->aio_offset);
|
|
|
|
else
|
|
|
|
len = qemu_preadv(aiocb->aio_fildes,
|
|
|
|
aiocb->aio_iov,
|
|
|
|
aiocb->aio_niov,
|
|
|
|
aiocb->aio_offset);
|
|
|
|
} while (len == -1 && errno == EINTR);
|
|
|
|
|
|
|
|
if (len == -1) {
|
|
|
|
return -errno;
|
|
|
|
}
|
|
|
|
return len;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Read/writes the data to/from a given linear buffer.
|
|
|
|
*
|
|
|
|
* Returns the number of bytes handles or -errno in case of an error. Short
|
|
|
|
* reads are only returned if the end of the file is reached.
|
|
|
|
*/
|
|
|
|
static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf)
|
|
|
|
{
|
|
|
|
ssize_t offset = 0;
|
|
|
|
ssize_t len;
|
|
|
|
|
|
|
|
while (offset < aiocb->aio_nbytes) {
|
|
|
|
if (aiocb->aio_type & QEMU_AIO_WRITE) {
|
|
|
|
len = pwrite(aiocb->aio_fildes,
|
|
|
|
(const char *)buf + offset,
|
|
|
|
aiocb->aio_nbytes - offset,
|
|
|
|
aiocb->aio_offset + offset);
|
|
|
|
} else {
|
|
|
|
len = pread(aiocb->aio_fildes,
|
|
|
|
buf + offset,
|
|
|
|
aiocb->aio_nbytes - offset,
|
|
|
|
aiocb->aio_offset + offset);
|
|
|
|
}
|
|
|
|
if (len == -1 && errno == EINTR) {
|
|
|
|
continue;
|
2014-08-21 16:44:07 +04:00
|
|
|
} else if (len == -1 && errno == EINVAL &&
|
|
|
|
(aiocb->bs->open_flags & BDRV_O_NOCACHE) &&
|
|
|
|
!(aiocb->aio_type & QEMU_AIO_WRITE) &&
|
|
|
|
offset > 0) {
|
|
|
|
/* O_DIRECT pread() may fail with EINVAL when offset is unaligned
|
|
|
|
* after a short read. Assume that O_DIRECT short reads only occur
|
|
|
|
* at EOF. Therefore this is a short read, not an I/O error.
|
|
|
|
*/
|
|
|
|
break;
|
2012-05-25 13:46:27 +04:00
|
|
|
} else if (len == -1) {
|
|
|
|
offset = -errno;
|
|
|
|
break;
|
|
|
|
} else if (len == 0) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
offset += len;
|
|
|
|
}
|
|
|
|
|
|
|
|
return offset;
|
|
|
|
}
|
|
|
|
|
|
|
|
static ssize_t handle_aiocb_rw(RawPosixAIOData *aiocb)
|
|
|
|
{
|
|
|
|
ssize_t nbytes;
|
|
|
|
char *buf;
|
|
|
|
|
|
|
|
if (!(aiocb->aio_type & QEMU_AIO_MISALIGNED)) {
|
|
|
|
/*
|
|
|
|
* If there is just a single buffer, and it is properly aligned
|
|
|
|
* we can just use plain pread/pwrite without any problems.
|
|
|
|
*/
|
|
|
|
if (aiocb->aio_niov == 1) {
|
|
|
|
return handle_aiocb_rw_linear(aiocb, aiocb->aio_iov->iov_base);
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* We have more than one iovec, and all are properly aligned.
|
|
|
|
*
|
|
|
|
* Try preadv/pwritev first and fall back to linearizing the
|
|
|
|
* buffer if it's not supported.
|
|
|
|
*/
|
|
|
|
if (preadv_present) {
|
|
|
|
nbytes = handle_aiocb_rw_vector(aiocb);
|
|
|
|
if (nbytes == aiocb->aio_nbytes ||
|
|
|
|
(nbytes < 0 && nbytes != -ENOSYS)) {
|
|
|
|
return nbytes;
|
|
|
|
}
|
|
|
|
preadv_present = false;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* XXX(hch): short read/write. no easy way to handle the reminder
|
|
|
|
* using these interfaces. For now retry using plain
|
|
|
|
* pread/pwrite?
|
|
|
|
*/
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Ok, we have to do it the hard way, copy all segments into
|
|
|
|
* a single aligned buffer.
|
|
|
|
*/
|
2014-05-21 20:02:42 +04:00
|
|
|
buf = qemu_try_blockalign(aiocb->bs, aiocb->aio_nbytes);
|
|
|
|
if (buf == NULL) {
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
|
2012-05-25 13:46:27 +04:00
|
|
|
if (aiocb->aio_type & QEMU_AIO_WRITE) {
|
|
|
|
char *p = buf;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < aiocb->aio_niov; ++i) {
|
|
|
|
memcpy(p, aiocb->aio_iov[i].iov_base, aiocb->aio_iov[i].iov_len);
|
|
|
|
p += aiocb->aio_iov[i].iov_len;
|
|
|
|
}
|
2014-07-01 18:09:54 +04:00
|
|
|
assert(p - buf == aiocb->aio_nbytes);
|
2012-05-25 13:46:27 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
nbytes = handle_aiocb_rw_linear(aiocb, buf);
|
|
|
|
if (!(aiocb->aio_type & QEMU_AIO_WRITE)) {
|
|
|
|
char *p = buf;
|
|
|
|
size_t count = aiocb->aio_nbytes, copy;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < aiocb->aio_niov && count; ++i) {
|
|
|
|
copy = count;
|
|
|
|
if (copy > aiocb->aio_iov[i].iov_len) {
|
|
|
|
copy = aiocb->aio_iov[i].iov_len;
|
|
|
|
}
|
|
|
|
memcpy(aiocb->aio_iov[i].iov_base, p, copy);
|
2014-07-01 18:09:54 +04:00
|
|
|
assert(count >= copy);
|
2012-05-25 13:46:27 +04:00
|
|
|
p += copy;
|
|
|
|
count -= copy;
|
|
|
|
}
|
2014-07-01 18:09:54 +04:00
|
|
|
assert(count == 0);
|
2012-05-25 13:46:27 +04:00
|
|
|
}
|
|
|
|
qemu_vfree(buf);
|
|
|
|
|
|
|
|
return nbytes;
|
|
|
|
}
|
|
|
|
|
2013-01-14 19:26:55 +04:00
|
|
|
#ifdef CONFIG_XFS
|
2013-11-22 16:39:57 +04:00
|
|
|
static int xfs_write_zeroes(BDRVRawState *s, int64_t offset, uint64_t bytes)
|
|
|
|
{
|
|
|
|
struct xfs_flock64 fl;
|
2015-06-23 13:44:58 +03:00
|
|
|
int err;
|
2013-11-22 16:39:57 +04:00
|
|
|
|
|
|
|
memset(&fl, 0, sizeof(fl));
|
|
|
|
fl.l_whence = SEEK_SET;
|
|
|
|
fl.l_start = offset;
|
|
|
|
fl.l_len = bytes;
|
|
|
|
|
|
|
|
if (xfsctl(NULL, s->fd, XFS_IOC_ZERO_RANGE, &fl) < 0) {
|
2015-06-23 13:44:58 +03:00
|
|
|
err = errno;
|
|
|
|
DPRINTF("cannot write zero range (%s)\n", strerror(errno));
|
|
|
|
return -err;
|
2013-11-22 16:39:57 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2013-01-14 19:26:55 +04:00
|
|
|
static int xfs_discard(BDRVRawState *s, int64_t offset, uint64_t bytes)
|
|
|
|
{
|
|
|
|
struct xfs_flock64 fl;
|
2015-06-23 13:44:58 +03:00
|
|
|
int err;
|
2013-01-14 19:26:55 +04:00
|
|
|
|
|
|
|
memset(&fl, 0, sizeof(fl));
|
|
|
|
fl.l_whence = SEEK_SET;
|
|
|
|
fl.l_start = offset;
|
|
|
|
fl.l_len = bytes;
|
|
|
|
|
|
|
|
if (xfsctl(NULL, s->fd, XFS_IOC_UNRESVSP64, &fl) < 0) {
|
2015-06-23 13:44:58 +03:00
|
|
|
err = errno;
|
|
|
|
DPRINTF("cannot punch hole (%s)\n", strerror(errno));
|
|
|
|
return -err;
|
2013-01-14 19:26:55 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2015-01-30 11:42:11 +03:00
|
|
|
static int translate_err(int err)
|
|
|
|
{
|
|
|
|
if (err == -ENODEV || err == -ENOSYS || err == -EOPNOTSUPP ||
|
|
|
|
err == -ENOTTY) {
|
|
|
|
err = -ENOTSUP;
|
|
|
|
}
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2015-01-30 11:42:15 +03:00
|
|
|
#ifdef CONFIG_FALLOCATE
|
2015-01-30 11:42:12 +03:00
|
|
|
static int do_fallocate(int fd, int mode, off_t offset, off_t len)
|
|
|
|
{
|
|
|
|
do {
|
|
|
|
if (fallocate(fd, mode, offset, len) == 0) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
} while (errno == EINTR);
|
|
|
|
return translate_err(-errno);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2015-01-30 11:42:13 +03:00
|
|
|
static ssize_t handle_aiocb_write_zeroes_block(RawPosixAIOData *aiocb)
|
2013-11-22 16:39:57 +04:00
|
|
|
{
|
2015-01-30 11:42:13 +03:00
|
|
|
int ret = -ENOTSUP;
|
2013-11-22 16:39:57 +04:00
|
|
|
BDRVRawState *s = aiocb->bs->opaque;
|
|
|
|
|
2015-01-30 11:42:13 +03:00
|
|
|
if (!s->has_write_zeroes) {
|
2013-11-22 16:39:57 +04:00
|
|
|
return -ENOTSUP;
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef BLKZEROOUT
|
2015-01-30 11:42:13 +03:00
|
|
|
do {
|
|
|
|
uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes };
|
|
|
|
if (ioctl(aiocb->aio_fildes, BLKZEROOUT, range) == 0) {
|
|
|
|
return 0;
|
2013-11-22 16:39:57 +04:00
|
|
|
}
|
2015-01-30 11:42:13 +03:00
|
|
|
} while (errno == EINTR);
|
|
|
|
|
|
|
|
ret = translate_err(-errno);
|
2013-11-22 16:39:57 +04:00
|
|
|
#endif
|
|
|
|
|
2015-01-30 11:42:11 +03:00
|
|
|
if (ret == -ENOTSUP) {
|
2013-11-22 16:39:57 +04:00
|
|
|
s->has_write_zeroes = false;
|
|
|
|
}
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2015-01-30 11:42:13 +03:00
|
|
|
static ssize_t handle_aiocb_write_zeroes(RawPosixAIOData *aiocb)
|
|
|
|
{
|
2015-02-12 08:35:49 +03:00
|
|
|
#if defined(CONFIG_FALLOCATE) || defined(CONFIG_XFS)
|
2015-01-30 11:42:13 +03:00
|
|
|
BDRVRawState *s = aiocb->bs->opaque;
|
2015-02-12 08:35:49 +03:00
|
|
|
#endif
|
2017-08-04 18:10:11 +03:00
|
|
|
#ifdef CONFIG_FALLOCATE
|
|
|
|
int64_t len;
|
|
|
|
#endif
|
2015-01-30 11:42:13 +03:00
|
|
|
|
|
|
|
if (aiocb->aio_type & QEMU_AIO_BLKDEV) {
|
|
|
|
return handle_aiocb_write_zeroes_block(aiocb);
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef CONFIG_XFS
|
|
|
|
if (s->is_xfs) {
|
|
|
|
return xfs_write_zeroes(s, aiocb->aio_offset, aiocb->aio_nbytes);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2015-01-30 11:42:14 +03:00
|
|
|
#ifdef CONFIG_FALLOCATE_ZERO_RANGE
|
|
|
|
if (s->has_write_zeroes) {
|
|
|
|
int ret = do_fallocate(s->fd, FALLOC_FL_ZERO_RANGE,
|
|
|
|
aiocb->aio_offset, aiocb->aio_nbytes);
|
|
|
|
if (ret == 0 || ret != -ENOTSUP) {
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
s->has_write_zeroes = false;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2015-01-30 11:42:16 +03:00
|
|
|
#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
|
|
|
|
if (s->has_discard && s->has_fallocate) {
|
|
|
|
int ret = do_fallocate(s->fd,
|
|
|
|
FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
|
|
|
|
aiocb->aio_offset, aiocb->aio_nbytes);
|
|
|
|
if (ret == 0) {
|
|
|
|
ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes);
|
|
|
|
if (ret == 0 || ret != -ENOTSUP) {
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
s->has_fallocate = false;
|
|
|
|
} else if (ret != -ENOTSUP) {
|
|
|
|
return ret;
|
|
|
|
} else {
|
|
|
|
s->has_discard = false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2015-01-30 11:42:15 +03:00
|
|
|
#ifdef CONFIG_FALLOCATE
|
2017-08-04 18:10:11 +03:00
|
|
|
/* Last resort: we are trying to extend the file with zeroed data. This
|
|
|
|
* can be done via fallocate(fd, 0) */
|
|
|
|
len = bdrv_getlength(aiocb->bs);
|
|
|
|
if (s->has_fallocate && len >= 0 && aiocb->aio_offset >= len) {
|
2015-01-30 11:42:15 +03:00
|
|
|
int ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes);
|
|
|
|
if (ret == 0 || ret != -ENOTSUP) {
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
s->has_fallocate = false;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2015-01-30 11:42:13 +03:00
|
|
|
return -ENOTSUP;
|
|
|
|
}
|
|
|
|
|
2013-01-14 19:26:55 +04:00
|
|
|
static ssize_t handle_aiocb_discard(RawPosixAIOData *aiocb)
|
|
|
|
{
|
|
|
|
int ret = -EOPNOTSUPP;
|
|
|
|
BDRVRawState *s = aiocb->bs->opaque;
|
|
|
|
|
2013-11-22 16:39:47 +04:00
|
|
|
if (!s->has_discard) {
|
|
|
|
return -ENOTSUP;
|
2013-01-14 19:26:55 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
if (aiocb->aio_type & QEMU_AIO_BLKDEV) {
|
|
|
|
#ifdef BLKDISCARD
|
|
|
|
do {
|
|
|
|
uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes };
|
|
|
|
if (ioctl(aiocb->aio_fildes, BLKDISCARD, range) == 0) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
} while (errno == EINTR);
|
|
|
|
|
|
|
|
ret = -errno;
|
|
|
|
#endif
|
|
|
|
} else {
|
|
|
|
#ifdef CONFIG_XFS
|
|
|
|
if (s->is_xfs) {
|
|
|
|
return xfs_discard(s, aiocb->aio_offset, aiocb->aio_nbytes);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
|
2015-01-30 11:42:12 +03:00
|
|
|
ret = do_fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
|
|
|
|
aiocb->aio_offset, aiocb->aio_nbytes);
|
2013-01-14 19:26:55 +04:00
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2015-01-30 11:42:11 +03:00
|
|
|
ret = translate_err(ret);
|
|
|
|
if (ret == -ENOTSUP) {
|
2013-11-22 16:39:47 +04:00
|
|
|
s->has_discard = false;
|
2013-01-14 19:26:55 +04:00
|
|
|
}
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2012-05-25 13:46:27 +04:00
|
|
|
static int aio_worker(void *arg)
|
|
|
|
{
|
|
|
|
RawPosixAIOData *aiocb = arg;
|
|
|
|
ssize_t ret = 0;
|
|
|
|
|
|
|
|
switch (aiocb->aio_type & QEMU_AIO_TYPE_MASK) {
|
|
|
|
case QEMU_AIO_READ:
|
|
|
|
ret = handle_aiocb_rw(aiocb);
|
2015-02-05 21:58:24 +03:00
|
|
|
if (ret >= 0 && ret < aiocb->aio_nbytes) {
|
2012-05-25 13:46:27 +04:00
|
|
|
iov_memset(aiocb->aio_iov, aiocb->aio_niov, ret,
|
|
|
|
0, aiocb->aio_nbytes - ret);
|
|
|
|
|
|
|
|
ret = aiocb->aio_nbytes;
|
|
|
|
}
|
|
|
|
if (ret == aiocb->aio_nbytes) {
|
|
|
|
ret = 0;
|
|
|
|
} else if (ret >= 0 && ret < aiocb->aio_nbytes) {
|
|
|
|
ret = -EINVAL;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case QEMU_AIO_WRITE:
|
|
|
|
ret = handle_aiocb_rw(aiocb);
|
|
|
|
if (ret == aiocb->aio_nbytes) {
|
|
|
|
ret = 0;
|
|
|
|
} else if (ret >= 0 && ret < aiocb->aio_nbytes) {
|
|
|
|
ret = -EINVAL;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case QEMU_AIO_FLUSH:
|
|
|
|
ret = handle_aiocb_flush(aiocb);
|
|
|
|
break;
|
|
|
|
case QEMU_AIO_IOCTL:
|
|
|
|
ret = handle_aiocb_ioctl(aiocb);
|
|
|
|
break;
|
2013-01-14 19:26:55 +04:00
|
|
|
case QEMU_AIO_DISCARD:
|
|
|
|
ret = handle_aiocb_discard(aiocb);
|
|
|
|
break;
|
2013-11-22 16:39:57 +04:00
|
|
|
case QEMU_AIO_WRITE_ZEROES:
|
|
|
|
ret = handle_aiocb_write_zeroes(aiocb);
|
|
|
|
break;
|
2012-05-25 13:46:27 +04:00
|
|
|
default:
|
|
|
|
fprintf(stderr, "invalid aio request (0x%x)\n", aiocb->aio_type);
|
|
|
|
ret = -EINVAL;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2015-10-01 14:04:39 +03:00
|
|
|
g_free(aiocb);
|
2012-05-25 13:46:27 +04:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2013-11-22 16:39:55 +04:00
|
|
|
static int paio_submit_co(BlockDriverState *bs, int fd,
|
2016-06-02 00:10:10 +03:00
|
|
|
int64_t offset, QEMUIOVector *qiov,
|
2017-06-09 13:18:08 +03:00
|
|
|
int bytes, int type)
|
2013-11-22 16:39:55 +04:00
|
|
|
{
|
2015-10-01 14:04:39 +03:00
|
|
|
RawPosixAIOData *acb = g_new(RawPosixAIOData, 1);
|
2013-11-22 16:39:55 +04:00
|
|
|
ThreadPool *pool;
|
|
|
|
|
|
|
|
acb->bs = bs;
|
|
|
|
acb->aio_type = type;
|
|
|
|
acb->aio_fildes = fd;
|
|
|
|
|
2017-06-09 13:18:08 +03:00
|
|
|
acb->aio_nbytes = bytes;
|
2016-06-02 00:10:10 +03:00
|
|
|
acb->aio_offset = offset;
|
2014-07-01 18:09:54 +04:00
|
|
|
|
2013-11-22 16:39:55 +04:00
|
|
|
if (qiov) {
|
|
|
|
acb->aio_iov = qiov->iov;
|
|
|
|
acb->aio_niov = qiov->niov;
|
2017-06-09 13:18:08 +03:00
|
|
|
assert(qiov->size == bytes);
|
2013-11-22 16:39:55 +04:00
|
|
|
}
|
|
|
|
|
2017-06-09 13:18:08 +03:00
|
|
|
trace_paio_submit_co(offset, bytes, type);
|
2013-11-22 16:39:55 +04:00
|
|
|
pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
|
|
|
|
return thread_pool_submit_co(pool, aio_worker, acb);
|
|
|
|
}
|
|
|
|
|
2014-10-07 15:59:14 +04:00
|
|
|
static BlockAIOCB *paio_submit(BlockDriverState *bs, int fd,
|
2017-06-09 13:18:08 +03:00
|
|
|
int64_t offset, QEMUIOVector *qiov, int bytes,
|
2014-10-07 15:59:15 +04:00
|
|
|
BlockCompletionFunc *cb, void *opaque, int type)
|
2012-05-25 13:46:27 +04:00
|
|
|
{
|
2015-10-01 14:04:39 +03:00
|
|
|
RawPosixAIOData *acb = g_new(RawPosixAIOData, 1);
|
2013-03-07 16:41:49 +04:00
|
|
|
ThreadPool *pool;
|
2012-05-25 13:46:27 +04:00
|
|
|
|
|
|
|
acb->bs = bs;
|
|
|
|
acb->aio_type = type;
|
|
|
|
acb->aio_fildes = fd;
|
|
|
|
|
2017-06-09 13:18:08 +03:00
|
|
|
acb->aio_nbytes = bytes;
|
2016-07-16 02:22:55 +03:00
|
|
|
acb->aio_offset = offset;
|
2014-07-01 18:09:54 +04:00
|
|
|
|
2012-05-25 13:46:27 +04:00
|
|
|
if (qiov) {
|
|
|
|
acb->aio_iov = qiov->iov;
|
|
|
|
acb->aio_niov = qiov->niov;
|
2014-07-01 18:09:54 +04:00
|
|
|
assert(qiov->size == acb->aio_nbytes);
|
2012-05-25 13:46:27 +04:00
|
|
|
}
|
|
|
|
|
2017-06-09 13:18:08 +03:00
|
|
|
trace_paio_submit(acb, opaque, offset, bytes, type);
|
2013-03-07 16:41:49 +04:00
|
|
|
pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
|
|
|
|
return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque);
|
2012-05-25 13:46:27 +04:00
|
|
|
}
|
|
|
|
|
2016-06-03 18:36:27 +03:00
|
|
|
static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset,
|
|
|
|
uint64_t bytes, QEMUIOVector *qiov, int type)
|
2006-08-01 20:21:11 +04:00
|
|
|
{
|
2006-08-07 06:38:06 +04:00
|
|
|
BDRVRawState *s = bs->opaque;
|
|
|
|
|
2006-08-19 15:45:59 +04:00
|
|
|
if (fd_open(bs) < 0)
|
2014-08-06 19:18:07 +04:00
|
|
|
return -EIO;
|
2006-08-19 15:45:59 +04:00
|
|
|
|
2009-04-07 22:43:24 +04:00
|
|
|
/*
|
2014-10-21 18:03:03 +04:00
|
|
|
* Check if the underlying device requires requests to be aligned,
|
|
|
|
* and if the request we are trying to submit is aligned or not.
|
|
|
|
* If this is the case tell the low-level driver that it needs
|
|
|
|
* to copy the buffer.
|
2009-04-07 22:43:24 +04:00
|
|
|
*/
|
2014-10-21 18:03:03 +04:00
|
|
|
if (s->needs_alignment) {
|
2013-01-11 19:41:27 +04:00
|
|
|
if (!bdrv_qiov_is_aligned(bs, qiov)) {
|
2009-08-20 18:58:35 +04:00
|
|
|
type |= QEMU_AIO_MISALIGNED;
|
2009-08-28 16:39:31 +04:00
|
|
|
#ifdef CONFIG_LINUX_AIO
|
2016-09-08 16:09:01 +03:00
|
|
|
} else if (s->use_linux_aio) {
|
2016-07-04 19:33:20 +03:00
|
|
|
LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
|
2016-06-03 18:36:27 +03:00
|
|
|
assert(qiov->size == bytes);
|
2016-07-04 19:33:20 +03:00
|
|
|
return laio_co_submit(bs, aio, s->fd, offset, qiov, type);
|
2009-08-28 16:39:31 +04:00
|
|
|
#endif
|
2009-08-20 18:58:35 +04:00
|
|
|
}
|
2009-08-20 18:58:19 +04:00
|
|
|
}
|
2009-04-07 22:43:24 +04:00
|
|
|
|
2016-06-03 18:36:27 +03:00
|
|
|
return paio_submit_co(bs, s->fd, offset, qiov, bytes, type);
|
2014-08-06 19:18:07 +04:00
|
|
|
}
|
|
|
|
|
2016-06-03 18:36:27 +03:00
|
|
|
static int coroutine_fn raw_co_preadv(BlockDriverState *bs, uint64_t offset,
|
|
|
|
uint64_t bytes, QEMUIOVector *qiov,
|
|
|
|
int flags)
|
2014-08-06 19:18:07 +04:00
|
|
|
{
|
2016-06-03 18:36:27 +03:00
|
|
|
return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_READ);
|
2014-08-06 19:18:07 +04:00
|
|
|
}
|
|
|
|
|
2016-06-03 18:36:27 +03:00
|
|
|
static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, uint64_t offset,
|
|
|
|
uint64_t bytes, QEMUIOVector *qiov,
|
|
|
|
int flags)
|
2014-08-06 19:18:07 +04:00
|
|
|
{
|
2016-06-03 18:36:27 +03:00
|
|
|
assert(flags == 0);
|
|
|
|
return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_WRITE);
|
2006-08-01 20:21:11 +04:00
|
|
|
}
|
|
|
|
|
2014-07-04 14:04:34 +04:00
|
|
|
static void raw_aio_plug(BlockDriverState *bs)
|
|
|
|
{
|
|
|
|
#ifdef CONFIG_LINUX_AIO
|
2016-09-08 16:09:01 +03:00
|
|
|
BDRVRawState *s = bs->opaque;
|
|
|
|
if (s->use_linux_aio) {
|
2016-07-04 19:33:20 +03:00
|
|
|
LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
|
|
|
|
laio_io_plug(bs, aio);
|
2014-07-04 14:04:34 +04:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
static void raw_aio_unplug(BlockDriverState *bs)
|
|
|
|
{
|
|
|
|
#ifdef CONFIG_LINUX_AIO
|
2016-09-08 16:09:01 +03:00
|
|
|
BDRVRawState *s = bs->opaque;
|
|
|
|
if (s->use_linux_aio) {
|
2016-07-04 19:33:20 +03:00
|
|
|
LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
|
|
|
|
laio_io_unplug(bs, aio);
|
2014-07-04 14:04:34 +04:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2014-10-07 15:59:14 +04:00
|
|
|
static BlockAIOCB *raw_aio_flush(BlockDriverState *bs,
|
2014-10-07 15:59:15 +04:00
|
|
|
BlockCompletionFunc *cb, void *opaque)
|
2009-09-04 21:01:49 +04:00
|
|
|
{
|
|
|
|
BDRVRawState *s = bs->opaque;
|
|
|
|
|
|
|
|
if (fd_open(bs) < 0)
|
|
|
|
return NULL;
|
|
|
|
|
2009-10-26 15:03:08 +03:00
|
|
|
return paio_submit(bs, s->fd, 0, NULL, 0, cb, opaque, QEMU_AIO_FLUSH);
|
2009-09-04 21:01:49 +04:00
|
|
|
}
|
|
|
|
|
2006-08-01 20:21:11 +04:00
|
|
|
static void raw_close(BlockDriverState *bs)
|
|
|
|
{
|
|
|
|
BDRVRawState *s = bs->opaque;
|
2014-05-08 18:34:47 +04:00
|
|
|
|
2006-08-19 15:45:59 +04:00
|
|
|
if (s->fd >= 0) {
|
2012-08-15 00:43:46 +04:00
|
|
|
qemu_close(s->fd);
|
2006-08-19 15:45:59 +04:00
|
|
|
s->fd = -1;
|
|
|
|
}
|
2017-05-02 19:35:56 +03:00
|
|
|
if (s->lock_fd >= 0) {
|
|
|
|
qemu_close(s->lock_fd);
|
|
|
|
s->lock_fd = -1;
|
|
|
|
}
|
2006-08-01 20:21:11 +04:00
|
|
|
}
|
|
|
|
|
2017-06-13 23:20:58 +03:00
|
|
|
/**
|
|
|
|
* Truncates the given regular file @fd to @offset and, when growing, fills the
|
|
|
|
* new space according to @prealloc.
|
|
|
|
*
|
|
|
|
* Returns: 0 on success, -errno on failure.
|
|
|
|
*/
|
2017-06-13 23:20:57 +03:00
|
|
|
static int raw_regular_truncate(int fd, int64_t offset, PreallocMode prealloc,
|
|
|
|
Error **errp)
|
|
|
|
{
|
|
|
|
int result = 0;
|
2017-06-13 23:20:58 +03:00
|
|
|
int64_t current_length = 0;
|
|
|
|
char *buf = NULL;
|
|
|
|
struct stat st;
|
|
|
|
|
|
|
|
if (fstat(fd, &st) < 0) {
|
|
|
|
result = -errno;
|
|
|
|
error_setg_errno(errp, -result, "Could not stat file");
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
current_length = st.st_size;
|
|
|
|
if (current_length > offset && prealloc != PREALLOC_MODE_OFF) {
|
|
|
|
error_setg(errp, "Cannot use preallocation for shrinking files");
|
|
|
|
return -ENOTSUP;
|
|
|
|
}
|
2017-06-13 23:20:57 +03:00
|
|
|
|
|
|
|
switch (prealloc) {
|
|
|
|
#ifdef CONFIG_POSIX_FALLOCATE
|
|
|
|
case PREALLOC_MODE_FALLOC:
|
|
|
|
/*
|
|
|
|
* Truncating before posix_fallocate() makes it about twice slower on
|
|
|
|
* file systems that do not support fallocate(), trying to check if a
|
|
|
|
* block is allocated before allocating it, so don't do that here.
|
|
|
|
*/
|
2017-06-13 23:20:58 +03:00
|
|
|
result = -posix_fallocate(fd, current_length, offset - current_length);
|
2017-06-13 23:20:57 +03:00
|
|
|
if (result != 0) {
|
|
|
|
/* posix_fallocate() doesn't set errno. */
|
|
|
|
error_setg_errno(errp, -result,
|
2017-06-13 23:20:58 +03:00
|
|
|
"Could not preallocate new data");
|
2017-06-13 23:20:57 +03:00
|
|
|
}
|
2017-06-13 23:20:58 +03:00
|
|
|
goto out;
|
2017-06-13 23:20:57 +03:00
|
|
|
#endif
|
|
|
|
case PREALLOC_MODE_FULL:
|
|
|
|
{
|
2017-06-13 23:20:58 +03:00
|
|
|
int64_t num = 0, left = offset - current_length;
|
2017-06-13 23:20:57 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Knowing the final size from the beginning could allow the file
|
|
|
|
* system driver to do less allocations and possibly avoid
|
|
|
|
* fragmentation of the file.
|
|
|
|
*/
|
|
|
|
if (ftruncate(fd, offset) != 0) {
|
|
|
|
result = -errno;
|
|
|
|
error_setg_errno(errp, -result, "Could not resize file");
|
2017-06-13 23:20:58 +03:00
|
|
|
goto out;
|
2017-06-13 23:20:57 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
buf = g_malloc0(65536);
|
|
|
|
|
2017-06-13 23:20:58 +03:00
|
|
|
result = lseek(fd, current_length, SEEK_SET);
|
|
|
|
if (result < 0) {
|
|
|
|
result = -errno;
|
|
|
|
error_setg_errno(errp, -result,
|
|
|
|
"Failed to seek to the old end of file");
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2017-06-13 23:20:57 +03:00
|
|
|
while (left > 0) {
|
|
|
|
num = MIN(left, 65536);
|
|
|
|
result = write(fd, buf, num);
|
|
|
|
if (result < 0) {
|
|
|
|
result = -errno;
|
|
|
|
error_setg_errno(errp, -result,
|
2017-06-13 23:20:58 +03:00
|
|
|
"Could not write zeros for preallocation");
|
|
|
|
goto out;
|
2017-06-13 23:20:57 +03:00
|
|
|
}
|
|
|
|
left -= result;
|
|
|
|
}
|
|
|
|
if (result >= 0) {
|
|
|
|
result = fsync(fd);
|
|
|
|
if (result < 0) {
|
|
|
|
result = -errno;
|
|
|
|
error_setg_errno(errp, -result,
|
2017-06-13 23:20:58 +03:00
|
|
|
"Could not flush file to disk");
|
|
|
|
goto out;
|
2017-06-13 23:20:57 +03:00
|
|
|
}
|
|
|
|
}
|
2017-06-13 23:20:58 +03:00
|
|
|
goto out;
|
2017-06-13 23:20:57 +03:00
|
|
|
}
|
|
|
|
case PREALLOC_MODE_OFF:
|
|
|
|
if (ftruncate(fd, offset) != 0) {
|
|
|
|
result = -errno;
|
|
|
|
error_setg_errno(errp, -result, "Could not resize file");
|
|
|
|
}
|
|
|
|
return result;
|
|
|
|
default:
|
|
|
|
result = -ENOTSUP;
|
|
|
|
error_setg(errp, "Unsupported preallocation mode: %s",
|
2017-08-24 11:46:08 +03:00
|
|
|
PreallocMode_str(prealloc));
|
2017-06-13 23:20:57 +03:00
|
|
|
return result;
|
|
|
|
}
|
2017-06-13 23:20:58 +03:00
|
|
|
|
|
|
|
out:
|
|
|
|
if (result < 0) {
|
|
|
|
if (ftruncate(fd, current_length) < 0) {
|
|
|
|
error_report("Failed to restore old file length: %s",
|
|
|
|
strerror(errno));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
g_free(buf);
|
|
|
|
return result;
|
2017-06-13 23:20:57 +03:00
|
|
|
}
|
|
|
|
|
2017-06-13 23:20:52 +03:00
|
|
|
static int raw_truncate(BlockDriverState *bs, int64_t offset,
|
|
|
|
PreallocMode prealloc, Error **errp)
|
2006-08-01 20:21:11 +04:00
|
|
|
{
|
|
|
|
BDRVRawState *s = bs->opaque;
|
2011-09-21 03:10:37 +04:00
|
|
|
struct stat st;
|
2017-03-28 23:51:29 +03:00
|
|
|
int ret;
|
2011-09-21 03:10:37 +04:00
|
|
|
|
|
|
|
if (fstat(s->fd, &st)) {
|
2017-03-28 23:51:29 +03:00
|
|
|
ret = -errno;
|
|
|
|
error_setg_errno(errp, -ret, "Failed to fstat() the file");
|
|
|
|
return ret;
|
2011-09-21 03:10:37 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
if (S_ISREG(st.st_mode)) {
|
2017-06-13 23:20:59 +03:00
|
|
|
return raw_regular_truncate(s->fd, offset, prealloc, errp);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (prealloc != PREALLOC_MODE_OFF) {
|
|
|
|
error_setg(errp, "Preallocation mode '%s' unsupported for this "
|
2017-08-24 11:46:08 +03:00
|
|
|
"non-regular file", PreallocMode_str(prealloc));
|
2017-06-13 23:20:59 +03:00
|
|
|
return -ENOTSUP;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
|
2017-03-28 23:51:29 +03:00
|
|
|
if (offset > raw_getlength(bs)) {
|
|
|
|
error_setg(errp, "Cannot grow device files");
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
2011-09-21 03:10:37 +04:00
|
|
|
} else {
|
2017-03-28 23:51:29 +03:00
|
|
|
error_setg(errp, "Resizing this file is not supported");
|
2011-09-21 03:10:37 +04:00
|
|
|
return -ENOTSUP;
|
|
|
|
}
|
|
|
|
|
2006-08-01 20:21:11 +04:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2008-08-15 22:33:42 +04:00
|
|
|
#ifdef __OpenBSD__
|
|
|
|
static int64_t raw_getlength(BlockDriverState *bs)
|
|
|
|
{
|
|
|
|
BDRVRawState *s = bs->opaque;
|
|
|
|
int fd = s->fd;
|
|
|
|
struct stat st;
|
|
|
|
|
|
|
|
if (fstat(fd, &st))
|
2014-06-26 15:23:16 +04:00
|
|
|
return -errno;
|
2008-08-15 22:33:42 +04:00
|
|
|
if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
|
|
|
|
struct disklabel dl;
|
|
|
|
|
|
|
|
if (ioctl(fd, DIOCGDINFO, &dl))
|
2014-06-26 15:23:16 +04:00
|
|
|
return -errno;
|
2008-08-15 22:33:42 +04:00
|
|
|
return (uint64_t)dl.d_secsize *
|
|
|
|
dl.d_partitions[DISKPART(st.st_rdev)].p_size;
|
|
|
|
} else
|
|
|
|
return st.st_size;
|
|
|
|
}
|
2011-05-23 16:31:17 +04:00
|
|
|
#elif defined(__NetBSD__)
|
|
|
|
static int64_t raw_getlength(BlockDriverState *bs)
|
|
|
|
{
|
|
|
|
BDRVRawState *s = bs->opaque;
|
|
|
|
int fd = s->fd;
|
|
|
|
struct stat st;
|
|
|
|
|
|
|
|
if (fstat(fd, &st))
|
2014-06-26 15:23:16 +04:00
|
|
|
return -errno;
|
2011-05-23 16:31:17 +04:00
|
|
|
if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
|
|
|
|
struct dkwedge_info dkw;
|
|
|
|
|
|
|
|
if (ioctl(fd, DIOCGWEDGEINFO, &dkw) != -1) {
|
|
|
|
return dkw.dkw_size * 512;
|
|
|
|
} else {
|
|
|
|
struct disklabel dl;
|
|
|
|
|
|
|
|
if (ioctl(fd, DIOCGDINFO, &dl))
|
2014-06-26 15:23:16 +04:00
|
|
|
return -errno;
|
2011-05-23 16:31:17 +04:00
|
|
|
return (uint64_t)dl.d_secsize *
|
|
|
|
dl.d_partitions[DISKPART(st.st_rdev)].p_size;
|
|
|
|
}
|
|
|
|
} else
|
|
|
|
return st.st_size;
|
|
|
|
}
|
2010-04-06 21:13:44 +04:00
|
|
|
#elif defined(__sun__)
|
|
|
|
static int64_t raw_getlength(BlockDriverState *bs)
|
|
|
|
{
|
|
|
|
BDRVRawState *s = bs->opaque;
|
|
|
|
struct dk_minfo minfo;
|
|
|
|
int ret;
|
2014-06-26 15:23:16 +04:00
|
|
|
int64_t size;
|
2010-04-06 21:13:44 +04:00
|
|
|
|
|
|
|
ret = fd_open(bs);
|
|
|
|
if (ret < 0) {
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Use the DKIOCGMEDIAINFO ioctl to read the size.
|
|
|
|
*/
|
|
|
|
ret = ioctl(s->fd, DKIOCGMEDIAINFO, &minfo);
|
|
|
|
if (ret != -1) {
|
|
|
|
return minfo.dki_lbsize * minfo.dki_capacity;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* There are reports that lseek on some devices fails, but
|
|
|
|
* irc discussion said that contingency on contingency was overkill.
|
|
|
|
*/
|
2014-06-26 15:23:16 +04:00
|
|
|
size = lseek(s->fd, 0, SEEK_END);
|
|
|
|
if (size < 0) {
|
|
|
|
return -errno;
|
|
|
|
}
|
|
|
|
return size;
|
2010-04-06 21:13:44 +04:00
|
|
|
}
|
|
|
|
#elif defined(CONFIG_BSD)
|
|
|
|
static int64_t raw_getlength(BlockDriverState *bs)
|
2006-08-01 20:21:11 +04:00
|
|
|
{
|
|
|
|
BDRVRawState *s = bs->opaque;
|
|
|
|
int fd = s->fd;
|
|
|
|
int64_t size;
|
|
|
|
struct stat sb;
|
2009-11-29 20:00:41 +03:00
|
|
|
#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
|
2009-03-28 11:37:13 +03:00
|
|
|
int reopened = 0;
|
2006-08-01 20:21:11 +04:00
|
|
|
#endif
|
2006-08-19 15:45:59 +04:00
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = fd_open(bs);
|
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
2006-08-01 20:21:11 +04:00
|
|
|
|
2009-11-29 20:00:41 +03:00
|
|
|
#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
|
2009-03-28 11:37:13 +03:00
|
|
|
again:
|
|
|
|
#endif
|
2006-08-01 20:21:11 +04:00
|
|
|
if (!fstat(fd, &sb) && (S_IFCHR & sb.st_mode)) {
|
|
|
|
#ifdef DIOCGMEDIASIZE
|
|
|
|
if (ioctl(fd, DIOCGMEDIASIZE, (off_t *)&size))
|
2009-03-07 23:06:23 +03:00
|
|
|
#elif defined(DIOCGPART)
|
|
|
|
{
|
|
|
|
struct partinfo pi;
|
|
|
|
if (ioctl(fd, DIOCGPART, &pi) == 0)
|
|
|
|
size = pi.media_size;
|
|
|
|
else
|
|
|
|
size = 0;
|
|
|
|
}
|
|
|
|
if (size == 0)
|
2006-08-01 20:21:11 +04:00
|
|
|
#endif
|
2011-11-10 22:40:06 +04:00
|
|
|
#if defined(__APPLE__) && defined(__MACH__)
|
2015-01-20 01:12:55 +03:00
|
|
|
{
|
|
|
|
uint64_t sectors = 0;
|
|
|
|
uint32_t sector_size = 0;
|
|
|
|
|
|
|
|
if (ioctl(fd, DKIOCGETBLOCKCOUNT, §ors) == 0
|
|
|
|
&& ioctl(fd, DKIOCGETBLOCKSIZE, §or_size) == 0) {
|
|
|
|
size = sectors * sector_size;
|
|
|
|
} else {
|
|
|
|
size = lseek(fd, 0LL, SEEK_END);
|
|
|
|
if (size < 0) {
|
|
|
|
return -errno;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2006-08-01 20:21:11 +04:00
|
|
|
#else
|
|
|
|
size = lseek(fd, 0LL, SEEK_END);
|
2014-06-26 15:23:16 +04:00
|
|
|
if (size < 0) {
|
|
|
|
return -errno;
|
|
|
|
}
|
2009-03-28 11:37:13 +03:00
|
|
|
#endif
|
2009-11-29 20:00:41 +03:00
|
|
|
#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
|
2009-03-28 11:37:13 +03:00
|
|
|
switch(s->type) {
|
|
|
|
case FTYPE_CD:
|
|
|
|
/* XXX FreeBSD acd returns UINT_MAX sectors for an empty drive */
|
|
|
|
if (size == 2048LL * (unsigned)-1)
|
|
|
|
size = 0;
|
|
|
|
/* XXX no disc? maybe we need to reopen... */
|
2009-06-15 15:55:19 +04:00
|
|
|
if (size <= 0 && !reopened && cdrom_reopen(bs) >= 0) {
|
2009-03-28 11:37:13 +03:00
|
|
|
reopened = 1;
|
|
|
|
goto again;
|
|
|
|
}
|
|
|
|
}
|
2006-08-01 20:21:11 +04:00
|
|
|
#endif
|
2010-04-06 21:13:44 +04:00
|
|
|
} else {
|
2006-08-01 20:21:11 +04:00
|
|
|
size = lseek(fd, 0, SEEK_END);
|
2014-06-26 15:23:16 +04:00
|
|
|
if (size < 0) {
|
|
|
|
return -errno;
|
|
|
|
}
|
2006-08-01 20:21:11 +04:00
|
|
|
}
|
|
|
|
return size;
|
|
|
|
}
|
2010-04-06 21:13:44 +04:00
|
|
|
#else
|
|
|
|
static int64_t raw_getlength(BlockDriverState *bs)
|
|
|
|
{
|
|
|
|
BDRVRawState *s = bs->opaque;
|
|
|
|
int ret;
|
2014-06-26 15:23:16 +04:00
|
|
|
int64_t size;
|
2010-04-06 21:13:44 +04:00
|
|
|
|
|
|
|
ret = fd_open(bs);
|
|
|
|
if (ret < 0) {
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2014-06-26 15:23:16 +04:00
|
|
|
size = lseek(s->fd, 0, SEEK_END);
|
|
|
|
if (size < 0) {
|
|
|
|
return -errno;
|
|
|
|
}
|
|
|
|
return size;
|
2010-04-06 21:13:44 +04:00
|
|
|
}
|
2008-08-15 22:33:42 +04:00
|
|
|
#endif
|
2006-08-01 20:21:11 +04:00
|
|
|
|
2011-07-12 15:56:39 +04:00
|
|
|
static int64_t raw_get_allocated_file_size(BlockDriverState *bs)
|
|
|
|
{
|
|
|
|
struct stat st;
|
|
|
|
BDRVRawState *s = bs->opaque;
|
|
|
|
|
|
|
|
if (fstat(s->fd, &st) < 0) {
|
|
|
|
return -errno;
|
|
|
|
}
|
|
|
|
return (int64_t)st.st_blocks * 512;
|
|
|
|
}
|
|
|
|
|
2014-06-05 13:21:01 +04:00
|
|
|
static int raw_create(const char *filename, QemuOpts *opts, Error **errp)
|
2006-08-01 20:21:11 +04:00
|
|
|
{
|
|
|
|
int fd;
|
2009-07-11 18:43:37 +04:00
|
|
|
int result = 0;
|
2009-05-18 18:42:10 +04:00
|
|
|
int64_t total_size = 0;
|
qemu-img create: add 'nocow' option
Add 'nocow' option so that users could have a chance to set NOCOW flag to
newly created files. It's useful on btrfs file system to enhance performance.
Btrfs has low performance when hosting VM images, even more when the guest
in those VM are also using btrfs as file system. One way to mitigate this bad
performance is to turn off COW attributes on VM files. Generally, there are
two ways to turn off NOCOW on btrfs: a) by mounting fs with nodatacow, then
all newly created files will be NOCOW. b) per file. Add the NOCOW file
attribute. It could only be done to empty or new files.
This patch tries the second way, according to the option, it could add NOCOW
per file.
For most block drivers, since the create file step is in raw-posix.c, so we
can do setting NOCOW flag ioctl in raw-posix.c only.
But there are some exceptions, like block/vpc.c and block/vdi.c, they are
creating file by calling qemu_open directly. For them, do the same setting
NOCOW flag ioctl work in them separately.
[Fixed up 082.out due to the new 'nocow' creation option
--Stefan]
Signed-off-by: Chunyan Liu <cyliu@suse.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-06-30 10:29:58 +04:00
|
|
|
bool nocow = false;
|
2014-09-10 13:05:48 +04:00
|
|
|
PreallocMode prealloc;
|
|
|
|
char *buf = NULL;
|
|
|
|
Error *local_err = NULL;
|
2006-08-01 20:21:11 +04:00
|
|
|
|
2014-03-06 01:41:38 +04:00
|
|
|
strstart(filename, "file:", &filename);
|
|
|
|
|
2009-05-18 18:42:10 +04:00
|
|
|
/* Read out options */
|
2014-09-10 13:05:46 +04:00
|
|
|
total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
|
|
|
|
BDRV_SECTOR_SIZE);
|
qemu-img create: add 'nocow' option
Add 'nocow' option so that users could have a chance to set NOCOW flag to
newly created files. It's useful on btrfs file system to enhance performance.
Btrfs has low performance when hosting VM images, even more when the guest
in those VM are also using btrfs as file system. One way to mitigate this bad
performance is to turn off COW attributes on VM files. Generally, there are
two ways to turn off NOCOW on btrfs: a) by mounting fs with nodatacow, then
all newly created files will be NOCOW. b) per file. Add the NOCOW file
attribute. It could only be done to empty or new files.
This patch tries the second way, according to the option, it could add NOCOW
per file.
For most block drivers, since the create file step is in raw-posix.c, so we
can do setting NOCOW flag ioctl in raw-posix.c only.
But there are some exceptions, like block/vpc.c and block/vdi.c, they are
creating file by calling qemu_open directly. For them, do the same setting
NOCOW flag ioctl work in them separately.
[Fixed up 082.out due to the new 'nocow' creation option
--Stefan]
Signed-off-by: Chunyan Liu <cyliu@suse.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-06-30 10:29:58 +04:00
|
|
|
nocow = qemu_opt_get_bool(opts, BLOCK_OPT_NOCOW, false);
|
2014-09-10 13:05:48 +04:00
|
|
|
buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
|
2017-08-24 11:46:10 +03:00
|
|
|
prealloc = qapi_enum_parse(&PreallocMode_lookup, buf,
|
2017-08-24 11:45:57 +03:00
|
|
|
PREALLOC_MODE_OFF, &local_err);
|
2014-09-10 13:05:48 +04:00
|
|
|
g_free(buf);
|
|
|
|
if (local_err) {
|
|
|
|
error_propagate(errp, local_err);
|
|
|
|
result = -EINVAL;
|
|
|
|
goto out;
|
|
|
|
}
|
2006-08-01 20:21:11 +04:00
|
|
|
|
2015-09-29 18:54:10 +03:00
|
|
|
fd = qemu_open(filename, O_RDWR | O_CREAT | O_TRUNC | O_BINARY,
|
2012-08-15 00:43:45 +04:00
|
|
|
0644);
|
2009-07-11 18:43:37 +04:00
|
|
|
if (fd < 0) {
|
|
|
|
result = -errno;
|
2013-10-11 13:37:01 +04:00
|
|
|
error_setg_errno(errp, -result, "Could not create file");
|
2014-09-10 13:05:48 +04:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (nocow) {
|
qemu-img create: add 'nocow' option
Add 'nocow' option so that users could have a chance to set NOCOW flag to
newly created files. It's useful on btrfs file system to enhance performance.
Btrfs has low performance when hosting VM images, even more when the guest
in those VM are also using btrfs as file system. One way to mitigate this bad
performance is to turn off COW attributes on VM files. Generally, there are
two ways to turn off NOCOW on btrfs: a) by mounting fs with nodatacow, then
all newly created files will be NOCOW. b) per file. Add the NOCOW file
attribute. It could only be done to empty or new files.
This patch tries the second way, according to the option, it could add NOCOW
per file.
For most block drivers, since the create file step is in raw-posix.c, so we
can do setting NOCOW flag ioctl in raw-posix.c only.
But there are some exceptions, like block/vpc.c and block/vdi.c, they are
creating file by calling qemu_open directly. For them, do the same setting
NOCOW flag ioctl work in them separately.
[Fixed up 082.out due to the new 'nocow' creation option
--Stefan]
Signed-off-by: Chunyan Liu <cyliu@suse.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-06-30 10:29:58 +04:00
|
|
|
#ifdef __linux__
|
2014-09-10 13:05:48 +04:00
|
|
|
/* Set NOCOW flag to solve performance issue on fs like btrfs.
|
|
|
|
* This is an optimisation. The FS_IOC_SETFLAGS ioctl return value
|
|
|
|
* will be ignored since any failure of this operation should not
|
|
|
|
* block the left work.
|
|
|
|
*/
|
|
|
|
int attr;
|
|
|
|
if (ioctl(fd, FS_IOC_GETFLAGS, &attr) == 0) {
|
|
|
|
attr |= FS_NOCOW_FL;
|
|
|
|
ioctl(fd, FS_IOC_SETFLAGS, &attr);
|
qemu-img create: add 'nocow' option
Add 'nocow' option so that users could have a chance to set NOCOW flag to
newly created files. It's useful on btrfs file system to enhance performance.
Btrfs has low performance when hosting VM images, even more when the guest
in those VM are also using btrfs as file system. One way to mitigate this bad
performance is to turn off COW attributes on VM files. Generally, there are
two ways to turn off NOCOW on btrfs: a) by mounting fs with nodatacow, then
all newly created files will be NOCOW. b) per file. Add the NOCOW file
attribute. It could only be done to empty or new files.
This patch tries the second way, according to the option, it could add NOCOW
per file.
For most block drivers, since the create file step is in raw-posix.c, so we
can do setting NOCOW flag ioctl in raw-posix.c only.
But there are some exceptions, like block/vpc.c and block/vdi.c, they are
creating file by calling qemu_open directly. For them, do the same setting
NOCOW flag ioctl work in them separately.
[Fixed up 082.out due to the new 'nocow' creation option
--Stefan]
Signed-off-by: Chunyan Liu <cyliu@suse.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-06-30 10:29:58 +04:00
|
|
|
}
|
2014-09-10 13:05:48 +04:00
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2017-06-13 23:20:57 +03:00
|
|
|
result = raw_regular_truncate(fd, total_size, prealloc, errp);
|
|
|
|
if (result < 0) {
|
|
|
|
goto out_close;
|
2009-07-11 18:43:37 +04:00
|
|
|
}
|
2014-09-10 13:05:48 +04:00
|
|
|
|
2017-02-17 03:51:26 +03:00
|
|
|
out_close:
|
2014-09-10 13:05:48 +04:00
|
|
|
if (qemu_close(fd) != 0 && result == 0) {
|
|
|
|
result = -errno;
|
|
|
|
error_setg_errno(errp, -result, "Could not close the new file");
|
|
|
|
}
|
|
|
|
out:
|
2009-07-11 18:43:37 +04:00
|
|
|
return result;
|
2006-08-01 20:21:11 +04:00
|
|
|
}
|
|
|
|
|
raw-posix: The SEEK_HOLE code is flawed, rewrite it
On systems where SEEK_HOLE in a trailing hole seeks to EOF (Solaris,
but not Linux), try_seek_hole() reports trailing data instead.
Additionally, unlikely lseek() failures are treated badly:
* When SEEK_HOLE fails, try_seek_hole() reports trailing data. For
-ENXIO, there's in fact a trailing hole. Can happen only when
something truncated the file since we opened it.
* When SEEK_HOLE succeeds, SEEK_DATA fails, and SEEK_END succeeds,
then try_seek_hole() reports a trailing hole. This is okay only
when SEEK_DATA failed with -ENXIO (which means the non-trailing hole
found by SEEK_HOLE has since become trailing somehow). For other
failures (unlikely), it's wrong.
* When SEEK_HOLE succeeds, SEEK_DATA fails, SEEK_END fails (unlikely),
then try_seek_hole() reports bogus data [-1,start), which its caller
raw_co_get_block_status() turns into zero sectors of data. Could
theoretically lead to infinite loops in code that attempts to scan
data vs. hole forward.
Rewrite from scratch, with very careful comments.
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
2014-11-17 13:18:34 +03:00
|
|
|
/*
|
|
|
|
* Find allocation range in @bs around offset @start.
|
|
|
|
* May change underlying file descriptor's file offset.
|
|
|
|
* If @start is not in a hole, store @start in @data, and the
|
|
|
|
* beginning of the next hole in @hole, and return 0.
|
|
|
|
* If @start is in a non-trailing hole, store @start in @hole and the
|
|
|
|
* beginning of the next non-hole in @data, and return 0.
|
|
|
|
* If @start is in a trailing hole or beyond EOF, return -ENXIO.
|
|
|
|
* If we can't find out, return a negative errno other than -ENXIO.
|
|
|
|
*/
|
|
|
|
static int find_allocation(BlockDriverState *bs, off_t start,
|
|
|
|
off_t *data, off_t *hole)
|
2014-05-08 22:57:55 +04:00
|
|
|
{
|
|
|
|
#if defined SEEK_HOLE && defined SEEK_DATA
|
2012-06-20 02:02:51 +04:00
|
|
|
BDRVRawState *s = bs->opaque;
|
raw-posix: The SEEK_HOLE code is flawed, rewrite it
On systems where SEEK_HOLE in a trailing hole seeks to EOF (Solaris,
but not Linux), try_seek_hole() reports trailing data instead.
Additionally, unlikely lseek() failures are treated badly:
* When SEEK_HOLE fails, try_seek_hole() reports trailing data. For
-ENXIO, there's in fact a trailing hole. Can happen only when
something truncated the file since we opened it.
* When SEEK_HOLE succeeds, SEEK_DATA fails, and SEEK_END succeeds,
then try_seek_hole() reports a trailing hole. This is okay only
when SEEK_DATA failed with -ENXIO (which means the non-trailing hole
found by SEEK_HOLE has since become trailing somehow). For other
failures (unlikely), it's wrong.
* When SEEK_HOLE succeeds, SEEK_DATA fails, SEEK_END fails (unlikely),
then try_seek_hole() reports bogus data [-1,start), which its caller
raw_co_get_block_status() turns into zero sectors of data. Could
theoretically lead to infinite loops in code that attempts to scan
data vs. hole forward.
Rewrite from scratch, with very careful comments.
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
2014-11-17 13:18:34 +03:00
|
|
|
off_t offs;
|
2012-06-20 02:02:51 +04:00
|
|
|
|
raw-posix: The SEEK_HOLE code is flawed, rewrite it
On systems where SEEK_HOLE in a trailing hole seeks to EOF (Solaris,
but not Linux), try_seek_hole() reports trailing data instead.
Additionally, unlikely lseek() failures are treated badly:
* When SEEK_HOLE fails, try_seek_hole() reports trailing data. For
-ENXIO, there's in fact a trailing hole. Can happen only when
something truncated the file since we opened it.
* When SEEK_HOLE succeeds, SEEK_DATA fails, and SEEK_END succeeds,
then try_seek_hole() reports a trailing hole. This is okay only
when SEEK_DATA failed with -ENXIO (which means the non-trailing hole
found by SEEK_HOLE has since become trailing somehow). For other
failures (unlikely), it's wrong.
* When SEEK_HOLE succeeds, SEEK_DATA fails, SEEK_END fails (unlikely),
then try_seek_hole() reports bogus data [-1,start), which its caller
raw_co_get_block_status() turns into zero sectors of data. Could
theoretically lead to infinite loops in code that attempts to scan
data vs. hole forward.
Rewrite from scratch, with very careful comments.
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
2014-11-17 13:18:34 +03:00
|
|
|
/*
|
|
|
|
* SEEK_DATA cases:
|
|
|
|
* D1. offs == start: start is in data
|
|
|
|
* D2. offs > start: start is in a hole, next data at offs
|
|
|
|
* D3. offs < 0, errno = ENXIO: either start is in a trailing hole
|
|
|
|
* or start is beyond EOF
|
|
|
|
* If the latter happens, the file has been truncated behind
|
|
|
|
* our back since we opened it. All bets are off then.
|
|
|
|
* Treating like a trailing hole is simplest.
|
|
|
|
* D4. offs < 0, errno != ENXIO: we learned nothing
|
|
|
|
*/
|
|
|
|
offs = lseek(s->fd, start, SEEK_DATA);
|
|
|
|
if (offs < 0) {
|
|
|
|
return -errno; /* D3 or D4 */
|
|
|
|
}
|
|
|
|
assert(offs >= start);
|
|
|
|
|
|
|
|
if (offs > start) {
|
|
|
|
/* D2: in hole, next data at offs */
|
|
|
|
*hole = start;
|
|
|
|
*data = offs;
|
|
|
|
return 0;
|
2012-05-09 18:49:58 +04:00
|
|
|
}
|
|
|
|
|
raw-posix: The SEEK_HOLE code is flawed, rewrite it
On systems where SEEK_HOLE in a trailing hole seeks to EOF (Solaris,
but not Linux), try_seek_hole() reports trailing data instead.
Additionally, unlikely lseek() failures are treated badly:
* When SEEK_HOLE fails, try_seek_hole() reports trailing data. For
-ENXIO, there's in fact a trailing hole. Can happen only when
something truncated the file since we opened it.
* When SEEK_HOLE succeeds, SEEK_DATA fails, and SEEK_END succeeds,
then try_seek_hole() reports a trailing hole. This is okay only
when SEEK_DATA failed with -ENXIO (which means the non-trailing hole
found by SEEK_HOLE has since become trailing somehow). For other
failures (unlikely), it's wrong.
* When SEEK_HOLE succeeds, SEEK_DATA fails, SEEK_END fails (unlikely),
then try_seek_hole() reports bogus data [-1,start), which its caller
raw_co_get_block_status() turns into zero sectors of data. Could
theoretically lead to infinite loops in code that attempts to scan
data vs. hole forward.
Rewrite from scratch, with very careful comments.
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
2014-11-17 13:18:34 +03:00
|
|
|
/* D1: in data, end not yet known */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* SEEK_HOLE cases:
|
|
|
|
* H1. offs == start: start is in a hole
|
|
|
|
* If this happens here, a hole has been dug behind our back
|
|
|
|
* since the previous lseek().
|
|
|
|
* H2. offs > start: either start is in data, next hole at offs,
|
|
|
|
* or start is in trailing hole, EOF at offs
|
|
|
|
* Linux treats trailing holes like any other hole: offs ==
|
|
|
|
* start. Solaris seeks to EOF instead: offs > start (blech).
|
|
|
|
* If that happens here, a hole has been dug behind our back
|
|
|
|
* since the previous lseek().
|
|
|
|
* H3. offs < 0, errno = ENXIO: start is beyond EOF
|
|
|
|
* If this happens, the file has been truncated behind our
|
|
|
|
* back since we opened it. Treat it like a trailing hole.
|
|
|
|
* H4. offs < 0, errno != ENXIO: we learned nothing
|
|
|
|
* Pretend we know nothing at all, i.e. "forget" about D1.
|
|
|
|
*/
|
|
|
|
offs = lseek(s->fd, start, SEEK_HOLE);
|
|
|
|
if (offs < 0) {
|
|
|
|
return -errno; /* D1 and (H3 or H4) */
|
|
|
|
}
|
|
|
|
assert(offs >= start);
|
|
|
|
|
|
|
|
if (offs > start) {
|
|
|
|
/*
|
|
|
|
* D1 and H2: either in data, next hole at offs, or it was in
|
|
|
|
* data but is now in a trailing hole. In the latter case,
|
|
|
|
* all bets are off. Treating it as if it there was data all
|
|
|
|
* the way to EOF is safe, so simply do that.
|
|
|
|
*/
|
2014-05-08 22:57:55 +04:00
|
|
|
*data = start;
|
raw-posix: The SEEK_HOLE code is flawed, rewrite it
On systems where SEEK_HOLE in a trailing hole seeks to EOF (Solaris,
but not Linux), try_seek_hole() reports trailing data instead.
Additionally, unlikely lseek() failures are treated badly:
* When SEEK_HOLE fails, try_seek_hole() reports trailing data. For
-ENXIO, there's in fact a trailing hole. Can happen only when
something truncated the file since we opened it.
* When SEEK_HOLE succeeds, SEEK_DATA fails, and SEEK_END succeeds,
then try_seek_hole() reports a trailing hole. This is okay only
when SEEK_DATA failed with -ENXIO (which means the non-trailing hole
found by SEEK_HOLE has since become trailing somehow). For other
failures (unlikely), it's wrong.
* When SEEK_HOLE succeeds, SEEK_DATA fails, SEEK_END fails (unlikely),
then try_seek_hole() reports bogus data [-1,start), which its caller
raw_co_get_block_status() turns into zero sectors of data. Could
theoretically lead to infinite loops in code that attempts to scan
data vs. hole forward.
Rewrite from scratch, with very careful comments.
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
2014-11-17 13:18:34 +03:00
|
|
|
*hole = offs;
|
|
|
|
return 0;
|
2012-05-09 18:49:58 +04:00
|
|
|
}
|
2014-05-08 22:57:55 +04:00
|
|
|
|
raw-posix: The SEEK_HOLE code is flawed, rewrite it
On systems where SEEK_HOLE in a trailing hole seeks to EOF (Solaris,
but not Linux), try_seek_hole() reports trailing data instead.
Additionally, unlikely lseek() failures are treated badly:
* When SEEK_HOLE fails, try_seek_hole() reports trailing data. For
-ENXIO, there's in fact a trailing hole. Can happen only when
something truncated the file since we opened it.
* When SEEK_HOLE succeeds, SEEK_DATA fails, and SEEK_END succeeds,
then try_seek_hole() reports a trailing hole. This is okay only
when SEEK_DATA failed with -ENXIO (which means the non-trailing hole
found by SEEK_HOLE has since become trailing somehow). For other
failures (unlikely), it's wrong.
* When SEEK_HOLE succeeds, SEEK_DATA fails, SEEK_END fails (unlikely),
then try_seek_hole() reports bogus data [-1,start), which its caller
raw_co_get_block_status() turns into zero sectors of data. Could
theoretically lead to infinite loops in code that attempts to scan
data vs. hole forward.
Rewrite from scratch, with very careful comments.
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
2014-11-17 13:18:34 +03:00
|
|
|
/* D1 and H1 */
|
|
|
|
return -EBUSY;
|
2012-05-09 18:49:58 +04:00
|
|
|
#else
|
2014-05-08 22:57:55 +04:00
|
|
|
return -ENOTSUP;
|
2012-05-09 18:49:58 +04:00
|
|
|
#endif
|
2014-05-08 22:57:55 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2014-11-17 13:18:32 +03:00
|
|
|
* Returns the allocation status of the specified sectors.
|
2014-05-08 22:57:55 +04:00
|
|
|
*
|
|
|
|
* If 'sector_num' is beyond the end of the disk image the return value is 0
|
|
|
|
* and 'pnum' is set to 0.
|
|
|
|
*
|
|
|
|
* 'pnum' is set to the number of sectors (including and immediately following
|
|
|
|
* the specified sector) that are known to be in the same
|
|
|
|
* allocated/unallocated state.
|
|
|
|
*
|
|
|
|
* 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
|
|
|
|
* beyond the end of the disk image it will be clamped.
|
|
|
|
*/
|
|
|
|
static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs,
|
|
|
|
int64_t sector_num,
|
2016-01-26 06:58:48 +03:00
|
|
|
int nb_sectors, int *pnum,
|
|
|
|
BlockDriverState **file)
|
2014-05-08 22:57:55 +04:00
|
|
|
{
|
|
|
|
off_t start, data = 0, hole = 0;
|
2014-10-24 14:57:58 +04:00
|
|
|
int64_t total_size;
|
2014-10-24 14:57:59 +04:00
|
|
|
int ret;
|
2014-05-08 22:57:55 +04:00
|
|
|
|
|
|
|
ret = fd_open(bs);
|
|
|
|
if (ret < 0) {
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
start = sector_num * BDRV_SECTOR_SIZE;
|
2014-10-24 14:57:58 +04:00
|
|
|
total_size = bdrv_getlength(bs);
|
|
|
|
if (total_size < 0) {
|
|
|
|
return total_size;
|
|
|
|
} else if (start >= total_size) {
|
|
|
|
*pnum = 0;
|
|
|
|
return 0;
|
|
|
|
} else if (start + nb_sectors * BDRV_SECTOR_SIZE > total_size) {
|
|
|
|
nb_sectors = DIV_ROUND_UP(total_size - start, BDRV_SECTOR_SIZE);
|
|
|
|
}
|
2014-05-08 22:57:55 +04:00
|
|
|
|
raw-posix: The SEEK_HOLE code is flawed, rewrite it
On systems where SEEK_HOLE in a trailing hole seeks to EOF (Solaris,
but not Linux), try_seek_hole() reports trailing data instead.
Additionally, unlikely lseek() failures are treated badly:
* When SEEK_HOLE fails, try_seek_hole() reports trailing data. For
-ENXIO, there's in fact a trailing hole. Can happen only when
something truncated the file since we opened it.
* When SEEK_HOLE succeeds, SEEK_DATA fails, and SEEK_END succeeds,
then try_seek_hole() reports a trailing hole. This is okay only
when SEEK_DATA failed with -ENXIO (which means the non-trailing hole
found by SEEK_HOLE has since become trailing somehow). For other
failures (unlikely), it's wrong.
* When SEEK_HOLE succeeds, SEEK_DATA fails, SEEK_END fails (unlikely),
then try_seek_hole() reports bogus data [-1,start), which its caller
raw_co_get_block_status() turns into zero sectors of data. Could
theoretically lead to infinite loops in code that attempts to scan
data vs. hole forward.
Rewrite from scratch, with very careful comments.
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
2014-11-17 13:18:34 +03:00
|
|
|
ret = find_allocation(bs, start, &data, &hole);
|
|
|
|
if (ret == -ENXIO) {
|
|
|
|
/* Trailing hole */
|
|
|
|
*pnum = nb_sectors;
|
|
|
|
ret = BDRV_BLOCK_ZERO;
|
|
|
|
} else if (ret < 0) {
|
|
|
|
/* No info available, so pretend there are no holes */
|
|
|
|
*pnum = nb_sectors;
|
|
|
|
ret = BDRV_BLOCK_DATA;
|
|
|
|
} else if (data == start) {
|
2015-06-09 11:55:08 +03:00
|
|
|
/* On a data extent, compute sectors to the end of the extent,
|
|
|
|
* possibly including a partial sector at EOF. */
|
|
|
|
*pnum = MIN(nb_sectors, DIV_ROUND_UP(hole - start, BDRV_SECTOR_SIZE));
|
raw-posix: The SEEK_HOLE code is flawed, rewrite it
On systems where SEEK_HOLE in a trailing hole seeks to EOF (Solaris,
but not Linux), try_seek_hole() reports trailing data instead.
Additionally, unlikely lseek() failures are treated badly:
* When SEEK_HOLE fails, try_seek_hole() reports trailing data. For
-ENXIO, there's in fact a trailing hole. Can happen only when
something truncated the file since we opened it.
* When SEEK_HOLE succeeds, SEEK_DATA fails, and SEEK_END succeeds,
then try_seek_hole() reports a trailing hole. This is okay only
when SEEK_DATA failed with -ENXIO (which means the non-trailing hole
found by SEEK_HOLE has since become trailing somehow). For other
failures (unlikely), it's wrong.
* When SEEK_HOLE succeeds, SEEK_DATA fails, SEEK_END fails (unlikely),
then try_seek_hole() reports bogus data [-1,start), which its caller
raw_co_get_block_status() turns into zero sectors of data. Could
theoretically lead to infinite loops in code that attempts to scan
data vs. hole forward.
Rewrite from scratch, with very careful comments.
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
2014-11-17 13:18:34 +03:00
|
|
|
ret = BDRV_BLOCK_DATA;
|
2012-05-09 18:49:58 +04:00
|
|
|
} else {
|
|
|
|
/* On a hole, compute sectors to the beginning of the next extent. */
|
raw-posix: The SEEK_HOLE code is flawed, rewrite it
On systems where SEEK_HOLE in a trailing hole seeks to EOF (Solaris,
but not Linux), try_seek_hole() reports trailing data instead.
Additionally, unlikely lseek() failures are treated badly:
* When SEEK_HOLE fails, try_seek_hole() reports trailing data. For
-ENXIO, there's in fact a trailing hole. Can happen only when
something truncated the file since we opened it.
* When SEEK_HOLE succeeds, SEEK_DATA fails, and SEEK_END succeeds,
then try_seek_hole() reports a trailing hole. This is okay only
when SEEK_DATA failed with -ENXIO (which means the non-trailing hole
found by SEEK_HOLE has since become trailing somehow). For other
failures (unlikely), it's wrong.
* When SEEK_HOLE succeeds, SEEK_DATA fails, SEEK_END fails (unlikely),
then try_seek_hole() reports bogus data [-1,start), which its caller
raw_co_get_block_status() turns into zero sectors of data. Could
theoretically lead to infinite loops in code that attempts to scan
data vs. hole forward.
Rewrite from scratch, with very careful comments.
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
2014-11-17 13:18:34 +03:00
|
|
|
assert(hole == start);
|
2012-05-09 18:49:58 +04:00
|
|
|
*pnum = MIN(nb_sectors, (data - start) / BDRV_SECTOR_SIZE);
|
raw-posix: The SEEK_HOLE code is flawed, rewrite it
On systems where SEEK_HOLE in a trailing hole seeks to EOF (Solaris,
but not Linux), try_seek_hole() reports trailing data instead.
Additionally, unlikely lseek() failures are treated badly:
* When SEEK_HOLE fails, try_seek_hole() reports trailing data. For
-ENXIO, there's in fact a trailing hole. Can happen only when
something truncated the file since we opened it.
* When SEEK_HOLE succeeds, SEEK_DATA fails, and SEEK_END succeeds,
then try_seek_hole() reports a trailing hole. This is okay only
when SEEK_DATA failed with -ENXIO (which means the non-trailing hole
found by SEEK_HOLE has since become trailing somehow). For other
failures (unlikely), it's wrong.
* When SEEK_HOLE succeeds, SEEK_DATA fails, SEEK_END fails (unlikely),
then try_seek_hole() reports bogus data [-1,start), which its caller
raw_co_get_block_status() turns into zero sectors of data. Could
theoretically lead to infinite loops in code that attempts to scan
data vs. hole forward.
Rewrite from scratch, with very careful comments.
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
2014-11-17 13:18:34 +03:00
|
|
|
ret = BDRV_BLOCK_ZERO;
|
2012-05-09 18:49:58 +04:00
|
|
|
}
|
2016-01-26 06:58:51 +03:00
|
|
|
*file = bs;
|
raw-posix: The SEEK_HOLE code is flawed, rewrite it
On systems where SEEK_HOLE in a trailing hole seeks to EOF (Solaris,
but not Linux), try_seek_hole() reports trailing data instead.
Additionally, unlikely lseek() failures are treated badly:
* When SEEK_HOLE fails, try_seek_hole() reports trailing data. For
-ENXIO, there's in fact a trailing hole. Can happen only when
something truncated the file since we opened it.
* When SEEK_HOLE succeeds, SEEK_DATA fails, and SEEK_END succeeds,
then try_seek_hole() reports a trailing hole. This is okay only
when SEEK_DATA failed with -ENXIO (which means the non-trailing hole
found by SEEK_HOLE has since become trailing somehow). For other
failures (unlikely), it's wrong.
* When SEEK_HOLE succeeds, SEEK_DATA fails, SEEK_END fails (unlikely),
then try_seek_hole() reports bogus data [-1,start), which its caller
raw_co_get_block_status() turns into zero sectors of data. Could
theoretically lead to infinite loops in code that attempts to scan
data vs. hole forward.
Rewrite from scratch, with very careful comments.
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
2014-11-17 13:18:34 +03:00
|
|
|
return ret | BDRV_BLOCK_OFFSET_VALID | start;
|
2012-05-09 18:49:58 +04:00
|
|
|
}
|
|
|
|
|
2016-07-16 02:22:57 +03:00
|
|
|
static coroutine_fn BlockAIOCB *raw_aio_pdiscard(BlockDriverState *bs,
|
2017-06-09 13:18:08 +03:00
|
|
|
int64_t offset, int bytes,
|
2014-10-07 15:59:15 +04:00
|
|
|
BlockCompletionFunc *cb, void *opaque)
|
2010-12-17 13:41:15 +03:00
|
|
|
{
|
|
|
|
BDRVRawState *s = bs->opaque;
|
|
|
|
|
2017-06-09 13:18:08 +03:00
|
|
|
return paio_submit(bs, s->fd, offset, NULL, bytes,
|
2013-01-14 19:26:55 +04:00
|
|
|
cb, opaque, QEMU_AIO_DISCARD);
|
2010-12-17 13:41:15 +03:00
|
|
|
}
|
2009-05-18 18:42:10 +04:00
|
|
|
|
2016-06-02 00:10:10 +03:00
|
|
|
static int coroutine_fn raw_co_pwrite_zeroes(
|
|
|
|
BlockDriverState *bs, int64_t offset,
|
2017-06-09 13:18:08 +03:00
|
|
|
int bytes, BdrvRequestFlags flags)
|
2013-11-22 16:39:55 +04:00
|
|
|
{
|
|
|
|
BDRVRawState *s = bs->opaque;
|
|
|
|
|
|
|
|
if (!(flags & BDRV_REQ_MAY_UNMAP)) {
|
2017-06-09 13:18:08 +03:00
|
|
|
return paio_submit_co(bs, s->fd, offset, NULL, bytes,
|
2013-11-22 16:39:57 +04:00
|
|
|
QEMU_AIO_WRITE_ZEROES);
|
|
|
|
} else if (s->discard_zeroes) {
|
2017-06-09 13:18:08 +03:00
|
|
|
return paio_submit_co(bs, s->fd, offset, NULL, bytes,
|
2013-11-22 16:39:57 +04:00
|
|
|
QEMU_AIO_DISCARD);
|
2013-11-22 16:39:55 +04:00
|
|
|
}
|
2013-11-22 16:39:57 +04:00
|
|
|
return -ENOTSUP;
|
2013-11-22 16:39:55 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
static int raw_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
|
|
|
|
{
|
|
|
|
BDRVRawState *s = bs->opaque;
|
|
|
|
|
|
|
|
bdi->unallocated_blocks_are_zero = s->discard_zeroes;
|
|
|
|
bdi->can_write_zeroes_with_unmap = s->discard_zeroes;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2014-06-05 13:21:01 +04:00
|
|
|
static QemuOptsList raw_create_opts = {
|
|
|
|
.name = "raw-create-opts",
|
|
|
|
.head = QTAILQ_HEAD_INITIALIZER(raw_create_opts.head),
|
|
|
|
.desc = {
|
|
|
|
{
|
|
|
|
.name = BLOCK_OPT_SIZE,
|
|
|
|
.type = QEMU_OPT_SIZE,
|
|
|
|
.help = "Virtual disk size"
|
|
|
|
},
|
qemu-img create: add 'nocow' option
Add 'nocow' option so that users could have a chance to set NOCOW flag to
newly created files. It's useful on btrfs file system to enhance performance.
Btrfs has low performance when hosting VM images, even more when the guest
in those VM are also using btrfs as file system. One way to mitigate this bad
performance is to turn off COW attributes on VM files. Generally, there are
two ways to turn off NOCOW on btrfs: a) by mounting fs with nodatacow, then
all newly created files will be NOCOW. b) per file. Add the NOCOW file
attribute. It could only be done to empty or new files.
This patch tries the second way, according to the option, it could add NOCOW
per file.
For most block drivers, since the create file step is in raw-posix.c, so we
can do setting NOCOW flag ioctl in raw-posix.c only.
But there are some exceptions, like block/vpc.c and block/vdi.c, they are
creating file by calling qemu_open directly. For them, do the same setting
NOCOW flag ioctl work in them separately.
[Fixed up 082.out due to the new 'nocow' creation option
--Stefan]
Signed-off-by: Chunyan Liu <cyliu@suse.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-06-30 10:29:58 +04:00
|
|
|
{
|
|
|
|
.name = BLOCK_OPT_NOCOW,
|
|
|
|
.type = QEMU_OPT_BOOL,
|
|
|
|
.help = "Turn off copy-on-write (valid only on btrfs)"
|
|
|
|
},
|
2014-09-10 13:05:48 +04:00
|
|
|
{
|
|
|
|
.name = BLOCK_OPT_PREALLOC,
|
|
|
|
.type = QEMU_OPT_STRING,
|
|
|
|
.help = "Preallocation mode (allowed values: off, falloc, full)"
|
|
|
|
},
|
2014-06-05 13:21:01 +04:00
|
|
|
{ /* end of list */ }
|
|
|
|
}
|
2009-05-18 18:42:10 +04:00
|
|
|
};
|
|
|
|
|
2017-05-02 19:35:56 +03:00
|
|
|
static int raw_check_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared,
|
|
|
|
Error **errp)
|
|
|
|
{
|
|
|
|
return raw_handle_perm_lock(bs, RAW_PL_PREPARE, perm, shared, errp);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void raw_set_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared)
|
|
|
|
{
|
|
|
|
BDRVRawState *s = bs->opaque;
|
|
|
|
raw_handle_perm_lock(bs, RAW_PL_COMMIT, perm, shared, NULL);
|
|
|
|
s->perm = perm;
|
|
|
|
s->shared_perm = shared;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void raw_abort_perm_update(BlockDriverState *bs)
|
|
|
|
{
|
|
|
|
raw_handle_perm_lock(bs, RAW_PL_ABORT, 0, 0, NULL);
|
|
|
|
}
|
|
|
|
|
2014-12-02 20:32:41 +03:00
|
|
|
BlockDriver bdrv_file = {
|
2010-04-08 00:30:24 +04:00
|
|
|
.format_name = "file",
|
|
|
|
.protocol_name = "file",
|
2009-04-07 21:57:09 +04:00
|
|
|
.instance_size = sizeof(BDRVRawState),
|
2013-09-24 19:07:04 +04:00
|
|
|
.bdrv_needs_filename = true,
|
2009-04-07 21:57:09 +04:00
|
|
|
.bdrv_probe = NULL, /* no probe for protocols */
|
2014-03-06 01:41:37 +04:00
|
|
|
.bdrv_parse_filename = raw_parse_filename,
|
2010-04-14 16:17:38 +04:00
|
|
|
.bdrv_file_open = raw_open,
|
2012-09-20 23:13:25 +04:00
|
|
|
.bdrv_reopen_prepare = raw_reopen_prepare,
|
|
|
|
.bdrv_reopen_commit = raw_reopen_commit,
|
|
|
|
.bdrv_reopen_abort = raw_reopen_abort,
|
2009-04-07 21:57:09 +04:00
|
|
|
.bdrv_close = raw_close,
|
2014-06-05 13:21:11 +04:00
|
|
|
.bdrv_create = raw_create,
|
2013-06-28 14:47:42 +04:00
|
|
|
.bdrv_has_zero_init = bdrv_has_zero_init_1,
|
2013-09-04 21:00:28 +04:00
|
|
|
.bdrv_co_get_block_status = raw_co_get_block_status,
|
2016-06-02 00:10:10 +03:00
|
|
|
.bdrv_co_pwrite_zeroes = raw_co_pwrite_zeroes,
|
2007-09-17 12:09:54 +04:00
|
|
|
|
2016-06-03 18:36:27 +03:00
|
|
|
.bdrv_co_preadv = raw_co_preadv,
|
|
|
|
.bdrv_co_pwritev = raw_co_pwritev,
|
2009-09-04 21:01:49 +04:00
|
|
|
.bdrv_aio_flush = raw_aio_flush,
|
2016-07-16 02:22:57 +03:00
|
|
|
.bdrv_aio_pdiscard = raw_aio_pdiscard,
|
2011-11-29 15:42:20 +04:00
|
|
|
.bdrv_refresh_limits = raw_refresh_limits,
|
2014-07-04 14:04:34 +04:00
|
|
|
.bdrv_io_plug = raw_aio_plug,
|
|
|
|
.bdrv_io_unplug = raw_aio_unplug,
|
2008-12-12 19:41:40 +03:00
|
|
|
|
2006-08-01 20:21:11 +04:00
|
|
|
.bdrv_truncate = raw_truncate,
|
|
|
|
.bdrv_getlength = raw_getlength,
|
2013-11-22 16:39:55 +04:00
|
|
|
.bdrv_get_info = raw_get_info,
|
2011-07-12 15:56:39 +04:00
|
|
|
.bdrv_get_allocated_file_size
|
|
|
|
= raw_get_allocated_file_size,
|
2017-05-02 19:35:56 +03:00
|
|
|
.bdrv_check_perm = raw_check_perm,
|
|
|
|
.bdrv_set_perm = raw_set_perm,
|
|
|
|
.bdrv_abort_perm_update = raw_abort_perm_update,
|
2014-06-05 13:21:01 +04:00
|
|
|
.create_opts = &raw_create_opts,
|
2006-08-01 20:21:11 +04:00
|
|
|
};
|
|
|
|
|
2006-08-19 15:45:59 +04:00
|
|
|
/***********************************************/
|
|
|
|
/* host device */
|
|
|
|
|
2011-11-10 22:40:06 +04:00
|
|
|
#if defined(__APPLE__) && defined(__MACH__)
|
2015-11-21 03:17:48 +03:00
|
|
|
static kern_return_t GetBSDPath(io_iterator_t mediaIterator, char *bsdPath,
|
|
|
|
CFIndex maxPathSize, int flags);
|
2016-03-21 18:41:28 +03:00
|
|
|
static char *FindEjectableOpticalMedia(io_iterator_t *mediaIterator)
|
2006-08-19 15:45:59 +04:00
|
|
|
{
|
2016-03-21 18:41:28 +03:00
|
|
|
kern_return_t kernResult = KERN_FAILURE;
|
2006-08-19 15:45:59 +04:00
|
|
|
mach_port_t masterPort;
|
|
|
|
CFMutableDictionaryRef classesToMatch;
|
2016-03-21 18:41:28 +03:00
|
|
|
const char *matching_array[] = {kIODVDMediaClass, kIOCDMediaClass};
|
|
|
|
char *mediaType = NULL;
|
2006-08-19 15:45:59 +04:00
|
|
|
|
|
|
|
kernResult = IOMasterPort( MACH_PORT_NULL, &masterPort );
|
|
|
|
if ( KERN_SUCCESS != kernResult ) {
|
|
|
|
printf( "IOMasterPort returned %d\n", kernResult );
|
|
|
|
}
|
2007-09-17 12:09:54 +04:00
|
|
|
|
2016-03-21 18:41:28 +03:00
|
|
|
int index;
|
|
|
|
for (index = 0; index < ARRAY_SIZE(matching_array); index++) {
|
|
|
|
classesToMatch = IOServiceMatching(matching_array[index]);
|
|
|
|
if (classesToMatch == NULL) {
|
|
|
|
error_report("IOServiceMatching returned NULL for %s",
|
|
|
|
matching_array[index]);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
CFDictionarySetValue(classesToMatch, CFSTR(kIOMediaEjectableKey),
|
|
|
|
kCFBooleanTrue);
|
|
|
|
kernResult = IOServiceGetMatchingServices(masterPort, classesToMatch,
|
|
|
|
mediaIterator);
|
|
|
|
if (kernResult != KERN_SUCCESS) {
|
|
|
|
error_report("Note: IOServiceGetMatchingServices returned %d",
|
|
|
|
kernResult);
|
|
|
|
continue;
|
|
|
|
}
|
2007-09-17 12:09:54 +04:00
|
|
|
|
2016-03-21 18:41:28 +03:00
|
|
|
/* If a match was found, leave the loop */
|
|
|
|
if (*mediaIterator != 0) {
|
|
|
|
DPRINTF("Matching using %s\n", matching_array[index]);
|
|
|
|
mediaType = g_strdup(matching_array[index]);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return mediaType;
|
2006-08-19 15:45:59 +04:00
|
|
|
}
|
|
|
|
|
2015-11-21 03:17:48 +03:00
|
|
|
kern_return_t GetBSDPath(io_iterator_t mediaIterator, char *bsdPath,
|
|
|
|
CFIndex maxPathSize, int flags)
|
2006-08-19 15:45:59 +04:00
|
|
|
{
|
|
|
|
io_object_t nextMedia;
|
|
|
|
kern_return_t kernResult = KERN_FAILURE;
|
|
|
|
*bsdPath = '\0';
|
|
|
|
nextMedia = IOIteratorNext( mediaIterator );
|
|
|
|
if ( nextMedia )
|
|
|
|
{
|
|
|
|
CFTypeRef bsdPathAsCFString;
|
|
|
|
bsdPathAsCFString = IORegistryEntryCreateCFProperty( nextMedia, CFSTR( kIOBSDNameKey ), kCFAllocatorDefault, 0 );
|
|
|
|
if ( bsdPathAsCFString ) {
|
|
|
|
size_t devPathLength;
|
|
|
|
strcpy( bsdPath, _PATH_DEV );
|
2015-11-21 03:17:48 +03:00
|
|
|
if (flags & BDRV_O_NOCACHE) {
|
|
|
|
strcat(bsdPath, "r");
|
|
|
|
}
|
2006-08-19 15:45:59 +04:00
|
|
|
devPathLength = strlen( bsdPath );
|
|
|
|
if ( CFStringGetCString( bsdPathAsCFString, bsdPath + devPathLength, maxPathSize - devPathLength, kCFStringEncodingASCII ) ) {
|
|
|
|
kernResult = KERN_SUCCESS;
|
|
|
|
}
|
|
|
|
CFRelease( bsdPathAsCFString );
|
|
|
|
}
|
|
|
|
IOObjectRelease( nextMedia );
|
|
|
|
}
|
2007-09-17 12:09:54 +04:00
|
|
|
|
2006-08-19 15:45:59 +04:00
|
|
|
return kernResult;
|
|
|
|
}
|
|
|
|
|
2016-03-21 18:41:28 +03:00
|
|
|
/* Sets up a real cdrom for use in QEMU */
|
|
|
|
static bool setup_cdrom(char *bsd_path, Error **errp)
|
|
|
|
{
|
|
|
|
int index, num_of_test_partitions = 2, fd;
|
|
|
|
char test_partition[MAXPATHLEN];
|
|
|
|
bool partition_found = false;
|
|
|
|
|
|
|
|
/* look for a working partition */
|
|
|
|
for (index = 0; index < num_of_test_partitions; index++) {
|
|
|
|
snprintf(test_partition, sizeof(test_partition), "%ss%d", bsd_path,
|
|
|
|
index);
|
|
|
|
fd = qemu_open(test_partition, O_RDONLY | O_BINARY | O_LARGEFILE);
|
|
|
|
if (fd >= 0) {
|
|
|
|
partition_found = true;
|
|
|
|
qemu_close(fd);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* if a working partition on the device was not found */
|
|
|
|
if (partition_found == false) {
|
|
|
|
error_setg(errp, "Failed to find a working partition on disc");
|
|
|
|
} else {
|
|
|
|
DPRINTF("Using %s as optical disc\n", test_partition);
|
|
|
|
pstrcpy(bsd_path, MAXPATHLEN, test_partition);
|
|
|
|
}
|
|
|
|
return partition_found;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Prints directions on mounting and unmounting a device */
|
|
|
|
static void print_unmounting_directions(const char *file_name)
|
|
|
|
{
|
|
|
|
error_report("If device %s is mounted on the desktop, unmount"
|
|
|
|
" it first before using it in QEMU", file_name);
|
|
|
|
error_report("Command to unmount device: diskutil unmountDisk %s",
|
|
|
|
file_name);
|
|
|
|
error_report("Command to mount device: diskutil mountDisk %s", file_name);
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif /* defined(__APPLE__) && defined(__MACH__) */
|
2006-08-19 15:45:59 +04:00
|
|
|
|
2009-06-15 16:04:22 +04:00
|
|
|
static int hdev_probe_device(const char *filename)
|
|
|
|
{
|
|
|
|
struct stat st;
|
|
|
|
|
|
|
|
/* allow a dedicated CD-ROM driver to match with a higher priority */
|
|
|
|
if (strstart(filename, "/dev/cdrom", NULL))
|
|
|
|
return 50;
|
|
|
|
|
|
|
|
if (stat(filename, &st) >= 0 &&
|
|
|
|
(S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode))) {
|
|
|
|
return 100;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2013-02-05 15:28:33 +04:00
|
|
|
static int check_hdev_writable(BDRVRawState *s)
|
|
|
|
{
|
|
|
|
#if defined(BLKROGET)
|
|
|
|
/* Linux block devices can be configured "read-only" using blockdev(8).
|
|
|
|
* This is independent of device node permissions and therefore open(2)
|
|
|
|
* with O_RDWR succeeds. Actual writes fail with EPERM.
|
|
|
|
*
|
|
|
|
* bdrv_open() is supposed to fail if the disk is read-only. Explicitly
|
|
|
|
* check for read-only block devices so that Linux block devices behave
|
|
|
|
* properly.
|
|
|
|
*/
|
|
|
|
struct stat st;
|
|
|
|
int readonly = 0;
|
|
|
|
|
|
|
|
if (fstat(s->fd, &st)) {
|
|
|
|
return -errno;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!S_ISBLK(st.st_mode)) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ioctl(s->fd, BLKROGET, &readonly) < 0) {
|
|
|
|
return -errno;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (readonly) {
|
|
|
|
return -EACCES;
|
|
|
|
}
|
|
|
|
#endif /* defined(BLKROGET) */
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2014-03-08 03:39:41 +04:00
|
|
|
static void hdev_parse_filename(const char *filename, QDict *options,
|
|
|
|
Error **errp)
|
|
|
|
{
|
2017-05-22 22:52:16 +03:00
|
|
|
bdrv_parse_filename_strip_prefix(filename, "host_device:", options);
|
2014-03-08 03:39:41 +04:00
|
|
|
}
|
|
|
|
|
2015-06-23 13:45:00 +03:00
|
|
|
static bool hdev_is_sg(BlockDriverState *bs)
|
|
|
|
{
|
|
|
|
|
|
|
|
#if defined(__linux__)
|
|
|
|
|
2016-10-20 15:50:12 +03:00
|
|
|
BDRVRawState *s = bs->opaque;
|
2015-06-23 13:45:00 +03:00
|
|
|
struct stat st;
|
|
|
|
struct sg_scsi_id scsiid;
|
|
|
|
int sg_version;
|
2016-10-20 15:50:12 +03:00
|
|
|
int ret;
|
|
|
|
|
|
|
|
if (stat(bs->filename, &st) < 0 || !S_ISCHR(st.st_mode)) {
|
|
|
|
return false;
|
|
|
|
}
|
2015-06-23 13:45:00 +03:00
|
|
|
|
2016-10-20 15:50:12 +03:00
|
|
|
ret = ioctl(s->fd, SG_GET_VERSION_NUM, &sg_version);
|
|
|
|
if (ret < 0) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = ioctl(s->fd, SG_GET_SCSI_ID, &scsiid);
|
|
|
|
if (ret >= 0) {
|
2015-06-23 13:45:00 +03:00
|
|
|
DPRINTF("SG device found: type=%d, version=%d\n",
|
|
|
|
scsiid.scsi_type, sg_version);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2013-09-05 16:22:29 +04:00
|
|
|
static int hdev_open(BlockDriverState *bs, QDict *options, int flags,
|
|
|
|
Error **errp)
|
2006-08-19 15:45:59 +04:00
|
|
|
{
|
|
|
|
BDRVRawState *s = bs->opaque;
|
2013-10-11 13:37:01 +04:00
|
|
|
Error *local_err = NULL;
|
2013-02-05 15:28:33 +04:00
|
|
|
int ret;
|
2008-09-22 23:17:18 +04:00
|
|
|
|
2011-11-10 22:40:06 +04:00
|
|
|
#if defined(__APPLE__) && defined(__MACH__)
|
block: Document -drive problematic code and bugs
-blockdev and blockdev_add convert their arguments via QObject to
BlockdevOptions for qmp_blockdev_add(), which converts them back to
QObject, then to a flattened QDict. The QDict's members are typed
according to the QAPI schema.
-drive converts its argument via QemuOpts to a (flat) QDict. This
QDict's members are all QString.
Thus, the QType of a flat QDict member depends on whether it comes
from -drive or -blockdev/blockdev_add, except when the QAPI type maps
to QString, which is the case for 'str' and enumeration types.
The block layer core extracts generic configuration from the flat
QDict, and the block driver extracts driver-specific configuration.
Both commonly do so by converting (parts of) the flat QDict to
QemuOpts, which turns all values into strings. Not exactly elegant,
but correct.
However, A few places access the flat QDict directly:
* Most of them access members that are always QString. Correct.
* bdrv_open_inherit() accesses a boolean, carefully. Correct.
* nfs_config() uses a QObject input visitor. Correct only because the
visited type contains nothing but QStrings.
* nbd_config() and ssh_config() use a QObject input visitor, and the
visited types contain non-QStrings: InetSocketAddress members
@numeric, @to, @ipv4, @ipv6. -drive works as long as you don't try
to use them (they're all optional). @to is ignored anyway.
Reproducer:
-drive driver=ssh,server.host=h,server.port=22,server.ipv4,path=p
-drive driver=nbd,server.type=inet,server.data.host=h,server.data.port=22,server.data.ipv4
both fail with "Invalid parameter type for 'data.ipv4', expected: boolean"
Add suitable comments to all these places. Mark the buggy ones FIXME.
"Fortunately", -drive's driver-specific options are entirely
undocumented.
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Message-id: 1490895797-29094-5-git-send-email-armbru@redhat.com
[mreitz: Fixed two typos]
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
2017-03-30 20:43:12 +03:00
|
|
|
/*
|
|
|
|
* Caution: while qdict_get_str() is fine, getting non-string types
|
|
|
|
* would require more care. When @options come from -blockdev or
|
|
|
|
* blockdev_add, its members are typed according to the QAPI
|
|
|
|
* schema, but when they come from -drive, they're all QString.
|
|
|
|
*/
|
2015-06-23 13:45:00 +03:00
|
|
|
const char *filename = qdict_get_str(options, "filename");
|
2016-03-21 18:41:28 +03:00
|
|
|
char bsd_path[MAXPATHLEN] = "";
|
|
|
|
bool error_occurred = false;
|
|
|
|
|
|
|
|
/* If using a real cdrom */
|
|
|
|
if (strcmp(filename, "/dev/cdrom") == 0) {
|
|
|
|
char *mediaType = NULL;
|
|
|
|
kern_return_t ret_val;
|
|
|
|
io_iterator_t mediaIterator = 0;
|
|
|
|
|
|
|
|
mediaType = FindEjectableOpticalMedia(&mediaIterator);
|
|
|
|
if (mediaType == NULL) {
|
|
|
|
error_setg(errp, "Please make sure your CD/DVD is in the optical"
|
|
|
|
" drive");
|
|
|
|
error_occurred = true;
|
|
|
|
goto hdev_open_Mac_error;
|
|
|
|
}
|
2015-06-23 13:45:00 +03:00
|
|
|
|
2016-03-21 18:41:28 +03:00
|
|
|
ret_val = GetBSDPath(mediaIterator, bsd_path, sizeof(bsd_path), flags);
|
|
|
|
if (ret_val != KERN_SUCCESS) {
|
|
|
|
error_setg(errp, "Could not get BSD path for optical drive");
|
|
|
|
error_occurred = true;
|
|
|
|
goto hdev_open_Mac_error;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* If a real optical drive was not found */
|
|
|
|
if (bsd_path[0] == '\0') {
|
|
|
|
error_setg(errp, "Failed to obtain bsd path for optical drive");
|
|
|
|
error_occurred = true;
|
|
|
|
goto hdev_open_Mac_error;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* If using a cdrom disc and finding a partition on the disc failed */
|
|
|
|
if (strncmp(mediaType, kIOCDMediaClass, 9) == 0 &&
|
|
|
|
setup_cdrom(bsd_path, errp) == false) {
|
|
|
|
print_unmounting_directions(bsd_path);
|
|
|
|
error_occurred = true;
|
|
|
|
goto hdev_open_Mac_error;
|
2006-08-19 15:45:59 +04:00
|
|
|
}
|
2007-09-17 12:09:54 +04:00
|
|
|
|
2017-04-28 00:58:17 +03:00
|
|
|
qdict_put_str(options, "filename", bsd_path);
|
2016-03-21 18:41:28 +03:00
|
|
|
|
|
|
|
hdev_open_Mac_error:
|
|
|
|
g_free(mediaType);
|
|
|
|
if (mediaIterator) {
|
|
|
|
IOObjectRelease(mediaIterator);
|
|
|
|
}
|
|
|
|
if (error_occurred) {
|
|
|
|
return -ENOENT;
|
|
|
|
}
|
2006-08-19 15:45:59 +04:00
|
|
|
}
|
2016-03-21 18:41:28 +03:00
|
|
|
#endif /* defined(__APPLE__) && defined(__MACH__) */
|
2006-08-19 15:45:59 +04:00
|
|
|
|
|
|
|
s->type = FTYPE_FILE;
|
2009-06-15 15:53:38 +04:00
|
|
|
|
2013-10-11 13:37:01 +04:00
|
|
|
ret = raw_open_common(bs, options, flags, 0, &local_err);
|
2013-02-05 15:28:33 +04:00
|
|
|
if (ret < 0) {
|
2016-06-14 00:57:56 +03:00
|
|
|
error_propagate(errp, local_err);
|
2016-03-21 18:41:28 +03:00
|
|
|
#if defined(__APPLE__) && defined(__MACH__)
|
|
|
|
if (*bsd_path) {
|
|
|
|
filename = bsd_path;
|
|
|
|
}
|
|
|
|
/* if a physical device experienced an error while being opened */
|
|
|
|
if (strncmp(filename, "/dev/", 5) == 0) {
|
|
|
|
print_unmounting_directions(filename);
|
|
|
|
}
|
|
|
|
#endif /* defined(__APPLE__) && defined(__MACH__) */
|
2013-02-05 15:28:33 +04:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2015-06-23 13:45:00 +03:00
|
|
|
/* Since this does ioctl the device must be already opened */
|
|
|
|
bs->sg = hdev_is_sg(bs);
|
|
|
|
|
2013-02-05 15:28:33 +04:00
|
|
|
if (flags & BDRV_O_RDWR) {
|
|
|
|
ret = check_hdev_writable(s);
|
|
|
|
if (ret < 0) {
|
|
|
|
raw_close(bs);
|
2013-10-11 13:37:01 +04:00
|
|
|
error_setg_errno(errp, -ret, "The device is not writable");
|
2013-02-05 15:28:33 +04:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
2006-08-19 15:45:59 +04:00
|
|
|
}
|
|
|
|
|
2008-09-15 19:51:35 +04:00
|
|
|
#if defined(__linux__)
|
2009-03-28 20:28:41 +03:00
|
|
|
|
2014-10-07 15:59:14 +04:00
|
|
|
static BlockAIOCB *hdev_aio_ioctl(BlockDriverState *bs,
|
2009-03-28 20:28:41 +03:00
|
|
|
unsigned long int req, void *buf,
|
2014-10-07 15:59:15 +04:00
|
|
|
BlockCompletionFunc *cb, void *opaque)
|
2009-03-28 20:28:41 +03:00
|
|
|
{
|
2009-04-07 22:43:24 +04:00
|
|
|
BDRVRawState *s = bs->opaque;
|
2012-11-02 19:14:20 +04:00
|
|
|
RawPosixAIOData *acb;
|
2013-03-07 16:41:49 +04:00
|
|
|
ThreadPool *pool;
|
2009-03-28 20:28:41 +03:00
|
|
|
|
2009-04-07 22:43:24 +04:00
|
|
|
if (fd_open(bs) < 0)
|
|
|
|
return NULL;
|
2012-11-02 19:14:20 +04:00
|
|
|
|
scsi, file-posix: add support for persistent reservation management
It is a common requirement for virtual machine to send persistent
reservations, but this currently requires either running QEMU with
CAP_SYS_RAWIO, or using out-of-tree patches that let an unprivileged
QEMU bypass Linux's filter on SG_IO commands.
As an alternative mechanism, the next patches will introduce a
privileged helper to run persistent reservation commands without
expanding QEMU's attack surface unnecessarily.
The helper is invoked through a "pr-manager" QOM object, to which
file-posix.c passes SG_IO requests for PERSISTENT RESERVE OUT and
PERSISTENT RESERVE IN commands. For example:
$ qemu-system-x86_64
-device virtio-scsi \
-object pr-manager-helper,id=helper0,path=/var/run/qemu-pr-helper.sock
-drive if=none,id=hd,driver=raw,file.filename=/dev/sdb,file.pr-manager=helper0
-device scsi-block,drive=hd
or:
$ qemu-system-x86_64
-device virtio-scsi \
-object pr-manager-helper,id=helper0,path=/var/run/qemu-pr-helper.sock
-blockdev node-name=hd,driver=raw,file.driver=host_device,file.filename=/dev/sdb,file.pr-manager=helper0
-device scsi-block,drive=hd
Multiple pr-manager implementations are conceivable and possible, though
only one is implemented right now. For example, a pr-manager could:
- talk directly to the multipath daemon from a privileged QEMU
(i.e. QEMU links to libmpathpersist); this makes reservation work
properly with multipath, but still requires CAP_SYS_RAWIO
- use the Linux IOC_PR_* ioctls (they require CAP_SYS_ADMIN though)
- more interestingly, implement reservations directly in QEMU
through file system locks or a shared database (e.g. sqlite)
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2017-08-21 19:58:56 +03:00
|
|
|
if (req == SG_IO && s->pr_mgr) {
|
|
|
|
struct sg_io_hdr *io_hdr = buf;
|
|
|
|
if (io_hdr->cmdp[0] == PERSISTENT_RESERVE_OUT ||
|
|
|
|
io_hdr->cmdp[0] == PERSISTENT_RESERVE_IN) {
|
|
|
|
return pr_manager_execute(s->pr_mgr, bdrv_get_aio_context(bs),
|
|
|
|
s->fd, io_hdr, cb, opaque);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-10-01 14:04:39 +03:00
|
|
|
acb = g_new(RawPosixAIOData, 1);
|
2012-11-02 19:14:20 +04:00
|
|
|
acb->bs = bs;
|
|
|
|
acb->aio_type = QEMU_AIO_IOCTL;
|
|
|
|
acb->aio_fildes = s->fd;
|
|
|
|
acb->aio_offset = 0;
|
|
|
|
acb->aio_ioctl_buf = buf;
|
|
|
|
acb->aio_ioctl_cmd = req;
|
2013-03-07 16:41:49 +04:00
|
|
|
pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
|
|
|
|
return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque);
|
2009-03-28 20:28:41 +03:00
|
|
|
}
|
2015-10-19 18:53:07 +03:00
|
|
|
#endif /* linux */
|
2009-03-28 20:28:41 +03:00
|
|
|
|
2009-03-28 11:37:13 +03:00
|
|
|
static int fd_open(BlockDriverState *bs)
|
|
|
|
{
|
|
|
|
BDRVRawState *s = bs->opaque;
|
|
|
|
|
|
|
|
/* this is just to ensure s->fd is sane (its called by io ops) */
|
|
|
|
if (s->fd >= 0)
|
|
|
|
return 0;
|
|
|
|
return -EIO;
|
|
|
|
}
|
2009-03-12 22:57:12 +03:00
|
|
|
|
2016-07-16 02:22:57 +03:00
|
|
|
static coroutine_fn BlockAIOCB *hdev_aio_pdiscard(BlockDriverState *bs,
|
2017-06-09 13:18:08 +03:00
|
|
|
int64_t offset, int bytes,
|
2014-10-07 15:59:15 +04:00
|
|
|
BlockCompletionFunc *cb, void *opaque)
|
2013-01-18 19:43:35 +04:00
|
|
|
{
|
|
|
|
BDRVRawState *s = bs->opaque;
|
|
|
|
|
|
|
|
if (fd_open(bs) < 0) {
|
|
|
|
return NULL;
|
|
|
|
}
|
2017-06-09 13:18:08 +03:00
|
|
|
return paio_submit(bs, s->fd, offset, NULL, bytes,
|
2013-01-18 19:43:35 +04:00
|
|
|
cb, opaque, QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV);
|
|
|
|
}
|
|
|
|
|
2016-06-02 00:10:10 +03:00
|
|
|
static coroutine_fn int hdev_co_pwrite_zeroes(BlockDriverState *bs,
|
2017-06-09 13:18:08 +03:00
|
|
|
int64_t offset, int bytes, BdrvRequestFlags flags)
|
2013-11-22 16:39:56 +04:00
|
|
|
{
|
|
|
|
BDRVRawState *s = bs->opaque;
|
|
|
|
int rc;
|
|
|
|
|
|
|
|
rc = fd_open(bs);
|
|
|
|
if (rc < 0) {
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
if (!(flags & BDRV_REQ_MAY_UNMAP)) {
|
2017-06-09 13:18:08 +03:00
|
|
|
return paio_submit_co(bs, s->fd, offset, NULL, bytes,
|
2013-11-22 16:39:57 +04:00
|
|
|
QEMU_AIO_WRITE_ZEROES|QEMU_AIO_BLKDEV);
|
|
|
|
} else if (s->discard_zeroes) {
|
2017-06-09 13:18:08 +03:00
|
|
|
return paio_submit_co(bs, s->fd, offset, NULL, bytes,
|
2013-11-22 16:39:57 +04:00
|
|
|
QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV);
|
2013-11-22 16:39:56 +04:00
|
|
|
}
|
2013-11-22 16:39:57 +04:00
|
|
|
return -ENOTSUP;
|
2013-11-22 16:39:56 +04:00
|
|
|
}
|
|
|
|
|
2014-06-05 13:21:01 +04:00
|
|
|
static int hdev_create(const char *filename, QemuOpts *opts,
|
2013-09-05 16:26:05 +04:00
|
|
|
Error **errp)
|
2009-04-05 21:40:43 +04:00
|
|
|
{
|
|
|
|
int fd;
|
|
|
|
int ret = 0;
|
|
|
|
struct stat stat_buf;
|
2009-05-18 18:42:10 +04:00
|
|
|
int64_t total_size = 0;
|
2014-03-08 03:39:44 +04:00
|
|
|
bool has_prefix;
|
|
|
|
|
2015-10-19 18:53:07 +03:00
|
|
|
/* This function is used by both protocol block drivers and therefore either
|
|
|
|
* of these prefixes may be given.
|
2014-03-08 03:39:44 +04:00
|
|
|
* The return value has to be stored somewhere, otherwise this is an error
|
|
|
|
* due to -Werror=unused-value. */
|
|
|
|
has_prefix =
|
|
|
|
strstart(filename, "host_device:", &filename) ||
|
2015-10-19 18:53:07 +03:00
|
|
|
strstart(filename, "host_cdrom:" , &filename);
|
2014-03-08 03:39:44 +04:00
|
|
|
|
|
|
|
(void)has_prefix;
|
2009-04-05 21:40:43 +04:00
|
|
|
|
2015-08-12 18:33:31 +03:00
|
|
|
ret = raw_normalize_devicepath(&filename);
|
|
|
|
if (ret < 0) {
|
|
|
|
error_setg_errno(errp, -ret, "Could not normalize device path");
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2009-05-18 18:42:10 +04:00
|
|
|
/* Read out options */
|
2014-09-10 13:05:46 +04:00
|
|
|
total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
|
|
|
|
BDRV_SECTOR_SIZE);
|
2009-04-05 21:40:43 +04:00
|
|
|
|
2012-08-15 00:43:45 +04:00
|
|
|
fd = qemu_open(filename, O_WRONLY | O_BINARY);
|
2013-10-11 13:37:01 +04:00
|
|
|
if (fd < 0) {
|
|
|
|
ret = -errno;
|
|
|
|
error_setg_errno(errp, -ret, "Could not open device");
|
|
|
|
return ret;
|
|
|
|
}
|
2009-04-05 21:40:43 +04:00
|
|
|
|
2013-10-11 13:37:01 +04:00
|
|
|
if (fstat(fd, &stat_buf) < 0) {
|
2010-03-12 15:52:31 +03:00
|
|
|
ret = -errno;
|
2013-10-11 13:37:01 +04:00
|
|
|
error_setg_errno(errp, -ret, "Could not stat device");
|
|
|
|
} else if (!S_ISBLK(stat_buf.st_mode) && !S_ISCHR(stat_buf.st_mode)) {
|
|
|
|
error_setg(errp,
|
|
|
|
"The given file is neither a block nor a character device");
|
2010-03-12 15:52:31 +03:00
|
|
|
ret = -ENODEV;
|
2014-09-10 13:05:46 +04:00
|
|
|
} else if (lseek(fd, 0, SEEK_END) < total_size) {
|
2013-10-11 13:37:01 +04:00
|
|
|
error_setg(errp, "Device is too small");
|
2009-04-05 21:40:43 +04:00
|
|
|
ret = -ENOSPC;
|
2013-10-11 13:37:01 +04:00
|
|
|
}
|
2009-04-05 21:40:43 +04:00
|
|
|
|
2017-09-08 12:44:57 +03:00
|
|
|
if (!ret && total_size) {
|
|
|
|
uint8_t buf[BDRV_SECTOR_SIZE] = { 0 };
|
|
|
|
int64_t zero_size = MIN(BDRV_SECTOR_SIZE, total_size);
|
|
|
|
if (lseek(fd, 0, SEEK_SET) == -1) {
|
|
|
|
ret = -errno;
|
|
|
|
} else {
|
|
|
|
ret = qemu_write_full(fd, buf, zero_size);
|
|
|
|
ret = ret == zero_size ? 0 : -errno;
|
|
|
|
}
|
|
|
|
}
|
2012-08-15 00:43:46 +04:00
|
|
|
qemu_close(fd);
|
2009-04-05 21:40:43 +04:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2009-05-10 02:03:42 +04:00
|
|
|
static BlockDriver bdrv_host_device = {
|
2009-10-01 14:35:49 +04:00
|
|
|
.format_name = "host_device",
|
2010-04-08 00:30:24 +04:00
|
|
|
.protocol_name = "host_device",
|
2009-10-01 14:35:49 +04:00
|
|
|
.instance_size = sizeof(BDRVRawState),
|
2013-09-24 19:07:04 +04:00
|
|
|
.bdrv_needs_filename = true,
|
2009-10-01 14:35:49 +04:00
|
|
|
.bdrv_probe_device = hdev_probe_device,
|
2014-03-08 03:39:41 +04:00
|
|
|
.bdrv_parse_filename = hdev_parse_filename,
|
2010-04-14 16:17:38 +04:00
|
|
|
.bdrv_file_open = hdev_open,
|
2009-10-01 14:35:49 +04:00
|
|
|
.bdrv_close = raw_close,
|
2012-11-20 19:21:10 +04:00
|
|
|
.bdrv_reopen_prepare = raw_reopen_prepare,
|
|
|
|
.bdrv_reopen_commit = raw_reopen_commit,
|
|
|
|
.bdrv_reopen_abort = raw_reopen_abort,
|
2014-06-05 13:21:11 +04:00
|
|
|
.bdrv_create = hdev_create,
|
2014-06-05 13:21:01 +04:00
|
|
|
.create_opts = &raw_create_opts,
|
2016-06-02 00:10:10 +03:00
|
|
|
.bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes,
|
2007-09-17 12:09:54 +04:00
|
|
|
|
2016-06-03 18:36:27 +03:00
|
|
|
.bdrv_co_preadv = raw_co_preadv,
|
|
|
|
.bdrv_co_pwritev = raw_co_pwritev,
|
2009-09-04 21:01:49 +04:00
|
|
|
.bdrv_aio_flush = raw_aio_flush,
|
2016-07-16 02:22:57 +03:00
|
|
|
.bdrv_aio_pdiscard = hdev_aio_pdiscard,
|
2011-11-29 15:42:20 +04:00
|
|
|
.bdrv_refresh_limits = raw_refresh_limits,
|
2014-07-04 14:04:34 +04:00
|
|
|
.bdrv_io_plug = raw_aio_plug,
|
|
|
|
.bdrv_io_unplug = raw_aio_unplug,
|
2008-12-12 19:41:40 +03:00
|
|
|
|
2011-09-21 03:10:37 +04:00
|
|
|
.bdrv_truncate = raw_truncate,
|
2009-03-08 01:00:29 +03:00
|
|
|
.bdrv_getlength = raw_getlength,
|
2013-11-22 16:39:55 +04:00
|
|
|
.bdrv_get_info = raw_get_info,
|
2011-07-12 15:56:39 +04:00
|
|
|
.bdrv_get_allocated_file_size
|
|
|
|
= raw_get_allocated_file_size,
|
2017-05-02 19:35:56 +03:00
|
|
|
.bdrv_check_perm = raw_check_perm,
|
|
|
|
.bdrv_set_perm = raw_set_perm,
|
|
|
|
.bdrv_abort_perm_update = raw_abort_perm_update,
|
2015-02-16 14:47:56 +03:00
|
|
|
.bdrv_probe_blocksizes = hdev_probe_blocksizes,
|
|
|
|
.bdrv_probe_geometry = hdev_probe_geometry,
|
2006-08-19 15:45:59 +04:00
|
|
|
|
2009-06-15 15:55:19 +04:00
|
|
|
/* generic scsi device */
|
2009-06-15 16:04:34 +04:00
|
|
|
#ifdef __linux__
|
|
|
|
.bdrv_aio_ioctl = hdev_aio_ioctl,
|
|
|
|
#endif
|
2009-06-15 15:55:19 +04:00
|
|
|
};
|
|
|
|
|
2014-03-08 03:39:43 +04:00
|
|
|
#if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
|
|
|
|
static void cdrom_parse_filename(const char *filename, QDict *options,
|
|
|
|
Error **errp)
|
|
|
|
{
|
2017-05-22 22:52:16 +03:00
|
|
|
bdrv_parse_filename_strip_prefix(filename, "host_cdrom:", options);
|
2014-03-08 03:39:43 +04:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifdef __linux__
|
2013-09-05 16:22:29 +04:00
|
|
|
static int cdrom_open(BlockDriverState *bs, QDict *options, int flags,
|
|
|
|
Error **errp)
|
2009-06-15 15:55:19 +04:00
|
|
|
{
|
|
|
|
BDRVRawState *s = bs->opaque;
|
|
|
|
|
|
|
|
s->type = FTYPE_CD;
|
|
|
|
|
2009-06-17 19:27:44 +04:00
|
|
|
/* open will not fail even if no CD is inserted, so add O_NONBLOCK */
|
2016-06-14 00:57:58 +03:00
|
|
|
return raw_open_common(bs, options, flags, O_NONBLOCK, errp);
|
2009-06-15 15:55:19 +04:00
|
|
|
}
|
|
|
|
|
2009-06-15 16:04:22 +04:00
|
|
|
static int cdrom_probe_device(const char *filename)
|
|
|
|
{
|
2010-01-14 19:19:40 +03:00
|
|
|
int fd, ret;
|
|
|
|
int prio = 0;
|
2011-06-29 18:25:17 +04:00
|
|
|
struct stat st;
|
2010-01-14 19:19:40 +03:00
|
|
|
|
2012-08-15 00:43:45 +04:00
|
|
|
fd = qemu_open(filename, O_RDONLY | O_NONBLOCK);
|
2010-01-14 19:19:40 +03:00
|
|
|
if (fd < 0) {
|
|
|
|
goto out;
|
|
|
|
}
|
2011-06-29 18:25:17 +04:00
|
|
|
ret = fstat(fd, &st);
|
|
|
|
if (ret == -1 || !S_ISBLK(st.st_mode)) {
|
|
|
|
goto outc;
|
|
|
|
}
|
2010-01-14 19:19:40 +03:00
|
|
|
|
|
|
|
/* Attempt to detect via a CDROM specific ioctl */
|
|
|
|
ret = ioctl(fd, CDROM_DRIVE_STATUS, CDSL_CURRENT);
|
|
|
|
if (ret >= 0)
|
|
|
|
prio = 100;
|
|
|
|
|
2011-06-29 18:25:17 +04:00
|
|
|
outc:
|
2012-08-15 00:43:46 +04:00
|
|
|
qemu_close(fd);
|
2010-01-14 19:19:40 +03:00
|
|
|
out:
|
|
|
|
return prio;
|
2009-06-15 16:04:22 +04:00
|
|
|
}
|
|
|
|
|
2015-10-19 18:53:11 +03:00
|
|
|
static bool cdrom_is_inserted(BlockDriverState *bs)
|
2009-06-15 15:55:19 +04:00
|
|
|
{
|
|
|
|
BDRVRawState *s = bs->opaque;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = ioctl(s->fd, CDROM_DRIVE_STATUS, CDSL_CURRENT);
|
2015-10-19 18:53:11 +03:00
|
|
|
return ret == CDS_DISC_OK;
|
2009-06-15 15:55:19 +04:00
|
|
|
}
|
|
|
|
|
2012-02-03 22:24:53 +04:00
|
|
|
static void cdrom_eject(BlockDriverState *bs, bool eject_flag)
|
2009-06-15 15:55:19 +04:00
|
|
|
{
|
|
|
|
BDRVRawState *s = bs->opaque;
|
|
|
|
|
|
|
|
if (eject_flag) {
|
|
|
|
if (ioctl(s->fd, CDROMEJECT, NULL) < 0)
|
|
|
|
perror("CDROMEJECT");
|
|
|
|
} else {
|
|
|
|
if (ioctl(s->fd, CDROMCLOSETRAY, NULL) < 0)
|
|
|
|
perror("CDROMEJECT");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-09-06 20:58:47 +04:00
|
|
|
static void cdrom_lock_medium(BlockDriverState *bs, bool locked)
|
2009-06-15 15:55:19 +04:00
|
|
|
{
|
|
|
|
BDRVRawState *s = bs->opaque;
|
|
|
|
|
|
|
|
if (ioctl(s->fd, CDROM_LOCKDOOR, locked) < 0) {
|
|
|
|
/*
|
|
|
|
* Note: an error can happen if the distribution automatically
|
|
|
|
* mounts the CD-ROM
|
|
|
|
*/
|
|
|
|
/* perror("CDROM_LOCKDOOR"); */
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static BlockDriver bdrv_host_cdrom = {
|
|
|
|
.format_name = "host_cdrom",
|
2010-04-08 00:30:24 +04:00
|
|
|
.protocol_name = "host_cdrom",
|
2009-06-15 15:55:19 +04:00
|
|
|
.instance_size = sizeof(BDRVRawState),
|
2013-09-24 19:07:04 +04:00
|
|
|
.bdrv_needs_filename = true,
|
2009-06-15 16:04:22 +04:00
|
|
|
.bdrv_probe_device = cdrom_probe_device,
|
2014-03-08 03:39:43 +04:00
|
|
|
.bdrv_parse_filename = cdrom_parse_filename,
|
2010-04-14 16:17:38 +04:00
|
|
|
.bdrv_file_open = cdrom_open,
|
2009-06-15 15:55:19 +04:00
|
|
|
.bdrv_close = raw_close,
|
2012-11-20 19:21:10 +04:00
|
|
|
.bdrv_reopen_prepare = raw_reopen_prepare,
|
|
|
|
.bdrv_reopen_commit = raw_reopen_commit,
|
|
|
|
.bdrv_reopen_abort = raw_reopen_abort,
|
2014-06-05 13:21:11 +04:00
|
|
|
.bdrv_create = hdev_create,
|
2014-06-05 13:21:01 +04:00
|
|
|
.create_opts = &raw_create_opts,
|
2009-06-15 15:55:19 +04:00
|
|
|
|
2016-06-03 18:36:27 +03:00
|
|
|
|
|
|
|
.bdrv_co_preadv = raw_co_preadv,
|
|
|
|
.bdrv_co_pwritev = raw_co_pwritev,
|
2009-09-04 21:01:49 +04:00
|
|
|
.bdrv_aio_flush = raw_aio_flush,
|
2011-11-29 15:42:20 +04:00
|
|
|
.bdrv_refresh_limits = raw_refresh_limits,
|
2014-07-04 14:04:34 +04:00
|
|
|
.bdrv_io_plug = raw_aio_plug,
|
|
|
|
.bdrv_io_unplug = raw_aio_unplug,
|
2009-06-15 15:55:19 +04:00
|
|
|
|
2011-09-21 03:10:37 +04:00
|
|
|
.bdrv_truncate = raw_truncate,
|
block: Avoid unecessary drv->bdrv_getlength() calls
The block layer generally keeps the size of an image cached in
bs->total_sectors so that it doesn't have to perform expensive
operations to get the size whenever it needs it.
This doesn't work however when using a backend that can change its size
without qemu being aware of it, i.e. passthrough of removable media like
CD-ROMs or floppy disks. For this reason, the caching is disabled when a
removable device is used.
It is obvious that checking whether the _guest_ device has removable
media isn't the right thing to do when we want to know whether the size
of the host backend can change. To make things worse, non-top-level
BlockDriverStates never have any device attached, which makes qemu
assume they are removable, so drv->bdrv_getlength() is always called on
the protocol layer. In the case of raw-posix, this causes unnecessary
lseek() system calls, which turned out to be rather expensive.
This patch completely changes the logic and disables bs->total_sectors
caching only for certain block driver types, for which a size change is
expected: host_cdrom and host_floppy on POSIX, host_device on win32; also
the raw format in case it sits on top of one of these protocols, but in
the common case the nested bdrv_getlength() call on the protocol driver
will use the cache again and avoid an expensive drv->bdrv_getlength()
call.
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
2013-10-29 15:18:58 +04:00
|
|
|
.bdrv_getlength = raw_getlength,
|
|
|
|
.has_variable_length = true,
|
2011-07-12 15:56:39 +04:00
|
|
|
.bdrv_get_allocated_file_size
|
|
|
|
= raw_get_allocated_file_size,
|
2009-06-15 15:55:19 +04:00
|
|
|
|
|
|
|
/* removable device support */
|
|
|
|
.bdrv_is_inserted = cdrom_is_inserted,
|
|
|
|
.bdrv_eject = cdrom_eject,
|
2011-09-06 20:58:47 +04:00
|
|
|
.bdrv_lock_medium = cdrom_lock_medium,
|
2009-06-15 15:55:19 +04:00
|
|
|
|
|
|
|
/* generic scsi device */
|
2009-06-15 16:04:34 +04:00
|
|
|
.bdrv_aio_ioctl = hdev_aio_ioctl,
|
2009-06-15 15:55:19 +04:00
|
|
|
};
|
|
|
|
#endif /* __linux__ */
|
|
|
|
|
2009-11-29 20:00:41 +03:00
|
|
|
#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
|
2013-11-01 01:41:46 +04:00
|
|
|
static int cdrom_open(BlockDriverState *bs, QDict *options, int flags,
|
|
|
|
Error **errp)
|
2009-06-15 15:55:19 +04:00
|
|
|
{
|
|
|
|
BDRVRawState *s = bs->opaque;
|
2013-10-11 13:37:01 +04:00
|
|
|
Error *local_err = NULL;
|
2009-06-15 15:55:19 +04:00
|
|
|
int ret;
|
|
|
|
|
|
|
|
s->type = FTYPE_CD;
|
|
|
|
|
2013-10-11 13:37:01 +04:00
|
|
|
ret = raw_open_common(bs, options, flags, 0, &local_err);
|
|
|
|
if (ret) {
|
2016-06-14 00:57:56 +03:00
|
|
|
error_propagate(errp, local_err);
|
2009-06-15 15:55:19 +04:00
|
|
|
return ret;
|
2013-10-11 13:37:01 +04:00
|
|
|
}
|
2009-06-15 15:55:19 +04:00
|
|
|
|
2011-11-22 14:06:25 +04:00
|
|
|
/* make sure the door isn't locked at this time */
|
2009-06-15 15:55:19 +04:00
|
|
|
ioctl(s->fd, CDIOCALLOW);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2009-06-15 16:04:22 +04:00
|
|
|
static int cdrom_probe_device(const char *filename)
|
|
|
|
{
|
|
|
|
if (strstart(filename, "/dev/cd", NULL) ||
|
|
|
|
strstart(filename, "/dev/acd", NULL))
|
|
|
|
return 100;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2009-06-15 15:55:19 +04:00
|
|
|
static int cdrom_reopen(BlockDriverState *bs)
|
|
|
|
{
|
|
|
|
BDRVRawState *s = bs->opaque;
|
|
|
|
int fd;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Force reread of possibly changed/newly loaded disc,
|
|
|
|
* FreeBSD seems to not notice sometimes...
|
|
|
|
*/
|
|
|
|
if (s->fd >= 0)
|
2012-08-15 00:43:46 +04:00
|
|
|
qemu_close(s->fd);
|
2012-08-15 00:43:45 +04:00
|
|
|
fd = qemu_open(bs->filename, s->open_flags, 0644);
|
2009-06-15 15:55:19 +04:00
|
|
|
if (fd < 0) {
|
|
|
|
s->fd = -1;
|
|
|
|
return -EIO;
|
|
|
|
}
|
|
|
|
s->fd = fd;
|
|
|
|
|
2011-11-22 14:06:25 +04:00
|
|
|
/* make sure the door isn't locked at this time */
|
2009-06-15 15:55:19 +04:00
|
|
|
ioctl(s->fd, CDIOCALLOW);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2015-10-19 18:53:11 +03:00
|
|
|
static bool cdrom_is_inserted(BlockDriverState *bs)
|
2009-06-15 15:55:19 +04:00
|
|
|
{
|
|
|
|
return raw_getlength(bs) > 0;
|
|
|
|
}
|
|
|
|
|
2012-02-03 22:24:53 +04:00
|
|
|
static void cdrom_eject(BlockDriverState *bs, bool eject_flag)
|
2009-06-15 15:55:19 +04:00
|
|
|
{
|
|
|
|
BDRVRawState *s = bs->opaque;
|
|
|
|
|
|
|
|
if (s->fd < 0)
|
2011-07-20 20:23:42 +04:00
|
|
|
return;
|
2009-06-15 15:55:19 +04:00
|
|
|
|
|
|
|
(void) ioctl(s->fd, CDIOCALLOW);
|
|
|
|
|
|
|
|
if (eject_flag) {
|
|
|
|
if (ioctl(s->fd, CDIOCEJECT) < 0)
|
|
|
|
perror("CDIOCEJECT");
|
|
|
|
} else {
|
|
|
|
if (ioctl(s->fd, CDIOCCLOSE) < 0)
|
|
|
|
perror("CDIOCCLOSE");
|
|
|
|
}
|
|
|
|
|
2011-07-20 20:23:42 +04:00
|
|
|
cdrom_reopen(bs);
|
2009-06-15 15:55:19 +04:00
|
|
|
}
|
|
|
|
|
2011-09-06 20:58:47 +04:00
|
|
|
static void cdrom_lock_medium(BlockDriverState *bs, bool locked)
|
2009-06-15 15:55:19 +04:00
|
|
|
{
|
|
|
|
BDRVRawState *s = bs->opaque;
|
|
|
|
|
|
|
|
if (s->fd < 0)
|
2011-07-20 20:23:41 +04:00
|
|
|
return;
|
2009-06-15 15:55:19 +04:00
|
|
|
if (ioctl(s->fd, (locked ? CDIOCPREVENT : CDIOCALLOW)) < 0) {
|
|
|
|
/*
|
|
|
|
* Note: an error can happen if the distribution automatically
|
|
|
|
* mounts the CD-ROM
|
|
|
|
*/
|
|
|
|
/* perror("CDROM_LOCKDOOR"); */
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static BlockDriver bdrv_host_cdrom = {
|
|
|
|
.format_name = "host_cdrom",
|
2010-04-08 00:30:24 +04:00
|
|
|
.protocol_name = "host_cdrom",
|
2009-06-15 15:55:19 +04:00
|
|
|
.instance_size = sizeof(BDRVRawState),
|
2013-09-24 19:07:04 +04:00
|
|
|
.bdrv_needs_filename = true,
|
2009-06-15 16:04:22 +04:00
|
|
|
.bdrv_probe_device = cdrom_probe_device,
|
2014-03-08 03:39:43 +04:00
|
|
|
.bdrv_parse_filename = cdrom_parse_filename,
|
2010-04-14 16:17:38 +04:00
|
|
|
.bdrv_file_open = cdrom_open,
|
2009-06-15 15:55:19 +04:00
|
|
|
.bdrv_close = raw_close,
|
2012-11-20 19:21:10 +04:00
|
|
|
.bdrv_reopen_prepare = raw_reopen_prepare,
|
|
|
|
.bdrv_reopen_commit = raw_reopen_commit,
|
|
|
|
.bdrv_reopen_abort = raw_reopen_abort,
|
2014-06-05 13:21:11 +04:00
|
|
|
.bdrv_create = hdev_create,
|
2014-06-05 13:21:01 +04:00
|
|
|
.create_opts = &raw_create_opts,
|
2009-06-15 15:55:19 +04:00
|
|
|
|
2016-06-03 18:36:27 +03:00
|
|
|
.bdrv_co_preadv = raw_co_preadv,
|
|
|
|
.bdrv_co_pwritev = raw_co_pwritev,
|
2009-09-04 21:01:49 +04:00
|
|
|
.bdrv_aio_flush = raw_aio_flush,
|
2011-11-29 15:42:20 +04:00
|
|
|
.bdrv_refresh_limits = raw_refresh_limits,
|
2014-07-04 14:04:34 +04:00
|
|
|
.bdrv_io_plug = raw_aio_plug,
|
|
|
|
.bdrv_io_unplug = raw_aio_unplug,
|
2009-06-15 15:55:19 +04:00
|
|
|
|
2011-09-21 03:10:37 +04:00
|
|
|
.bdrv_truncate = raw_truncate,
|
block: Avoid unecessary drv->bdrv_getlength() calls
The block layer generally keeps the size of an image cached in
bs->total_sectors so that it doesn't have to perform expensive
operations to get the size whenever it needs it.
This doesn't work however when using a backend that can change its size
without qemu being aware of it, i.e. passthrough of removable media like
CD-ROMs or floppy disks. For this reason, the caching is disabled when a
removable device is used.
It is obvious that checking whether the _guest_ device has removable
media isn't the right thing to do when we want to know whether the size
of the host backend can change. To make things worse, non-top-level
BlockDriverStates never have any device attached, which makes qemu
assume they are removable, so drv->bdrv_getlength() is always called on
the protocol layer. In the case of raw-posix, this causes unnecessary
lseek() system calls, which turned out to be rather expensive.
This patch completely changes the logic and disables bs->total_sectors
caching only for certain block driver types, for which a size change is
expected: host_cdrom and host_floppy on POSIX, host_device on win32; also
the raw format in case it sits on top of one of these protocols, but in
the common case the nested bdrv_getlength() call on the protocol driver
will use the cache again and avoid an expensive drv->bdrv_getlength()
call.
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
2013-10-29 15:18:58 +04:00
|
|
|
.bdrv_getlength = raw_getlength,
|
|
|
|
.has_variable_length = true,
|
2011-07-12 15:56:39 +04:00
|
|
|
.bdrv_get_allocated_file_size
|
|
|
|
= raw_get_allocated_file_size,
|
2009-06-15 15:55:19 +04:00
|
|
|
|
2006-08-19 15:45:59 +04:00
|
|
|
/* removable device support */
|
2009-06-15 15:55:19 +04:00
|
|
|
.bdrv_is_inserted = cdrom_is_inserted,
|
|
|
|
.bdrv_eject = cdrom_eject,
|
2011-09-06 20:58:47 +04:00
|
|
|
.bdrv_lock_medium = cdrom_lock_medium,
|
2006-08-19 15:45:59 +04:00
|
|
|
};
|
2009-06-15 15:55:19 +04:00
|
|
|
#endif /* __FreeBSD__ */
|
2009-05-10 02:03:42 +04:00
|
|
|
|
2010-04-08 00:30:24 +04:00
|
|
|
static void bdrv_file_init(void)
|
2009-05-10 02:03:42 +04:00
|
|
|
{
|
2009-06-15 16:04:22 +04:00
|
|
|
/*
|
|
|
|
* Register all the drivers. Note that order is important, the driver
|
|
|
|
* registered last will get probed first.
|
|
|
|
*/
|
2010-04-08 00:30:24 +04:00
|
|
|
bdrv_register(&bdrv_file);
|
2009-05-10 02:03:42 +04:00
|
|
|
bdrv_register(&bdrv_host_device);
|
2009-06-15 15:55:19 +04:00
|
|
|
#ifdef __linux__
|
|
|
|
bdrv_register(&bdrv_host_cdrom);
|
|
|
|
#endif
|
2009-11-29 20:00:41 +03:00
|
|
|
#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
|
2009-06-15 15:55:19 +04:00
|
|
|
bdrv_register(&bdrv_host_cdrom);
|
|
|
|
#endif
|
2009-05-10 02:03:42 +04:00
|
|
|
}
|
|
|
|
|
2010-04-08 00:30:24 +04:00
|
|
|
block_init(bdrv_file_init);
|