qemu/include/block/block_int.h

762 lines
30 KiB
C
Raw Normal View History

/*
* QEMU System Emulator block driver
*
* Copyright (c) 2003 Fabrice Bellard
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#ifndef BLOCK_INT_H
#define BLOCK_INT_H
#include "block/accounting.h"
#include "block/block.h"
#include "qemu/option.h"
#include "qemu/queue.h"
#include "qemu/coroutine.h"
#include "qemu/timer.h"
#include "qapi-types.h"
#include "qemu/hbitmap.h"
#include "block/snapshot.h"
#include "qemu/main-loop.h"
#include "qemu/throttle.h"
#define BLOCK_FLAG_ENCRYPT 1
#define BLOCK_FLAG_LAZY_REFCOUNTS 8
#define BLOCK_OPT_SIZE "size"
#define BLOCK_OPT_ENCRYPT "encryption"
#define BLOCK_OPT_COMPAT6 "compat6"
#define BLOCK_OPT_HWVERSION "hwversion"
#define BLOCK_OPT_BACKING_FILE "backing_file"
#define BLOCK_OPT_BACKING_FMT "backing_fmt"
#define BLOCK_OPT_CLUSTER_SIZE "cluster_size"
#define BLOCK_OPT_TABLE_SIZE "table_size"
#define BLOCK_OPT_PREALLOC "preallocation"
#define BLOCK_OPT_SUBFMT "subformat"
#define BLOCK_OPT_COMPAT_LEVEL "compat"
#define BLOCK_OPT_LAZY_REFCOUNTS "lazy_refcounts"
#define BLOCK_OPT_ADAPTER_TYPE "adapter_type"
#define BLOCK_OPT_REDUNDANCY "redundancy"
#define BLOCK_OPT_NOCOW "nocow"
#define BLOCK_OPT_OBJECT_SIZE "object_size"
#define BLOCK_OPT_REFCOUNT_BITS "refcount_bits"
#define BLOCK_PROBE_BUF_SIZE 512
enum BdrvTrackedRequestType {
BDRV_TRACKED_READ,
BDRV_TRACKED_WRITE,
BDRV_TRACKED_FLUSH,
BDRV_TRACKED_IOCTL,
BDRV_TRACKED_DISCARD,
};
typedef struct BdrvTrackedRequest {
BlockDriverState *bs;
int64_t offset;
unsigned int bytes;
enum BdrvTrackedRequestType type;
bool serialising;
int64_t overlap_offset;
unsigned int overlap_bytes;
QLIST_ENTRY(BdrvTrackedRequest) list;
Coroutine *co; /* owner, used for deadlock detection */
CoQueue wait_queue; /* coroutines blocked on this request */
struct BdrvTrackedRequest *waiting_for;
} BdrvTrackedRequest;
struct BlockDriver {
const char *format_name;
int instance_size;
/* set to true if the BlockDriver is a block filter */
bool is_filter;
/* for snapshots block filter like Quorum can implement the
* following recursive callback.
* It's purpose is to recurse on the filter children while calling
* bdrv_recurse_is_first_non_filter on them.
* For a sample implementation look in the future Quorum block filter.
*/
bool (*bdrv_recurse_is_first_non_filter)(BlockDriverState *bs,
BlockDriverState *candidate);
int (*bdrv_probe)(const uint8_t *buf, int buf_size, const char *filename);
int (*bdrv_probe_device)(const char *filename);
/* Any driver implementing this callback is expected to be able to handle
* NULL file names in its .bdrv_open() implementation */
void (*bdrv_parse_filename)(const char *filename, QDict *options, Error **errp);
/* Drivers not implementing bdrv_parse_filename nor bdrv_open should have
* this field set to true, except ones that are defined only by their
* child's bs.
* An example of the last type will be the quorum block driver.
*/
bool bdrv_needs_filename;
/* Set if a driver can support backing files */
bool supports_backing;
/* For handling image reopen for split or non-split files */
int (*bdrv_reopen_prepare)(BDRVReopenState *reopen_state,
BlockReopenQueue *queue, Error **errp);
void (*bdrv_reopen_commit)(BDRVReopenState *reopen_state);
void (*bdrv_reopen_abort)(BDRVReopenState *reopen_state);
void (*bdrv_join_options)(QDict *options, QDict *old_options);
int (*bdrv_open)(BlockDriverState *bs, QDict *options, int flags,
Error **errp);
int (*bdrv_file_open)(BlockDriverState *bs, QDict *options, int flags,
Error **errp);
void (*bdrv_close)(BlockDriverState *bs);
int (*bdrv_create)(const char *filename, QemuOpts *opts, Error **errp);
int (*bdrv_set_key)(BlockDriverState *bs, const char *key);
int (*bdrv_make_empty)(BlockDriverState *bs);
void (*bdrv_refresh_filename)(BlockDriverState *bs, QDict *options);
/* aio */
BlockAIOCB *(*bdrv_aio_readv)(BlockDriverState *bs,
int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
BlockCompletionFunc *cb, void *opaque);
BlockAIOCB *(*bdrv_aio_writev)(BlockDriverState *bs,
int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
BlockCompletionFunc *cb, void *opaque);
BlockAIOCB *(*bdrv_aio_flush)(BlockDriverState *bs,
BlockCompletionFunc *cb, void *opaque);
BlockAIOCB *(*bdrv_aio_discard)(BlockDriverState *bs,
int64_t sector_num, int nb_sectors,
BlockCompletionFunc *cb, void *opaque);
int coroutine_fn (*bdrv_co_readv)(BlockDriverState *bs,
int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
int coroutine_fn (*bdrv_co_preadv)(BlockDriverState *bs,
uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags);
int coroutine_fn (*bdrv_co_writev)(BlockDriverState *bs,
int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
int coroutine_fn (*bdrv_co_writev_flags)(BlockDriverState *bs,
int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, int flags);
int coroutine_fn (*bdrv_co_pwritev)(BlockDriverState *bs,
uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags);
/*
* Efficiently zero a region of the disk image. Typically an image format
* would use a compact metadata representation to implement this. This
block: Honor BDRV_REQ_FUA during write_zeroes The block layer has a couple of cases where it can lose Force Unit Access semantics when writing a large block of zeroes, such that the request returns before the zeroes have been guaranteed to land on underlying media. SCSI does not support FUA during WRITESAME(10/16); FUA is only supported if it falls back to WRITE(10/16). But where the underlying device is new enough to not need a fallback, it means that any upper layer request with FUA semantics was silently ignoring BDRV_REQ_FUA. Conversely, NBD has situations where it can support FUA but not ZERO_WRITE; when that happens, the generic block layer fallback to bdrv_driver_pwritev() (or the older bdrv_co_writev() in qemu 2.6) was losing the FUA flag. The problem of losing flags unrelated to ZERO_WRITE has been latent in bdrv_co_do_write_zeroes() since commit aa7bfbff, but back then, it did not matter because there was no FUA flag. It became observable when commit 93f5e6d8 paved the way for flags that can impact correctness, when we should have been using bdrv_co_writev_flags() with modified flags. Compare to commit 9eeb6dd, which got flag manipulation right in bdrv_co_do_zero_pwritev(). Symptoms: I tested with qemu-io with default writethrough cache (which is supposed to use FUA semantics on every write), and targetted an NBD client connected to a server that intentionally did not advertise NBD_FLAG_SEND_FUA. When doing 'write 0 512', the NBD client sent two operations (NBD_CMD_WRITE then NBD_CMD_FLUSH) to get the fallback FUA semantics; but when doing 'write -z 0 512', the NBD client sent only NBD_CMD_WRITE. The fix is do to a cleanup bdrv_co_flush() at the end of the operation if any step in the middle relied on a BDS that does not natively support FUA for that step (note that we don't need to flush after every operation, if the operation is broken into chunks based on bounce-buffer sizing). Each BDS gains a new flag .supported_zero_flags, which parallels the use of .supported_write_flags but only when accessing a zero write operation (the flags MUST be different, because of SCSI having different semantics based on WRITE vs. WRITESAME; and also because BDRV_REQ_MAY_UNMAP only makes sense on zero writes). Also fix some documentation to describe -ENOTSUP semantics, particularly since iscsi depends on those semantics. Down the road, we may want to add a driver where its .bdrv_co_pwritev() honors all three of BDRV_REQ_FUA, BDRV_REQ_ZERO_WRITE, and BDRV_REQ_MAY_UNMAP, and advertise this via bs->supported_write_flags for blocks opened by that driver; such a driver should NOT supply .bdrv_co_write_zeroes nor .supported_zero_flags. But none of the drivers touched in this patch want to do that (the act of writing zeroes is different enough from normal writes to deserve a second callback). Signed-off-by: Eric Blake <eblake@redhat.com> Reviewed-by: Fam Zheng <famz@redhat.com> Acked-by: Stefan Hajnoczi <stefanha@redhat.com> Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2016-05-04 01:39:07 +03:00
* function pointer may be NULL or return -ENOSUP and .bdrv_co_writev()
* will be called instead.
*/
int coroutine_fn (*bdrv_co_pwrite_zeroes)(BlockDriverState *bs,
int64_t offset, int count, BdrvRequestFlags flags);
int coroutine_fn (*bdrv_co_discard)(BlockDriverState *bs,
int64_t sector_num, int nb_sectors);
int64_t coroutine_fn (*bdrv_co_get_block_status)(BlockDriverState *bs,
int64_t sector_num, int nb_sectors, int *pnum,
BlockDriverState **file);
/*
* Invalidate any cached meta-data.
*/
void (*bdrv_invalidate_cache)(BlockDriverState *bs, Error **errp);
block: Inactivate BDS when migration completes So far, live migration with shared storage meant that the image is in a not-really-ready don't-touch-me state on the destination while the source is still actively using it, but after completing the migration, the image was fully opened on both sides. This is bad. This patch adds a block driver callback to inactivate images on the source before completing the migration. Inactivation means that it goes to a state as if it was just live migrated to the qemu instance on the source (i.e. BDRV_O_INACTIVE is set). You're then supposed to continue either on the source or on the destination, which takes ownership of the image. A typical migration looks like this now with respect to disk images: 1. Destination qemu is started, the image is opened with BDRV_O_INACTIVE. The image is fully opened on the source. 2. Migration is about to complete. The source flushes the image and inactivates it. Now both sides have the image opened with BDRV_O_INACTIVE and are expecting the other side to still modify it. 3. One side (the destination on success) continues and calls bdrv_invalidate_all() in order to take ownership of the image again. This removes BDRV_O_INACTIVE on the resuming side; the flag remains set on the other side. This ensures that the same image isn't written to by both instances (unless both are resumed, but then you get what you deserve). This is important because .bdrv_close for non-BDRV_O_INACTIVE images could write to the image file, which is definitely forbidden while another host is using the image. Signed-off-by: Kevin Wolf <kwolf@redhat.com> Reviewed-by: Eric Blake <eblake@redhat.com> Reviewed-by: John Snow <jsnow@redhat.com>
2015-12-22 16:07:08 +03:00
int (*bdrv_inactivate)(BlockDriverState *bs);
/*
* Flushes all data for all layers by calling bdrv_co_flush for underlying
* layers, if needed. This function is needed for deterministic
* synchronization of the flush finishing callback.
*/
int coroutine_fn (*bdrv_co_flush)(BlockDriverState *bs);
/*
* Flushes all data that was already written to the OS all the way down to
* the disk (for example raw-posix calls fsync()).
*/
int coroutine_fn (*bdrv_co_flush_to_disk)(BlockDriverState *bs);
/*
* Flushes all internal caches to the OS. The data may still sit in a
* writeback cache of the host OS, but it will survive a crash of the qemu
* process.
*/
int coroutine_fn (*bdrv_co_flush_to_os)(BlockDriverState *bs);
const char *protocol_name;
int (*bdrv_truncate)(BlockDriverState *bs, int64_t offset);
2013-10-29 15:18:58 +04:00
int64_t (*bdrv_getlength)(BlockDriverState *bs);
2013-10-29 15:18:58 +04:00
bool has_variable_length;
int64_t (*bdrv_get_allocated_file_size)(BlockDriverState *bs);
2013-10-29 15:18:58 +04:00
int (*bdrv_write_compressed)(BlockDriverState *bs, int64_t sector_num,
const uint8_t *buf, int nb_sectors);
int (*bdrv_snapshot_create)(BlockDriverState *bs,
QEMUSnapshotInfo *sn_info);
int (*bdrv_snapshot_goto)(BlockDriverState *bs,
const char *snapshot_id);
int (*bdrv_snapshot_delete)(BlockDriverState *bs,
const char *snapshot_id,
const char *name,
Error **errp);
int (*bdrv_snapshot_list)(BlockDriverState *bs,
QEMUSnapshotInfo **psn_info);
int (*bdrv_snapshot_load_tmp)(BlockDriverState *bs,
const char *snapshot_id,
const char *name,
Error **errp);
int (*bdrv_get_info)(BlockDriverState *bs, BlockDriverInfo *bdi);
ImageInfoSpecific *(*bdrv_get_specific_info)(BlockDriverState *bs);
int coroutine_fn (*bdrv_save_vmstate)(BlockDriverState *bs,
QEMUIOVector *qiov,
int64_t pos);
int coroutine_fn (*bdrv_load_vmstate)(BlockDriverState *bs,
QEMUIOVector *qiov,
int64_t pos);
int (*bdrv_change_backing_file)(BlockDriverState *bs,
const char *backing_file, const char *backing_fmt);
/* removable device specific */
bool (*bdrv_is_inserted)(BlockDriverState *bs);
int (*bdrv_media_changed)(BlockDriverState *bs);
void (*bdrv_eject)(BlockDriverState *bs, bool eject_flag);
void (*bdrv_lock_medium)(BlockDriverState *bs, bool locked);
/* to control generic scsi devices */
BlockAIOCB *(*bdrv_aio_ioctl)(BlockDriverState *bs,
unsigned long int req, void *buf,
BlockCompletionFunc *cb, void *opaque);
/* List of options for creating images, terminated by name == NULL */
QemuOptsList *create_opts;
/*
* Returns 0 for completed check, -errno for internal errors.
* The check results are stored in result.
*/
int (*bdrv_check)(BlockDriverState* bs, BdrvCheckResult *result,
BdrvCheckMode fix);
int (*bdrv_amend_options)(BlockDriverState *bs, QemuOpts *opts,
BlockDriverAmendStatusCB *status_cb,
void *cb_opaque);
void (*bdrv_debug_event)(BlockDriverState *bs, BlkdebugEvent event);
/* TODO Better pass a option string/QDict/QemuOpts to add any rule? */
int (*bdrv_debug_breakpoint)(BlockDriverState *bs, const char *event,
const char *tag);
int (*bdrv_debug_remove_breakpoint)(BlockDriverState *bs,
const char *tag);
int (*bdrv_debug_resume)(BlockDriverState *bs, const char *tag);
bool (*bdrv_debug_is_suspended)(BlockDriverState *bs, const char *tag);
void (*bdrv_refresh_limits)(BlockDriverState *bs, Error **errp);
/*
* Returns 1 if newly created images are guaranteed to contain only
* zeros, 0 otherwise.
*/
int (*bdrv_has_zero_init)(BlockDriverState *bs);
/* Remove fd handlers, timers, and other event loop callbacks so the event
* loop is no longer in use. Called with no in-flight requests and in
* depth-first traversal order with parents before child nodes.
*/
void (*bdrv_detach_aio_context)(BlockDriverState *bs);
/* Add fd handlers, timers, and other event loop callbacks so I/O requests
* can be processed again. Called with no in-flight requests and in
* depth-first traversal order with child nodes before parent nodes.
*/
void (*bdrv_attach_aio_context)(BlockDriverState *bs,
AioContext *new_context);
/* io queue for linux-aio */
void (*bdrv_io_plug)(BlockDriverState *bs);
void (*bdrv_io_unplug)(BlockDriverState *bs);
/**
* Try to get @bs's logical and physical block size.
* On success, store them in @bsz and return zero.
* On failure, return negative errno.
*/
int (*bdrv_probe_blocksizes)(BlockDriverState *bs, BlockSizes *bsz);
/**
* Try to get @bs's geometry (cyls, heads, sectors)
* On success, store them in @geo and return 0.
* On failure return -errno.
* Only drivers that want to override guest geometry implement this
* callback; see hd_geometry_guess().
*/
int (*bdrv_probe_geometry)(BlockDriverState *bs, HDGeometry *geo);
/**
* Drain and stop any internal sources of requests in the driver, and
* remain so until next I/O callback (e.g. bdrv_co_writev) is called.
*/
void (*bdrv_drain)(BlockDriverState *bs);
void (*bdrv_add_child)(BlockDriverState *parent, BlockDriverState *child,
Error **errp);
void (*bdrv_del_child)(BlockDriverState *parent, BdrvChild *child,
Error **errp);
QLIST_ENTRY(BlockDriver) list;
};
typedef struct BlockLimits {
/* maximum number of sectors that can be discarded at once */
int max_discard;
/* optimal alignment for discard requests in sectors */
int64_t discard_alignment;
/* maximum number of bytes that can zeroized at once (since it is
* signed, it must be < 2G, if set) */
int32_t max_pwrite_zeroes;
/* optimal alignment for write zeroes requests in bytes, must be
* power of 2, and less than max_pwrite_zeroes if that is set */
uint32_t pwrite_zeroes_alignment;
/* optimal transfer length in sectors */
int opt_transfer_length;
/* maximal transfer length in sectors */
int max_transfer_length;
/* memory alignment so that no bounce buffer is needed */
size_t min_mem_alignment;
/* memory alignment for bounce buffer */
size_t opt_mem_alignment;
/* maximum number of iovec elements */
int max_iov;
} BlockLimits;
typedef struct BdrvOpBlocker BdrvOpBlocker;
typedef struct BdrvAioNotifier {
void (*attached_aio_context)(AioContext *new_context, void *opaque);
void (*detach_aio_context)(void *opaque);
void *opaque;
bool deleted;
QLIST_ENTRY(BdrvAioNotifier) list;
} BdrvAioNotifier;
struct BdrvChildRole {
void (*inherit_options)(int *child_flags, QDict *child_options,
int parent_flags, QDict *parent_options);
void (*change_media)(BdrvChild *child, bool load);
void (*resize)(BdrvChild *child);
/* Returns a name that is supposedly more useful for human users than the
* node name for identifying the node in question (in particular, a BB
* name), or NULL if the parent can't provide a better name. */
const char* (*get_name)(BdrvChild *child);
/*
* If this pair of functions is implemented, the parent doesn't issue new
* requests after returning from .drained_begin() until .drained_end() is
* called.
*
* Note that this can be nested. If drained_begin() was called twice, new
* I/O is allowed only after drained_end() was called twice, too.
*/
void (*drained_begin)(BdrvChild *child);
void (*drained_end)(BdrvChild *child);
};
extern const BdrvChildRole child_file;
extern const BdrvChildRole child_format;
struct BdrvChild {
BlockDriverState *bs;
char *name;
const BdrvChildRole *role;
void *opaque;
QLIST_ENTRY(BdrvChild) next;
QLIST_ENTRY(BdrvChild) next_parent;
};
/*
* Note: the function bdrv_append() copies and swaps contents of
* BlockDriverStates, so if you add new fields to this struct, please
* inspect bdrv_append() to determine if the new fields need to be
* copied as well.
*/
struct BlockDriverState {
int64_t total_sectors; /* if we are reading a disk image, give its
size in sectors */
int read_only; /* if true, the media is read only */
int open_flags; /* flags used to open the file, re-used for re-open */
int encrypted; /* if true, the media is encrypted */
int valid_key; /* if true, a valid encryption key has been set */
int sg; /* if true, the device is a /dev/sg* */
int copy_on_read; /* if true, copy read backing sectors into image
note this is a reference count */
raw: Prohibit dangerous writes for probed images If the user neglects to specify the image format, QEMU probes the image to guess it automatically, for convenience. Relying on format probing is insecure for raw images (CVE-2008-2004). If the guest writes a suitable header to the device, the next probe will recognize a format chosen by the guest. A malicious guest can abuse this to gain access to host files, e.g. by crafting a QCOW2 header with backing file /etc/shadow. Commit 1e72d3b (April 2008) provided -drive parameter format to let users disable probing. Commit f965509 (March 2009) extended QCOW2 to optionally store the backing file format, to let users disable backing file probing. QED has had a flag to suppress probing since the beginning (2010), set whenever a raw backing file is assigned. All of these additions that allow to avoid format probing have to be specified explicitly. The default still allows the attack. In order to fix this, commit 79368c8 (July 2010) put probed raw images in a restricted mode, in which they wouldn't be able to overwrite the first few bytes of the image so that they would identify as a different image. If a write to the first sector would write one of the signatures of another driver, qemu would instead zero out the first four bytes. This patch was later reverted in commit 8b33d9e (September 2010) because it didn't get the handling of unaligned qiov members right. Today's block layer that is based on coroutines and has qiov utility functions makes it much easier to get this functionality right, so this patch implements it. The other differences of this patch to the old one are that it doesn't silently write something different than the guest requested by zeroing out some bytes (it fails the request instead) and that it doesn't maintain a list of signatures in the raw driver (it calls the usual probe function instead). Note that this change doesn't introduce new breakage for false positive cases where the guest legitimately writes data into the first sector that matches the signatures of an image format (e.g. for nested virt): These cases were broken before, only the failure mode changes from corruption after the next restart (when the wrong format is probed) to failing the problematic write request. Also note that like in the original patch, the restrictions only apply if the image format has been guessed by probing. Explicitly specifying a format allows guests to write anything they like. Signed-off-by: Kevin Wolf <kwolf@redhat.com> Reviewed-by: Eric Blake <eblake@redhat.com> Reviewed-by: Max Reitz <mreitz@redhat.com> Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> Message-id: 1416497234-29880-8-git-send-email-kwolf@redhat.com Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2014-11-20 18:27:12 +03:00
bool probed;
BlockDriver *drv; /* NULL means no media */
void *opaque;
AioContext *aio_context; /* event loop used for fd handlers, timers, etc */
/* long-running tasks intended to always use the same AioContext as this
* BDS may register themselves in this list to be notified of changes
* regarding this BDS's context */
QLIST_HEAD(, BdrvAioNotifier) aio_notifiers;
bool walking_aio_notifiers; /* to make removal during iteration safe */
char filename[PATH_MAX];
char backing_file[PATH_MAX]; /* if non zero, the image is a diff of
this file image */
char backing_format[16]; /* if non-zero and backing_file exists */
QDict *full_open_options;
char exact_filename[PATH_MAX];
BdrvChild *backing;
BdrvChild *file;
/* Callback before write request is processed */
NotifierWithReturnList before_write_notifiers;
/* number of in-flight serialising requests */
unsigned int serialising_in_flight;
/* Offset after the highest byte written to */
uint64_t wr_highest_offset;
/* I/O Limits */
BlockLimits bl;
/* Alignment requirement for offset/length of I/O requests */
unsigned int request_alignment;
/* Flags honored during pwrite (so far: BDRV_REQ_FUA) */
unsigned int supported_write_flags;
/* Flags honored during pwrite_zeroes (so far: BDRV_REQ_FUA,
block: Honor BDRV_REQ_FUA during write_zeroes The block layer has a couple of cases where it can lose Force Unit Access semantics when writing a large block of zeroes, such that the request returns before the zeroes have been guaranteed to land on underlying media. SCSI does not support FUA during WRITESAME(10/16); FUA is only supported if it falls back to WRITE(10/16). But where the underlying device is new enough to not need a fallback, it means that any upper layer request with FUA semantics was silently ignoring BDRV_REQ_FUA. Conversely, NBD has situations where it can support FUA but not ZERO_WRITE; when that happens, the generic block layer fallback to bdrv_driver_pwritev() (or the older bdrv_co_writev() in qemu 2.6) was losing the FUA flag. The problem of losing flags unrelated to ZERO_WRITE has been latent in bdrv_co_do_write_zeroes() since commit aa7bfbff, but back then, it did not matter because there was no FUA flag. It became observable when commit 93f5e6d8 paved the way for flags that can impact correctness, when we should have been using bdrv_co_writev_flags() with modified flags. Compare to commit 9eeb6dd, which got flag manipulation right in bdrv_co_do_zero_pwritev(). Symptoms: I tested with qemu-io with default writethrough cache (which is supposed to use FUA semantics on every write), and targetted an NBD client connected to a server that intentionally did not advertise NBD_FLAG_SEND_FUA. When doing 'write 0 512', the NBD client sent two operations (NBD_CMD_WRITE then NBD_CMD_FLUSH) to get the fallback FUA semantics; but when doing 'write -z 0 512', the NBD client sent only NBD_CMD_WRITE. The fix is do to a cleanup bdrv_co_flush() at the end of the operation if any step in the middle relied on a BDS that does not natively support FUA for that step (note that we don't need to flush after every operation, if the operation is broken into chunks based on bounce-buffer sizing). Each BDS gains a new flag .supported_zero_flags, which parallels the use of .supported_write_flags but only when accessing a zero write operation (the flags MUST be different, because of SCSI having different semantics based on WRITE vs. WRITESAME; and also because BDRV_REQ_MAY_UNMAP only makes sense on zero writes). Also fix some documentation to describe -ENOTSUP semantics, particularly since iscsi depends on those semantics. Down the road, we may want to add a driver where its .bdrv_co_pwritev() honors all three of BDRV_REQ_FUA, BDRV_REQ_ZERO_WRITE, and BDRV_REQ_MAY_UNMAP, and advertise this via bs->supported_write_flags for blocks opened by that driver; such a driver should NOT supply .bdrv_co_write_zeroes nor .supported_zero_flags. But none of the drivers touched in this patch want to do that (the act of writing zeroes is different enough from normal writes to deserve a second callback). Signed-off-by: Eric Blake <eblake@redhat.com> Reviewed-by: Fam Zheng <famz@redhat.com> Acked-by: Stefan Hajnoczi <stefanha@redhat.com> Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2016-05-04 01:39:07 +03:00
* BDRV_REQ_MAY_UNMAP) */
unsigned int supported_zero_flags;
/* the following member gives a name to every node on the bs graph. */
char node_name[32];
/* element of the list of named nodes building the graph */
QTAILQ_ENTRY(BlockDriverState) node_list;
/* element of the list of all BlockDriverStates (all_bdrv_states) */
QTAILQ_ENTRY(BlockDriverState) bs_list;
/* element of the list of monitor-owned BDS */
QTAILQ_ENTRY(BlockDriverState) monitor_list;
QLIST_HEAD(, BdrvDirtyBitmap) dirty_bitmaps;
int refcnt;
QLIST_HEAD(, BdrvTrackedRequest) tracked_requests;
/* operation blockers */
QLIST_HEAD(, BdrvOpBlocker) op_blockers[BLOCK_OP_TYPE_MAX];
/* long-running background operation */
BlockJob *job;
/* The node that this node inherited default options from (and a reopen on
* which can affect this node by changing these defaults). This is always a
* parent node of this node. */
BlockDriverState *inherits_from;
QLIST_HEAD(, BdrvChild) children;
QLIST_HEAD(, BdrvChild) parents;
QDict *options;
QDict *explicit_options;
BlockdevDetectZeroesOptions detect_zeroes;
/* The error object in use for blocking operations on backing_hd */
Error *backing_blocker;
block: add event when disk usage exceeds threshold Managing applications, like oVirt (http://www.ovirt.org), make extensive use of thin-provisioned disk images. To let the guest run smoothly and be not unnecessarily paused, oVirt sets a disk usage threshold (so called 'high water mark') based on the occupation of the device, and automatically extends the image once the threshold is reached or exceeded. In order to detect the crossing of the threshold, oVirt has no choice but aggressively polling the QEMU monitor using the query-blockstats command. This lead to unnecessary system load, and is made even worse under scale: deployments with hundreds of VMs are no longer rare. To fix this, this patch adds: * A new monitor command `block-set-write-threshold', to set a mark for a given block device. * A new event `BLOCK_WRITE_THRESHOLD', to report if a block device usage exceeds the threshold. * A new `write_threshold' field into the `BlockDeviceInfo' structure, to report the configured threshold. This will allow the managing application to use smarter and more efficient monitoring, greatly reducing the need of polling. [Updated qemu-iotests 067 output to add the new 'write_threshold' property. --Stefan] [Changed g_assert_false() to !g_assert() to fix the build on older glib versions. --Kevin] Signed-off-by: Francesco Romani <fromani@redhat.com> Reviewed-by: Eric Blake <eblake@redhat.com> Message-id: 1421068273-692-1-git-send-email-fromani@redhat.com Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2015-01-12 16:11:13 +03:00
/* threshold limit for writes, in bytes. "High water mark". */
uint64_t write_threshold_offset;
NotifierWithReturn write_threshold_notifier;
/* counters for nested bdrv_io_plug and bdrv_io_unplugged_begin */
unsigned io_plugged;
unsigned io_plug_disabled;
int quiesce_counter;
};
struct BlockBackendRootState {
int open_flags;
bool read_only;
BlockdevDetectZeroesOptions detect_zeroes;
};
block/mirror: Fix target backing BDS Currently, we are trying to move the backing BDS from the source to the target in bdrv_replace_in_backing_chain() which is called from mirror_exit(). However, mirror_complete() already tries to open the target's backing chain with a call to bdrv_open_backing_file(). First, we should only set the target's backing BDS once. Second, the mirroring block job has a better idea of what to set it to than the generic code in bdrv_replace_in_backing_chain() (in fact, the latter's conditions on when to move the backing BDS from source to target are not really correct). Therefore, remove that code from bdrv_replace_in_backing_chain() and leave it to mirror_complete(). Depending on what kind of mirroring is performed, we furthermore want to use different strategies to open the target's backing chain: - If blockdev-mirror is used, we can assume the user made sure that the target already has the correct backing chain. In particular, we should not try to open a backing file if the target does not have any yet. - If drive-mirror with mode=absolute-paths is used, we can and should reuse the already existing chain of nodes that the source BDS is in. In case of sync=full, no backing BDS is required; with sync=top, we just link the source's backing BDS to the target, and with sync=none, we use the source BDS as the target's backing BDS. We should not try to open these backing files anew because this would lead to two BDSs existing per physical file in the backing chain, and we would like to avoid such concurrent access. - If drive-mirror with mode=existing is used, we have to use the information provided in the physical image file which means opening the target's backing chain completely anew, just as it has been done already. If the target's backing chain shares images with the source, this may lead to multiple BDSs per physical image file. But since we cannot reliably ascertain this case, there is nothing we can do about it. Signed-off-by: Max Reitz <mreitz@redhat.com> Message-id: 20160610185750.30956-3-mreitz@redhat.com Reviewed-by: Kevin Wolf <kwolf@redhat.com> Reviewed-by: Fam Zheng <famz@redhat.com> Signed-off-by: Max Reitz <mreitz@redhat.com>
2016-06-10 21:57:47 +03:00
typedef enum BlockMirrorBackingMode {
/* Reuse the existing backing chain from the source for the target.
* - sync=full: Set backing BDS to NULL.
* - sync=top: Use source's backing BDS.
* - sync=none: Use source as the backing BDS. */
MIRROR_SOURCE_BACKING_CHAIN,
/* Open the target's backing chain completely anew */
MIRROR_OPEN_BACKING_CHAIN,
/* Do not change the target's backing BDS after job completion */
MIRROR_LEAVE_BACKING_CHAIN,
} BlockMirrorBackingMode;
static inline BlockDriverState *backing_bs(BlockDriverState *bs)
{
return bs->backing ? bs->backing->bs : NULL;
}
/* Essential block drivers which must always be statically linked into qemu, and
* which therefore can be accessed without using bdrv_find_format() */
extern BlockDriver bdrv_file;
extern BlockDriver bdrv_raw;
extern BlockDriver bdrv_qcow2;
/**
* bdrv_setup_io_funcs:
*
* Prepare a #BlockDriver for I/O request processing by populating
* unimplemented coroutine and AIO interfaces with generic wrapper functions
* that fall back to implemented interfaces.
*/
void bdrv_setup_io_funcs(BlockDriver *bdrv);
int coroutine_fn bdrv_co_preadv(BlockDriverState *bs,
int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
BdrvRequestFlags flags);
int coroutine_fn bdrv_co_pwritev(BlockDriverState *bs,
int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
BdrvRequestFlags flags);
int get_tmp_filename(char *filename, int size);
raw: Prohibit dangerous writes for probed images If the user neglects to specify the image format, QEMU probes the image to guess it automatically, for convenience. Relying on format probing is insecure for raw images (CVE-2008-2004). If the guest writes a suitable header to the device, the next probe will recognize a format chosen by the guest. A malicious guest can abuse this to gain access to host files, e.g. by crafting a QCOW2 header with backing file /etc/shadow. Commit 1e72d3b (April 2008) provided -drive parameter format to let users disable probing. Commit f965509 (March 2009) extended QCOW2 to optionally store the backing file format, to let users disable backing file probing. QED has had a flag to suppress probing since the beginning (2010), set whenever a raw backing file is assigned. All of these additions that allow to avoid format probing have to be specified explicitly. The default still allows the attack. In order to fix this, commit 79368c8 (July 2010) put probed raw images in a restricted mode, in which they wouldn't be able to overwrite the first few bytes of the image so that they would identify as a different image. If a write to the first sector would write one of the signatures of another driver, qemu would instead zero out the first four bytes. This patch was later reverted in commit 8b33d9e (September 2010) because it didn't get the handling of unaligned qiov members right. Today's block layer that is based on coroutines and has qiov utility functions makes it much easier to get this functionality right, so this patch implements it. The other differences of this patch to the old one are that it doesn't silently write something different than the guest requested by zeroing out some bytes (it fails the request instead) and that it doesn't maintain a list of signatures in the raw driver (it calls the usual probe function instead). Note that this change doesn't introduce new breakage for false positive cases where the guest legitimately writes data into the first sector that matches the signatures of an image format (e.g. for nested virt): These cases were broken before, only the failure mode changes from corruption after the next restart (when the wrong format is probed) to failing the problematic write request. Also note that like in the original patch, the restrictions only apply if the image format has been guessed by probing. Explicitly specifying a format allows guests to write anything they like. Signed-off-by: Kevin Wolf <kwolf@redhat.com> Reviewed-by: Eric Blake <eblake@redhat.com> Reviewed-by: Max Reitz <mreitz@redhat.com> Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> Message-id: 1416497234-29880-8-git-send-email-kwolf@redhat.com Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2014-11-20 18:27:12 +03:00
BlockDriver *bdrv_probe_all(const uint8_t *buf, int buf_size,
const char *filename);
/**
* bdrv_add_before_write_notifier:
*
* Register a callback that is invoked before write requests are processed but
* after any throttling or waiting for overlapping requests.
*/
void bdrv_add_before_write_notifier(BlockDriverState *bs,
NotifierWithReturn *notifier);
/**
* bdrv_detach_aio_context:
*
* May be called from .bdrv_detach_aio_context() to detach children from the
* current #AioContext. This is only needed by block drivers that manage their
* own children. Both ->file and ->backing are automatically handled and
* block drivers should not call this function on them explicitly.
*/
void bdrv_detach_aio_context(BlockDriverState *bs);
/**
* bdrv_attach_aio_context:
*
* May be called from .bdrv_attach_aio_context() to attach children to the new
* #AioContext. This is only needed by block drivers that manage their own
* children. Both ->file and ->backing are automatically handled and block
* drivers should not call this function on them explicitly.
*/
void bdrv_attach_aio_context(BlockDriverState *bs,
AioContext *new_context);
/**
* bdrv_add_aio_context_notifier:
*
* If a long-running job intends to be always run in the same AioContext as a
* certain BDS, it may use this function to be notified of changes regarding the
* association of the BDS to an AioContext.
*
* attached_aio_context() is called after the target BDS has been attached to a
* new AioContext; detach_aio_context() is called before the target BDS is being
* detached from its old AioContext.
*/
void bdrv_add_aio_context_notifier(BlockDriverState *bs,
void (*attached_aio_context)(AioContext *new_context, void *opaque),
void (*detach_aio_context)(void *opaque), void *opaque);
/**
* bdrv_remove_aio_context_notifier:
*
* Unsubscribe of change notifications regarding the BDS's AioContext. The
* parameters given here have to be the same as those given to
* bdrv_add_aio_context_notifier().
*/
void bdrv_remove_aio_context_notifier(BlockDriverState *bs,
void (*aio_context_attached)(AioContext *,
void *),
void (*aio_context_detached)(void *),
void *opaque);
#ifdef _WIN32
int is_windows_drive(const char *filename);
#endif
/**
* stream_start:
* @bs: Block device to operate on.
* @base: Block device that will become the new base, or %NULL to
* flatten the whole backing file chain onto @bs.
* @base_id: The file name that will be written to @bs as the new
* backing file if the job completes. Ignored if @base is %NULL.
* @speed: The maximum speed, in bytes per second, or 0 for unlimited.
* @on_error: The action to take upon error.
* @cb: Completion function for the job.
* @opaque: Opaque pointer value passed to @cb.
* @errp: Error object.
*
* Start a streaming operation on @bs. Clusters that are unallocated
* in @bs, but allocated in any image between @base and @bs (both
* exclusive) will be written to @bs. At the end of a successful
* streaming job, the backing file of @bs will be changed to
* @base_id in the written image and to @base in the live BlockDriverState.
*/
void stream_start(BlockDriverState *bs, BlockDriverState *base,
const char *base_id, int64_t speed, BlockdevOnError on_error,
BlockCompletionFunc *cb,
void *opaque, Error **errp);
/**
* commit_start:
* @bs: Active block device.
* @top: Top block device to be committed.
* @base: Block device that will be written into, and become the new top.
* @speed: The maximum speed, in bytes per second, or 0 for unlimited.
* @on_error: The action to take upon error.
* @cb: Completion function for the job.
* @opaque: Opaque pointer value passed to @cb.
* @backing_file_str: String to use as the backing file in @top's overlay
* @errp: Error object.
*
*/
void commit_start(BlockDriverState *bs, BlockDriverState *base,
BlockDriverState *top, int64_t speed,
BlockdevOnError on_error, BlockCompletionFunc *cb,
void *opaque, const char *backing_file_str, Error **errp);
/**
* commit_active_start:
* @bs: Active block device to be committed.
* @base: Block device that will be written into, and become the new top.
* @speed: The maximum speed, in bytes per second, or 0 for unlimited.
* @on_error: The action to take upon error.
* @cb: Completion function for the job.
* @opaque: Opaque pointer value passed to @cb.
* @errp: Error object.
*
*/
void commit_active_start(BlockDriverState *bs, BlockDriverState *base,
int64_t speed,
BlockdevOnError on_error,
BlockCompletionFunc *cb,
void *opaque, Error **errp);
mirror: introduce mirror job This patch adds the implementation of a new job that mirrors a disk to a new image while letting the guest continue using the old image. The target is treated as a "black box" and data is copied from the source to the target in the background. This can be used for several purposes, including storage migration, continuous replication, and observation of the guest I/O in an external program. It is also a first step in replacing the inefficient block migration code that is part of QEMU. The job is possibly never-ending, but it is logically structured into two phases: 1) copy all data as fast as possible until the target first gets in sync with the source; 2) keep target in sync and ensure that reopening to the target gets a correct (full) copy of the source data. The second phase is indicated by the progress in "info block-jobs" reporting the current offset to be equal to the length of the file. When the job is cancelled in the second phase, QEMU will run the job until the source is clean and quiescent, then it will report successful completion of the job. In other words, the BLOCK_JOB_CANCELLED event means that the target may _not_ be consistent with a past state of the source; the BLOCK_JOB_COMPLETED event means that the target is consistent with a past state of the source. (Note that it could already happen that management lost the race against QEMU and got a completion event instead of cancellation). It is not yet possible to complete the job and switch over to the target disk. The next patches will fix this and add many refinements to the basic idea introduced here. These include improved error management, some tunable knobs and performance optimizations. Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2012-10-18 18:49:23 +04:00
/*
* mirror_start:
* @bs: Block device to operate on.
* @target: Block device to write to.
* @replaces: Block graph node name to replace once the mirror is done. Can
* only be used when full mirroring is selected.
mirror: introduce mirror job This patch adds the implementation of a new job that mirrors a disk to a new image while letting the guest continue using the old image. The target is treated as a "black box" and data is copied from the source to the target in the background. This can be used for several purposes, including storage migration, continuous replication, and observation of the guest I/O in an external program. It is also a first step in replacing the inefficient block migration code that is part of QEMU. The job is possibly never-ending, but it is logically structured into two phases: 1) copy all data as fast as possible until the target first gets in sync with the source; 2) keep target in sync and ensure that reopening to the target gets a correct (full) copy of the source data. The second phase is indicated by the progress in "info block-jobs" reporting the current offset to be equal to the length of the file. When the job is cancelled in the second phase, QEMU will run the job until the source is clean and quiescent, then it will report successful completion of the job. In other words, the BLOCK_JOB_CANCELLED event means that the target may _not_ be consistent with a past state of the source; the BLOCK_JOB_COMPLETED event means that the target is consistent with a past state of the source. (Note that it could already happen that management lost the race against QEMU and got a completion event instead of cancellation). It is not yet possible to complete the job and switch over to the target disk. The next patches will fix this and add many refinements to the basic idea introduced here. These include improved error management, some tunable knobs and performance optimizations. Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2012-10-18 18:49:23 +04:00
* @speed: The maximum speed, in bytes per second, or 0 for unlimited.
* @granularity: The chosen granularity for the dirty bitmap.
* @buf_size: The amount of data that can be in flight at one time.
mirror: introduce mirror job This patch adds the implementation of a new job that mirrors a disk to a new image while letting the guest continue using the old image. The target is treated as a "black box" and data is copied from the source to the target in the background. This can be used for several purposes, including storage migration, continuous replication, and observation of the guest I/O in an external program. It is also a first step in replacing the inefficient block migration code that is part of QEMU. The job is possibly never-ending, but it is logically structured into two phases: 1) copy all data as fast as possible until the target first gets in sync with the source; 2) keep target in sync and ensure that reopening to the target gets a correct (full) copy of the source data. The second phase is indicated by the progress in "info block-jobs" reporting the current offset to be equal to the length of the file. When the job is cancelled in the second phase, QEMU will run the job until the source is clean and quiescent, then it will report successful completion of the job. In other words, the BLOCK_JOB_CANCELLED event means that the target may _not_ be consistent with a past state of the source; the BLOCK_JOB_COMPLETED event means that the target is consistent with a past state of the source. (Note that it could already happen that management lost the race against QEMU and got a completion event instead of cancellation). It is not yet possible to complete the job and switch over to the target disk. The next patches will fix this and add many refinements to the basic idea introduced here. These include improved error management, some tunable knobs and performance optimizations. Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2012-10-18 18:49:23 +04:00
* @mode: Whether to collapse all images in the chain to the target.
block/mirror: Fix target backing BDS Currently, we are trying to move the backing BDS from the source to the target in bdrv_replace_in_backing_chain() which is called from mirror_exit(). However, mirror_complete() already tries to open the target's backing chain with a call to bdrv_open_backing_file(). First, we should only set the target's backing BDS once. Second, the mirroring block job has a better idea of what to set it to than the generic code in bdrv_replace_in_backing_chain() (in fact, the latter's conditions on when to move the backing BDS from source to target are not really correct). Therefore, remove that code from bdrv_replace_in_backing_chain() and leave it to mirror_complete(). Depending on what kind of mirroring is performed, we furthermore want to use different strategies to open the target's backing chain: - If blockdev-mirror is used, we can assume the user made sure that the target already has the correct backing chain. In particular, we should not try to open a backing file if the target does not have any yet. - If drive-mirror with mode=absolute-paths is used, we can and should reuse the already existing chain of nodes that the source BDS is in. In case of sync=full, no backing BDS is required; with sync=top, we just link the source's backing BDS to the target, and with sync=none, we use the source BDS as the target's backing BDS. We should not try to open these backing files anew because this would lead to two BDSs existing per physical file in the backing chain, and we would like to avoid such concurrent access. - If drive-mirror with mode=existing is used, we have to use the information provided in the physical image file which means opening the target's backing chain completely anew, just as it has been done already. If the target's backing chain shares images with the source, this may lead to multiple BDSs per physical image file. But since we cannot reliably ascertain this case, there is nothing we can do about it. Signed-off-by: Max Reitz <mreitz@redhat.com> Message-id: 20160610185750.30956-3-mreitz@redhat.com Reviewed-by: Kevin Wolf <kwolf@redhat.com> Reviewed-by: Fam Zheng <famz@redhat.com> Signed-off-by: Max Reitz <mreitz@redhat.com>
2016-06-10 21:57:47 +03:00
* @backing_mode: How to establish the target's backing chain after completion.
* @on_source_error: The action to take upon error reading from the source.
* @on_target_error: The action to take upon error writing to the target.
* @unmap: Whether to unmap target where source sectors only contain zeroes.
mirror: introduce mirror job This patch adds the implementation of a new job that mirrors a disk to a new image while letting the guest continue using the old image. The target is treated as a "black box" and data is copied from the source to the target in the background. This can be used for several purposes, including storage migration, continuous replication, and observation of the guest I/O in an external program. It is also a first step in replacing the inefficient block migration code that is part of QEMU. The job is possibly never-ending, but it is logically structured into two phases: 1) copy all data as fast as possible until the target first gets in sync with the source; 2) keep target in sync and ensure that reopening to the target gets a correct (full) copy of the source data. The second phase is indicated by the progress in "info block-jobs" reporting the current offset to be equal to the length of the file. When the job is cancelled in the second phase, QEMU will run the job until the source is clean and quiescent, then it will report successful completion of the job. In other words, the BLOCK_JOB_CANCELLED event means that the target may _not_ be consistent with a past state of the source; the BLOCK_JOB_COMPLETED event means that the target is consistent with a past state of the source. (Note that it could already happen that management lost the race against QEMU and got a completion event instead of cancellation). It is not yet possible to complete the job and switch over to the target disk. The next patches will fix this and add many refinements to the basic idea introduced here. These include improved error management, some tunable knobs and performance optimizations. Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2012-10-18 18:49:23 +04:00
* @cb: Completion function for the job.
* @opaque: Opaque pointer value passed to @cb.
* @errp: Error object.
*
* Start a mirroring operation on @bs. Clusters that are allocated
* in @bs will be written to @bs until the job is cancelled or
* manually completed. At the end of a successful mirroring job,
* @bs will be switched to read from @target.
*/
void mirror_start(BlockDriverState *bs, BlockDriverState *target,
const char *replaces,
int64_t speed, uint32_t granularity, int64_t buf_size,
block/mirror: Fix target backing BDS Currently, we are trying to move the backing BDS from the source to the target in bdrv_replace_in_backing_chain() which is called from mirror_exit(). However, mirror_complete() already tries to open the target's backing chain with a call to bdrv_open_backing_file(). First, we should only set the target's backing BDS once. Second, the mirroring block job has a better idea of what to set it to than the generic code in bdrv_replace_in_backing_chain() (in fact, the latter's conditions on when to move the backing BDS from source to target are not really correct). Therefore, remove that code from bdrv_replace_in_backing_chain() and leave it to mirror_complete(). Depending on what kind of mirroring is performed, we furthermore want to use different strategies to open the target's backing chain: - If blockdev-mirror is used, we can assume the user made sure that the target already has the correct backing chain. In particular, we should not try to open a backing file if the target does not have any yet. - If drive-mirror with mode=absolute-paths is used, we can and should reuse the already existing chain of nodes that the source BDS is in. In case of sync=full, no backing BDS is required; with sync=top, we just link the source's backing BDS to the target, and with sync=none, we use the source BDS as the target's backing BDS. We should not try to open these backing files anew because this would lead to two BDSs existing per physical file in the backing chain, and we would like to avoid such concurrent access. - If drive-mirror with mode=existing is used, we have to use the information provided in the physical image file which means opening the target's backing chain completely anew, just as it has been done already. If the target's backing chain shares images with the source, this may lead to multiple BDSs per physical image file. But since we cannot reliably ascertain this case, there is nothing we can do about it. Signed-off-by: Max Reitz <mreitz@redhat.com> Message-id: 20160610185750.30956-3-mreitz@redhat.com Reviewed-by: Kevin Wolf <kwolf@redhat.com> Reviewed-by: Fam Zheng <famz@redhat.com> Signed-off-by: Max Reitz <mreitz@redhat.com>
2016-06-10 21:57:47 +03:00
MirrorSyncMode mode, BlockMirrorBackingMode backing_mode,
BlockdevOnError on_source_error,
BlockdevOnError on_target_error,
bool unmap,
BlockCompletionFunc *cb,
mirror: introduce mirror job This patch adds the implementation of a new job that mirrors a disk to a new image while letting the guest continue using the old image. The target is treated as a "black box" and data is copied from the source to the target in the background. This can be used for several purposes, including storage migration, continuous replication, and observation of the guest I/O in an external program. It is also a first step in replacing the inefficient block migration code that is part of QEMU. The job is possibly never-ending, but it is logically structured into two phases: 1) copy all data as fast as possible until the target first gets in sync with the source; 2) keep target in sync and ensure that reopening to the target gets a correct (full) copy of the source data. The second phase is indicated by the progress in "info block-jobs" reporting the current offset to be equal to the length of the file. When the job is cancelled in the second phase, QEMU will run the job until the source is clean and quiescent, then it will report successful completion of the job. In other words, the BLOCK_JOB_CANCELLED event means that the target may _not_ be consistent with a past state of the source; the BLOCK_JOB_COMPLETED event means that the target is consistent with a past state of the source. (Note that it could already happen that management lost the race against QEMU and got a completion event instead of cancellation). It is not yet possible to complete the job and switch over to the target disk. The next patches will fix this and add many refinements to the basic idea introduced here. These include improved error management, some tunable knobs and performance optimizations. Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2012-10-18 18:49:23 +04:00
void *opaque, Error **errp);
block: add basic backup support to block driver backup_start() creates a block job that copies a point-in-time snapshot of a block device to a target block device. We call backup_do_cow() for each write during backup. That function reads the original data from the block device before it gets overwritten. The data is then written to the target device. Currently backup cluster size is hardcoded to 65536 bytes. [I made a number of changes to Dietmar's original patch and folded them in to make code review easy. Here is the full list: * Drop BackupDumpFunc interface in favor of a target block device * Detect zero clusters with buffer_is_zero() and use bdrv_co_write_zeroes() * Use 0 delay instead of 1us, like other block jobs * Unify creation/start functions into backup_start() * Simplify cleanup, free bitmap in backup_run() instead of cb * function * Use HBitmap to avoid duplicating bitmap code * Use bdrv_getlength() instead of accessing ->total_sectors * directly * Delete the backup.h header file, it is no longer necessary * Move ./backup.c to block/backup.c * Remove #ifdefed out code * Coding style and whitespace cleanups * Use bdrv_add_before_write_notifier() instead of blockjob-specific hooks * Keep our own in-flight CowRequest list instead of using block.c tracked requests. This means a little code duplication but is much simpler than trying to share the tracked requests list and use the backup block size. * Add on_source_error and on_target_error error handling. * Use trace events instead of DPRINTF() -- stefanha] Signed-off-by: Dietmar Maurer <dietmar@proxmox.com> Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2013-06-24 19:13:11 +04:00
/*
* backup_start:
* @bs: Block device to operate on.
* @target: Block device to write to.
* @speed: The maximum speed, in bytes per second, or 0 for unlimited.
Implement sync modes for drive-backup. This patch adds sync-modes to the drive-backup interface and implements the FULL, NONE and TOP modes of synchronization. FULL performs as before copying the entire contents of the drive while preserving the point-in-time using CoW. NONE only copies new writes to the target drive. TOP copies changes to the topmost drive image and preserves the point-in-time using CoW. For sync mode TOP are creating a new target image using the same backing file as the original disk image. Then any new data that has been laid on top of it since creation is copied in the main backup_run() loop. There is an extra check in the 'TOP' case so that we don't bother to copy all the data of the backing file as it already exists in the target. This is where the bdrv_co_is_allocated() is used to determine if the data exists in the topmost layer or below. Also any new data being written is intercepted via the write_notifier hook which ends up calling backup_do_cow() to copy old data out before it gets overwritten. For mode 'NONE' we create the new target image and only copy in the original data from the disk image starting from the time the call was made. This preserves the point in time data by only copying the parts that are *going to change* to the target image. This way we can reconstruct the final image by checking to see if the given block exists in the new target image first, and if it does not, you can get it from the original image. This is basically an optimization allowing you to do point-in-time snapshots with low overhead vs the 'FULL' version. Since there is no old data to copy out the loop in backup_run() for the NONE case just calls qemu_coroutine_yield() which only wakes up after an event (usually cancel in this case). The rest is handled by the before_write notifier which again calls backup_do_cow() to write out the old data so it can be preserved. Signed-off-by: Ian Main <imain@redhat.com> Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2013-07-26 22:39:04 +04:00
* @sync_mode: What parts of the disk image should be copied to the destination.
* @sync_bitmap: The dirty bitmap if sync_mode is MIRROR_SYNC_MODE_INCREMENTAL.
block: add basic backup support to block driver backup_start() creates a block job that copies a point-in-time snapshot of a block device to a target block device. We call backup_do_cow() for each write during backup. That function reads the original data from the block device before it gets overwritten. The data is then written to the target device. Currently backup cluster size is hardcoded to 65536 bytes. [I made a number of changes to Dietmar's original patch and folded them in to make code review easy. Here is the full list: * Drop BackupDumpFunc interface in favor of a target block device * Detect zero clusters with buffer_is_zero() and use bdrv_co_write_zeroes() * Use 0 delay instead of 1us, like other block jobs * Unify creation/start functions into backup_start() * Simplify cleanup, free bitmap in backup_run() instead of cb * function * Use HBitmap to avoid duplicating bitmap code * Use bdrv_getlength() instead of accessing ->total_sectors * directly * Delete the backup.h header file, it is no longer necessary * Move ./backup.c to block/backup.c * Remove #ifdefed out code * Coding style and whitespace cleanups * Use bdrv_add_before_write_notifier() instead of blockjob-specific hooks * Keep our own in-flight CowRequest list instead of using block.c tracked requests. This means a little code duplication but is much simpler than trying to share the tracked requests list and use the backup block size. * Add on_source_error and on_target_error error handling. * Use trace events instead of DPRINTF() -- stefanha] Signed-off-by: Dietmar Maurer <dietmar@proxmox.com> Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2013-06-24 19:13:11 +04:00
* @on_source_error: The action to take upon error reading from the source.
* @on_target_error: The action to take upon error writing to the target.
* @cb: Completion function for the job.
* @opaque: Opaque pointer value passed to @cb.
* @txn: Transaction that this job is part of (may be NULL).
block: add basic backup support to block driver backup_start() creates a block job that copies a point-in-time snapshot of a block device to a target block device. We call backup_do_cow() for each write during backup. That function reads the original data from the block device before it gets overwritten. The data is then written to the target device. Currently backup cluster size is hardcoded to 65536 bytes. [I made a number of changes to Dietmar's original patch and folded them in to make code review easy. Here is the full list: * Drop BackupDumpFunc interface in favor of a target block device * Detect zero clusters with buffer_is_zero() and use bdrv_co_write_zeroes() * Use 0 delay instead of 1us, like other block jobs * Unify creation/start functions into backup_start() * Simplify cleanup, free bitmap in backup_run() instead of cb * function * Use HBitmap to avoid duplicating bitmap code * Use bdrv_getlength() instead of accessing ->total_sectors * directly * Delete the backup.h header file, it is no longer necessary * Move ./backup.c to block/backup.c * Remove #ifdefed out code * Coding style and whitespace cleanups * Use bdrv_add_before_write_notifier() instead of blockjob-specific hooks * Keep our own in-flight CowRequest list instead of using block.c tracked requests. This means a little code duplication but is much simpler than trying to share the tracked requests list and use the backup block size. * Add on_source_error and on_target_error error handling. * Use trace events instead of DPRINTF() -- stefanha] Signed-off-by: Dietmar Maurer <dietmar@proxmox.com> Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2013-06-24 19:13:11 +04:00
*
* Start a backup operation on @bs. Clusters in @bs are written to @target
* until the job is cancelled or manually completed.
*/
void backup_start(BlockDriverState *bs, BlockDriverState *target,
Implement sync modes for drive-backup. This patch adds sync-modes to the drive-backup interface and implements the FULL, NONE and TOP modes of synchronization. FULL performs as before copying the entire contents of the drive while preserving the point-in-time using CoW. NONE only copies new writes to the target drive. TOP copies changes to the topmost drive image and preserves the point-in-time using CoW. For sync mode TOP are creating a new target image using the same backing file as the original disk image. Then any new data that has been laid on top of it since creation is copied in the main backup_run() loop. There is an extra check in the 'TOP' case so that we don't bother to copy all the data of the backing file as it already exists in the target. This is where the bdrv_co_is_allocated() is used to determine if the data exists in the topmost layer or below. Also any new data being written is intercepted via the write_notifier hook which ends up calling backup_do_cow() to copy old data out before it gets overwritten. For mode 'NONE' we create the new target image and only copy in the original data from the disk image starting from the time the call was made. This preserves the point in time data by only copying the parts that are *going to change* to the target image. This way we can reconstruct the final image by checking to see if the given block exists in the new target image first, and if it does not, you can get it from the original image. This is basically an optimization allowing you to do point-in-time snapshots with low overhead vs the 'FULL' version. Since there is no old data to copy out the loop in backup_run() for the NONE case just calls qemu_coroutine_yield() which only wakes up after an event (usually cancel in this case). The rest is handled by the before_write notifier which again calls backup_do_cow() to write out the old data so it can be preserved. Signed-off-by: Ian Main <imain@redhat.com> Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2013-07-26 22:39:04 +04:00
int64_t speed, MirrorSyncMode sync_mode,
BdrvDirtyBitmap *sync_bitmap,
Implement sync modes for drive-backup. This patch adds sync-modes to the drive-backup interface and implements the FULL, NONE and TOP modes of synchronization. FULL performs as before copying the entire contents of the drive while preserving the point-in-time using CoW. NONE only copies new writes to the target drive. TOP copies changes to the topmost drive image and preserves the point-in-time using CoW. For sync mode TOP are creating a new target image using the same backing file as the original disk image. Then any new data that has been laid on top of it since creation is copied in the main backup_run() loop. There is an extra check in the 'TOP' case so that we don't bother to copy all the data of the backing file as it already exists in the target. This is where the bdrv_co_is_allocated() is used to determine if the data exists in the topmost layer or below. Also any new data being written is intercepted via the write_notifier hook which ends up calling backup_do_cow() to copy old data out before it gets overwritten. For mode 'NONE' we create the new target image and only copy in the original data from the disk image starting from the time the call was made. This preserves the point in time data by only copying the parts that are *going to change* to the target image. This way we can reconstruct the final image by checking to see if the given block exists in the new target image first, and if it does not, you can get it from the original image. This is basically an optimization allowing you to do point-in-time snapshots with low overhead vs the 'FULL' version. Since there is no old data to copy out the loop in backup_run() for the NONE case just calls qemu_coroutine_yield() which only wakes up after an event (usually cancel in this case). The rest is handled by the before_write notifier which again calls backup_do_cow() to write out the old data so it can be preserved. Signed-off-by: Ian Main <imain@redhat.com> Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2013-07-26 22:39:04 +04:00
BlockdevOnError on_source_error,
block: add basic backup support to block driver backup_start() creates a block job that copies a point-in-time snapshot of a block device to a target block device. We call backup_do_cow() for each write during backup. That function reads the original data from the block device before it gets overwritten. The data is then written to the target device. Currently backup cluster size is hardcoded to 65536 bytes. [I made a number of changes to Dietmar's original patch and folded them in to make code review easy. Here is the full list: * Drop BackupDumpFunc interface in favor of a target block device * Detect zero clusters with buffer_is_zero() and use bdrv_co_write_zeroes() * Use 0 delay instead of 1us, like other block jobs * Unify creation/start functions into backup_start() * Simplify cleanup, free bitmap in backup_run() instead of cb * function * Use HBitmap to avoid duplicating bitmap code * Use bdrv_getlength() instead of accessing ->total_sectors * directly * Delete the backup.h header file, it is no longer necessary * Move ./backup.c to block/backup.c * Remove #ifdefed out code * Coding style and whitespace cleanups * Use bdrv_add_before_write_notifier() instead of blockjob-specific hooks * Keep our own in-flight CowRequest list instead of using block.c tracked requests. This means a little code duplication but is much simpler than trying to share the tracked requests list and use the backup block size. * Add on_source_error and on_target_error error handling. * Use trace events instead of DPRINTF() -- stefanha] Signed-off-by: Dietmar Maurer <dietmar@proxmox.com> Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2013-06-24 19:13:11 +04:00
BlockdevOnError on_target_error,
BlockCompletionFunc *cb, void *opaque,
BlockJobTxn *txn, Error **errp);
block: add basic backup support to block driver backup_start() creates a block job that copies a point-in-time snapshot of a block device to a target block device. We call backup_do_cow() for each write during backup. That function reads the original data from the block device before it gets overwritten. The data is then written to the target device. Currently backup cluster size is hardcoded to 65536 bytes. [I made a number of changes to Dietmar's original patch and folded them in to make code review easy. Here is the full list: * Drop BackupDumpFunc interface in favor of a target block device * Detect zero clusters with buffer_is_zero() and use bdrv_co_write_zeroes() * Use 0 delay instead of 1us, like other block jobs * Unify creation/start functions into backup_start() * Simplify cleanup, free bitmap in backup_run() instead of cb * function * Use HBitmap to avoid duplicating bitmap code * Use bdrv_getlength() instead of accessing ->total_sectors * directly * Delete the backup.h header file, it is no longer necessary * Move ./backup.c to block/backup.c * Remove #ifdefed out code * Coding style and whitespace cleanups * Use bdrv_add_before_write_notifier() instead of blockjob-specific hooks * Keep our own in-flight CowRequest list instead of using block.c tracked requests. This means a little code duplication but is much simpler than trying to share the tracked requests list and use the backup block size. * Add on_source_error and on_target_error error handling. * Use trace events instead of DPRINTF() -- stefanha] Signed-off-by: Dietmar Maurer <dietmar@proxmox.com> Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2013-06-24 19:13:11 +04:00
void hmp_drive_add_node(Monitor *mon, const char *optstr);
BdrvChild *bdrv_root_attach_child(BlockDriverState *child_bs,
const char *child_name,
const BdrvChildRole *child_role,
void *opaque);
void bdrv_root_unref_child(BdrvChild *child);
const char *bdrv_get_parent_name(const BlockDriverState *bs);
void blk_dev_change_media_cb(BlockBackend *blk, bool load);
bool blk_dev_has_removable_media(BlockBackend *blk);
bool blk_dev_has_tray(BlockBackend *blk);
void blk_dev_eject_request(BlockBackend *blk, bool force);
bool blk_dev_is_tray_open(BlockBackend *blk);
bool blk_dev_is_medium_locked(BlockBackend *blk);
void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors);
bool bdrv_requests_pending(BlockDriverState *bs);
void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap **out);
void bdrv_undo_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap *in);
void blockdev_close_all_bdrv_states(void);
#endif /* BLOCK_INT_H */