2010-12-06 19:08:00 +03:00
|
|
|
/*
|
|
|
|
* QEMU Enhanced Disk Format
|
|
|
|
*
|
|
|
|
* Copyright IBM, Corp. 2010
|
|
|
|
*
|
|
|
|
* Authors:
|
|
|
|
* Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
|
|
|
|
* Anthony Liguori <aliguori@us.ibm.com>
|
|
|
|
*
|
|
|
|
* This work is licensed under the terms of the GNU LGPL, version 2 or later.
|
|
|
|
* See the COPYING.LIB file in the top-level directory.
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
|
2016-01-18 21:01:42 +03:00
|
|
|
#include "qemu/osdep.h"
|
2018-06-14 22:14:28 +03:00
|
|
|
#include "block/qdict.h"
|
include/qemu/osdep.h: Don't include qapi/error.h
Commit 57cb38b included qapi/error.h into qemu/osdep.h to get the
Error typedef. Since then, we've moved to include qemu/osdep.h
everywhere. Its file comment explains: "To avoid getting into
possible circular include dependencies, this file should not include
any other QEMU headers, with the exceptions of config-host.h,
compiler.h, os-posix.h and os-win32.h, all of which are doing a
similar job to this file and are under similar constraints."
qapi/error.h doesn't do a similar job, and it doesn't adhere to
similar constraints: it includes qapi-types.h. That's in excess of
100KiB of crap most .c files don't actually need.
Add the typedef to qemu/typedefs.h, and include that instead of
qapi/error.h. Include qapi/error.h in .c files that need it and don't
get it now. Include qapi-types.h in qom/object.h for uint16List.
Update scripts/clean-includes accordingly. Update it further to match
reality: replace config.h by config-target.h, add sysemu/os-posix.h,
sysemu/os-win32.h. Update the list of includes in the qemu/osdep.h
comment quoted above similarly.
This reduces the number of objects depending on qapi/error.h from "all
of them" to less than a third. Unfortunately, the number depending on
qapi-types.h shrinks only a little. More work is needed for that one.
Signed-off-by: Markus Armbruster <armbru@redhat.com>
[Fix compilation without the spice devel packages. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2016-03-14 11:01:28 +03:00
|
|
|
#include "qapi/error.h"
|
2012-12-17 21:20:00 +04:00
|
|
|
#include "qemu/timer.h"
|
2016-03-15 19:22:36 +03:00
|
|
|
#include "qemu/bswap.h"
|
Include qemu/main-loop.h less
In my "build everything" tree, changing qemu/main-loop.h triggers a
recompile of some 5600 out of 6600 objects (not counting tests and
objects that don't depend on qemu/osdep.h). It includes block/aio.h,
which in turn includes qemu/event_notifier.h, qemu/notify.h,
qemu/processor.h, qemu/qsp.h, qemu/queue.h, qemu/thread-posix.h,
qemu/thread.h, qemu/timer.h, and a few more.
Include qemu/main-loop.h only where it's needed. Touching it now
recompiles only some 1700 objects. For block/aio.h and
qemu/event_notifier.h, these numbers drop from 5600 to 2800. For the
others, they shrink only slightly.
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Message-Id: <20190812052359.30071-21-armbru@redhat.com>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Tested-by: Philippe Mathieu-Daudé <philmd@redhat.com>
2019-08-12 08:23:50 +03:00
|
|
|
#include "qemu/main-loop.h"
|
2019-05-23 17:35:07 +03:00
|
|
|
#include "qemu/module.h"
|
2018-02-01 14:18:46 +03:00
|
|
|
#include "qemu/option.h"
|
2010-12-06 19:08:02 +03:00
|
|
|
#include "trace.h"
|
2010-12-06 19:08:00 +03:00
|
|
|
#include "qed.h"
|
2016-03-08 17:57:05 +03:00
|
|
|
#include "sysemu/block-backend.h"
|
2018-03-09 21:53:19 +03:00
|
|
|
#include "qapi/qmp/qdict.h"
|
|
|
|
#include "qapi/qobject-input-visitor.h"
|
|
|
|
#include "qapi/qapi-visit-block-core.h"
|
|
|
|
|
|
|
|
static QemuOptsList qed_create_opts;
|
2010-12-06 19:08:00 +03:00
|
|
|
|
|
|
|
static int bdrv_qed_probe(const uint8_t *buf, int buf_size,
|
|
|
|
const char *filename)
|
|
|
|
{
|
|
|
|
const QEDHeader *header = (const QEDHeader *)buf;
|
|
|
|
|
|
|
|
if (buf_size < sizeof(*header)) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
if (le32_to_cpu(header->magic) != QED_MAGIC) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
return 100;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Check whether an image format is raw
|
|
|
|
*
|
|
|
|
* @fmt: Backing file format, may be NULL
|
|
|
|
*/
|
|
|
|
static bool qed_fmt_is_raw(const char *fmt)
|
|
|
|
{
|
|
|
|
return fmt && strcmp(fmt, "raw") == 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void qed_header_le_to_cpu(const QEDHeader *le, QEDHeader *cpu)
|
|
|
|
{
|
|
|
|
cpu->magic = le32_to_cpu(le->magic);
|
|
|
|
cpu->cluster_size = le32_to_cpu(le->cluster_size);
|
|
|
|
cpu->table_size = le32_to_cpu(le->table_size);
|
|
|
|
cpu->header_size = le32_to_cpu(le->header_size);
|
|
|
|
cpu->features = le64_to_cpu(le->features);
|
|
|
|
cpu->compat_features = le64_to_cpu(le->compat_features);
|
|
|
|
cpu->autoclear_features = le64_to_cpu(le->autoclear_features);
|
|
|
|
cpu->l1_table_offset = le64_to_cpu(le->l1_table_offset);
|
|
|
|
cpu->image_size = le64_to_cpu(le->image_size);
|
|
|
|
cpu->backing_filename_offset = le32_to_cpu(le->backing_filename_offset);
|
|
|
|
cpu->backing_filename_size = le32_to_cpu(le->backing_filename_size);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void qed_header_cpu_to_le(const QEDHeader *cpu, QEDHeader *le)
|
|
|
|
{
|
|
|
|
le->magic = cpu_to_le32(cpu->magic);
|
|
|
|
le->cluster_size = cpu_to_le32(cpu->cluster_size);
|
|
|
|
le->table_size = cpu_to_le32(cpu->table_size);
|
|
|
|
le->header_size = cpu_to_le32(cpu->header_size);
|
|
|
|
le->features = cpu_to_le64(cpu->features);
|
|
|
|
le->compat_features = cpu_to_le64(cpu->compat_features);
|
|
|
|
le->autoclear_features = cpu_to_le64(cpu->autoclear_features);
|
|
|
|
le->l1_table_offset = cpu_to_le64(cpu->l1_table_offset);
|
|
|
|
le->image_size = cpu_to_le64(cpu->image_size);
|
|
|
|
le->backing_filename_offset = cpu_to_le32(cpu->backing_filename_offset);
|
|
|
|
le->backing_filename_size = cpu_to_le32(cpu->backing_filename_size);
|
|
|
|
}
|
|
|
|
|
2012-08-09 16:05:54 +04:00
|
|
|
int qed_write_header_sync(BDRVQEDState *s)
|
2010-12-06 19:08:00 +03:00
|
|
|
{
|
|
|
|
QEDHeader le;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
qed_header_cpu_to_le(&s->header, &le);
|
2016-06-20 21:09:15 +03:00
|
|
|
ret = bdrv_pwrite(s->bs->file, 0, &le, sizeof(le));
|
2010-12-06 19:08:00 +03:00
|
|
|
if (ret != sizeof(le)) {
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2010-12-06 19:08:03 +03:00
|
|
|
/**
|
|
|
|
* Update header in-place (does not rewrite backing filename or other strings)
|
|
|
|
*
|
|
|
|
* This function only updates known header fields in-place and does not affect
|
|
|
|
* extra data after the QED header.
|
2017-06-29 16:27:47 +03:00
|
|
|
*
|
|
|
|
* No new allocating reqs can start while this function runs.
|
2010-12-06 19:08:03 +03:00
|
|
|
*/
|
2017-06-12 12:12:41 +03:00
|
|
|
static int coroutine_fn qed_write_header(BDRVQEDState *s)
|
2010-12-06 19:08:03 +03:00
|
|
|
{
|
|
|
|
/* We must write full sectors for O_DIRECT but cannot necessarily generate
|
|
|
|
* the data following the header if an unrecognized compat feature is
|
|
|
|
* active. Therefore, first read the sectors containing the header, update
|
|
|
|
* them, and write back.
|
|
|
|
*/
|
|
|
|
|
2016-05-31 19:35:53 +03:00
|
|
|
int nsectors = DIV_ROUND_UP(sizeof(QEDHeader), BDRV_SECTOR_SIZE);
|
2010-12-06 19:08:03 +03:00
|
|
|
size_t len = nsectors * BDRV_SECTOR_SIZE;
|
2016-11-14 16:56:32 +03:00
|
|
|
uint8_t *buf;
|
|
|
|
int ret;
|
|
|
|
|
2017-06-29 16:27:47 +03:00
|
|
|
assert(s->allocating_acb || s->allocating_write_reqs_plugged);
|
|
|
|
|
2016-11-14 16:56:32 +03:00
|
|
|
buf = qemu_blockalign(s->bs, len);
|
|
|
|
|
2019-04-22 17:58:33 +03:00
|
|
|
ret = bdrv_co_pread(s->bs->file, 0, len, buf, 0);
|
2016-11-14 16:56:32 +03:00
|
|
|
if (ret < 0) {
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Update header */
|
|
|
|
qed_header_cpu_to_le(&s->header, (QEDHeader *) buf);
|
|
|
|
|
2019-04-22 17:58:33 +03:00
|
|
|
ret = bdrv_co_pwrite(s->bs->file, 0, len, buf, 0);
|
2016-11-14 16:56:32 +03:00
|
|
|
if (ret < 0) {
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = 0;
|
|
|
|
out:
|
|
|
|
qemu_vfree(buf);
|
2016-11-15 13:14:01 +03:00
|
|
|
return ret;
|
2010-12-06 19:08:03 +03:00
|
|
|
}
|
|
|
|
|
2010-12-06 19:08:00 +03:00
|
|
|
static uint64_t qed_max_image_size(uint32_t cluster_size, uint32_t table_size)
|
|
|
|
{
|
|
|
|
uint64_t table_entries;
|
|
|
|
uint64_t l2_size;
|
|
|
|
|
|
|
|
table_entries = (table_size * cluster_size) / sizeof(uint64_t);
|
|
|
|
l2_size = table_entries * cluster_size;
|
|
|
|
|
|
|
|
return l2_size * table_entries;
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool qed_is_cluster_size_valid(uint32_t cluster_size)
|
|
|
|
{
|
|
|
|
if (cluster_size < QED_MIN_CLUSTER_SIZE ||
|
|
|
|
cluster_size > QED_MAX_CLUSTER_SIZE) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
if (cluster_size & (cluster_size - 1)) {
|
|
|
|
return false; /* not power of 2 */
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool qed_is_table_size_valid(uint32_t table_size)
|
|
|
|
{
|
|
|
|
if (table_size < QED_MIN_TABLE_SIZE ||
|
|
|
|
table_size > QED_MAX_TABLE_SIZE) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
if (table_size & (table_size - 1)) {
|
|
|
|
return false; /* not power of 2 */
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool qed_is_image_size_valid(uint64_t image_size, uint32_t cluster_size,
|
|
|
|
uint32_t table_size)
|
|
|
|
{
|
|
|
|
if (image_size % BDRV_SECTOR_SIZE != 0) {
|
|
|
|
return false; /* not multiple of sector size */
|
|
|
|
}
|
|
|
|
if (image_size > qed_max_image_size(cluster_size, table_size)) {
|
|
|
|
return false; /* image is too large */
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Read a string of known length from the image file
|
|
|
|
*
|
|
|
|
* @file: Image file
|
|
|
|
* @offset: File offset to start of string, in bytes
|
|
|
|
* @n: String length in bytes
|
|
|
|
* @buf: Destination buffer
|
|
|
|
* @buflen: Destination buffer length in bytes
|
|
|
|
* @ret: 0 on success, -errno on failure
|
|
|
|
*
|
|
|
|
* The string is NUL-terminated.
|
|
|
|
*/
|
2016-06-20 19:24:02 +03:00
|
|
|
static int qed_read_string(BdrvChild *file, uint64_t offset, size_t n,
|
2010-12-06 19:08:00 +03:00
|
|
|
char *buf, size_t buflen)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
if (n >= buflen) {
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
ret = bdrv_pread(file, offset, buf, n);
|
|
|
|
if (ret < 0) {
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
buf[n] = '\0';
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2010-12-06 19:08:02 +03:00
|
|
|
/**
|
|
|
|
* Allocate new clusters
|
|
|
|
*
|
|
|
|
* @s: QED state
|
|
|
|
* @n: Number of contiguous clusters to allocate
|
|
|
|
* @ret: Offset of first allocated cluster
|
|
|
|
*
|
|
|
|
* This function only produces the offset where the new clusters should be
|
|
|
|
* written. It updates BDRVQEDState but does not make any changes to the image
|
|
|
|
* file.
|
2017-06-29 16:27:47 +03:00
|
|
|
*
|
|
|
|
* Called with table_lock held.
|
2010-12-06 19:08:02 +03:00
|
|
|
*/
|
|
|
|
static uint64_t qed_alloc_clusters(BDRVQEDState *s, unsigned int n)
|
|
|
|
{
|
|
|
|
uint64_t offset = s->file_size;
|
|
|
|
s->file_size += n * s->header.cluster_size;
|
|
|
|
return offset;
|
|
|
|
}
|
|
|
|
|
2010-12-06 19:08:01 +03:00
|
|
|
QEDTable *qed_alloc_table(BDRVQEDState *s)
|
|
|
|
{
|
|
|
|
/* Honor O_DIRECT memory alignment requirements */
|
|
|
|
return qemu_blockalign(s->bs,
|
|
|
|
s->header.cluster_size * s->header.table_size);
|
|
|
|
}
|
|
|
|
|
2010-12-06 19:08:02 +03:00
|
|
|
/**
|
|
|
|
* Allocate a new zeroed L2 table
|
2017-06-29 16:27:47 +03:00
|
|
|
*
|
|
|
|
* Called with table_lock held.
|
2010-12-06 19:08:02 +03:00
|
|
|
*/
|
|
|
|
static CachedL2Table *qed_new_l2_table(BDRVQEDState *s)
|
|
|
|
{
|
|
|
|
CachedL2Table *l2_table = qed_alloc_l2_cache_entry(&s->l2_cache);
|
|
|
|
|
|
|
|
l2_table->table = qed_alloc_table(s);
|
|
|
|
l2_table->offset = qed_alloc_clusters(s, s->header.table_size);
|
|
|
|
|
|
|
|
memset(l2_table->table->offsets, 0,
|
|
|
|
s->header.cluster_size * s->header.table_size);
|
|
|
|
return l2_table;
|
|
|
|
}
|
|
|
|
|
2017-06-29 16:27:47 +03:00
|
|
|
static bool qed_plug_allocating_write_reqs(BDRVQEDState *s)
|
2011-05-09 19:45:40 +04:00
|
|
|
{
|
2017-06-29 16:27:47 +03:00
|
|
|
qemu_co_mutex_lock(&s->table_lock);
|
|
|
|
|
|
|
|
/* No reentrancy is allowed. */
|
2011-05-09 19:45:40 +04:00
|
|
|
assert(!s->allocating_write_reqs_plugged);
|
2017-06-29 16:27:47 +03:00
|
|
|
if (s->allocating_acb != NULL) {
|
|
|
|
/* Another allocating write came concurrently. This cannot happen
|
2017-09-23 14:14:10 +03:00
|
|
|
* from bdrv_qed_co_drain_begin, but it can happen when the timer runs.
|
2017-06-29 16:27:47 +03:00
|
|
|
*/
|
|
|
|
qemu_co_mutex_unlock(&s->table_lock);
|
|
|
|
return false;
|
|
|
|
}
|
2011-05-09 19:45:40 +04:00
|
|
|
|
|
|
|
s->allocating_write_reqs_plugged = true;
|
2017-06-29 16:27:47 +03:00
|
|
|
qemu_co_mutex_unlock(&s->table_lock);
|
|
|
|
return true;
|
2011-05-09 19:45:40 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
static void qed_unplug_allocating_write_reqs(BDRVQEDState *s)
|
|
|
|
{
|
2017-06-29 16:27:47 +03:00
|
|
|
qemu_co_mutex_lock(&s->table_lock);
|
2011-05-09 19:45:40 +04:00
|
|
|
assert(s->allocating_write_reqs_plugged);
|
|
|
|
s->allocating_write_reqs_plugged = false;
|
2017-06-29 16:27:47 +03:00
|
|
|
qemu_co_queue_next(&s->allocating_write_reqs);
|
|
|
|
qemu_co_mutex_unlock(&s->table_lock);
|
2011-05-09 19:45:40 +04:00
|
|
|
}
|
|
|
|
|
2017-06-12 12:12:41 +03:00
|
|
|
static void coroutine_fn qed_need_check_timer_entry(void *opaque)
|
2011-05-09 19:45:40 +04:00
|
|
|
{
|
|
|
|
BDRVQEDState *s = opaque;
|
2016-11-18 18:04:59 +03:00
|
|
|
int ret;
|
|
|
|
|
|
|
|
trace_qed_need_check_timer_cb(s);
|
2011-05-09 19:45:40 +04:00
|
|
|
|
2017-06-29 16:27:47 +03:00
|
|
|
if (!qed_plug_allocating_write_reqs(s)) {
|
|
|
|
return;
|
|
|
|
}
|
2016-11-18 18:04:59 +03:00
|
|
|
|
|
|
|
/* Ensure writes are on disk before clearing flag */
|
|
|
|
ret = bdrv_co_flush(s->bs->file->bs);
|
|
|
|
if (ret < 0) {
|
2011-05-09 19:45:40 +04:00
|
|
|
qed_unplug_allocating_write_reqs(s);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
s->header.features &= ~QED_F_NEED_CHECK;
|
2016-11-15 13:14:01 +03:00
|
|
|
ret = qed_write_header(s);
|
|
|
|
(void) ret;
|
|
|
|
|
|
|
|
qed_unplug_allocating_write_reqs(s);
|
|
|
|
|
2016-11-18 18:04:59 +03:00
|
|
|
ret = bdrv_co_flush(s->bs);
|
2016-11-15 13:14:01 +03:00
|
|
|
(void) ret;
|
2011-05-09 19:45:40 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
static void qed_need_check_timer_cb(void *opaque)
|
|
|
|
{
|
2016-11-18 18:04:59 +03:00
|
|
|
Coroutine *co = qemu_coroutine_create(qed_need_check_timer_entry, opaque);
|
|
|
|
qemu_coroutine_enter(co);
|
2017-02-13 16:52:29 +03:00
|
|
|
}
|
|
|
|
|
2011-05-09 19:45:40 +04:00
|
|
|
static void qed_start_need_check_timer(BDRVQEDState *s)
|
|
|
|
{
|
|
|
|
trace_qed_start_need_check_timer(s);
|
|
|
|
|
2013-08-21 19:03:08 +04:00
|
|
|
/* Use QEMU_CLOCK_VIRTUAL so we don't alter the image file while suspended for
|
2011-05-09 19:45:40 +04:00
|
|
|
* migration.
|
|
|
|
*/
|
2013-08-21 19:03:08 +04:00
|
|
|
timer_mod(s->need_check_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
|
2016-03-21 19:02:30 +03:00
|
|
|
NANOSECONDS_PER_SECOND * QED_NEED_CHECK_TIMEOUT);
|
2011-05-09 19:45:40 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/* It's okay to call this multiple times or when no timer is started */
|
|
|
|
static void qed_cancel_need_check_timer(BDRVQEDState *s)
|
|
|
|
{
|
|
|
|
trace_qed_cancel_need_check_timer(s);
|
2013-08-21 19:03:08 +04:00
|
|
|
timer_del(s->need_check_timer);
|
2011-05-09 19:45:40 +04:00
|
|
|
}
|
|
|
|
|
2014-05-08 18:34:45 +04:00
|
|
|
static void bdrv_qed_detach_aio_context(BlockDriverState *bs)
|
|
|
|
{
|
|
|
|
BDRVQEDState *s = bs->opaque;
|
|
|
|
|
|
|
|
qed_cancel_need_check_timer(s);
|
|
|
|
timer_free(s->need_check_timer);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void bdrv_qed_attach_aio_context(BlockDriverState *bs,
|
|
|
|
AioContext *new_context)
|
|
|
|
{
|
|
|
|
BDRVQEDState *s = bs->opaque;
|
|
|
|
|
|
|
|
s->need_check_timer = aio_timer_new(new_context,
|
|
|
|
QEMU_CLOCK_VIRTUAL, SCALE_NS,
|
|
|
|
qed_need_check_timer_cb, s);
|
|
|
|
if (s->header.features & QED_F_NEED_CHECK) {
|
|
|
|
qed_start_need_check_timer(s);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-09-23 14:14:10 +03:00
|
|
|
static void coroutine_fn bdrv_qed_co_drain_begin(BlockDriverState *bs)
|
2016-10-27 13:48:54 +03:00
|
|
|
{
|
|
|
|
BDRVQEDState *s = bs->opaque;
|
|
|
|
|
|
|
|
/* Fire the timer immediately in order to start doing I/O as soon as the
|
|
|
|
* header is flushed.
|
|
|
|
*/
|
|
|
|
if (s->need_check_timer && timer_pending(s->need_check_timer)) {
|
|
|
|
qed_cancel_need_check_timer(s);
|
2017-06-29 16:27:45 +03:00
|
|
|
qed_need_check_timer_entry(s);
|
2016-10-27 13:48:54 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-06-29 16:27:46 +03:00
|
|
|
static void bdrv_qed_init_state(BlockDriverState *bs)
|
|
|
|
{
|
|
|
|
BDRVQEDState *s = bs->opaque;
|
|
|
|
|
|
|
|
memset(s, 0, sizeof(BDRVQEDState));
|
|
|
|
s->bs = bs;
|
2017-06-29 16:27:47 +03:00
|
|
|
qemu_co_mutex_init(&s->table_lock);
|
2017-06-29 16:27:46 +03:00
|
|
|
qemu_co_queue_init(&s->allocating_write_reqs);
|
|
|
|
}
|
|
|
|
|
2018-03-01 19:36:17 +03:00
|
|
|
/* Called with table_lock held. */
|
|
|
|
static int coroutine_fn bdrv_qed_do_open(BlockDriverState *bs, QDict *options,
|
|
|
|
int flags, Error **errp)
|
2010-12-06 19:08:00 +03:00
|
|
|
{
|
|
|
|
BDRVQEDState *s = bs->opaque;
|
|
|
|
QEDHeader le_header;
|
|
|
|
int64_t file_size;
|
|
|
|
int ret;
|
|
|
|
|
2016-06-20 19:24:02 +03:00
|
|
|
ret = bdrv_pread(bs->file, 0, &le_header, sizeof(le_header));
|
2010-12-06 19:08:00 +03:00
|
|
|
if (ret < 0) {
|
2021-02-02 15:49:55 +03:00
|
|
|
error_setg(errp, "Failed to read QED header");
|
2010-12-06 19:08:00 +03:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
qed_header_le_to_cpu(&le_header, &s->header);
|
|
|
|
|
|
|
|
if (s->header.magic != QED_MAGIC) {
|
2014-02-17 17:44:06 +04:00
|
|
|
error_setg(errp, "Image not in QED format");
|
|
|
|
return -EINVAL;
|
2010-12-06 19:08:00 +03:00
|
|
|
}
|
|
|
|
if (s->header.features & ~QED_FEATURE_MASK) {
|
2011-02-09 13:13:26 +03:00
|
|
|
/* image uses unsupported feature bits */
|
2016-03-16 21:54:33 +03:00
|
|
|
error_setg(errp, "Unsupported QED features: %" PRIx64,
|
|
|
|
s->header.features & ~QED_FEATURE_MASK);
|
2011-02-09 13:13:26 +03:00
|
|
|
return -ENOTSUP;
|
2010-12-06 19:08:00 +03:00
|
|
|
}
|
|
|
|
if (!qed_is_cluster_size_valid(s->header.cluster_size)) {
|
2021-02-02 15:49:55 +03:00
|
|
|
error_setg(errp, "QED cluster size is invalid");
|
2010-12-06 19:08:00 +03:00
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Round down file size to the last cluster */
|
2015-06-16 15:19:22 +03:00
|
|
|
file_size = bdrv_getlength(bs->file->bs);
|
2010-12-06 19:08:00 +03:00
|
|
|
if (file_size < 0) {
|
2021-02-02 15:49:55 +03:00
|
|
|
error_setg(errp, "Failed to get file length");
|
2010-12-06 19:08:00 +03:00
|
|
|
return file_size;
|
|
|
|
}
|
|
|
|
s->file_size = qed_start_of_cluster(s, file_size);
|
|
|
|
|
|
|
|
if (!qed_is_table_size_valid(s->header.table_size)) {
|
2021-02-02 15:49:55 +03:00
|
|
|
error_setg(errp, "QED table size is invalid");
|
2010-12-06 19:08:00 +03:00
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
if (!qed_is_image_size_valid(s->header.image_size,
|
|
|
|
s->header.cluster_size,
|
|
|
|
s->header.table_size)) {
|
2021-02-02 15:49:55 +03:00
|
|
|
error_setg(errp, "QED image size is invalid");
|
2010-12-06 19:08:00 +03:00
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
if (!qed_check_table_offset(s, s->header.l1_table_offset)) {
|
2021-02-02 15:49:55 +03:00
|
|
|
error_setg(errp, "QED table offset is invalid");
|
2010-12-06 19:08:00 +03:00
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
s->table_nelems = (s->header.cluster_size * s->header.table_size) /
|
|
|
|
sizeof(uint64_t);
|
2015-03-23 18:29:26 +03:00
|
|
|
s->l2_shift = ctz32(s->header.cluster_size);
|
2010-12-06 19:08:00 +03:00
|
|
|
s->l2_mask = s->table_nelems - 1;
|
2015-03-23 18:29:26 +03:00
|
|
|
s->l1_shift = s->l2_shift + ctz32(s->table_nelems);
|
2010-12-06 19:08:00 +03:00
|
|
|
|
2015-01-12 15:31:32 +03:00
|
|
|
/* Header size calculation must not overflow uint32_t */
|
|
|
|
if (s->header.header_size > UINT32_MAX / s->header.cluster_size) {
|
2021-02-02 15:49:55 +03:00
|
|
|
error_setg(errp, "QED header size is too large");
|
2015-01-12 15:31:32 +03:00
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
2010-12-06 19:08:00 +03:00
|
|
|
if ((s->header.features & QED_F_BACKING_FILE)) {
|
|
|
|
if ((uint64_t)s->header.backing_filename_offset +
|
|
|
|
s->header.backing_filename_size >
|
|
|
|
s->header.cluster_size * s->header.header_size) {
|
2021-02-02 15:49:55 +03:00
|
|
|
error_setg(errp, "QED backing filename offset is invalid");
|
2010-12-06 19:08:00 +03:00
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
2016-06-20 19:24:02 +03:00
|
|
|
ret = qed_read_string(bs->file, s->header.backing_filename_offset,
|
block: Add BDS.auto_backing_file
If the backing file is overridden, this most probably does change the
guest-visible data of a BDS. Therefore, we will need to consider this
in bdrv_refresh_filename().
To see whether it has been overridden, we might want to compare
bs->backing_file and bs->backing->bs->filename. However,
bs->backing_file is changed by bdrv_set_backing_hd() (which is just used
to change the backing child at runtime, without modifying the image
header), so bs->backing_file most of the time simply contains a copy of
bs->backing->bs->filename anyway, so it is useless for such a
comparison.
This patch adds an auto_backing_file BDS field which contains the
backing file path as indicated by the image header, which is not changed
by bdrv_set_backing_hd().
Because of bdrv_refresh_filename() magic, however, a BDS's filename may
differ from what has been specified during bdrv_open(). Then, the
comparison between bs->auto_backing_file and bs->backing->bs->filename
may fail even though bs->backing was opened from bs->auto_backing_file.
To mitigate this, we can copy the real BDS's filename (after the whole
bdrv_open() and bdrv_refresh_filename() process) into
bs->auto_backing_file, if we know the former has been opened based on
the latter. This is only possible if no options modifying the backing
file's behavior have been specified, though. To simplify things, this
patch only copies the filename from the backing file if no options have
been specified for it at all.
Furthermore, there are cases where an overlay is created by qemu which
already contains a BDS's filename (e.g. in blockdev-snapshot-sync). We
do not need to worry about updating the overlay's bs->auto_backing_file
there, because we actually wrote a post-bdrv_refresh_filename() filename
into the image header.
So all in all, there will be false negatives where (as of a future
patch) bdrv_refresh_filename() will assume that the backing file differs
from what was specified in the image header, even though it really does
not. However, these cases should be limited to where (1) the user
actually did override something in the backing chain (e.g. by specifying
options for the backing file), or (2) the user executed a QMP command to
change some node's backing file (e.g. change-backing-file or
block-commit with @backing-file given) where the given filename does not
happen to coincide with qemu's idea of the backing BDS's filename.
Then again, (1) really is limited to -drive. With -blockdev or
blockdev-add, you have to adhere to the schema, so a user cannot give
partial "unimportant" options (e.g. by just setting backing.node-name
and leaving the rest to the image header). Therefore, trying to fix
this would mean trying to fix something for -drive only.
To improve on (2), we would need a full infrastructure to "canonicalize"
an arbitrary filename (+ options), so it can be compared against
another. That seems a bit over the top, considering that filenames
nowadays are there mostly for the user's entertainment.
Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Alberto Garcia <berto@igalia.com>
Message-id: 20190201192935.18394-5-mreitz@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
2019-02-01 22:29:08 +03:00
|
|
|
s->header.backing_filename_size,
|
|
|
|
bs->auto_backing_file,
|
|
|
|
sizeof(bs->auto_backing_file));
|
2010-12-06 19:08:00 +03:00
|
|
|
if (ret < 0) {
|
2021-02-02 15:49:55 +03:00
|
|
|
error_setg(errp, "Failed to read backing filename");
|
2010-12-06 19:08:00 +03:00
|
|
|
return ret;
|
|
|
|
}
|
block: Add BDS.auto_backing_file
If the backing file is overridden, this most probably does change the
guest-visible data of a BDS. Therefore, we will need to consider this
in bdrv_refresh_filename().
To see whether it has been overridden, we might want to compare
bs->backing_file and bs->backing->bs->filename. However,
bs->backing_file is changed by bdrv_set_backing_hd() (which is just used
to change the backing child at runtime, without modifying the image
header), so bs->backing_file most of the time simply contains a copy of
bs->backing->bs->filename anyway, so it is useless for such a
comparison.
This patch adds an auto_backing_file BDS field which contains the
backing file path as indicated by the image header, which is not changed
by bdrv_set_backing_hd().
Because of bdrv_refresh_filename() magic, however, a BDS's filename may
differ from what has been specified during bdrv_open(). Then, the
comparison between bs->auto_backing_file and bs->backing->bs->filename
may fail even though bs->backing was opened from bs->auto_backing_file.
To mitigate this, we can copy the real BDS's filename (after the whole
bdrv_open() and bdrv_refresh_filename() process) into
bs->auto_backing_file, if we know the former has been opened based on
the latter. This is only possible if no options modifying the backing
file's behavior have been specified, though. To simplify things, this
patch only copies the filename from the backing file if no options have
been specified for it at all.
Furthermore, there are cases where an overlay is created by qemu which
already contains a BDS's filename (e.g. in blockdev-snapshot-sync). We
do not need to worry about updating the overlay's bs->auto_backing_file
there, because we actually wrote a post-bdrv_refresh_filename() filename
into the image header.
So all in all, there will be false negatives where (as of a future
patch) bdrv_refresh_filename() will assume that the backing file differs
from what was specified in the image header, even though it really does
not. However, these cases should be limited to where (1) the user
actually did override something in the backing chain (e.g. by specifying
options for the backing file), or (2) the user executed a QMP command to
change some node's backing file (e.g. change-backing-file or
block-commit with @backing-file given) where the given filename does not
happen to coincide with qemu's idea of the backing BDS's filename.
Then again, (1) really is limited to -drive. With -blockdev or
blockdev-add, you have to adhere to the schema, so a user cannot give
partial "unimportant" options (e.g. by just setting backing.node-name
and leaving the rest to the image header). Therefore, trying to fix
this would mean trying to fix something for -drive only.
To improve on (2), we would need a full infrastructure to "canonicalize"
an arbitrary filename (+ options), so it can be compared against
another. That seems a bit over the top, considering that filenames
nowadays are there mostly for the user's entertainment.
Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Alberto Garcia <berto@igalia.com>
Message-id: 20190201192935.18394-5-mreitz@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
2019-02-01 22:29:08 +03:00
|
|
|
pstrcpy(bs->backing_file, sizeof(bs->backing_file),
|
|
|
|
bs->auto_backing_file);
|
2010-12-06 19:08:00 +03:00
|
|
|
|
|
|
|
if (s->header.features & QED_F_BACKING_FORMAT_NO_PROBE) {
|
|
|
|
pstrcpy(bs->backing_format, sizeof(bs->backing_format), "raw");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Reset unknown autoclear feature bits. This is a backwards
|
|
|
|
* compatibility mechanism that allows images to be opened by older
|
|
|
|
* programs, which "knock out" unknown feature bits. When an image is
|
|
|
|
* opened by a newer program again it can detect that the autoclear
|
|
|
|
* feature is no longer valid.
|
|
|
|
*/
|
|
|
|
if ((s->header.autoclear_features & ~QED_AUTOCLEAR_FEATURE_MASK) != 0 &&
|
2016-01-13 17:56:06 +03:00
|
|
|
!bdrv_is_read_only(bs->file->bs) && !(flags & BDRV_O_INACTIVE)) {
|
2010-12-06 19:08:00 +03:00
|
|
|
s->header.autoclear_features &= QED_AUTOCLEAR_FEATURE_MASK;
|
|
|
|
|
|
|
|
ret = qed_write_header_sync(s);
|
|
|
|
if (ret) {
|
2021-02-02 15:49:55 +03:00
|
|
|
error_setg(errp, "Failed to update header");
|
2010-12-06 19:08:00 +03:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* From here on only known autoclear feature bits are valid */
|
2015-06-16 15:19:22 +03:00
|
|
|
bdrv_flush(bs->file->bs);
|
2010-12-06 19:08:00 +03:00
|
|
|
}
|
|
|
|
|
2010-12-06 19:08:01 +03:00
|
|
|
s->l1_table = qed_alloc_table(s);
|
|
|
|
qed_init_l2_cache(&s->l2_cache);
|
|
|
|
|
|
|
|
ret = qed_read_l1_table_sync(s);
|
2010-12-06 19:08:03 +03:00
|
|
|
if (ret) {
|
2021-02-02 15:49:55 +03:00
|
|
|
error_setg(errp, "Failed to read L1 table");
|
2010-12-06 19:08:03 +03:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* If image was not closed cleanly, check consistency */
|
2012-08-09 16:05:56 +04:00
|
|
|
if (!(flags & BDRV_O_CHECK) && (s->header.features & QED_F_NEED_CHECK)) {
|
2010-12-06 19:08:03 +03:00
|
|
|
/* Read-only images cannot be fixed. There is no risk of corruption
|
|
|
|
* since write operations are not possible. Therefore, allow
|
|
|
|
* potentially inconsistent images to be opened read-only. This can
|
|
|
|
* aid data recovery from an otherwise inconsistent image.
|
|
|
|
*/
|
2015-06-16 15:19:22 +03:00
|
|
|
if (!bdrv_is_read_only(bs->file->bs) &&
|
2016-01-13 17:56:06 +03:00
|
|
|
!(flags & BDRV_O_INACTIVE)) {
|
2010-12-06 19:08:03 +03:00
|
|
|
BdrvCheckResult result = {0};
|
|
|
|
|
|
|
|
ret = qed_check(s, &result, true);
|
2011-05-09 19:45:40 +04:00
|
|
|
if (ret) {
|
2021-02-02 15:49:55 +03:00
|
|
|
error_setg(errp, "Image corrupted");
|
2011-05-09 19:45:40 +04:00
|
|
|
goto out;
|
|
|
|
}
|
2010-12-06 19:08:03 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-05-08 18:34:45 +04:00
|
|
|
bdrv_qed_attach_aio_context(bs, bdrv_get_aio_context(bs));
|
2011-05-09 19:45:40 +04:00
|
|
|
|
2010-12-06 19:08:03 +03:00
|
|
|
out:
|
2010-12-06 19:08:01 +03:00
|
|
|
if (ret) {
|
|
|
|
qed_free_l2_cache(&s->l2_cache);
|
|
|
|
qemu_vfree(s->l1_table);
|
|
|
|
}
|
2010-12-06 19:08:00 +03:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2018-03-01 19:36:17 +03:00
|
|
|
typedef struct QEDOpenCo {
|
|
|
|
BlockDriverState *bs;
|
|
|
|
QDict *options;
|
|
|
|
int flags;
|
|
|
|
Error **errp;
|
|
|
|
int ret;
|
|
|
|
} QEDOpenCo;
|
|
|
|
|
|
|
|
static void coroutine_fn bdrv_qed_open_entry(void *opaque)
|
|
|
|
{
|
|
|
|
QEDOpenCo *qoc = opaque;
|
|
|
|
BDRVQEDState *s = qoc->bs->opaque;
|
|
|
|
|
|
|
|
qemu_co_mutex_lock(&s->table_lock);
|
|
|
|
qoc->ret = bdrv_qed_do_open(qoc->bs, qoc->options, qoc->flags, qoc->errp);
|
|
|
|
qemu_co_mutex_unlock(&s->table_lock);
|
|
|
|
}
|
|
|
|
|
2016-12-16 20:52:37 +03:00
|
|
|
static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags,
|
|
|
|
Error **errp)
|
|
|
|
{
|
2018-03-01 19:36:17 +03:00
|
|
|
QEDOpenCo qoc = {
|
|
|
|
.bs = bs,
|
|
|
|
.options = options,
|
|
|
|
.flags = flags,
|
|
|
|
.errp = errp,
|
|
|
|
.ret = -EINPROGRESS
|
|
|
|
};
|
|
|
|
|
2020-05-13 14:05:35 +03:00
|
|
|
bs->file = bdrv_open_child(NULL, options, "file", bs, &child_of_bds,
|
|
|
|
BDRV_CHILD_IMAGE, false, errp);
|
2016-12-16 20:52:37 +03:00
|
|
|
if (!bs->file) {
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
2017-06-29 16:27:46 +03:00
|
|
|
bdrv_qed_init_state(bs);
|
2018-03-01 19:36:17 +03:00
|
|
|
if (qemu_in_coroutine()) {
|
|
|
|
bdrv_qed_open_entry(&qoc);
|
|
|
|
} else {
|
block: Fix hangs in synchronous APIs with iothreads
In the block layer, synchronous APIs are often implemented by creating a
coroutine that calls the asynchronous coroutine-based implementation and
then waiting for completion with BDRV_POLL_WHILE().
For this to work with iothreads (more specifically, when the synchronous
API is called in a thread that is not the home thread of the block
device, so that the coroutine will run in a different thread), we must
make sure to call aio_wait_kick() at the end of the operation. Many
places are missing this, so that BDRV_POLL_WHILE() keeps hanging even if
the condition has long become false.
Note that bdrv_dec_in_flight() involves an aio_wait_kick() call. This
corresponds to the BDRV_POLL_WHILE() in the drain functions, but it is
generally not enough for most other operations because they haven't set
the return value in the coroutine entry stub yet. To avoid race
conditions there, we need to kick after setting the return value.
The race window is small enough that the problem doesn't usually surface
in the common path. However, it does surface and causes easily
reproducible hangs if the operation can return early before even calling
bdrv_inc/dec_in_flight, which many of them do (trivial error or no-op
success paths).
The bug in bdrv_truncate(), bdrv_check() and bdrv_invalidate_cache() is
slightly different: These functions even neglected to schedule the
coroutine in the home thread of the node. This avoids the hang, but is
obviously wrong, too. Fix those to schedule the coroutine in the right
AioContext in addition to adding aio_wait_kick() calls.
Cc: qemu-stable@nongnu.org
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
2019-01-07 15:02:48 +03:00
|
|
|
assert(qemu_get_current_aio_context() == qemu_get_aio_context());
|
2018-03-01 19:36:17 +03:00
|
|
|
qemu_coroutine_enter(qemu_coroutine_create(bdrv_qed_open_entry, &qoc));
|
|
|
|
BDRV_POLL_WHILE(bs, qoc.ret == -EINPROGRESS);
|
|
|
|
}
|
|
|
|
BDRV_POLL_WHILE(bs, qoc.ret == -EINPROGRESS);
|
|
|
|
return qoc.ret;
|
2016-12-16 20:52:37 +03:00
|
|
|
}
|
|
|
|
|
2014-07-16 19:48:16 +04:00
|
|
|
static void bdrv_qed_refresh_limits(BlockDriverState *bs, Error **errp)
|
2013-12-11 22:26:16 +04:00
|
|
|
{
|
|
|
|
BDRVQEDState *s = bs->opaque;
|
|
|
|
|
2016-06-02 00:10:02 +03:00
|
|
|
bs->bl.pwrite_zeroes_alignment = s->header.cluster_size;
|
block: use int64_t instead of int in driver write_zeroes handlers
We are generally moving to int64_t for both offset and bytes parameters
on all io paths.
Main motivation is realization of 64-bit write_zeroes operation for
fast zeroing large disk chunks, up to the whole disk.
We chose signed type, to be consistent with off_t (which is signed) and
with possibility for signed return type (where negative value means
error).
So, convert driver write_zeroes handlers bytes parameter to int64_t.
The only caller of all updated function is bdrv_co_do_pwrite_zeroes().
bdrv_co_do_pwrite_zeroes() itself is of course OK with widening of
callee parameter type. Also, bdrv_co_do_pwrite_zeroes()'s
max_write_zeroes is limited to INT_MAX. So, updated functions all are
safe, they will not get "bytes" larger than before.
Still, let's look through all updated functions, and add assertions to
the ones which are actually unprepared to values larger than INT_MAX.
For these drivers also set explicit max_pwrite_zeroes limit.
Let's go:
blkdebug: calculations can't overflow, thanks to
bdrv_check_qiov_request() in generic layer. rule_check() and
bdrv_co_pwrite_zeroes() both have 64bit argument.
blklogwrites: pass to blk_log_writes_co_log() with 64bit argument.
blkreplay, copy-on-read, filter-compress: pass to
bdrv_co_pwrite_zeroes() which is OK
copy-before-write: Calls cbw_do_copy_before_write() and
bdrv_co_pwrite_zeroes, both have 64bit argument.
file-posix: both handler calls raw_do_pwrite_zeroes, which is updated.
In raw_do_pwrite_zeroes() calculations are OK due to
bdrv_check_qiov_request(), bytes go to RawPosixAIOData::aio_nbytes
which is uint64_t.
Check also where that uint64_t gets handed:
handle_aiocb_write_zeroes_block() passes a uint64_t[2] to
ioctl(BLKZEROOUT), handle_aiocb_write_zeroes() calls do_fallocate()
which takes off_t (and we compile to always have 64-bit off_t), as
does handle_aiocb_write_zeroes_unmap. All look safe.
gluster: bytes go to GlusterAIOCB::size which is int64_t and to
glfs_zerofill_async works with off_t.
iscsi: Aha, here we deal with iscsi_writesame16_task() that has
uint32_t num_blocks argument and iscsi_writesame16_task() has
uint16_t argument. Make comments, add assertions and clarify
max_pwrite_zeroes calculation.
iscsi_allocmap_() functions already has int64_t argument
is_byte_request_lun_aligned is simple to update, do it.
mirror_top: pass to bdrv_mirror_top_do_write which has uint64_t
argument
nbd: Aha, here we have protocol limitation, and NBDRequest::len is
uint32_t. max_pwrite_zeroes is cleanly set to 32bit value, so we are
OK for now.
nvme: Again, protocol limitation. And no inherent limit for
write-zeroes at all. But from code that calculates cdw12 it's obvious
that we do have limit and alignment. Let's clarify it. Also,
obviously the code is not prepared to handle bytes=0. Let's handle
this case too.
trace events already 64bit
preallocate: pass to handle_write() and bdrv_co_pwrite_zeroes(), both
64bit.
rbd: pass to qemu_rbd_start_co() which is 64bit.
qcow2: offset + bytes and alignment still works good (thanks to
bdrv_check_qiov_request()), so tail calculation is OK
qcow2_subcluster_zeroize() has 64bit argument, should be OK
trace events updated
qed: qed_co_request wants int nb_sectors. Also in code we have size_t
used for request length which may be 32bit. So, let's just keep
INT_MAX as a limit (aligning it down to pwrite_zeroes_alignment) and
don't care.
raw-format: Is OK. raw_adjust_offset and bdrv_co_pwrite_zeroes are both
64bit.
throttle: Both throttle_group_co_io_limits_intercept() and
bdrv_co_pwrite_zeroes() are 64bit.
vmdk: pass to vmdk_pwritev which is 64bit
quorum: pass to quorum_co_pwritev() which is 64bit
Hooray!
At this point all block drivers are prepared to support 64bit
write-zero requests, or have explicitly set max_pwrite_zeroes.
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20210903102807.27127-8-vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
[eblake: use <= rather than < in assertions relying on max_pwrite_zeroes]
Signed-off-by: Eric Blake <eblake@redhat.com>
2021-09-03 13:28:03 +03:00
|
|
|
bs->bl.max_pwrite_zeroes = QEMU_ALIGN_DOWN(INT_MAX, s->header.cluster_size);
|
2013-12-11 22:26:16 +04:00
|
|
|
}
|
|
|
|
|
2012-09-20 23:13:27 +04:00
|
|
|
/* We have nothing to do for QED reopen, stubs just return
|
|
|
|
* success */
|
|
|
|
static int bdrv_qed_reopen_prepare(BDRVReopenState *state,
|
|
|
|
BlockReopenQueue *queue, Error **errp)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2010-12-06 19:08:00 +03:00
|
|
|
static void bdrv_qed_close(BlockDriverState *bs)
|
|
|
|
{
|
2010-12-06 19:08:01 +03:00
|
|
|
BDRVQEDState *s = bs->opaque;
|
|
|
|
|
2014-05-08 18:34:45 +04:00
|
|
|
bdrv_qed_detach_aio_context(bs);
|
2011-05-09 19:45:40 +04:00
|
|
|
|
2010-12-06 19:08:03 +03:00
|
|
|
/* Ensure writes reach stable storage */
|
2015-06-16 15:19:22 +03:00
|
|
|
bdrv_flush(bs->file->bs);
|
2010-12-06 19:08:03 +03:00
|
|
|
|
|
|
|
/* Clean shutdown, no check required on next open */
|
|
|
|
if (s->header.features & QED_F_NEED_CHECK) {
|
|
|
|
s->header.features &= ~QED_F_NEED_CHECK;
|
|
|
|
qed_write_header_sync(s);
|
|
|
|
}
|
|
|
|
|
2010-12-06 19:08:01 +03:00
|
|
|
qed_free_l2_cache(&s->l2_cache);
|
|
|
|
qemu_vfree(s->l1_table);
|
2010-12-06 19:08:00 +03:00
|
|
|
}
|
|
|
|
|
2018-03-09 21:53:19 +03:00
|
|
|
static int coroutine_fn bdrv_qed_co_create(BlockdevCreateOptions *opts,
|
|
|
|
Error **errp)
|
2010-12-06 19:08:00 +03:00
|
|
|
{
|
2018-03-09 21:53:19 +03:00
|
|
|
BlockdevCreateOptionsQed *qed_opts;
|
|
|
|
BlockBackend *blk = NULL;
|
|
|
|
BlockDriverState *bs = NULL;
|
|
|
|
|
|
|
|
QEDHeader header;
|
2010-12-06 19:08:00 +03:00
|
|
|
QEDHeader le_header;
|
|
|
|
uint8_t *l1_table = NULL;
|
2018-03-09 21:53:19 +03:00
|
|
|
size_t l1_size;
|
2010-12-06 19:08:00 +03:00
|
|
|
int ret = 0;
|
|
|
|
|
2018-03-09 21:53:19 +03:00
|
|
|
assert(opts->driver == BLOCKDEV_DRIVER_QED);
|
|
|
|
qed_opts = &opts->u.qed;
|
|
|
|
|
|
|
|
/* Validate options and set default values */
|
|
|
|
if (!qed_opts->has_cluster_size) {
|
|
|
|
qed_opts->cluster_size = QED_DEFAULT_CLUSTER_SIZE;
|
|
|
|
}
|
|
|
|
if (!qed_opts->has_table_size) {
|
|
|
|
qed_opts->table_size = QED_DEFAULT_TABLE_SIZE;
|
2010-12-06 19:08:00 +03:00
|
|
|
}
|
|
|
|
|
2018-03-09 21:53:19 +03:00
|
|
|
if (!qed_is_cluster_size_valid(qed_opts->cluster_size)) {
|
|
|
|
error_setg(errp, "QED cluster size must be within range [%u, %u] "
|
|
|
|
"and power of 2",
|
|
|
|
QED_MIN_CLUSTER_SIZE, QED_MAX_CLUSTER_SIZE);
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
if (!qed_is_table_size_valid(qed_opts->table_size)) {
|
|
|
|
error_setg(errp, "QED table size must be within range [%u, %u] "
|
|
|
|
"and power of 2",
|
|
|
|
QED_MIN_TABLE_SIZE, QED_MAX_TABLE_SIZE);
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
if (!qed_is_image_size_valid(qed_opts->size, qed_opts->cluster_size,
|
|
|
|
qed_opts->table_size))
|
|
|
|
{
|
|
|
|
error_setg(errp, "QED image size must be a non-zero multiple of "
|
|
|
|
"cluster size and less than %" PRIu64 " bytes",
|
|
|
|
qed_max_image_size(qed_opts->cluster_size,
|
|
|
|
qed_opts->table_size));
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Create BlockBackend to write to the image */
|
|
|
|
bs = bdrv_open_blockdev_ref(qed_opts->file, errp);
|
|
|
|
if (bs == NULL) {
|
2016-03-08 17:57:05 +03:00
|
|
|
return -EIO;
|
2010-12-06 19:08:00 +03:00
|
|
|
}
|
|
|
|
|
2020-04-28 22:26:46 +03:00
|
|
|
blk = blk_new_with_bs(bs, BLK_PERM_WRITE | BLK_PERM_RESIZE, BLK_PERM_ALL,
|
|
|
|
errp);
|
|
|
|
if (!blk) {
|
|
|
|
ret = -EPERM;
|
2018-03-09 21:53:19 +03:00
|
|
|
goto out;
|
|
|
|
}
|
2016-03-08 17:57:05 +03:00
|
|
|
blk_set_allow_write_beyond_eof(blk, true);
|
|
|
|
|
2018-03-09 21:53:19 +03:00
|
|
|
/* Prepare image format */
|
|
|
|
header = (QEDHeader) {
|
|
|
|
.magic = QED_MAGIC,
|
|
|
|
.cluster_size = qed_opts->cluster_size,
|
|
|
|
.table_size = qed_opts->table_size,
|
|
|
|
.header_size = 1,
|
|
|
|
.features = 0,
|
|
|
|
.compat_features = 0,
|
|
|
|
.l1_table_offset = qed_opts->cluster_size,
|
|
|
|
.image_size = qed_opts->size,
|
|
|
|
};
|
|
|
|
|
|
|
|
l1_size = header.cluster_size * header.table_size;
|
|
|
|
|
2019-09-18 12:51:43 +03:00
|
|
|
/*
|
|
|
|
* The QED format associates file length with allocation status,
|
|
|
|
* so a new file (which is empty) must have a length of 0.
|
|
|
|
*/
|
2020-04-24 15:54:41 +03:00
|
|
|
ret = blk_truncate(blk, 0, true, PREALLOC_MODE_OFF, 0, errp);
|
2011-01-15 01:44:33 +03:00
|
|
|
if (ret < 0) {
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2018-03-09 21:53:19 +03:00
|
|
|
if (qed_opts->has_backing_file) {
|
2010-12-06 19:08:00 +03:00
|
|
|
header.features |= QED_F_BACKING_FILE;
|
|
|
|
header.backing_filename_offset = sizeof(le_header);
|
2018-03-09 21:53:19 +03:00
|
|
|
header.backing_filename_size = strlen(qed_opts->backing_file);
|
2010-12-06 19:08:00 +03:00
|
|
|
|
2018-03-09 21:53:19 +03:00
|
|
|
if (qed_opts->has_backing_fmt) {
|
|
|
|
const char *backing_fmt = BlockdevDriver_str(qed_opts->backing_fmt);
|
|
|
|
if (qed_fmt_is_raw(backing_fmt)) {
|
|
|
|
header.features |= QED_F_BACKING_FORMAT_NO_PROBE;
|
|
|
|
}
|
2010-12-06 19:08:00 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
qed_header_cpu_to_le(&header, &le_header);
|
2016-05-06 19:26:27 +03:00
|
|
|
ret = blk_pwrite(blk, 0, &le_header, sizeof(le_header), 0);
|
2010-12-06 19:08:00 +03:00
|
|
|
if (ret < 0) {
|
|
|
|
goto out;
|
|
|
|
}
|
2018-03-09 21:53:19 +03:00
|
|
|
ret = blk_pwrite(blk, sizeof(le_header), qed_opts->backing_file,
|
2016-05-06 19:26:27 +03:00
|
|
|
header.backing_filename_size, 0);
|
2010-12-06 19:08:00 +03:00
|
|
|
if (ret < 0) {
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2011-08-21 07:09:37 +04:00
|
|
|
l1_table = g_malloc0(l1_size);
|
2016-05-06 19:26:27 +03:00
|
|
|
ret = blk_pwrite(blk, header.l1_table_offset, l1_table, l1_size, 0);
|
2010-12-06 19:08:00 +03:00
|
|
|
if (ret < 0) {
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = 0; /* success */
|
|
|
|
out:
|
2011-08-21 07:09:37 +04:00
|
|
|
g_free(l1_table);
|
2016-03-08 17:57:05 +03:00
|
|
|
blk_unref(blk);
|
2018-03-09 21:53:19 +03:00
|
|
|
bdrv_unref(bs);
|
2010-12-06 19:08:00 +03:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2020-03-26 04:12:17 +03:00
|
|
|
static int coroutine_fn bdrv_qed_co_create_opts(BlockDriver *drv,
|
|
|
|
const char *filename,
|
2018-01-18 15:43:45 +03:00
|
|
|
QemuOpts *opts,
|
|
|
|
Error **errp)
|
2010-12-06 19:08:00 +03:00
|
|
|
{
|
2018-03-09 21:53:19 +03:00
|
|
|
BlockdevCreateOptions *create_options = NULL;
|
2018-06-14 22:14:32 +03:00
|
|
|
QDict *qdict;
|
2018-03-09 21:53:19 +03:00
|
|
|
Visitor *v;
|
|
|
|
BlockDriverState *bs = NULL;
|
2014-06-05 13:21:00 +04:00
|
|
|
int ret;
|
|
|
|
|
2018-03-09 21:53:19 +03:00
|
|
|
static const QDictRenames opt_renames[] = {
|
|
|
|
{ BLOCK_OPT_BACKING_FILE, "backing-file" },
|
|
|
|
{ BLOCK_OPT_BACKING_FMT, "backing-fmt" },
|
|
|
|
{ BLOCK_OPT_CLUSTER_SIZE, "cluster-size" },
|
|
|
|
{ BLOCK_OPT_TABLE_SIZE, "table-size" },
|
|
|
|
{ NULL, NULL },
|
|
|
|
};
|
|
|
|
|
|
|
|
/* Parse options and convert legacy syntax */
|
|
|
|
qdict = qemu_opts_to_qdict_filtered(opts, NULL, &qed_create_opts, true);
|
|
|
|
|
|
|
|
if (!qdict_rename_keys(qdict, opt_renames, errp)) {
|
2014-06-05 13:21:00 +04:00
|
|
|
ret = -EINVAL;
|
2018-03-09 21:53:19 +03:00
|
|
|
goto fail;
|
2010-12-06 19:08:00 +03:00
|
|
|
}
|
2018-03-09 21:53:19 +03:00
|
|
|
|
|
|
|
/* Create and open the file (protocol layer) */
|
error: Eliminate error_propagate() with Coccinelle, part 1
When all we do with an Error we receive into a local variable is
propagating to somewhere else, we can just as well receive it there
right away. Convert
if (!foo(..., &err)) {
...
error_propagate(errp, err);
...
return ...
}
to
if (!foo(..., errp)) {
...
...
return ...
}
where nothing else needs @err. Coccinelle script:
@rule1 forall@
identifier fun, err, errp, lbl;
expression list args, args2;
binary operator op;
constant c1, c2;
symbol false;
@@
if (
(
- fun(args, &err, args2)
+ fun(args, errp, args2)
|
- !fun(args, &err, args2)
+ !fun(args, errp, args2)
|
- fun(args, &err, args2) op c1
+ fun(args, errp, args2) op c1
)
)
{
... when != err
when != lbl:
when strict
- error_propagate(errp, err);
... when != err
(
return;
|
return c2;
|
return false;
)
}
@rule2 forall@
identifier fun, err, errp, lbl;
expression list args, args2;
expression var;
binary operator op;
constant c1, c2;
symbol false;
@@
- var = fun(args, &err, args2);
+ var = fun(args, errp, args2);
... when != err
if (
(
var
|
!var
|
var op c1
)
)
{
... when != err
when != lbl:
when strict
- error_propagate(errp, err);
... when != err
(
return;
|
return c2;
|
return false;
|
return var;
)
}
@depends on rule1 || rule2@
identifier err;
@@
- Error *err = NULL;
... when != err
Not exactly elegant, I'm afraid.
The "when != lbl:" is necessary to avoid transforming
if (fun(args, &err)) {
goto out
}
...
out:
error_propagate(errp, err);
even though other paths to label out still need the error_propagate().
For an actual example, see sclp_realize().
Without the "when strict", Coccinelle transforms vfio_msix_setup(),
incorrectly. I don't know what exactly "when strict" does, only that
it helps here.
The match of return is narrower than what I want, but I can't figure
out how to express "return where the operand doesn't use @err". For
an example where it's too narrow, see vfio_intx_enable().
Silently fails to convert hw/arm/armsse.c, because Coccinelle gets
confused by ARMSSE being used both as typedef and function-like macro
there. Converted manually.
Line breaks tidied up manually. One nested declaration of @local_err
deleted manually. Preexisting unwanted blank line dropped in
hw/riscv/sifive_e.c.
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-Id: <20200707160613.848843-35-armbru@redhat.com>
2020-07-07 19:06:02 +03:00
|
|
|
ret = bdrv_create_file(filename, opts, errp);
|
2018-03-09 21:53:19 +03:00
|
|
|
if (ret < 0) {
|
|
|
|
goto fail;
|
|
|
|
}
|
|
|
|
|
|
|
|
bs = bdrv_open(filename, NULL, NULL,
|
|
|
|
BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL, errp);
|
|
|
|
if (bs == NULL) {
|
|
|
|
ret = -EIO;
|
|
|
|
goto fail;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Now get the QAPI type BlockdevCreateOptions */
|
|
|
|
qdict_put_str(qdict, "driver", "qed");
|
|
|
|
qdict_put_str(qdict, "file", bs->node_name);
|
|
|
|
|
2018-06-14 22:14:33 +03:00
|
|
|
v = qobject_input_visitor_new_flat_confused(qdict, errp);
|
|
|
|
if (!v) {
|
2014-06-05 13:21:00 +04:00
|
|
|
ret = -EINVAL;
|
2018-03-09 21:53:19 +03:00
|
|
|
goto fail;
|
2010-12-06 19:08:00 +03:00
|
|
|
}
|
2018-03-09 21:53:19 +03:00
|
|
|
|
2020-07-07 19:06:07 +03:00
|
|
|
visit_type_BlockdevCreateOptions(v, NULL, &create_options, errp);
|
2018-03-09 21:53:19 +03:00
|
|
|
visit_free(v);
|
2020-07-07 19:06:07 +03:00
|
|
|
if (!create_options) {
|
2014-06-05 13:21:00 +04:00
|
|
|
ret = -EINVAL;
|
2018-03-09 21:53:19 +03:00
|
|
|
goto fail;
|
2010-12-06 19:08:00 +03:00
|
|
|
}
|
|
|
|
|
2018-03-09 21:53:19 +03:00
|
|
|
/* Silently round up size */
|
|
|
|
assert(create_options->driver == BLOCKDEV_DRIVER_QED);
|
|
|
|
create_options->u.qed.size =
|
|
|
|
ROUND_UP(create_options->u.qed.size, BDRV_SECTOR_SIZE);
|
|
|
|
|
|
|
|
/* Create the qed image (format layer) */
|
|
|
|
ret = bdrv_qed_co_create(create_options, errp);
|
2014-06-05 13:21:00 +04:00
|
|
|
|
2018-03-09 21:53:19 +03:00
|
|
|
fail:
|
2018-04-19 18:01:43 +03:00
|
|
|
qobject_unref(qdict);
|
2018-03-09 21:53:19 +03:00
|
|
|
bdrv_unref(bs);
|
|
|
|
qapi_free_BlockdevCreateOptions(create_options);
|
2014-06-05 13:21:00 +04:00
|
|
|
return ret;
|
2010-12-06 19:08:00 +03:00
|
|
|
}
|
|
|
|
|
2018-02-13 23:26:53 +03:00
|
|
|
static int coroutine_fn bdrv_qed_co_block_status(BlockDriverState *bs,
|
|
|
|
bool want_zero,
|
|
|
|
int64_t pos, int64_t bytes,
|
|
|
|
int64_t *pnum, int64_t *map,
|
|
|
|
BlockDriverState **file)
|
2010-12-06 19:08:01 +03:00
|
|
|
{
|
2018-02-13 23:26:53 +03:00
|
|
|
BDRVQEDState *s = bs->opaque;
|
|
|
|
size_t len = MIN(bytes, SIZE_MAX);
|
|
|
|
int status;
|
|
|
|
QEDRequest request = { .l2_table = NULL };
|
|
|
|
uint64_t offset;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
qemu_co_mutex_lock(&s->table_lock);
|
|
|
|
ret = qed_find_cluster(s, &request, pos, &len, &offset);
|
|
|
|
|
|
|
|
*pnum = len;
|
2013-09-04 21:00:30 +04:00
|
|
|
switch (ret) {
|
|
|
|
case QED_CLUSTER_FOUND:
|
2018-02-13 23:26:53 +03:00
|
|
|
*map = offset | qed_offset_into_cluster(s, pos);
|
|
|
|
status = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
|
|
|
|
*file = bs->file->bs;
|
2013-09-04 21:00:30 +04:00
|
|
|
break;
|
|
|
|
case QED_CLUSTER_ZERO:
|
2018-02-13 23:26:53 +03:00
|
|
|
status = BDRV_BLOCK_ZERO;
|
2013-09-04 21:00:30 +04:00
|
|
|
break;
|
|
|
|
case QED_CLUSTER_L2:
|
|
|
|
case QED_CLUSTER_L1:
|
2018-02-13 23:26:53 +03:00
|
|
|
status = 0;
|
2013-09-04 21:00:30 +04:00
|
|
|
break;
|
|
|
|
default:
|
|
|
|
assert(ret < 0);
|
2018-02-13 23:26:53 +03:00
|
|
|
status = ret;
|
2013-09-04 21:00:30 +04:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2010-12-06 19:08:01 +03:00
|
|
|
qed_unref_l2_cache_entry(request.l2_table);
|
2017-06-29 16:27:47 +03:00
|
|
|
qemu_co_mutex_unlock(&s->table_lock);
|
2010-12-06 19:08:01 +03:00
|
|
|
|
2018-02-13 23:26:53 +03:00
|
|
|
return status;
|
2010-12-06 19:08:00 +03:00
|
|
|
}
|
|
|
|
|
2010-12-06 19:08:02 +03:00
|
|
|
static BDRVQEDState *acb_to_s(QEDAIOCB *acb)
|
|
|
|
{
|
2016-11-18 16:47:36 +03:00
|
|
|
return acb->bs->opaque;
|
2010-12-06 19:08:02 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Read from the backing file or zero-fill if no backing file
|
|
|
|
*
|
2014-07-04 19:11:28 +04:00
|
|
|
* @s: QED state
|
|
|
|
* @pos: Byte position in device
|
|
|
|
* @qiov: Destination I/O vector
|
2010-12-06 19:08:02 +03:00
|
|
|
*
|
|
|
|
* This function reads qiov->size bytes starting at pos from the backing file.
|
|
|
|
* If there is no backing file then zeroes are read.
|
|
|
|
*/
|
2017-06-12 12:12:41 +03:00
|
|
|
static int coroutine_fn qed_read_backing_file(BDRVQEDState *s, uint64_t pos,
|
2020-05-28 12:44:05 +03:00
|
|
|
QEMUIOVector *qiov)
|
2010-12-06 19:08:02 +03:00
|
|
|
{
|
2015-06-17 15:55:21 +03:00
|
|
|
if (s->bs->backing) {
|
2020-05-28 12:44:05 +03:00
|
|
|
BLKDBG_EVENT(s->bs->file, BLKDBG_READ_BACKING_AIO);
|
|
|
|
return bdrv_co_preadv(s->bs->backing, pos, qiov->size, qiov, 0);
|
2016-11-14 16:56:32 +03:00
|
|
|
}
|
2020-05-28 12:44:05 +03:00
|
|
|
qemu_iovec_memset(qiov, 0, 0, qiov->size);
|
2016-11-14 16:56:32 +03:00
|
|
|
return 0;
|
2010-12-06 19:08:02 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Copy data from backing file into the image
|
|
|
|
*
|
|
|
|
* @s: QED state
|
|
|
|
* @pos: Byte position in device
|
|
|
|
* @len: Number of bytes
|
|
|
|
* @offset: Byte offset in image file
|
|
|
|
*/
|
2017-06-12 12:12:41 +03:00
|
|
|
static int coroutine_fn qed_copy_from_backing_file(BDRVQEDState *s,
|
|
|
|
uint64_t pos, uint64_t len,
|
|
|
|
uint64_t offset)
|
2010-12-06 19:08:02 +03:00
|
|
|
{
|
2016-11-14 16:56:32 +03:00
|
|
|
QEMUIOVector qiov;
|
2016-11-14 16:56:32 +03:00
|
|
|
int ret;
|
2010-12-06 19:08:02 +03:00
|
|
|
|
|
|
|
/* Skip copy entirely if there is no work to do */
|
|
|
|
if (len == 0) {
|
2016-11-15 13:14:01 +03:00
|
|
|
return 0;
|
2010-12-06 19:08:02 +03:00
|
|
|
}
|
|
|
|
|
2019-02-18 17:09:19 +03:00
|
|
|
qemu_iovec_init_buf(&qiov, qemu_blockalign(s->bs, len), len);
|
2016-11-14 16:56:32 +03:00
|
|
|
|
2020-05-28 12:44:05 +03:00
|
|
|
ret = qed_read_backing_file(s, pos, &qiov);
|
2016-11-14 16:56:32 +03:00
|
|
|
|
|
|
|
if (ret) {
|
|
|
|
goto out;
|
|
|
|
}
|
2010-12-06 19:08:02 +03:00
|
|
|
|
2016-11-14 16:56:32 +03:00
|
|
|
BLKDBG_EVENT(s->bs->file, BLKDBG_COW_WRITE);
|
2017-06-16 15:43:19 +03:00
|
|
|
ret = bdrv_co_pwritev(s->bs->file, offset, qiov.size, &qiov, 0);
|
2016-11-14 16:56:32 +03:00
|
|
|
if (ret < 0) {
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
ret = 0;
|
|
|
|
out:
|
2019-02-18 17:09:19 +03:00
|
|
|
qemu_vfree(qemu_iovec_buf(&qiov));
|
2016-11-15 13:14:01 +03:00
|
|
|
return ret;
|
2010-12-06 19:08:02 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Link one or more contiguous clusters into a table
|
|
|
|
*
|
|
|
|
* @s: QED state
|
|
|
|
* @table: L2 table
|
|
|
|
* @index: First cluster index
|
|
|
|
* @n: Number of contiguous clusters
|
2010-12-17 18:58:22 +03:00
|
|
|
* @cluster: First cluster offset
|
|
|
|
*
|
|
|
|
* The cluster offset may be an allocated byte offset in the image file, the
|
|
|
|
* zero cluster marker, or the unallocated cluster marker.
|
2017-06-29 16:27:47 +03:00
|
|
|
*
|
|
|
|
* Called with table_lock held.
|
2010-12-06 19:08:02 +03:00
|
|
|
*/
|
2017-06-12 12:12:41 +03:00
|
|
|
static void coroutine_fn qed_update_l2_table(BDRVQEDState *s, QEDTable *table,
|
|
|
|
int index, unsigned int n,
|
|
|
|
uint64_t cluster)
|
2010-12-06 19:08:02 +03:00
|
|
|
{
|
|
|
|
int i;
|
|
|
|
for (i = index; i < index + n; i++) {
|
|
|
|
table->offsets[i] = cluster;
|
2010-12-17 18:58:22 +03:00
|
|
|
if (!qed_offset_is_unalloc_cluster(cluster) &&
|
|
|
|
!qed_offset_is_zero_cluster(cluster)) {
|
|
|
|
cluster += s->header.cluster_size;
|
|
|
|
}
|
2010-12-06 19:08:02 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-06-29 16:27:47 +03:00
|
|
|
/* Called with table_lock held. */
|
2017-06-12 12:12:41 +03:00
|
|
|
static void coroutine_fn qed_aio_complete(QEDAIOCB *acb)
|
2010-12-06 19:08:02 +03:00
|
|
|
{
|
2017-02-13 16:52:31 +03:00
|
|
|
BDRVQEDState *s = acb_to_s(acb);
|
2010-12-06 19:08:02 +03:00
|
|
|
|
|
|
|
/* Free resources */
|
|
|
|
qemu_iovec_destroy(&acb->cur_qiov);
|
|
|
|
qed_unref_l2_cache_entry(acb->request.l2_table);
|
|
|
|
|
2012-02-07 17:27:28 +04:00
|
|
|
/* Free the buffer we may have allocated for zero writes */
|
|
|
|
if (acb->flags & QED_AIOCB_ZERO) {
|
|
|
|
qemu_vfree(acb->qiov->iov[0].iov_base);
|
|
|
|
acb->qiov->iov[0].iov_base = NULL;
|
|
|
|
}
|
|
|
|
|
2010-12-06 19:08:02 +03:00
|
|
|
/* Start next allocating write request waiting behind this one. Note that
|
|
|
|
* requests enqueue themselves when they first hit an unallocated cluster
|
|
|
|
* but they wait until the entire request is finished before waking up the
|
|
|
|
* next request in the queue. This ensures that we don't cycle through
|
|
|
|
* requests multiple times but rather finish one at a time completely.
|
|
|
|
*/
|
2016-11-18 17:32:17 +03:00
|
|
|
if (acb == s->allocating_acb) {
|
|
|
|
s->allocating_acb = NULL;
|
|
|
|
if (!qemu_co_queue_empty(&s->allocating_write_reqs)) {
|
2017-06-29 16:27:47 +03:00
|
|
|
qemu_co_queue_next(&s->allocating_write_reqs);
|
2011-05-09 19:45:40 +04:00
|
|
|
} else if (s->header.features & QED_F_NEED_CHECK) {
|
|
|
|
qed_start_need_check_timer(s);
|
2010-12-06 19:08:02 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2016-11-17 14:51:21 +03:00
|
|
|
* Update L1 table with new L2 table offset and write it out
|
2017-06-29 16:27:47 +03:00
|
|
|
*
|
|
|
|
* Called with table_lock held.
|
2010-12-06 19:08:02 +03:00
|
|
|
*/
|
2017-06-12 12:12:41 +03:00
|
|
|
static int coroutine_fn qed_aio_write_l1_update(QEDAIOCB *acb)
|
2010-12-06 19:08:02 +03:00
|
|
|
{
|
|
|
|
BDRVQEDState *s = acb_to_s(acb);
|
|
|
|
CachedL2Table *l2_table = acb->request.l2_table;
|
2011-09-30 14:39:11 +04:00
|
|
|
uint64_t l2_offset = l2_table->offset;
|
2016-11-17 17:40:41 +03:00
|
|
|
int index, ret;
|
2010-12-06 19:08:02 +03:00
|
|
|
|
2016-11-17 14:51:21 +03:00
|
|
|
index = qed_l1_index(s, acb->cur_pos);
|
|
|
|
s->l1_table->offsets[index] = l2_table->offset;
|
|
|
|
|
|
|
|
ret = qed_write_l1_table(s, index, 1);
|
|
|
|
|
|
|
|
/* Commit the current L2 table to the cache */
|
2010-12-06 19:08:02 +03:00
|
|
|
qed_commit_l2_cache_entry(&s->l2_cache, l2_table);
|
|
|
|
|
|
|
|
/* This is guaranteed to succeed because we just committed the entry to the
|
|
|
|
* cache.
|
|
|
|
*/
|
2011-09-30 14:39:11 +04:00
|
|
|
acb->request.l2_table = qed_find_l2_cache_entry(&s->l2_cache, l2_offset);
|
2010-12-06 19:08:02 +03:00
|
|
|
assert(acb->request.l2_table != NULL);
|
|
|
|
|
2016-11-17 17:40:41 +03:00
|
|
|
return ret;
|
2010-12-06 19:08:02 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Update L2 table with new cluster offsets and write them out
|
2017-06-29 16:27:47 +03:00
|
|
|
*
|
|
|
|
* Called with table_lock held.
|
2010-12-06 19:08:02 +03:00
|
|
|
*/
|
2017-06-12 12:12:41 +03:00
|
|
|
static int coroutine_fn qed_aio_write_l2_update(QEDAIOCB *acb, uint64_t offset)
|
2010-12-06 19:08:02 +03:00
|
|
|
{
|
|
|
|
BDRVQEDState *s = acb_to_s(acb);
|
|
|
|
bool need_alloc = acb->find_cluster_ret == QED_CLUSTER_L1;
|
2016-11-17 17:40:41 +03:00
|
|
|
int index, ret;
|
2010-12-06 19:08:02 +03:00
|
|
|
|
|
|
|
if (need_alloc) {
|
|
|
|
qed_unref_l2_cache_entry(acb->request.l2_table);
|
|
|
|
acb->request.l2_table = qed_new_l2_table(s);
|
|
|
|
}
|
|
|
|
|
|
|
|
index = qed_l2_index(s, acb->cur_pos);
|
|
|
|
qed_update_l2_table(s, acb->request.l2_table->table, index, acb->cur_nclusters,
|
2012-02-07 17:27:28 +04:00
|
|
|
offset);
|
2010-12-06 19:08:02 +03:00
|
|
|
|
|
|
|
if (need_alloc) {
|
|
|
|
/* Write out the whole new L2 table */
|
2016-11-15 13:14:01 +03:00
|
|
|
ret = qed_write_l2_table(s, &acb->request, 0, s->table_nelems, true);
|
2016-11-17 17:40:41 +03:00
|
|
|
if (ret) {
|
2016-11-17 17:40:41 +03:00
|
|
|
return ret;
|
2016-11-17 17:40:41 +03:00
|
|
|
}
|
2016-11-17 17:40:41 +03:00
|
|
|
return qed_aio_write_l1_update(acb);
|
2010-12-06 19:08:02 +03:00
|
|
|
} else {
|
|
|
|
/* Write out only the updated part of the L2 table */
|
2016-11-15 13:14:01 +03:00
|
|
|
ret = qed_write_l2_table(s, &acb->request, index, acb->cur_nclusters,
|
|
|
|
false);
|
2016-11-17 17:40:41 +03:00
|
|
|
if (ret) {
|
|
|
|
return ret;
|
|
|
|
}
|
2010-12-06 19:08:02 +03:00
|
|
|
}
|
2016-11-17 17:40:41 +03:00
|
|
|
return 0;
|
2010-12-06 19:08:02 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Write data to the image file
|
2017-06-29 16:27:47 +03:00
|
|
|
*
|
|
|
|
* Called with table_lock *not* held.
|
2010-12-06 19:08:02 +03:00
|
|
|
*/
|
2017-06-12 12:12:41 +03:00
|
|
|
static int coroutine_fn qed_aio_write_main(QEDAIOCB *acb)
|
2010-12-06 19:08:02 +03:00
|
|
|
{
|
|
|
|
BDRVQEDState *s = acb_to_s(acb);
|
|
|
|
uint64_t offset = acb->cur_cluster +
|
|
|
|
qed_offset_into_cluster(s, acb->cur_pos);
|
|
|
|
|
2016-11-17 17:40:41 +03:00
|
|
|
trace_qed_aio_write_main(s, acb, 0, offset, acb->cur_qiov.size);
|
2010-12-06 19:08:02 +03:00
|
|
|
|
2016-11-14 16:56:32 +03:00
|
|
|
BLKDBG_EVENT(s->bs->file, BLKDBG_WRITE_AIO);
|
2017-06-29 16:27:44 +03:00
|
|
|
return bdrv_co_pwritev(s->bs->file, offset, acb->cur_qiov.size,
|
|
|
|
&acb->cur_qiov, 0);
|
2010-12-06 19:08:02 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2016-11-15 13:14:01 +03:00
|
|
|
* Populate untouched regions of new data cluster
|
2017-06-29 16:27:47 +03:00
|
|
|
*
|
|
|
|
* Called with table_lock held.
|
2010-12-06 19:08:02 +03:00
|
|
|
*/
|
2017-06-12 12:12:41 +03:00
|
|
|
static int coroutine_fn qed_aio_write_cow(QEDAIOCB *acb)
|
2010-12-06 19:08:02 +03:00
|
|
|
{
|
|
|
|
BDRVQEDState *s = acb_to_s(acb);
|
2016-11-15 13:14:01 +03:00
|
|
|
uint64_t start, len, offset;
|
2016-11-17 17:40:41 +03:00
|
|
|
int ret;
|
2010-12-06 19:08:02 +03:00
|
|
|
|
2017-06-29 16:27:47 +03:00
|
|
|
qemu_co_mutex_unlock(&s->table_lock);
|
|
|
|
|
2016-11-15 13:14:01 +03:00
|
|
|
/* Populate front untouched region of new data cluster */
|
|
|
|
start = qed_start_of_cluster(s, acb->cur_pos);
|
|
|
|
len = qed_offset_into_cluster(s, acb->cur_pos);
|
|
|
|
|
|
|
|
trace_qed_aio_write_prefill(s, acb, start, len, acb->cur_cluster);
|
|
|
|
ret = qed_copy_from_backing_file(s, start, len, acb->cur_cluster);
|
2016-11-17 17:40:41 +03:00
|
|
|
if (ret < 0) {
|
2017-06-29 16:27:47 +03:00
|
|
|
goto out;
|
2010-12-06 19:08:02 +03:00
|
|
|
}
|
|
|
|
|
2016-11-15 13:14:01 +03:00
|
|
|
/* Populate back untouched region of new data cluster */
|
|
|
|
start = acb->cur_pos + acb->cur_qiov.size;
|
|
|
|
len = qed_start_of_cluster(s, start + s->header.cluster_size - 1) - start;
|
|
|
|
offset = acb->cur_cluster +
|
|
|
|
qed_offset_into_cluster(s, acb->cur_pos) +
|
|
|
|
acb->cur_qiov.size;
|
2010-12-06 19:08:02 +03:00
|
|
|
|
2016-11-15 13:14:01 +03:00
|
|
|
trace_qed_aio_write_postfill(s, acb, start, len, offset);
|
|
|
|
ret = qed_copy_from_backing_file(s, start, len, offset);
|
2016-11-17 17:40:41 +03:00
|
|
|
if (ret < 0) {
|
2017-06-29 16:27:47 +03:00
|
|
|
goto out;
|
2016-11-17 17:40:41 +03:00
|
|
|
}
|
2016-11-17 17:40:41 +03:00
|
|
|
|
2017-06-29 16:27:44 +03:00
|
|
|
ret = qed_aio_write_main(acb);
|
|
|
|
if (ret < 0) {
|
2017-06-29 16:27:47 +03:00
|
|
|
goto out;
|
2017-06-29 16:27:44 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
if (s->bs->backing) {
|
|
|
|
/*
|
|
|
|
* Flush new data clusters before updating the L2 table
|
|
|
|
*
|
|
|
|
* This flush is necessary when a backing file is in use. A crash
|
|
|
|
* during an allocating write could result in empty clusters in the
|
|
|
|
* image. If the write only touched a subregion of the cluster,
|
|
|
|
* then backing image sectors have been lost in the untouched
|
|
|
|
* region. The solution is to flush after writing a new data
|
|
|
|
* cluster and before updating the L2 table.
|
|
|
|
*/
|
|
|
|
ret = bdrv_co_flush(s->bs->file->bs);
|
|
|
|
}
|
|
|
|
|
2017-06-29 16:27:47 +03:00
|
|
|
out:
|
|
|
|
qemu_co_mutex_lock(&s->table_lock);
|
|
|
|
return ret;
|
2010-12-06 19:08:02 +03:00
|
|
|
}
|
|
|
|
|
2011-01-28 20:11:59 +03:00
|
|
|
/**
|
|
|
|
* Check if the QED_F_NEED_CHECK bit should be set during allocating write
|
|
|
|
*/
|
|
|
|
static bool qed_should_set_need_check(BDRVQEDState *s)
|
|
|
|
{
|
|
|
|
/* The flush before L2 update path ensures consistency */
|
2015-06-17 15:55:21 +03:00
|
|
|
if (s->bs->backing) {
|
2011-01-28 20:11:59 +03:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
return !(s->header.features & QED_F_NEED_CHECK);
|
|
|
|
}
|
|
|
|
|
2010-12-06 19:08:02 +03:00
|
|
|
/**
|
|
|
|
* Write new data cluster
|
|
|
|
*
|
|
|
|
* @acb: Write request
|
|
|
|
* @len: Length in bytes
|
|
|
|
*
|
|
|
|
* This path is taken when writing to previously unallocated clusters.
|
2017-06-29 16:27:47 +03:00
|
|
|
*
|
|
|
|
* Called with table_lock held.
|
2010-12-06 19:08:02 +03:00
|
|
|
*/
|
2017-06-12 12:12:41 +03:00
|
|
|
static int coroutine_fn qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
|
2010-12-06 19:08:02 +03:00
|
|
|
{
|
|
|
|
BDRVQEDState *s = acb_to_s(acb);
|
2016-11-15 13:14:01 +03:00
|
|
|
int ret;
|
2010-12-06 19:08:02 +03:00
|
|
|
|
2011-05-09 19:45:40 +04:00
|
|
|
/* Cancel timer when the first allocating request comes in */
|
2016-11-18 17:32:17 +03:00
|
|
|
if (s->allocating_acb == NULL) {
|
2011-05-09 19:45:40 +04:00
|
|
|
qed_cancel_need_check_timer(s);
|
|
|
|
}
|
|
|
|
|
2010-12-06 19:08:02 +03:00
|
|
|
/* Freeze this request if another allocating write is in progress */
|
2016-11-18 17:32:17 +03:00
|
|
|
if (s->allocating_acb != acb || s->allocating_write_reqs_plugged) {
|
|
|
|
if (s->allocating_acb != NULL) {
|
2017-06-29 16:27:47 +03:00
|
|
|
qemu_co_queue_wait(&s->allocating_write_reqs, &s->table_lock);
|
2016-11-18 17:32:17 +03:00
|
|
|
assert(s->allocating_acb == NULL);
|
|
|
|
}
|
|
|
|
s->allocating_acb = acb;
|
|
|
|
return -EAGAIN; /* start over with looking up table entries */
|
2010-12-06 19:08:02 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
acb->cur_nclusters = qed_bytes_to_clusters(s,
|
|
|
|
qed_offset_into_cluster(s, acb->cur_pos) + len);
|
consolidate qemu_iovec_copy() and qemu_iovec_concat() and make them consistent
qemu_iovec_concat() is currently a wrapper for
qemu_iovec_copy(), use the former (with extra
"0" arg) in a few places where it is used.
Change skip argument of qemu_iovec_copy() from
uint64_t to size_t, since size of qiov itself
is size_t, so there's no way to skip larger
sizes. Rename it to soffset, to make it clear
that the offset is applied to src.
Also change the only usage of uint64_t in
hw/9pfs/virtio-9p.c, in v9fs_init_qiov_from_pdu() -
all callers of it actually uses size_t too,
not uint64_t.
One added restriction: as for all other iovec-related
functions, soffset must point inside src.
Order of argumens is already good:
qemu_iovec_memset(QEMUIOVector *qiov, size_t offset,
int c, size_t bytes)
vs:
qemu_iovec_concat(QEMUIOVector *dst,
QEMUIOVector *src,
size_t soffset, size_t sbytes)
(note soffset is after _src_ not dst, since it applies to src;
for memset it applies to qiov).
Note that in many places where this function is used,
the previous call is qemu_iovec_reset(), which means
many callers actually want copy (replacing dst content),
not concat. So we may want to add a wrapper like
qemu_iovec_copy() with the same arguments but which
calls qemu_iovec_reset() before _concat().
Signed-off-by: Michael Tokarev <mjt@tls.msk.ru>
2012-03-12 21:28:06 +04:00
|
|
|
qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
|
2010-12-06 19:08:02 +03:00
|
|
|
|
2012-02-07 17:27:28 +04:00
|
|
|
if (acb->flags & QED_AIOCB_ZERO) {
|
|
|
|
/* Skip ahead if the clusters are already zero */
|
|
|
|
if (acb->find_cluster_ret == QED_CLUSTER_ZERO) {
|
2016-11-17 17:40:41 +03:00
|
|
|
return 0;
|
2012-02-07 17:27:28 +04:00
|
|
|
}
|
2017-06-29 16:27:44 +03:00
|
|
|
acb->cur_cluster = 1;
|
2012-02-07 17:27:28 +04:00
|
|
|
} else {
|
|
|
|
acb->cur_cluster = qed_alloc_clusters(s, acb->cur_nclusters);
|
|
|
|
}
|
|
|
|
|
2011-01-28 20:11:59 +03:00
|
|
|
if (qed_should_set_need_check(s)) {
|
|
|
|
s->header.features |= QED_F_NEED_CHECK;
|
2016-11-15 13:14:01 +03:00
|
|
|
ret = qed_write_header(s);
|
2016-11-17 17:40:41 +03:00
|
|
|
if (ret < 0) {
|
2016-11-17 17:40:41 +03:00
|
|
|
return ret;
|
2016-11-17 17:40:41 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-06-29 16:27:44 +03:00
|
|
|
if (!(acb->flags & QED_AIOCB_ZERO)) {
|
2016-11-17 17:40:41 +03:00
|
|
|
ret = qed_aio_write_cow(acb);
|
2017-06-29 16:27:44 +03:00
|
|
|
if (ret < 0) {
|
|
|
|
return ret;
|
|
|
|
}
|
2010-12-06 19:08:03 +03:00
|
|
|
}
|
2017-06-29 16:27:44 +03:00
|
|
|
|
|
|
|
return qed_aio_write_l2_update(acb, acb->cur_cluster);
|
2010-12-06 19:08:02 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Write data cluster in place
|
|
|
|
*
|
|
|
|
* @acb: Write request
|
|
|
|
* @offset: Cluster offset in bytes
|
|
|
|
* @len: Length in bytes
|
|
|
|
*
|
|
|
|
* This path is taken when writing to already allocated clusters.
|
2017-06-29 16:27:47 +03:00
|
|
|
*
|
|
|
|
* Called with table_lock held.
|
2010-12-06 19:08:02 +03:00
|
|
|
*/
|
2017-06-12 12:12:41 +03:00
|
|
|
static int coroutine_fn qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset,
|
|
|
|
size_t len)
|
2010-12-06 19:08:02 +03:00
|
|
|
{
|
2017-06-29 16:27:47 +03:00
|
|
|
BDRVQEDState *s = acb_to_s(acb);
|
|
|
|
int r;
|
|
|
|
|
|
|
|
qemu_co_mutex_unlock(&s->table_lock);
|
|
|
|
|
2012-02-07 17:27:28 +04:00
|
|
|
/* Allocate buffer for zero writes */
|
|
|
|
if (acb->flags & QED_AIOCB_ZERO) {
|
|
|
|
struct iovec *iov = acb->qiov->iov;
|
|
|
|
|
|
|
|
if (!iov->iov_base) {
|
2016-11-18 16:47:36 +03:00
|
|
|
iov->iov_base = qemu_try_blockalign(acb->bs, iov->iov_len);
|
2014-05-20 15:39:57 +04:00
|
|
|
if (iov->iov_base == NULL) {
|
2017-06-29 16:27:47 +03:00
|
|
|
r = -ENOMEM;
|
|
|
|
goto out;
|
2014-05-20 15:39:57 +04:00
|
|
|
}
|
2012-02-07 17:27:28 +04:00
|
|
|
memset(iov->iov_base, 0, iov->iov_len);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-12-06 19:08:02 +03:00
|
|
|
/* Calculate the I/O vector */
|
|
|
|
acb->cur_cluster = offset;
|
consolidate qemu_iovec_copy() and qemu_iovec_concat() and make them consistent
qemu_iovec_concat() is currently a wrapper for
qemu_iovec_copy(), use the former (with extra
"0" arg) in a few places where it is used.
Change skip argument of qemu_iovec_copy() from
uint64_t to size_t, since size of qiov itself
is size_t, so there's no way to skip larger
sizes. Rename it to soffset, to make it clear
that the offset is applied to src.
Also change the only usage of uint64_t in
hw/9pfs/virtio-9p.c, in v9fs_init_qiov_from_pdu() -
all callers of it actually uses size_t too,
not uint64_t.
One added restriction: as for all other iovec-related
functions, soffset must point inside src.
Order of argumens is already good:
qemu_iovec_memset(QEMUIOVector *qiov, size_t offset,
int c, size_t bytes)
vs:
qemu_iovec_concat(QEMUIOVector *dst,
QEMUIOVector *src,
size_t soffset, size_t sbytes)
(note soffset is after _src_ not dst, since it applies to src;
for memset it applies to qiov).
Note that in many places where this function is used,
the previous call is qemu_iovec_reset(), which means
many callers actually want copy (replacing dst content),
not concat. So we may want to add a wrapper like
qemu_iovec_copy() with the same arguments but which
calls qemu_iovec_reset() before _concat().
Signed-off-by: Michael Tokarev <mjt@tls.msk.ru>
2012-03-12 21:28:06 +04:00
|
|
|
qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
|
2010-12-06 19:08:02 +03:00
|
|
|
|
2017-06-29 16:27:47 +03:00
|
|
|
/* Do the actual write. */
|
|
|
|
r = qed_aio_write_main(acb);
|
|
|
|
out:
|
|
|
|
qemu_co_mutex_lock(&s->table_lock);
|
|
|
|
return r;
|
2010-12-06 19:08:02 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Write data cluster
|
|
|
|
*
|
|
|
|
* @opaque: Write request
|
2016-11-17 17:40:41 +03:00
|
|
|
* @ret: QED_CLUSTER_FOUND, QED_CLUSTER_L2 or QED_CLUSTER_L1
|
2010-12-06 19:08:02 +03:00
|
|
|
* @offset: Cluster offset in bytes
|
|
|
|
* @len: Length in bytes
|
2017-06-29 16:27:47 +03:00
|
|
|
*
|
|
|
|
* Called with table_lock held.
|
2010-12-06 19:08:02 +03:00
|
|
|
*/
|
2017-06-12 12:12:41 +03:00
|
|
|
static int coroutine_fn qed_aio_write_data(void *opaque, int ret,
|
|
|
|
uint64_t offset, size_t len)
|
2010-12-06 19:08:02 +03:00
|
|
|
{
|
|
|
|
QEDAIOCB *acb = opaque;
|
|
|
|
|
|
|
|
trace_qed_aio_write_data(acb_to_s(acb), acb, ret, offset, len);
|
|
|
|
|
|
|
|
acb->find_cluster_ret = ret;
|
|
|
|
|
|
|
|
switch (ret) {
|
|
|
|
case QED_CLUSTER_FOUND:
|
2016-11-17 17:40:41 +03:00
|
|
|
return qed_aio_write_inplace(acb, offset, len);
|
2010-12-06 19:08:02 +03:00
|
|
|
|
|
|
|
case QED_CLUSTER_L2:
|
|
|
|
case QED_CLUSTER_L1:
|
2010-12-17 18:58:22 +03:00
|
|
|
case QED_CLUSTER_ZERO:
|
2016-11-17 17:40:41 +03:00
|
|
|
return qed_aio_write_alloc(acb, len);
|
2010-12-06 19:08:02 +03:00
|
|
|
|
|
|
|
default:
|
2016-11-17 17:40:41 +03:00
|
|
|
g_assert_not_reached();
|
2016-11-17 17:40:41 +03:00
|
|
|
}
|
2010-12-06 19:08:02 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Read data cluster
|
|
|
|
*
|
|
|
|
* @opaque: Read request
|
2016-11-17 17:40:41 +03:00
|
|
|
* @ret: QED_CLUSTER_FOUND, QED_CLUSTER_L2 or QED_CLUSTER_L1
|
2010-12-06 19:08:02 +03:00
|
|
|
* @offset: Cluster offset in bytes
|
|
|
|
* @len: Length in bytes
|
2017-06-29 16:27:47 +03:00
|
|
|
*
|
|
|
|
* Called with table_lock held.
|
2010-12-06 19:08:02 +03:00
|
|
|
*/
|
2017-06-12 12:12:41 +03:00
|
|
|
static int coroutine_fn qed_aio_read_data(void *opaque, int ret,
|
|
|
|
uint64_t offset, size_t len)
|
2010-12-06 19:08:02 +03:00
|
|
|
{
|
|
|
|
QEDAIOCB *acb = opaque;
|
|
|
|
BDRVQEDState *s = acb_to_s(acb);
|
2016-11-18 16:47:36 +03:00
|
|
|
BlockDriverState *bs = acb->bs;
|
2017-06-29 16:27:47 +03:00
|
|
|
int r;
|
|
|
|
|
|
|
|
qemu_co_mutex_unlock(&s->table_lock);
|
2010-12-06 19:08:02 +03:00
|
|
|
|
|
|
|
/* Adjust offset into cluster */
|
|
|
|
offset += qed_offset_into_cluster(s, acb->cur_pos);
|
|
|
|
|
|
|
|
trace_qed_aio_read_data(s, acb, ret, offset, len);
|
|
|
|
|
consolidate qemu_iovec_copy() and qemu_iovec_concat() and make them consistent
qemu_iovec_concat() is currently a wrapper for
qemu_iovec_copy(), use the former (with extra
"0" arg) in a few places where it is used.
Change skip argument of qemu_iovec_copy() from
uint64_t to size_t, since size of qiov itself
is size_t, so there's no way to skip larger
sizes. Rename it to soffset, to make it clear
that the offset is applied to src.
Also change the only usage of uint64_t in
hw/9pfs/virtio-9p.c, in v9fs_init_qiov_from_pdu() -
all callers of it actually uses size_t too,
not uint64_t.
One added restriction: as for all other iovec-related
functions, soffset must point inside src.
Order of argumens is already good:
qemu_iovec_memset(QEMUIOVector *qiov, size_t offset,
int c, size_t bytes)
vs:
qemu_iovec_concat(QEMUIOVector *dst,
QEMUIOVector *src,
size_t soffset, size_t sbytes)
(note soffset is after _src_ not dst, since it applies to src;
for memset it applies to qiov).
Note that in many places where this function is used,
the previous call is qemu_iovec_reset(), which means
many callers actually want copy (replacing dst content),
not concat. So we may want to add a wrapper like
qemu_iovec_copy() with the same arguments but which
calls qemu_iovec_reset() before _concat().
Signed-off-by: Michael Tokarev <mjt@tls.msk.ru>
2012-03-12 21:28:06 +04:00
|
|
|
qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
|
2010-12-06 19:08:02 +03:00
|
|
|
|
2017-06-29 16:27:47 +03:00
|
|
|
/* Handle zero cluster and backing file reads, otherwise read
|
|
|
|
* data cluster directly.
|
|
|
|
*/
|
2010-12-17 18:58:22 +03:00
|
|
|
if (ret == QED_CLUSTER_ZERO) {
|
consolidate qemu_iovec_memset{,_skip}() into single function and use existing iov_memset()
This patch combines two functions into one, and replaces
the implementation with already existing iov_memset() from
iov.c.
The new prototype of qemu_iovec_memset():
size_t qemu_iovec_memset(qiov, size_t offset, int fillc, size_t bytes)
It is different from former qemu_iovec_memset_skip(), and
I want to make other functions to be consistent with it
too: first how much to skip, second what, and 3rd how many
of it. It also returns actual number of bytes filled in,
which may be less than the requested `bytes' if qiov is
smaller than offset+bytes, in the same way iov_memset()
does.
While at it, use utility function iov_memset() from
iov.h in posix-aio-compat.c, where qiov was used.
Signed-off-by: Michael Tokarev <mjt@tls.msk.ru>
2012-03-10 16:54:23 +04:00
|
|
|
qemu_iovec_memset(&acb->cur_qiov, 0, 0, acb->cur_qiov.size);
|
2017-06-29 16:27:47 +03:00
|
|
|
r = 0;
|
2010-12-17 18:58:22 +03:00
|
|
|
} else if (ret != QED_CLUSTER_FOUND) {
|
2020-05-28 12:44:05 +03:00
|
|
|
r = qed_read_backing_file(s, acb->cur_pos, &acb->cur_qiov);
|
2017-06-29 16:27:47 +03:00
|
|
|
} else {
|
|
|
|
BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
|
|
|
|
r = bdrv_co_preadv(bs->file, offset, acb->cur_qiov.size,
|
|
|
|
&acb->cur_qiov, 0);
|
2010-12-06 19:08:02 +03:00
|
|
|
}
|
|
|
|
|
2017-06-29 16:27:47 +03:00
|
|
|
qemu_co_mutex_lock(&s->table_lock);
|
|
|
|
return r;
|
2010-12-06 19:08:02 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Begin next I/O or complete the request
|
|
|
|
*/
|
2017-06-12 12:12:41 +03:00
|
|
|
static int coroutine_fn qed_aio_next_io(QEDAIOCB *acb)
|
2010-12-06 19:08:02 +03:00
|
|
|
{
|
|
|
|
BDRVQEDState *s = acb_to_s(acb);
|
2016-11-14 18:56:10 +03:00
|
|
|
uint64_t offset;
|
|
|
|
size_t len;
|
2016-11-18 15:40:13 +03:00
|
|
|
int ret;
|
2010-12-06 19:08:02 +03:00
|
|
|
|
2017-06-29 16:27:47 +03:00
|
|
|
qemu_co_mutex_lock(&s->table_lock);
|
2016-11-18 16:16:42 +03:00
|
|
|
while (1) {
|
|
|
|
trace_qed_aio_next_io(s, acb, 0, acb->cur_pos + acb->cur_qiov.size);
|
2010-12-06 19:08:02 +03:00
|
|
|
|
2016-11-18 16:16:42 +03:00
|
|
|
acb->qiov_offset += acb->cur_qiov.size;
|
|
|
|
acb->cur_pos += acb->cur_qiov.size;
|
|
|
|
qemu_iovec_reset(&acb->cur_qiov);
|
2010-12-06 19:08:02 +03:00
|
|
|
|
2016-11-18 16:16:42 +03:00
|
|
|
/* Complete request */
|
|
|
|
if (acb->cur_pos >= acb->end_pos) {
|
2016-11-18 16:47:36 +03:00
|
|
|
ret = 0;
|
|
|
|
break;
|
2016-11-18 16:16:42 +03:00
|
|
|
}
|
2010-12-06 19:08:02 +03:00
|
|
|
|
2016-11-18 16:16:42 +03:00
|
|
|
/* Find next cluster and start I/O */
|
|
|
|
len = acb->end_pos - acb->cur_pos;
|
|
|
|
ret = qed_find_cluster(s, &acb->request, acb->cur_pos, &len, &offset);
|
|
|
|
if (ret < 0) {
|
2016-11-18 16:47:36 +03:00
|
|
|
break;
|
2016-11-18 16:16:42 +03:00
|
|
|
}
|
2016-11-17 17:40:41 +03:00
|
|
|
|
2016-11-18 16:16:42 +03:00
|
|
|
if (acb->flags & QED_AIOCB_WRITE) {
|
|
|
|
ret = qed_aio_write_data(acb, ret, offset, len);
|
|
|
|
} else {
|
|
|
|
ret = qed_aio_read_data(acb, ret, offset, len);
|
|
|
|
}
|
2016-11-17 17:40:41 +03:00
|
|
|
|
2016-11-18 17:32:17 +03:00
|
|
|
if (ret < 0 && ret != -EAGAIN) {
|
2016-11-18 16:47:36 +03:00
|
|
|
break;
|
2016-11-17 17:40:41 +03:00
|
|
|
}
|
|
|
|
}
|
2010-12-06 19:08:02 +03:00
|
|
|
|
2016-11-18 16:47:36 +03:00
|
|
|
trace_qed_aio_complete(s, acb, ret);
|
|
|
|
qed_aio_complete(acb);
|
2017-06-29 16:27:47 +03:00
|
|
|
qemu_co_mutex_unlock(&s->table_lock);
|
2016-11-18 16:47:36 +03:00
|
|
|
return ret;
|
2016-11-14 16:20:00 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static int coroutine_fn qed_co_request(BlockDriverState *bs, int64_t sector_num,
|
|
|
|
QEMUIOVector *qiov, int nb_sectors,
|
|
|
|
int flags)
|
|
|
|
{
|
2016-11-18 16:47:36 +03:00
|
|
|
QEDAIOCB acb = {
|
|
|
|
.bs = bs,
|
|
|
|
.cur_pos = (uint64_t) sector_num * BDRV_SECTOR_SIZE,
|
|
|
|
.end_pos = (sector_num + nb_sectors) * BDRV_SECTOR_SIZE,
|
|
|
|
.qiov = qiov,
|
|
|
|
.flags = flags,
|
2016-11-14 16:20:00 +03:00
|
|
|
};
|
2016-11-18 16:47:36 +03:00
|
|
|
qemu_iovec_init(&acb.cur_qiov, qiov->niov);
|
2010-12-06 19:08:02 +03:00
|
|
|
|
2016-11-18 16:47:36 +03:00
|
|
|
trace_qed_aio_setup(bs->opaque, &acb, sector_num, nb_sectors, NULL, flags);
|
2010-12-06 19:08:02 +03:00
|
|
|
|
|
|
|
/* Start request */
|
2016-11-18 16:47:36 +03:00
|
|
|
return qed_aio_next_io(&acb);
|
2010-12-06 19:08:00 +03:00
|
|
|
}
|
|
|
|
|
2016-11-14 16:20:00 +03:00
|
|
|
static int coroutine_fn bdrv_qed_co_readv(BlockDriverState *bs,
|
|
|
|
int64_t sector_num, int nb_sectors,
|
|
|
|
QEMUIOVector *qiov)
|
2010-12-06 19:08:00 +03:00
|
|
|
{
|
2016-11-14 16:20:00 +03:00
|
|
|
return qed_co_request(bs, sector_num, qiov, nb_sectors, 0);
|
2010-12-06 19:08:00 +03:00
|
|
|
}
|
|
|
|
|
2016-11-14 16:20:00 +03:00
|
|
|
static int coroutine_fn bdrv_qed_co_writev(BlockDriverState *bs,
|
|
|
|
int64_t sector_num, int nb_sectors,
|
2018-04-25 01:01:57 +03:00
|
|
|
QEMUIOVector *qiov, int flags)
|
2012-02-07 17:27:28 +04:00
|
|
|
{
|
2018-04-25 01:01:57 +03:00
|
|
|
assert(!flags);
|
2016-11-14 16:20:00 +03:00
|
|
|
return qed_co_request(bs, sector_num, qiov, nb_sectors, QED_AIOCB_WRITE);
|
2012-02-07 17:27:28 +04:00
|
|
|
}
|
|
|
|
|
2016-06-02 00:10:09 +03:00
|
|
|
static int coroutine_fn bdrv_qed_co_pwrite_zeroes(BlockDriverState *bs,
|
|
|
|
int64_t offset,
|
block: use int64_t instead of int in driver write_zeroes handlers
We are generally moving to int64_t for both offset and bytes parameters
on all io paths.
Main motivation is realization of 64-bit write_zeroes operation for
fast zeroing large disk chunks, up to the whole disk.
We chose signed type, to be consistent with off_t (which is signed) and
with possibility for signed return type (where negative value means
error).
So, convert driver write_zeroes handlers bytes parameter to int64_t.
The only caller of all updated function is bdrv_co_do_pwrite_zeroes().
bdrv_co_do_pwrite_zeroes() itself is of course OK with widening of
callee parameter type. Also, bdrv_co_do_pwrite_zeroes()'s
max_write_zeroes is limited to INT_MAX. So, updated functions all are
safe, they will not get "bytes" larger than before.
Still, let's look through all updated functions, and add assertions to
the ones which are actually unprepared to values larger than INT_MAX.
For these drivers also set explicit max_pwrite_zeroes limit.
Let's go:
blkdebug: calculations can't overflow, thanks to
bdrv_check_qiov_request() in generic layer. rule_check() and
bdrv_co_pwrite_zeroes() both have 64bit argument.
blklogwrites: pass to blk_log_writes_co_log() with 64bit argument.
blkreplay, copy-on-read, filter-compress: pass to
bdrv_co_pwrite_zeroes() which is OK
copy-before-write: Calls cbw_do_copy_before_write() and
bdrv_co_pwrite_zeroes, both have 64bit argument.
file-posix: both handler calls raw_do_pwrite_zeroes, which is updated.
In raw_do_pwrite_zeroes() calculations are OK due to
bdrv_check_qiov_request(), bytes go to RawPosixAIOData::aio_nbytes
which is uint64_t.
Check also where that uint64_t gets handed:
handle_aiocb_write_zeroes_block() passes a uint64_t[2] to
ioctl(BLKZEROOUT), handle_aiocb_write_zeroes() calls do_fallocate()
which takes off_t (and we compile to always have 64-bit off_t), as
does handle_aiocb_write_zeroes_unmap. All look safe.
gluster: bytes go to GlusterAIOCB::size which is int64_t and to
glfs_zerofill_async works with off_t.
iscsi: Aha, here we deal with iscsi_writesame16_task() that has
uint32_t num_blocks argument and iscsi_writesame16_task() has
uint16_t argument. Make comments, add assertions and clarify
max_pwrite_zeroes calculation.
iscsi_allocmap_() functions already has int64_t argument
is_byte_request_lun_aligned is simple to update, do it.
mirror_top: pass to bdrv_mirror_top_do_write which has uint64_t
argument
nbd: Aha, here we have protocol limitation, and NBDRequest::len is
uint32_t. max_pwrite_zeroes is cleanly set to 32bit value, so we are
OK for now.
nvme: Again, protocol limitation. And no inherent limit for
write-zeroes at all. But from code that calculates cdw12 it's obvious
that we do have limit and alignment. Let's clarify it. Also,
obviously the code is not prepared to handle bytes=0. Let's handle
this case too.
trace events already 64bit
preallocate: pass to handle_write() and bdrv_co_pwrite_zeroes(), both
64bit.
rbd: pass to qemu_rbd_start_co() which is 64bit.
qcow2: offset + bytes and alignment still works good (thanks to
bdrv_check_qiov_request()), so tail calculation is OK
qcow2_subcluster_zeroize() has 64bit argument, should be OK
trace events updated
qed: qed_co_request wants int nb_sectors. Also in code we have size_t
used for request length which may be 32bit. So, let's just keep
INT_MAX as a limit (aligning it down to pwrite_zeroes_alignment) and
don't care.
raw-format: Is OK. raw_adjust_offset and bdrv_co_pwrite_zeroes are both
64bit.
throttle: Both throttle_group_co_io_limits_intercept() and
bdrv_co_pwrite_zeroes() are 64bit.
vmdk: pass to vmdk_pwritev which is 64bit
quorum: pass to quorum_co_pwritev() which is 64bit
Hooray!
At this point all block drivers are prepared to support 64bit
write-zero requests, or have explicitly set max_pwrite_zeroes.
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20210903102807.27127-8-vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
[eblake: use <= rather than < in assertions relying on max_pwrite_zeroes]
Signed-off-by: Eric Blake <eblake@redhat.com>
2021-09-03 13:28:03 +03:00
|
|
|
int64_t bytes,
|
2016-06-02 00:10:09 +03:00
|
|
|
BdrvRequestFlags flags)
|
2012-02-07 17:27:28 +04:00
|
|
|
{
|
2012-08-28 17:04:27 +04:00
|
|
|
BDRVQEDState *s = bs->opaque;
|
2019-02-18 17:09:19 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Zero writes start without an I/O buffer. If a buffer becomes necessary
|
|
|
|
* then it will be allocated during request processing.
|
|
|
|
*/
|
|
|
|
QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, NULL, bytes);
|
2012-02-07 17:27:28 +04:00
|
|
|
|
block: use int64_t instead of int in driver write_zeroes handlers
We are generally moving to int64_t for both offset and bytes parameters
on all io paths.
Main motivation is realization of 64-bit write_zeroes operation for
fast zeroing large disk chunks, up to the whole disk.
We chose signed type, to be consistent with off_t (which is signed) and
with possibility for signed return type (where negative value means
error).
So, convert driver write_zeroes handlers bytes parameter to int64_t.
The only caller of all updated function is bdrv_co_do_pwrite_zeroes().
bdrv_co_do_pwrite_zeroes() itself is of course OK with widening of
callee parameter type. Also, bdrv_co_do_pwrite_zeroes()'s
max_write_zeroes is limited to INT_MAX. So, updated functions all are
safe, they will not get "bytes" larger than before.
Still, let's look through all updated functions, and add assertions to
the ones which are actually unprepared to values larger than INT_MAX.
For these drivers also set explicit max_pwrite_zeroes limit.
Let's go:
blkdebug: calculations can't overflow, thanks to
bdrv_check_qiov_request() in generic layer. rule_check() and
bdrv_co_pwrite_zeroes() both have 64bit argument.
blklogwrites: pass to blk_log_writes_co_log() with 64bit argument.
blkreplay, copy-on-read, filter-compress: pass to
bdrv_co_pwrite_zeroes() which is OK
copy-before-write: Calls cbw_do_copy_before_write() and
bdrv_co_pwrite_zeroes, both have 64bit argument.
file-posix: both handler calls raw_do_pwrite_zeroes, which is updated.
In raw_do_pwrite_zeroes() calculations are OK due to
bdrv_check_qiov_request(), bytes go to RawPosixAIOData::aio_nbytes
which is uint64_t.
Check also where that uint64_t gets handed:
handle_aiocb_write_zeroes_block() passes a uint64_t[2] to
ioctl(BLKZEROOUT), handle_aiocb_write_zeroes() calls do_fallocate()
which takes off_t (and we compile to always have 64-bit off_t), as
does handle_aiocb_write_zeroes_unmap. All look safe.
gluster: bytes go to GlusterAIOCB::size which is int64_t and to
glfs_zerofill_async works with off_t.
iscsi: Aha, here we deal with iscsi_writesame16_task() that has
uint32_t num_blocks argument and iscsi_writesame16_task() has
uint16_t argument. Make comments, add assertions and clarify
max_pwrite_zeroes calculation.
iscsi_allocmap_() functions already has int64_t argument
is_byte_request_lun_aligned is simple to update, do it.
mirror_top: pass to bdrv_mirror_top_do_write which has uint64_t
argument
nbd: Aha, here we have protocol limitation, and NBDRequest::len is
uint32_t. max_pwrite_zeroes is cleanly set to 32bit value, so we are
OK for now.
nvme: Again, protocol limitation. And no inherent limit for
write-zeroes at all. But from code that calculates cdw12 it's obvious
that we do have limit and alignment. Let's clarify it. Also,
obviously the code is not prepared to handle bytes=0. Let's handle
this case too.
trace events already 64bit
preallocate: pass to handle_write() and bdrv_co_pwrite_zeroes(), both
64bit.
rbd: pass to qemu_rbd_start_co() which is 64bit.
qcow2: offset + bytes and alignment still works good (thanks to
bdrv_check_qiov_request()), so tail calculation is OK
qcow2_subcluster_zeroize() has 64bit argument, should be OK
trace events updated
qed: qed_co_request wants int nb_sectors. Also in code we have size_t
used for request length which may be 32bit. So, let's just keep
INT_MAX as a limit (aligning it down to pwrite_zeroes_alignment) and
don't care.
raw-format: Is OK. raw_adjust_offset and bdrv_co_pwrite_zeroes are both
64bit.
throttle: Both throttle_group_co_io_limits_intercept() and
bdrv_co_pwrite_zeroes() are 64bit.
vmdk: pass to vmdk_pwritev which is 64bit
quorum: pass to quorum_co_pwritev() which is 64bit
Hooray!
At this point all block drivers are prepared to support 64bit
write-zero requests, or have explicitly set max_pwrite_zeroes.
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20210903102807.27127-8-vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
[eblake: use <= rather than < in assertions relying on max_pwrite_zeroes]
Signed-off-by: Eric Blake <eblake@redhat.com>
2021-09-03 13:28:03 +03:00
|
|
|
/*
|
|
|
|
* QED is not prepared for 63bit write-zero requests, so rely on
|
|
|
|
* max_pwrite_zeroes.
|
|
|
|
*/
|
|
|
|
assert(bytes <= INT_MAX);
|
|
|
|
|
2016-06-02 00:10:09 +03:00
|
|
|
/* Fall back if the request is not aligned */
|
|
|
|
if (qed_offset_into_cluster(s, offset) ||
|
2017-06-09 13:18:08 +03:00
|
|
|
qed_offset_into_cluster(s, bytes)) {
|
2016-06-02 00:10:09 +03:00
|
|
|
return -ENOTSUP;
|
2012-08-28 17:04:27 +04:00
|
|
|
}
|
|
|
|
|
2016-11-14 16:20:00 +03:00
|
|
|
return qed_co_request(bs, offset >> BDRV_SECTOR_BITS, &qiov,
|
2017-06-09 13:18:08 +03:00
|
|
|
bytes >> BDRV_SECTOR_BITS,
|
2016-11-14 16:20:00 +03:00
|
|
|
QED_AIOCB_WRITE | QED_AIOCB_ZERO);
|
2012-02-07 17:27:28 +04:00
|
|
|
}
|
|
|
|
|
block: Convert .bdrv_truncate callback to coroutine_fn
bdrv_truncate() is an operation that can block (even for a quite long
time, depending on the PreallocMode) in I/O paths that shouldn't block.
Convert it to a coroutine_fn so that we have the infrastructure for
drivers to make their .bdrv_co_truncate implementation asynchronous.
This change could potentially introduce new race conditions because
bdrv_truncate() isn't necessarily executed atomically any more. Whether
this is a problem needs to be evaluated for each block driver that
supports truncate:
* file-posix/win32, gluster, iscsi, nfs, rbd, ssh, sheepdog: The
protocol drivers are trivially safe because they don't actually yield
yet, so there is no change in behaviour.
* copy-on-read, crypto, raw-format: Essentially just filter drivers that
pass the request to a child node, no problem.
* qcow2: The implementation modifies metadata, so it needs to hold
s->lock to be safe with concurrent I/O requests. In order to avoid
double locking, this requires pulling the locking out into
preallocate_co() and using qcow2_write_caches() instead of
bdrv_flush().
* qed: Does a single header update, this is fine without locking.
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
2018-06-21 18:54:35 +03:00
|
|
|
static int coroutine_fn bdrv_qed_co_truncate(BlockDriverState *bs,
|
|
|
|
int64_t offset,
|
2019-09-18 12:51:40 +03:00
|
|
|
bool exact,
|
block: Convert .bdrv_truncate callback to coroutine_fn
bdrv_truncate() is an operation that can block (even for a quite long
time, depending on the PreallocMode) in I/O paths that shouldn't block.
Convert it to a coroutine_fn so that we have the infrastructure for
drivers to make their .bdrv_co_truncate implementation asynchronous.
This change could potentially introduce new race conditions because
bdrv_truncate() isn't necessarily executed atomically any more. Whether
this is a problem needs to be evaluated for each block driver that
supports truncate:
* file-posix/win32, gluster, iscsi, nfs, rbd, ssh, sheepdog: The
protocol drivers are trivially safe because they don't actually yield
yet, so there is no change in behaviour.
* copy-on-read, crypto, raw-format: Essentially just filter drivers that
pass the request to a child node, no problem.
* qcow2: The implementation modifies metadata, so it needs to hold
s->lock to be safe with concurrent I/O requests. In order to avoid
double locking, this requires pulling the locking out into
preallocate_co() and using qcow2_write_caches() instead of
bdrv_flush().
* qed: Does a single header update, this is fine without locking.
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
2018-06-21 18:54:35 +03:00
|
|
|
PreallocMode prealloc,
|
2020-04-24 15:54:39 +03:00
|
|
|
BdrvRequestFlags flags,
|
block: Convert .bdrv_truncate callback to coroutine_fn
bdrv_truncate() is an operation that can block (even for a quite long
time, depending on the PreallocMode) in I/O paths that shouldn't block.
Convert it to a coroutine_fn so that we have the infrastructure for
drivers to make their .bdrv_co_truncate implementation asynchronous.
This change could potentially introduce new race conditions because
bdrv_truncate() isn't necessarily executed atomically any more. Whether
this is a problem needs to be evaluated for each block driver that
supports truncate:
* file-posix/win32, gluster, iscsi, nfs, rbd, ssh, sheepdog: The
protocol drivers are trivially safe because they don't actually yield
yet, so there is no change in behaviour.
* copy-on-read, crypto, raw-format: Essentially just filter drivers that
pass the request to a child node, no problem.
* qcow2: The implementation modifies metadata, so it needs to hold
s->lock to be safe with concurrent I/O requests. In order to avoid
double locking, this requires pulling the locking out into
preallocate_co() and using qcow2_write_caches() instead of
bdrv_flush().
* qed: Does a single header update, this is fine without locking.
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
2018-06-21 18:54:35 +03:00
|
|
|
Error **errp)
|
2010-12-06 19:08:00 +03:00
|
|
|
{
|
2011-05-16 16:56:53 +04:00
|
|
|
BDRVQEDState *s = bs->opaque;
|
|
|
|
uint64_t old_image_size;
|
|
|
|
int ret;
|
|
|
|
|
2017-06-13 23:20:52 +03:00
|
|
|
if (prealloc != PREALLOC_MODE_OFF) {
|
|
|
|
error_setg(errp, "Unsupported preallocation mode '%s'",
|
2017-08-24 11:46:08 +03:00
|
|
|
PreallocMode_str(prealloc));
|
2017-06-13 23:20:52 +03:00
|
|
|
return -ENOTSUP;
|
|
|
|
}
|
|
|
|
|
2011-05-16 16:56:53 +04:00
|
|
|
if (!qed_is_image_size_valid(offset, s->header.cluster_size,
|
|
|
|
s->header.table_size)) {
|
2017-03-28 23:51:29 +03:00
|
|
|
error_setg(errp, "Invalid image size specified");
|
2011-05-16 16:56:53 +04:00
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
if ((uint64_t)offset < s->header.image_size) {
|
2017-03-28 23:51:29 +03:00
|
|
|
error_setg(errp, "Shrinking images is currently not supported");
|
2011-05-16 16:56:53 +04:00
|
|
|
return -ENOTSUP;
|
|
|
|
}
|
|
|
|
|
|
|
|
old_image_size = s->header.image_size;
|
|
|
|
s->header.image_size = offset;
|
|
|
|
ret = qed_write_header_sync(s);
|
|
|
|
if (ret < 0) {
|
|
|
|
s->header.image_size = old_image_size;
|
2017-03-28 23:51:29 +03:00
|
|
|
error_setg_errno(errp, -ret, "Failed to update the image size");
|
2011-05-16 16:56:53 +04:00
|
|
|
}
|
|
|
|
return ret;
|
2010-12-06 19:08:00 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static int64_t bdrv_qed_getlength(BlockDriverState *bs)
|
|
|
|
{
|
|
|
|
BDRVQEDState *s = bs->opaque;
|
|
|
|
return s->header.image_size;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int bdrv_qed_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
|
|
|
|
{
|
|
|
|
BDRVQEDState *s = bs->opaque;
|
|
|
|
|
|
|
|
memset(bdi, 0, sizeof(*bdi));
|
|
|
|
bdi->cluster_size = s->header.cluster_size;
|
2012-03-15 16:13:34 +04:00
|
|
|
bdi->is_dirty = s->header.features & QED_F_NEED_CHECK;
|
2010-12-06 19:08:00 +03:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int bdrv_qed_change_backing_file(BlockDriverState *bs,
|
|
|
|
const char *backing_file,
|
|
|
|
const char *backing_fmt)
|
|
|
|
{
|
|
|
|
BDRVQEDState *s = bs->opaque;
|
|
|
|
QEDHeader new_header, le_header;
|
|
|
|
void *buffer;
|
|
|
|
size_t buffer_len, backing_file_len;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
/* Refuse to set backing filename if unknown compat feature bits are
|
|
|
|
* active. If the image uses an unknown compat feature then we may not
|
|
|
|
* know the layout of data following the header structure and cannot safely
|
|
|
|
* add a new string.
|
|
|
|
*/
|
|
|
|
if (backing_file && (s->header.compat_features &
|
|
|
|
~QED_COMPAT_FEATURE_MASK)) {
|
|
|
|
return -ENOTSUP;
|
|
|
|
}
|
|
|
|
|
|
|
|
memcpy(&new_header, &s->header, sizeof(new_header));
|
|
|
|
|
|
|
|
new_header.features &= ~(QED_F_BACKING_FILE |
|
|
|
|
QED_F_BACKING_FORMAT_NO_PROBE);
|
|
|
|
|
|
|
|
/* Adjust feature flags */
|
|
|
|
if (backing_file) {
|
|
|
|
new_header.features |= QED_F_BACKING_FILE;
|
|
|
|
|
|
|
|
if (qed_fmt_is_raw(backing_fmt)) {
|
|
|
|
new_header.features |= QED_F_BACKING_FORMAT_NO_PROBE;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Calculate new header size */
|
|
|
|
backing_file_len = 0;
|
|
|
|
|
|
|
|
if (backing_file) {
|
|
|
|
backing_file_len = strlen(backing_file);
|
|
|
|
}
|
|
|
|
|
|
|
|
buffer_len = sizeof(new_header);
|
|
|
|
new_header.backing_filename_offset = buffer_len;
|
|
|
|
new_header.backing_filename_size = backing_file_len;
|
|
|
|
buffer_len += backing_file_len;
|
|
|
|
|
|
|
|
/* Make sure we can rewrite header without failing */
|
|
|
|
if (buffer_len > new_header.header_size * new_header.cluster_size) {
|
|
|
|
return -ENOSPC;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Prepare new header */
|
2011-08-21 07:09:37 +04:00
|
|
|
buffer = g_malloc(buffer_len);
|
2010-12-06 19:08:00 +03:00
|
|
|
|
|
|
|
qed_header_cpu_to_le(&new_header, &le_header);
|
|
|
|
memcpy(buffer, &le_header, sizeof(le_header));
|
|
|
|
buffer_len = sizeof(le_header);
|
|
|
|
|
2011-10-18 21:17:35 +04:00
|
|
|
if (backing_file) {
|
|
|
|
memcpy(buffer + buffer_len, backing_file, backing_file_len);
|
|
|
|
buffer_len += backing_file_len;
|
|
|
|
}
|
2010-12-06 19:08:00 +03:00
|
|
|
|
|
|
|
/* Write new header */
|
2016-06-20 21:09:15 +03:00
|
|
|
ret = bdrv_pwrite_sync(bs->file, 0, buffer, buffer_len);
|
2011-08-21 07:09:37 +04:00
|
|
|
g_free(buffer);
|
2010-12-06 19:08:00 +03:00
|
|
|
if (ret == 0) {
|
|
|
|
memcpy(&s->header, &new_header, sizeof(new_header));
|
|
|
|
}
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2018-03-01 19:36:18 +03:00
|
|
|
static void coroutine_fn bdrv_qed_co_invalidate_cache(BlockDriverState *bs,
|
|
|
|
Error **errp)
|
2012-03-23 11:36:51 +04:00
|
|
|
{
|
2017-06-29 16:27:47 +03:00
|
|
|
BDRVQEDState *s = bs->opaque;
|
2014-03-12 18:59:16 +04:00
|
|
|
int ret;
|
2012-03-23 11:36:51 +04:00
|
|
|
|
|
|
|
bdrv_qed_close(bs);
|
2014-03-11 13:58:39 +04:00
|
|
|
|
2017-06-29 16:27:46 +03:00
|
|
|
bdrv_qed_init_state(bs);
|
2018-03-01 19:36:18 +03:00
|
|
|
qemu_co_mutex_lock(&s->table_lock);
|
2021-02-02 15:49:55 +03:00
|
|
|
ret = bdrv_qed_do_open(bs, NULL, bs->open_flags, errp);
|
2018-03-01 19:36:18 +03:00
|
|
|
qemu_co_mutex_unlock(&s->table_lock);
|
2021-02-02 15:49:55 +03:00
|
|
|
if (ret < 0) {
|
|
|
|
error_prepend(errp, "Could not reopen qed layer: ");
|
2014-03-12 18:59:16 +04:00
|
|
|
}
|
2012-03-23 11:36:51 +04:00
|
|
|
}
|
|
|
|
|
2019-04-30 15:36:11 +03:00
|
|
|
static int coroutine_fn bdrv_qed_co_check(BlockDriverState *bs,
|
|
|
|
BdrvCheckResult *result,
|
|
|
|
BdrvCheckMode fix)
|
2010-12-06 19:08:00 +03:00
|
|
|
{
|
2010-12-06 19:08:03 +03:00
|
|
|
BDRVQEDState *s = bs->opaque;
|
2018-03-01 19:36:19 +03:00
|
|
|
int ret;
|
2010-12-06 19:08:03 +03:00
|
|
|
|
2018-03-01 19:36:19 +03:00
|
|
|
qemu_co_mutex_lock(&s->table_lock);
|
|
|
|
ret = qed_check(s, result, !!fix);
|
|
|
|
qemu_co_mutex_unlock(&s->table_lock);
|
|
|
|
|
|
|
|
return ret;
|
2010-12-06 19:08:00 +03:00
|
|
|
}
|
|
|
|
|
2014-06-05 13:21:00 +04:00
|
|
|
static QemuOptsList qed_create_opts = {
|
|
|
|
.name = "qed-create-opts",
|
|
|
|
.head = QTAILQ_HEAD_INITIALIZER(qed_create_opts.head),
|
|
|
|
.desc = {
|
|
|
|
{
|
|
|
|
.name = BLOCK_OPT_SIZE,
|
|
|
|
.type = QEMU_OPT_SIZE,
|
|
|
|
.help = "Virtual disk size"
|
|
|
|
},
|
|
|
|
{
|
|
|
|
.name = BLOCK_OPT_BACKING_FILE,
|
|
|
|
.type = QEMU_OPT_STRING,
|
|
|
|
.help = "File name of a base image"
|
|
|
|
},
|
|
|
|
{
|
|
|
|
.name = BLOCK_OPT_BACKING_FMT,
|
|
|
|
.type = QEMU_OPT_STRING,
|
|
|
|
.help = "Image format of the base image"
|
|
|
|
},
|
|
|
|
{
|
|
|
|
.name = BLOCK_OPT_CLUSTER_SIZE,
|
|
|
|
.type = QEMU_OPT_SIZE,
|
|
|
|
.help = "Cluster size (in bytes)",
|
|
|
|
.def_value_str = stringify(QED_DEFAULT_CLUSTER_SIZE)
|
|
|
|
},
|
|
|
|
{
|
|
|
|
.name = BLOCK_OPT_TABLE_SIZE,
|
|
|
|
.type = QEMU_OPT_SIZE,
|
|
|
|
.help = "L1/L2 table size (in clusters)"
|
|
|
|
},
|
|
|
|
{ /* end of list */ }
|
|
|
|
}
|
2010-12-06 19:08:00 +03:00
|
|
|
};
|
|
|
|
|
|
|
|
static BlockDriver bdrv_qed = {
|
|
|
|
.format_name = "qed",
|
|
|
|
.instance_size = sizeof(BDRVQEDState),
|
2014-06-05 13:21:00 +04:00
|
|
|
.create_opts = &qed_create_opts,
|
2020-05-13 14:05:12 +03:00
|
|
|
.is_format = true,
|
2014-06-04 17:09:35 +04:00
|
|
|
.supports_backing = true,
|
2010-12-06 19:08:00 +03:00
|
|
|
|
|
|
|
.bdrv_probe = bdrv_qed_probe,
|
|
|
|
.bdrv_open = bdrv_qed_open,
|
|
|
|
.bdrv_close = bdrv_qed_close,
|
2012-09-20 23:13:27 +04:00
|
|
|
.bdrv_reopen_prepare = bdrv_qed_reopen_prepare,
|
2020-05-13 14:05:39 +03:00
|
|
|
.bdrv_child_perm = bdrv_default_perms,
|
2018-03-09 21:53:19 +03:00
|
|
|
.bdrv_co_create = bdrv_qed_co_create,
|
2018-01-18 15:43:45 +03:00
|
|
|
.bdrv_co_create_opts = bdrv_qed_co_create_opts,
|
2013-06-28 14:47:42 +04:00
|
|
|
.bdrv_has_zero_init = bdrv_has_zero_init_1,
|
2018-02-13 23:26:53 +03:00
|
|
|
.bdrv_co_block_status = bdrv_qed_co_block_status,
|
2016-11-14 16:20:00 +03:00
|
|
|
.bdrv_co_readv = bdrv_qed_co_readv,
|
|
|
|
.bdrv_co_writev = bdrv_qed_co_writev,
|
2016-06-02 00:10:09 +03:00
|
|
|
.bdrv_co_pwrite_zeroes = bdrv_qed_co_pwrite_zeroes,
|
block: Convert .bdrv_truncate callback to coroutine_fn
bdrv_truncate() is an operation that can block (even for a quite long
time, depending on the PreallocMode) in I/O paths that shouldn't block.
Convert it to a coroutine_fn so that we have the infrastructure for
drivers to make their .bdrv_co_truncate implementation asynchronous.
This change could potentially introduce new race conditions because
bdrv_truncate() isn't necessarily executed atomically any more. Whether
this is a problem needs to be evaluated for each block driver that
supports truncate:
* file-posix/win32, gluster, iscsi, nfs, rbd, ssh, sheepdog: The
protocol drivers are trivially safe because they don't actually yield
yet, so there is no change in behaviour.
* copy-on-read, crypto, raw-format: Essentially just filter drivers that
pass the request to a child node, no problem.
* qcow2: The implementation modifies metadata, so it needs to hold
s->lock to be safe with concurrent I/O requests. In order to avoid
double locking, this requires pulling the locking out into
preallocate_co() and using qcow2_write_caches() instead of
bdrv_flush().
* qed: Does a single header update, this is fine without locking.
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
2018-06-21 18:54:35 +03:00
|
|
|
.bdrv_co_truncate = bdrv_qed_co_truncate,
|
2010-12-06 19:08:00 +03:00
|
|
|
.bdrv_getlength = bdrv_qed_getlength,
|
|
|
|
.bdrv_get_info = bdrv_qed_get_info,
|
2013-12-11 22:26:16 +04:00
|
|
|
.bdrv_refresh_limits = bdrv_qed_refresh_limits,
|
2010-12-06 19:08:00 +03:00
|
|
|
.bdrv_change_backing_file = bdrv_qed_change_backing_file,
|
2018-03-01 19:36:18 +03:00
|
|
|
.bdrv_co_invalidate_cache = bdrv_qed_co_invalidate_cache,
|
2018-03-01 19:36:19 +03:00
|
|
|
.bdrv_co_check = bdrv_qed_co_check,
|
2014-05-08 18:34:45 +04:00
|
|
|
.bdrv_detach_aio_context = bdrv_qed_detach_aio_context,
|
|
|
|
.bdrv_attach_aio_context = bdrv_qed_attach_aio_context,
|
2017-09-23 14:14:10 +03:00
|
|
|
.bdrv_co_drain_begin = bdrv_qed_co_drain_begin,
|
2010-12-06 19:08:00 +03:00
|
|
|
};
|
|
|
|
|
|
|
|
static void bdrv_qed_init(void)
|
|
|
|
{
|
|
|
|
bdrv_register(&bdrv_qed);
|
|
|
|
}
|
|
|
|
|
|
|
|
block_init(bdrv_qed_init);
|