2004-08-02 01:59:26 +04:00
|
|
|
/*
|
|
|
|
* Block driver for the VMDK format
|
2007-09-17 01:08:06 +04:00
|
|
|
*
|
2004-08-02 01:59:26 +04:00
|
|
|
* Copyright (c) 2004 Fabrice Bellard
|
2005-04-27 01:08:00 +04:00
|
|
|
* Copyright (c) 2005 Filip Navara
|
2007-09-17 01:08:06 +04:00
|
|
|
*
|
2004-08-02 01:59:26 +04:00
|
|
|
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
|
|
* of this software and associated documentation files (the "Software"), to deal
|
|
|
|
* in the Software without restriction, including without limitation the rights
|
|
|
|
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
|
|
* copies of the Software, and to permit persons to whom the Software is
|
|
|
|
* furnished to do so, subject to the following conditions:
|
|
|
|
*
|
|
|
|
* The above copyright notice and this permission notice shall be included in
|
|
|
|
* all copies or substantial portions of the Software.
|
|
|
|
*
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
|
|
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
|
|
* THE SOFTWARE.
|
|
|
|
*/
|
2007-01-25 00:05:24 +03:00
|
|
|
|
2007-11-11 05:51:17 +03:00
|
|
|
#include "qemu-common.h"
|
2012-12-17 21:19:44 +04:00
|
|
|
#include "block/block_int.h"
|
2012-12-17 21:20:00 +04:00
|
|
|
#include "qemu/module.h"
|
2012-12-17 21:19:50 +04:00
|
|
|
#include "migration/migration.h"
|
2011-11-20 15:34:30 +04:00
|
|
|
#include <zlib.h>
|
2014-12-04 02:28:29 +03:00
|
|
|
#include <glib.h>
|
2004-08-02 01:59:26 +04:00
|
|
|
|
|
|
|
#define VMDK3_MAGIC (('C' << 24) | ('O' << 16) | ('W' << 8) | 'D')
|
|
|
|
#define VMDK4_MAGIC (('K' << 24) | ('D' << 16) | ('M' << 8) | 'V')
|
2011-08-12 19:19:30 +04:00
|
|
|
#define VMDK4_COMPRESSION_DEFLATE 1
|
2013-05-02 06:25:25 +04:00
|
|
|
#define VMDK4_FLAG_NL_DETECT (1 << 0)
|
2011-08-12 19:19:34 +04:00
|
|
|
#define VMDK4_FLAG_RGD (1 << 1)
|
2013-05-02 06:25:23 +04:00
|
|
|
/* Zeroed-grain enable bit */
|
|
|
|
#define VMDK4_FLAG_ZERO_GRAIN (1 << 2)
|
2011-08-12 19:19:30 +04:00
|
|
|
#define VMDK4_FLAG_COMPRESS (1 << 16)
|
|
|
|
#define VMDK4_FLAG_MARKER (1 << 17)
|
2012-08-16 12:39:33 +04:00
|
|
|
#define VMDK4_GD_AT_END 0xffffffffffffffffULL
|
2004-08-02 01:59:26 +04:00
|
|
|
|
2013-05-02 06:25:23 +04:00
|
|
|
#define VMDK_GTE_ZEROED 0x1
|
2013-05-02 06:25:22 +04:00
|
|
|
|
|
|
|
/* VMDK internal error codes */
|
|
|
|
#define VMDK_OK 0
|
|
|
|
#define VMDK_ERROR (-1)
|
|
|
|
/* Cluster not allocated */
|
|
|
|
#define VMDK_UNALLOC (-2)
|
|
|
|
#define VMDK_ZEROED (-3)
|
|
|
|
|
2013-05-02 06:25:24 +04:00
|
|
|
#define BLOCK_OPT_ZEROED_GRAIN "zeroed_grain"
|
|
|
|
|
2004-08-02 01:59:26 +04:00
|
|
|
typedef struct {
|
|
|
|
uint32_t version;
|
|
|
|
uint32_t flags;
|
|
|
|
uint32_t disk_sectors;
|
|
|
|
uint32_t granularity;
|
|
|
|
uint32_t l1dir_offset;
|
|
|
|
uint32_t l1dir_size;
|
|
|
|
uint32_t file_sectors;
|
|
|
|
uint32_t cylinders;
|
|
|
|
uint32_t heads;
|
|
|
|
uint32_t sectors_per_track;
|
2013-08-06 11:44:47 +04:00
|
|
|
} QEMU_PACKED VMDK3Header;
|
2004-08-02 01:59:26 +04:00
|
|
|
|
|
|
|
typedef struct {
|
|
|
|
uint32_t version;
|
|
|
|
uint32_t flags;
|
2013-08-06 11:44:48 +04:00
|
|
|
uint64_t capacity;
|
|
|
|
uint64_t granularity;
|
|
|
|
uint64_t desc_offset;
|
|
|
|
uint64_t desc_size;
|
2013-08-06 11:44:55 +04:00
|
|
|
/* Number of GrainTableEntries per GrainTable */
|
|
|
|
uint32_t num_gtes_per_gt;
|
2013-08-06 11:44:48 +04:00
|
|
|
uint64_t rgd_offset;
|
|
|
|
uint64_t gd_offset;
|
|
|
|
uint64_t grain_offset;
|
2004-08-02 01:59:26 +04:00
|
|
|
char filler[1];
|
|
|
|
char check_bytes[4];
|
2011-08-12 19:19:30 +04:00
|
|
|
uint16_t compressAlgorithm;
|
2011-08-31 14:38:01 +04:00
|
|
|
} QEMU_PACKED VMDK4Header;
|
2004-08-02 01:59:26 +04:00
|
|
|
|
|
|
|
#define L2_CACHE_SIZE 16
|
|
|
|
|
2011-07-12 15:56:28 +04:00
|
|
|
typedef struct VmdkExtent {
|
|
|
|
BlockDriverState *file;
|
|
|
|
bool flat;
|
2011-08-12 19:19:30 +04:00
|
|
|
bool compressed;
|
|
|
|
bool has_marker;
|
2013-05-02 06:25:23 +04:00
|
|
|
bool has_zero_grain;
|
|
|
|
int version;
|
2011-07-12 15:56:28 +04:00
|
|
|
int64_t sectors;
|
|
|
|
int64_t end_sector;
|
2011-07-19 04:38:22 +04:00
|
|
|
int64_t flat_start_offset;
|
2004-08-02 01:59:26 +04:00
|
|
|
int64_t l1_table_offset;
|
2005-04-27 01:08:00 +04:00
|
|
|
int64_t l1_backup_table_offset;
|
2004-08-02 01:59:26 +04:00
|
|
|
uint32_t *l1_table;
|
2005-04-27 01:08:00 +04:00
|
|
|
uint32_t *l1_backup_table;
|
2004-08-02 01:59:26 +04:00
|
|
|
unsigned int l1_size;
|
|
|
|
uint32_t l1_entry_sectors;
|
|
|
|
|
|
|
|
unsigned int l2_size;
|
|
|
|
uint32_t *l2_cache;
|
|
|
|
uint32_t l2_cache_offsets[L2_CACHE_SIZE];
|
|
|
|
uint32_t l2_cache_counts[L2_CACHE_SIZE];
|
|
|
|
|
2013-09-23 13:18:29 +04:00
|
|
|
int64_t cluster_sectors;
|
vmdk: Optimize cluster allocation
This drops the unnecessary bdrv_truncate() from, and also improves,
cluster allocation code path.
Before, when we need a new cluster, get_cluster_offset truncates the
image to bdrv_getlength() + cluster_size, and returns the offset of
added area, i.e. the image length before truncating.
This is not efficient, so it's now rewritten as:
- Save the extent file length when opening.
- When allocating cluster, use the saved length as cluster offset.
- Don't truncate image, because we'll anyway write data there: just
write any data at the EOF position, in descending priority:
* New user data (cluster allocation happens in a write request).
* Filling data in the beginning and/or ending of the new cluster, if
not covered by user data: either backing file content (COW), or
zero for standalone images.
One major benifit of this change is, on host mounted NFS images, even
over a fast network, ftruncate is slow (see the example below). This
change significantly speeds up cluster allocation. Comparing by
converting a cirros image (296M) to VMDK on an NFS mount point, over
1Gbe LAN:
$ time qemu-img convert cirros-0.3.1.img /mnt/a.raw -O vmdk
Before:
real 0m21.796s
user 0m0.130s
sys 0m0.483s
After:
real 0m2.017s
user 0m0.047s
sys 0m0.190s
We also get rid of unchecked bdrv_getlength() and bdrv_truncate(), and
get a little more documentation in function comments.
Tested that this passes qemu-iotests for all VMDK subformats.
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-07-30 10:39:10 +04:00
|
|
|
int64_t next_cluster_sector;
|
2013-10-31 06:06:23 +04:00
|
|
|
char *type;
|
2011-07-12 15:56:28 +04:00
|
|
|
} VmdkExtent;
|
|
|
|
|
|
|
|
typedef struct BDRVVmdkState {
|
2011-10-20 15:16:21 +04:00
|
|
|
CoMutex lock;
|
2013-08-06 11:44:48 +04:00
|
|
|
uint64_t desc_offset;
|
2011-07-12 15:56:34 +04:00
|
|
|
bool cid_updated;
|
2013-10-18 09:17:19 +04:00
|
|
|
bool cid_checked;
|
2013-10-31 06:06:23 +04:00
|
|
|
uint32_t cid;
|
2007-01-25 00:05:24 +03:00
|
|
|
uint32_t parent_cid;
|
2011-07-12 15:56:28 +04:00
|
|
|
int num_extents;
|
|
|
|
/* Extent array with num_extents entries, ascend ordered by address */
|
|
|
|
VmdkExtent *extents;
|
2011-11-22 19:50:27 +04:00
|
|
|
Error *migration_blocker;
|
2013-10-31 06:06:23 +04:00
|
|
|
char *create_type;
|
2004-08-02 01:59:26 +04:00
|
|
|
} BDRVVmdkState;
|
|
|
|
|
2007-06-18 19:01:30 +04:00
|
|
|
typedef struct VmdkMetaData {
|
|
|
|
unsigned int l1_index;
|
|
|
|
unsigned int l2_index;
|
|
|
|
unsigned int l2_offset;
|
|
|
|
int valid;
|
2013-05-02 06:25:27 +04:00
|
|
|
uint32_t *l2_cache_entry;
|
2007-06-18 19:01:30 +04:00
|
|
|
} VmdkMetaData;
|
|
|
|
|
2011-08-12 19:19:30 +04:00
|
|
|
typedef struct VmdkGrainMarker {
|
|
|
|
uint64_t lba;
|
|
|
|
uint32_t size;
|
|
|
|
uint8_t data[0];
|
2013-08-06 11:44:47 +04:00
|
|
|
} QEMU_PACKED VmdkGrainMarker;
|
2011-08-12 19:19:30 +04:00
|
|
|
|
2012-08-16 12:39:33 +04:00
|
|
|
enum {
|
|
|
|
MARKER_END_OF_STREAM = 0,
|
|
|
|
MARKER_GRAIN_TABLE = 1,
|
|
|
|
MARKER_GRAIN_DIRECTORY = 2,
|
|
|
|
MARKER_FOOTER = 3,
|
|
|
|
};
|
|
|
|
|
2004-08-02 01:59:26 +04:00
|
|
|
static int vmdk_probe(const uint8_t *buf, int buf_size, const char *filename)
|
|
|
|
{
|
|
|
|
uint32_t magic;
|
|
|
|
|
2011-07-12 15:56:38 +04:00
|
|
|
if (buf_size < 4) {
|
2004-08-02 01:59:26 +04:00
|
|
|
return 0;
|
2011-07-12 15:56:38 +04:00
|
|
|
}
|
2004-08-02 01:59:26 +04:00
|
|
|
magic = be32_to_cpu(*(uint32_t *)buf);
|
|
|
|
if (magic == VMDK3_MAGIC ||
|
2011-07-12 15:56:30 +04:00
|
|
|
magic == VMDK4_MAGIC) {
|
2004-08-02 01:59:26 +04:00
|
|
|
return 100;
|
2011-07-12 15:56:30 +04:00
|
|
|
} else {
|
|
|
|
const char *p = (const char *)buf;
|
|
|
|
const char *end = p + buf_size;
|
|
|
|
while (p < end) {
|
|
|
|
if (*p == '#') {
|
|
|
|
/* skip comment line */
|
|
|
|
while (p < end && *p != '\n') {
|
|
|
|
p++;
|
|
|
|
}
|
|
|
|
p++;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (*p == ' ') {
|
|
|
|
while (p < end && *p == ' ') {
|
|
|
|
p++;
|
|
|
|
}
|
|
|
|
/* skip '\r' if windows line endings used. */
|
|
|
|
if (p < end && *p == '\r') {
|
|
|
|
p++;
|
|
|
|
}
|
|
|
|
/* only accept blank lines before 'version=' line */
|
|
|
|
if (p == end || *p != '\n') {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
p++;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (end - p >= strlen("version=X\n")) {
|
|
|
|
if (strncmp("version=1\n", p, strlen("version=1\n")) == 0 ||
|
|
|
|
strncmp("version=2\n", p, strlen("version=2\n")) == 0) {
|
|
|
|
return 100;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (end - p >= strlen("version=X\r\n")) {
|
|
|
|
if (strncmp("version=1\r\n", p, strlen("version=1\r\n")) == 0 ||
|
|
|
|
strncmp("version=2\r\n", p, strlen("version=2\r\n")) == 0) {
|
|
|
|
return 100;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
2004-08-02 01:59:26 +04:00
|
|
|
return 0;
|
2011-07-12 15:56:30 +04:00
|
|
|
}
|
2004-08-02 01:59:26 +04:00
|
|
|
}
|
|
|
|
|
2007-09-17 12:09:54 +04:00
|
|
|
#define SECTOR_SIZE 512
|
2011-07-19 04:45:23 +04:00
|
|
|
#define DESC_SIZE (20 * SECTOR_SIZE) /* 20 sectors of 512 bytes each */
|
|
|
|
#define BUF_SIZE 4096
|
|
|
|
#define HEADER_SIZE 512 /* first sector of 512 bytes */
|
2007-01-25 00:05:24 +03:00
|
|
|
|
2011-07-12 15:56:28 +04:00
|
|
|
static void vmdk_free_extents(BlockDriverState *bs)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
BDRVVmdkState *s = bs->opaque;
|
2011-09-19 06:26:42 +04:00
|
|
|
VmdkExtent *e;
|
2011-07-12 15:56:28 +04:00
|
|
|
|
|
|
|
for (i = 0; i < s->num_extents; i++) {
|
2011-09-19 06:26:42 +04:00
|
|
|
e = &s->extents[i];
|
|
|
|
g_free(e->l1_table);
|
|
|
|
g_free(e->l2_cache);
|
|
|
|
g_free(e->l1_backup_table);
|
2013-10-31 06:06:23 +04:00
|
|
|
g_free(e->type);
|
2011-09-19 06:26:42 +04:00
|
|
|
if (e->file != bs->file) {
|
2013-08-23 05:14:47 +04:00
|
|
|
bdrv_unref(e->file);
|
2011-09-19 06:26:42 +04:00
|
|
|
}
|
2011-07-12 15:56:28 +04:00
|
|
|
}
|
2011-08-21 07:09:37 +04:00
|
|
|
g_free(s->extents);
|
2011-07-12 15:56:28 +04:00
|
|
|
}
|
|
|
|
|
2011-08-12 19:19:28 +04:00
|
|
|
static void vmdk_free_last_extent(BlockDriverState *bs)
|
|
|
|
{
|
|
|
|
BDRVVmdkState *s = bs->opaque;
|
|
|
|
|
|
|
|
if (s->num_extents == 0) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
s->num_extents--;
|
block: Use g_new() & friends where that makes obvious sense
g_new(T, n) is neater than g_malloc(sizeof(T) * n). It's also safer,
for two reasons. One, it catches multiplication overflowing size_t.
Two, it returns T * rather than void *, which lets the compiler catch
more type errors.
Patch created with Coccinelle, with two manual changes on top:
* Add const to bdrv_iterate_format() to keep the types straight
* Convert the allocation in bdrv_drop_intermediate(), which Coccinelle
inexplicably misses
Coccinelle semantic patch:
@@
type T;
@@
-g_malloc(sizeof(T))
+g_new(T, 1)
@@
type T;
@@
-g_try_malloc(sizeof(T))
+g_try_new(T, 1)
@@
type T;
@@
-g_malloc0(sizeof(T))
+g_new0(T, 1)
@@
type T;
@@
-g_try_malloc0(sizeof(T))
+g_try_new0(T, 1)
@@
type T;
expression n;
@@
-g_malloc(sizeof(T) * (n))
+g_new(T, n)
@@
type T;
expression n;
@@
-g_try_malloc(sizeof(T) * (n))
+g_try_new(T, n)
@@
type T;
expression n;
@@
-g_malloc0(sizeof(T) * (n))
+g_new0(T, n)
@@
type T;
expression n;
@@
-g_try_malloc0(sizeof(T) * (n))
+g_try_new0(T, n)
@@
type T;
expression p, n;
@@
-g_realloc(p, sizeof(T) * (n))
+g_renew(T, p, n)
@@
type T;
expression p, n;
@@
-g_try_realloc(p, sizeof(T) * (n))
+g_try_renew(T, p, n)
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Jeff Cody <jcody@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2014-08-19 12:31:08 +04:00
|
|
|
s->extents = g_renew(VmdkExtent, s->extents, s->num_extents);
|
2011-08-12 19:19:28 +04:00
|
|
|
}
|
|
|
|
|
2007-01-25 00:05:24 +03:00
|
|
|
static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent)
|
2004-08-02 01:59:26 +04:00
|
|
|
{
|
2007-01-25 00:05:24 +03:00
|
|
|
char desc[DESC_SIZE];
|
2011-10-18 21:19:03 +04:00
|
|
|
uint32_t cid = 0xffffffff;
|
2008-09-14 10:45:34 +04:00
|
|
|
const char *p_name, *cid_str;
|
2007-01-25 00:05:24 +03:00
|
|
|
size_t cid_str_size;
|
2011-07-12 15:56:32 +04:00
|
|
|
BDRVVmdkState *s = bs->opaque;
|
2011-10-26 14:25:25 +04:00
|
|
|
int ret;
|
2007-01-25 00:05:24 +03:00
|
|
|
|
2011-10-26 14:25:25 +04:00
|
|
|
ret = bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE);
|
|
|
|
if (ret < 0) {
|
2007-01-25 00:05:24 +03:00
|
|
|
return 0;
|
2011-07-12 15:56:32 +04:00
|
|
|
}
|
2007-01-25 00:05:24 +03:00
|
|
|
|
|
|
|
if (parent) {
|
|
|
|
cid_str = "parentCID";
|
|
|
|
cid_str_size = sizeof("parentCID");
|
|
|
|
} else {
|
|
|
|
cid_str = "CID";
|
|
|
|
cid_str_size = sizeof("CID");
|
|
|
|
}
|
|
|
|
|
2011-10-26 14:25:52 +04:00
|
|
|
desc[DESC_SIZE - 1] = '\0';
|
2011-07-12 15:56:38 +04:00
|
|
|
p_name = strstr(desc, cid_str);
|
|
|
|
if (p_name != NULL) {
|
2007-01-25 00:05:24 +03:00
|
|
|
p_name += cid_str_size;
|
2014-04-17 14:43:53 +04:00
|
|
|
sscanf(p_name, "%" SCNx32, &cid);
|
2007-01-25 00:05:24 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
return cid;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int vmdk_write_cid(BlockDriverState *bs, uint32_t cid)
|
|
|
|
{
|
|
|
|
char desc[DESC_SIZE], tmp_desc[DESC_SIZE];
|
|
|
|
char *p_name, *tmp_str;
|
2011-07-12 15:56:32 +04:00
|
|
|
BDRVVmdkState *s = bs->opaque;
|
2011-10-26 14:25:25 +04:00
|
|
|
int ret;
|
2007-01-25 00:05:24 +03:00
|
|
|
|
2011-10-26 14:25:25 +04:00
|
|
|
ret = bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE);
|
|
|
|
if (ret < 0) {
|
|
|
|
return ret;
|
2011-07-12 15:56:32 +04:00
|
|
|
}
|
2007-01-25 00:05:24 +03:00
|
|
|
|
2011-10-26 14:25:52 +04:00
|
|
|
desc[DESC_SIZE - 1] = '\0';
|
2011-07-12 15:56:38 +04:00
|
|
|
tmp_str = strstr(desc, "parentCID");
|
2011-10-26 14:25:52 +04:00
|
|
|
if (tmp_str == NULL) {
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
2008-08-21 21:58:08 +04:00
|
|
|
pstrcpy(tmp_desc, sizeof(tmp_desc), tmp_str);
|
2011-07-12 15:56:38 +04:00
|
|
|
p_name = strstr(desc, "CID");
|
|
|
|
if (p_name != NULL) {
|
2007-01-25 00:05:24 +03:00
|
|
|
p_name += sizeof("CID");
|
2014-04-17 14:43:53 +04:00
|
|
|
snprintf(p_name, sizeof(desc) - (p_name - desc), "%" PRIx32 "\n", cid);
|
2008-08-21 21:58:08 +04:00
|
|
|
pstrcat(desc, sizeof(desc), tmp_desc);
|
2007-01-25 00:05:24 +03:00
|
|
|
}
|
|
|
|
|
2011-10-26 14:25:25 +04:00
|
|
|
ret = bdrv_pwrite_sync(bs->file, s->desc_offset, desc, DESC_SIZE);
|
|
|
|
if (ret < 0) {
|
|
|
|
return ret;
|
2011-07-12 15:56:32 +04:00
|
|
|
}
|
2011-10-26 14:25:25 +04:00
|
|
|
|
2007-01-25 00:05:24 +03:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int vmdk_is_cid_valid(BlockDriverState *bs)
|
|
|
|
{
|
|
|
|
BDRVVmdkState *s = bs->opaque;
|
2009-07-17 10:20:41 +04:00
|
|
|
BlockDriverState *p_bs = bs->backing_hd;
|
2007-01-25 00:05:24 +03:00
|
|
|
uint32_t cur_pcid;
|
|
|
|
|
2013-10-18 09:17:19 +04:00
|
|
|
if (!s->cid_checked && p_bs) {
|
2011-07-12 15:56:38 +04:00
|
|
|
cur_pcid = vmdk_read_cid(p_bs, 0);
|
|
|
|
if (s->parent_cid != cur_pcid) {
|
|
|
|
/* CID not valid */
|
2007-01-25 00:05:24 +03:00
|
|
|
return 0;
|
2011-07-12 15:56:38 +04:00
|
|
|
}
|
2007-01-25 00:05:24 +03:00
|
|
|
}
|
2013-10-18 09:17:19 +04:00
|
|
|
s->cid_checked = true;
|
2011-07-12 15:56:38 +04:00
|
|
|
/* CID valid */
|
2007-01-25 00:05:24 +03:00
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2012-09-20 23:13:30 +04:00
|
|
|
/* Queue extents, if any, for reopen() */
|
|
|
|
static int vmdk_reopen_prepare(BDRVReopenState *state,
|
|
|
|
BlockReopenQueue *queue, Error **errp)
|
|
|
|
{
|
|
|
|
BDRVVmdkState *s;
|
|
|
|
int ret = -1;
|
|
|
|
int i;
|
|
|
|
VmdkExtent *e;
|
|
|
|
|
|
|
|
assert(state != NULL);
|
|
|
|
assert(state->bs != NULL);
|
|
|
|
|
|
|
|
if (queue == NULL) {
|
2013-10-11 11:43:22 +04:00
|
|
|
error_setg(errp, "No reopen queue for VMDK extents");
|
2012-09-20 23:13:30 +04:00
|
|
|
goto exit;
|
|
|
|
}
|
|
|
|
|
|
|
|
s = state->bs->opaque;
|
|
|
|
|
|
|
|
assert(s != NULL);
|
|
|
|
|
|
|
|
for (i = 0; i < s->num_extents; i++) {
|
|
|
|
e = &s->extents[i];
|
|
|
|
if (e->file != state->bs->file) {
|
|
|
|
bdrv_reopen_queue(queue, e->file, state->flags);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
ret = 0;
|
|
|
|
|
|
|
|
exit:
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2010-04-16 23:07:19 +04:00
|
|
|
static int vmdk_parent_open(BlockDriverState *bs)
|
2007-01-25 00:05:24 +03:00
|
|
|
{
|
2007-09-17 01:08:06 +04:00
|
|
|
char *p_name;
|
2011-07-19 04:38:22 +04:00
|
|
|
char desc[DESC_SIZE + 1];
|
2011-07-12 15:56:32 +04:00
|
|
|
BDRVVmdkState *s = bs->opaque;
|
2011-10-20 15:16:19 +04:00
|
|
|
int ret;
|
2007-01-25 00:05:24 +03:00
|
|
|
|
2011-07-19 04:38:22 +04:00
|
|
|
desc[DESC_SIZE] = '\0';
|
2011-10-20 15:16:19 +04:00
|
|
|
ret = bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE);
|
|
|
|
if (ret < 0) {
|
|
|
|
return ret;
|
2011-07-12 15:56:32 +04:00
|
|
|
}
|
2007-01-25 00:05:24 +03:00
|
|
|
|
2011-07-12 15:56:38 +04:00
|
|
|
p_name = strstr(desc, "parentFileNameHint");
|
|
|
|
if (p_name != NULL) {
|
2007-01-25 00:05:24 +03:00
|
|
|
char *end_name;
|
|
|
|
|
|
|
|
p_name += sizeof("parentFileNameHint") + 1;
|
2011-07-12 15:56:38 +04:00
|
|
|
end_name = strchr(p_name, '\"');
|
|
|
|
if (end_name == NULL) {
|
2011-10-20 15:16:19 +04:00
|
|
|
return -EINVAL;
|
2011-07-12 15:56:38 +04:00
|
|
|
}
|
|
|
|
if ((end_name - p_name) > sizeof(bs->backing_file) - 1) {
|
2011-10-20 15:16:19 +04:00
|
|
|
return -EINVAL;
|
2011-07-12 15:56:38 +04:00
|
|
|
}
|
2007-09-17 12:09:54 +04:00
|
|
|
|
2009-07-17 10:20:41 +04:00
|
|
|
pstrcpy(bs->backing_file, end_name - p_name + 1, p_name);
|
2005-04-27 01:08:00 +04:00
|
|
|
}
|
2007-01-25 00:05:24 +03:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2011-07-12 15:56:28 +04:00
|
|
|
/* Create and append extent to the extent array. Return the added VmdkExtent
|
|
|
|
* address. return NULL if allocation failed. */
|
2013-08-06 11:44:51 +04:00
|
|
|
static int vmdk_add_extent(BlockDriverState *bs,
|
2011-07-12 15:56:28 +04:00
|
|
|
BlockDriverState *file, bool flat, int64_t sectors,
|
|
|
|
int64_t l1_offset, int64_t l1_backup_offset,
|
|
|
|
uint32_t l1_size,
|
2013-08-06 11:44:51 +04:00
|
|
|
int l2_size, uint64_t cluster_sectors,
|
2013-10-11 11:43:22 +04:00
|
|
|
VmdkExtent **new_extent,
|
|
|
|
Error **errp)
|
2011-07-12 15:56:28 +04:00
|
|
|
{
|
|
|
|
VmdkExtent *extent;
|
|
|
|
BDRVVmdkState *s = bs->opaque;
|
2014-08-21 16:36:19 +04:00
|
|
|
int64_t nb_sectors;
|
2011-07-12 15:56:28 +04:00
|
|
|
|
2013-08-06 11:44:51 +04:00
|
|
|
if (cluster_sectors > 0x200000) {
|
|
|
|
/* 0x200000 * 512Bytes = 1GB for one cluster is unrealistic */
|
2013-10-11 11:43:22 +04:00
|
|
|
error_setg(errp, "Invalid granularity, image may be corrupt");
|
|
|
|
return -EFBIG;
|
2013-08-06 11:44:51 +04:00
|
|
|
}
|
2013-08-19 14:54:25 +04:00
|
|
|
if (l1_size > 512 * 1024 * 1024) {
|
|
|
|
/* Although with big capacity and small l1_entry_sectors, we can get a
|
|
|
|
* big l1_size, we don't want unbounded value to allocate the table.
|
|
|
|
* Limit it to 512M, which is 16PB for default cluster and L2 table
|
|
|
|
* size */
|
2013-10-11 11:43:22 +04:00
|
|
|
error_setg(errp, "L1 size too big");
|
2013-08-19 14:54:25 +04:00
|
|
|
return -EFBIG;
|
|
|
|
}
|
2013-08-06 11:44:51 +04:00
|
|
|
|
2014-08-21 16:36:19 +04:00
|
|
|
nb_sectors = bdrv_nb_sectors(file);
|
|
|
|
if (nb_sectors < 0) {
|
|
|
|
return nb_sectors;
|
vmdk: Optimize cluster allocation
This drops the unnecessary bdrv_truncate() from, and also improves,
cluster allocation code path.
Before, when we need a new cluster, get_cluster_offset truncates the
image to bdrv_getlength() + cluster_size, and returns the offset of
added area, i.e. the image length before truncating.
This is not efficient, so it's now rewritten as:
- Save the extent file length when opening.
- When allocating cluster, use the saved length as cluster offset.
- Don't truncate image, because we'll anyway write data there: just
write any data at the EOF position, in descending priority:
* New user data (cluster allocation happens in a write request).
* Filling data in the beginning and/or ending of the new cluster, if
not covered by user data: either backing file content (COW), or
zero for standalone images.
One major benifit of this change is, on host mounted NFS images, even
over a fast network, ftruncate is slow (see the example below). This
change significantly speeds up cluster allocation. Comparing by
converting a cirros image (296M) to VMDK on an NFS mount point, over
1Gbe LAN:
$ time qemu-img convert cirros-0.3.1.img /mnt/a.raw -O vmdk
Before:
real 0m21.796s
user 0m0.130s
sys 0m0.483s
After:
real 0m2.017s
user 0m0.047s
sys 0m0.190s
We also get rid of unchecked bdrv_getlength() and bdrv_truncate(), and
get a little more documentation in function comments.
Tested that this passes qemu-iotests for all VMDK subformats.
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-07-30 10:39:10 +04:00
|
|
|
}
|
|
|
|
|
block: Use g_new() & friends where that makes obvious sense
g_new(T, n) is neater than g_malloc(sizeof(T) * n). It's also safer,
for two reasons. One, it catches multiplication overflowing size_t.
Two, it returns T * rather than void *, which lets the compiler catch
more type errors.
Patch created with Coccinelle, with two manual changes on top:
* Add const to bdrv_iterate_format() to keep the types straight
* Convert the allocation in bdrv_drop_intermediate(), which Coccinelle
inexplicably misses
Coccinelle semantic patch:
@@
type T;
@@
-g_malloc(sizeof(T))
+g_new(T, 1)
@@
type T;
@@
-g_try_malloc(sizeof(T))
+g_try_new(T, 1)
@@
type T;
@@
-g_malloc0(sizeof(T))
+g_new0(T, 1)
@@
type T;
@@
-g_try_malloc0(sizeof(T))
+g_try_new0(T, 1)
@@
type T;
expression n;
@@
-g_malloc(sizeof(T) * (n))
+g_new(T, n)
@@
type T;
expression n;
@@
-g_try_malloc(sizeof(T) * (n))
+g_try_new(T, n)
@@
type T;
expression n;
@@
-g_malloc0(sizeof(T) * (n))
+g_new0(T, n)
@@
type T;
expression n;
@@
-g_try_malloc0(sizeof(T) * (n))
+g_try_new0(T, n)
@@
type T;
expression p, n;
@@
-g_realloc(p, sizeof(T) * (n))
+g_renew(T, p, n)
@@
type T;
expression p, n;
@@
-g_try_realloc(p, sizeof(T) * (n))
+g_try_renew(T, p, n)
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Jeff Cody <jcody@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2014-08-19 12:31:08 +04:00
|
|
|
s->extents = g_renew(VmdkExtent, s->extents, s->num_extents + 1);
|
2011-07-12 15:56:28 +04:00
|
|
|
extent = &s->extents[s->num_extents];
|
|
|
|
s->num_extents++;
|
|
|
|
|
|
|
|
memset(extent, 0, sizeof(VmdkExtent));
|
|
|
|
extent->file = file;
|
|
|
|
extent->flat = flat;
|
|
|
|
extent->sectors = sectors;
|
|
|
|
extent->l1_table_offset = l1_offset;
|
|
|
|
extent->l1_backup_table_offset = l1_backup_offset;
|
|
|
|
extent->l1_size = l1_size;
|
|
|
|
extent->l1_entry_sectors = l2_size * cluster_sectors;
|
|
|
|
extent->l2_size = l2_size;
|
2013-09-23 13:18:29 +04:00
|
|
|
extent->cluster_sectors = flat ? sectors : cluster_sectors;
|
2014-08-21 16:36:19 +04:00
|
|
|
extent->next_cluster_sector = ROUND_UP(nb_sectors, cluster_sectors);
|
2011-07-12 15:56:28 +04:00
|
|
|
|
|
|
|
if (s->num_extents > 1) {
|
|
|
|
extent->end_sector = (*(extent - 1)).end_sector + extent->sectors;
|
|
|
|
} else {
|
|
|
|
extent->end_sector = extent->sectors;
|
|
|
|
}
|
|
|
|
bs->total_sectors = extent->end_sector;
|
2013-08-06 11:44:51 +04:00
|
|
|
if (new_extent) {
|
|
|
|
*new_extent = extent;
|
|
|
|
}
|
|
|
|
return 0;
|
2011-07-12 15:56:28 +04:00
|
|
|
}
|
|
|
|
|
2013-10-11 11:43:22 +04:00
|
|
|
static int vmdk_init_tables(BlockDriverState *bs, VmdkExtent *extent,
|
|
|
|
Error **errp)
|
2007-01-25 00:05:24 +03:00
|
|
|
{
|
2011-07-12 15:56:31 +04:00
|
|
|
int ret;
|
|
|
|
int l1_size, i;
|
2007-01-25 00:05:24 +03:00
|
|
|
|
2004-08-02 01:59:26 +04:00
|
|
|
/* read the L1 table */
|
2011-07-12 15:56:28 +04:00
|
|
|
l1_size = extent->l1_size * sizeof(uint32_t);
|
2014-05-20 15:56:27 +04:00
|
|
|
extent->l1_table = g_try_malloc(l1_size);
|
|
|
|
if (l1_size && extent->l1_table == NULL) {
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
|
2011-07-12 15:56:31 +04:00
|
|
|
ret = bdrv_pread(extent->file,
|
2013-10-11 11:43:22 +04:00
|
|
|
extent->l1_table_offset,
|
|
|
|
extent->l1_table,
|
|
|
|
l1_size);
|
2011-07-12 15:56:31 +04:00
|
|
|
if (ret < 0) {
|
2013-10-11 11:43:22 +04:00
|
|
|
error_setg_errno(errp, -ret,
|
|
|
|
"Could not read l1 table from extent '%s'",
|
|
|
|
extent->file->filename);
|
2011-07-12 15:56:31 +04:00
|
|
|
goto fail_l1;
|
2011-07-12 15:56:28 +04:00
|
|
|
}
|
|
|
|
for (i = 0; i < extent->l1_size; i++) {
|
|
|
|
le32_to_cpus(&extent->l1_table[i]);
|
2004-08-02 01:59:26 +04:00
|
|
|
}
|
|
|
|
|
2011-07-12 15:56:28 +04:00
|
|
|
if (extent->l1_backup_table_offset) {
|
2014-05-20 15:56:27 +04:00
|
|
|
extent->l1_backup_table = g_try_malloc(l1_size);
|
|
|
|
if (l1_size && extent->l1_backup_table == NULL) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto fail_l1;
|
|
|
|
}
|
2011-07-12 15:56:31 +04:00
|
|
|
ret = bdrv_pread(extent->file,
|
2013-10-11 11:43:22 +04:00
|
|
|
extent->l1_backup_table_offset,
|
|
|
|
extent->l1_backup_table,
|
|
|
|
l1_size);
|
2011-07-12 15:56:31 +04:00
|
|
|
if (ret < 0) {
|
2013-10-11 11:43:22 +04:00
|
|
|
error_setg_errno(errp, -ret,
|
|
|
|
"Could not read l1 backup table from extent '%s'",
|
|
|
|
extent->file->filename);
|
2011-07-12 15:56:31 +04:00
|
|
|
goto fail_l1b;
|
2011-07-12 15:56:28 +04:00
|
|
|
}
|
|
|
|
for (i = 0; i < extent->l1_size; i++) {
|
|
|
|
le32_to_cpus(&extent->l1_backup_table[i]);
|
2005-04-27 01:08:00 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-07-12 15:56:28 +04:00
|
|
|
extent->l2_cache =
|
block: Use g_new() & friends where that makes obvious sense
g_new(T, n) is neater than g_malloc(sizeof(T) * n). It's also safer,
for two reasons. One, it catches multiplication overflowing size_t.
Two, it returns T * rather than void *, which lets the compiler catch
more type errors.
Patch created with Coccinelle, with two manual changes on top:
* Add const to bdrv_iterate_format() to keep the types straight
* Convert the allocation in bdrv_drop_intermediate(), which Coccinelle
inexplicably misses
Coccinelle semantic patch:
@@
type T;
@@
-g_malloc(sizeof(T))
+g_new(T, 1)
@@
type T;
@@
-g_try_malloc(sizeof(T))
+g_try_new(T, 1)
@@
type T;
@@
-g_malloc0(sizeof(T))
+g_new0(T, 1)
@@
type T;
@@
-g_try_malloc0(sizeof(T))
+g_try_new0(T, 1)
@@
type T;
expression n;
@@
-g_malloc(sizeof(T) * (n))
+g_new(T, n)
@@
type T;
expression n;
@@
-g_try_malloc(sizeof(T) * (n))
+g_try_new(T, n)
@@
type T;
expression n;
@@
-g_malloc0(sizeof(T) * (n))
+g_new0(T, n)
@@
type T;
expression n;
@@
-g_try_malloc0(sizeof(T) * (n))
+g_try_new0(T, n)
@@
type T;
expression p, n;
@@
-g_realloc(p, sizeof(T) * (n))
+g_renew(T, p, n)
@@
type T;
expression p, n;
@@
-g_try_realloc(p, sizeof(T) * (n))
+g_try_renew(T, p, n)
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Jeff Cody <jcody@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2014-08-19 12:31:08 +04:00
|
|
|
g_new(uint32_t, extent->l2_size * L2_CACHE_SIZE);
|
2004-08-02 01:59:26 +04:00
|
|
|
return 0;
|
2011-07-12 15:56:31 +04:00
|
|
|
fail_l1b:
|
2011-08-21 07:09:37 +04:00
|
|
|
g_free(extent->l1_backup_table);
|
2011-07-12 15:56:31 +04:00
|
|
|
fail_l1:
|
2011-08-21 07:09:37 +04:00
|
|
|
g_free(extent->l1_table);
|
2011-07-12 15:56:31 +04:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2013-08-19 14:54:27 +04:00
|
|
|
static int vmdk_open_vmfs_sparse(BlockDriverState *bs,
|
|
|
|
BlockDriverState *file,
|
2013-10-11 11:43:22 +04:00
|
|
|
int flags, Error **errp)
|
2011-07-12 15:56:31 +04:00
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
uint32_t magic;
|
|
|
|
VMDK3Header header;
|
|
|
|
VmdkExtent *extent;
|
|
|
|
|
2011-08-12 19:19:28 +04:00
|
|
|
ret = bdrv_pread(file, sizeof(magic), &header, sizeof(header));
|
2011-07-12 15:56:31 +04:00
|
|
|
if (ret < 0) {
|
2013-10-11 11:43:22 +04:00
|
|
|
error_setg_errno(errp, -ret,
|
|
|
|
"Could not read header from file '%s'",
|
|
|
|
file->filename);
|
2011-08-12 19:19:28 +04:00
|
|
|
return ret;
|
2011-07-12 15:56:31 +04:00
|
|
|
}
|
2013-08-19 14:54:26 +04:00
|
|
|
ret = vmdk_add_extent(bs, file, false,
|
|
|
|
le32_to_cpu(header.disk_sectors),
|
2015-04-27 17:23:01 +03:00
|
|
|
(int64_t)le32_to_cpu(header.l1dir_offset) << 9,
|
2013-08-19 14:54:26 +04:00
|
|
|
0,
|
|
|
|
le32_to_cpu(header.l1dir_size),
|
|
|
|
4096,
|
|
|
|
le32_to_cpu(header.granularity),
|
2013-10-11 11:43:22 +04:00
|
|
|
&extent,
|
|
|
|
errp);
|
2013-08-06 11:44:51 +04:00
|
|
|
if (ret < 0) {
|
|
|
|
return ret;
|
|
|
|
}
|
2013-10-11 11:43:22 +04:00
|
|
|
ret = vmdk_init_tables(bs, extent, errp);
|
2011-07-12 15:56:31 +04:00
|
|
|
if (ret) {
|
2011-08-12 19:19:28 +04:00
|
|
|
/* free extent allocated by vmdk_add_extent */
|
|
|
|
vmdk_free_last_extent(bs);
|
2011-07-12 15:56:31 +04:00
|
|
|
}
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2014-02-17 17:44:03 +04:00
|
|
|
static int vmdk_open_desc_file(BlockDriverState *bs, int flags, char *buf,
|
|
|
|
Error **errp);
|
2011-08-12 19:19:33 +04:00
|
|
|
|
2014-02-17 17:44:02 +04:00
|
|
|
static char *vmdk_read_desc(BlockDriverState *file, uint64_t desc_offset,
|
|
|
|
Error **errp)
|
|
|
|
{
|
|
|
|
int64_t size;
|
|
|
|
char *buf;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
size = bdrv_getlength(file);
|
|
|
|
if (size < 0) {
|
|
|
|
error_setg_errno(errp, -size, "Could not access file");
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2014-12-04 02:28:32 +03:00
|
|
|
if (size < 4) {
|
|
|
|
/* Both descriptor file and sparse image must be much larger than 4
|
|
|
|
* bytes, also callers of vmdk_read_desc want to compare the first 4
|
|
|
|
* bytes with VMDK4_MAGIC, let's error out if less is read. */
|
|
|
|
error_setg(errp, "File is too small, not a valid image");
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2014-12-04 02:28:31 +03:00
|
|
|
size = MIN(size, (1 << 20) - 1); /* avoid unbounded allocation */
|
|
|
|
buf = g_malloc(size + 1);
|
2014-02-17 17:44:02 +04:00
|
|
|
|
|
|
|
ret = bdrv_pread(file, desc_offset, buf, size);
|
|
|
|
if (ret < 0) {
|
|
|
|
error_setg_errno(errp, -ret, "Could not read from file");
|
|
|
|
g_free(buf);
|
|
|
|
return NULL;
|
|
|
|
}
|
2014-12-04 02:28:31 +03:00
|
|
|
buf[ret] = 0;
|
2014-02-17 17:44:02 +04:00
|
|
|
|
|
|
|
return buf;
|
|
|
|
}
|
|
|
|
|
2011-08-12 19:19:28 +04:00
|
|
|
static int vmdk_open_vmdk4(BlockDriverState *bs,
|
|
|
|
BlockDriverState *file,
|
2013-10-11 11:43:22 +04:00
|
|
|
int flags, Error **errp)
|
2011-07-12 15:56:31 +04:00
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
uint32_t magic;
|
|
|
|
uint32_t l1_size, l1_entry_sectors;
|
|
|
|
VMDK4Header header;
|
|
|
|
VmdkExtent *extent;
|
2013-10-31 06:06:23 +04:00
|
|
|
BDRVVmdkState *s = bs->opaque;
|
2011-08-12 19:19:34 +04:00
|
|
|
int64_t l1_backup_offset = 0;
|
2011-07-12 15:56:31 +04:00
|
|
|
|
2011-08-12 19:19:28 +04:00
|
|
|
ret = bdrv_pread(file, sizeof(magic), &header, sizeof(header));
|
2011-07-12 15:56:31 +04:00
|
|
|
if (ret < 0) {
|
2013-10-11 11:43:22 +04:00
|
|
|
error_setg_errno(errp, -ret,
|
|
|
|
"Could not read header from file '%s'",
|
|
|
|
file->filename);
|
2014-02-17 17:44:05 +04:00
|
|
|
return -EINVAL;
|
2011-07-12 15:56:31 +04:00
|
|
|
}
|
2013-06-10 13:07:33 +04:00
|
|
|
if (header.capacity == 0) {
|
2013-08-06 11:44:48 +04:00
|
|
|
uint64_t desc_offset = le64_to_cpu(header.desc_offset);
|
2013-06-10 13:07:33 +04:00
|
|
|
if (desc_offset) {
|
2014-02-17 17:44:03 +04:00
|
|
|
char *buf = vmdk_read_desc(file, desc_offset << 9, errp);
|
|
|
|
if (!buf) {
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
ret = vmdk_open_desc_file(bs, flags, buf, errp);
|
|
|
|
g_free(buf);
|
|
|
|
return ret;
|
2013-06-10 13:07:33 +04:00
|
|
|
}
|
2011-08-12 19:19:33 +04:00
|
|
|
}
|
2012-08-16 12:39:33 +04:00
|
|
|
|
2013-10-31 06:06:23 +04:00
|
|
|
if (!s->create_type) {
|
|
|
|
s->create_type = g_strdup("monolithicSparse");
|
|
|
|
}
|
|
|
|
|
2012-08-16 12:39:33 +04:00
|
|
|
if (le64_to_cpu(header.gd_offset) == VMDK4_GD_AT_END) {
|
|
|
|
/*
|
|
|
|
* The footer takes precedence over the header, so read it in. The
|
|
|
|
* footer starts at offset -1024 from the end: One sector for the
|
|
|
|
* footer, and another one for the end-of-stream marker.
|
|
|
|
*/
|
|
|
|
struct {
|
|
|
|
struct {
|
|
|
|
uint64_t val;
|
|
|
|
uint32_t size;
|
|
|
|
uint32_t type;
|
|
|
|
uint8_t pad[512 - 16];
|
|
|
|
} QEMU_PACKED footer_marker;
|
|
|
|
|
|
|
|
uint32_t magic;
|
|
|
|
VMDK4Header header;
|
|
|
|
uint8_t pad[512 - 4 - sizeof(VMDK4Header)];
|
|
|
|
|
|
|
|
struct {
|
|
|
|
uint64_t val;
|
|
|
|
uint32_t size;
|
|
|
|
uint32_t type;
|
|
|
|
uint8_t pad[512 - 16];
|
|
|
|
} QEMU_PACKED eos_marker;
|
|
|
|
} QEMU_PACKED footer;
|
|
|
|
|
|
|
|
ret = bdrv_pread(file,
|
|
|
|
bs->file->total_sectors * 512 - 1536,
|
|
|
|
&footer, sizeof(footer));
|
|
|
|
if (ret < 0) {
|
2014-12-04 02:28:34 +03:00
|
|
|
error_setg_errno(errp, -ret, "Failed to read footer");
|
2012-08-16 12:39:33 +04:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Some sanity checks for the footer */
|
|
|
|
if (be32_to_cpu(footer.magic) != VMDK4_MAGIC ||
|
|
|
|
le32_to_cpu(footer.footer_marker.size) != 0 ||
|
|
|
|
le32_to_cpu(footer.footer_marker.type) != MARKER_FOOTER ||
|
|
|
|
le64_to_cpu(footer.eos_marker.val) != 0 ||
|
|
|
|
le32_to_cpu(footer.eos_marker.size) != 0 ||
|
|
|
|
le32_to_cpu(footer.eos_marker.type) != MARKER_END_OF_STREAM)
|
|
|
|
{
|
2014-12-04 02:28:34 +03:00
|
|
|
error_setg(errp, "Invalid footer");
|
2012-08-16 12:39:33 +04:00
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
header = footer.header;
|
|
|
|
}
|
|
|
|
|
2013-11-28 05:48:03 +04:00
|
|
|
if (le32_to_cpu(header.version) > 3) {
|
2013-06-13 07:21:29 +04:00
|
|
|
char buf[64];
|
2014-04-17 07:34:37 +04:00
|
|
|
snprintf(buf, sizeof(buf), "VMDK version %" PRId32,
|
2013-06-13 07:21:29 +04:00
|
|
|
le32_to_cpu(header.version));
|
2014-02-17 17:44:05 +04:00
|
|
|
error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
|
2015-04-08 12:29:19 +03:00
|
|
|
bdrv_get_device_or_node_name(bs), "vmdk", buf);
|
2013-06-13 07:21:29 +04:00
|
|
|
return -ENOTSUP;
|
2013-11-28 05:48:03 +04:00
|
|
|
} else if (le32_to_cpu(header.version) == 3 && (flags & BDRV_O_RDWR)) {
|
|
|
|
/* VMware KB 2064959 explains that version 3 added support for
|
|
|
|
* persistent changed block tracking (CBT), and backup software can
|
|
|
|
* read it as version=1 if it doesn't care about the changed area
|
|
|
|
* information. So we are safe to enable read only. */
|
|
|
|
error_setg(errp, "VMDK version 3 must be read only");
|
|
|
|
return -EINVAL;
|
2013-06-13 07:21:29 +04:00
|
|
|
}
|
|
|
|
|
2013-08-06 11:44:55 +04:00
|
|
|
if (le32_to_cpu(header.num_gtes_per_gt) > 512) {
|
2014-02-17 17:44:05 +04:00
|
|
|
error_setg(errp, "L2 table size too big");
|
2013-08-06 11:44:52 +04:00
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
2013-08-06 11:44:55 +04:00
|
|
|
l1_entry_sectors = le32_to_cpu(header.num_gtes_per_gt)
|
2011-07-12 15:56:31 +04:00
|
|
|
* le64_to_cpu(header.granularity);
|
2012-02-25 17:01:42 +04:00
|
|
|
if (l1_entry_sectors == 0) {
|
2014-12-04 02:28:34 +03:00
|
|
|
error_setg(errp, "L1 entry size is invalid");
|
2011-08-12 19:19:28 +04:00
|
|
|
return -EINVAL;
|
|
|
|
}
|
2011-07-12 15:56:31 +04:00
|
|
|
l1_size = (le64_to_cpu(header.capacity) + l1_entry_sectors - 1)
|
|
|
|
/ l1_entry_sectors;
|
2011-08-12 19:19:34 +04:00
|
|
|
if (le32_to_cpu(header.flags) & VMDK4_FLAG_RGD) {
|
|
|
|
l1_backup_offset = le64_to_cpu(header.rgd_offset) << 9;
|
|
|
|
}
|
2014-06-26 15:23:22 +04:00
|
|
|
if (bdrv_nb_sectors(file) < le64_to_cpu(header.grain_offset)) {
|
2014-04-17 07:34:37 +04:00
|
|
|
error_setg(errp, "File truncated, expecting at least %" PRId64 " bytes",
|
|
|
|
(int64_t)(le64_to_cpu(header.grain_offset)
|
|
|
|
* BDRV_SECTOR_SIZE));
|
2014-01-21 11:07:43 +04:00
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
2013-08-06 11:44:51 +04:00
|
|
|
ret = vmdk_add_extent(bs, file, false,
|
2011-07-12 15:56:31 +04:00
|
|
|
le64_to_cpu(header.capacity),
|
|
|
|
le64_to_cpu(header.gd_offset) << 9,
|
2011-08-12 19:19:34 +04:00
|
|
|
l1_backup_offset,
|
2011-07-12 15:56:31 +04:00
|
|
|
l1_size,
|
2013-08-06 11:44:55 +04:00
|
|
|
le32_to_cpu(header.num_gtes_per_gt),
|
2013-08-06 11:44:51 +04:00
|
|
|
le64_to_cpu(header.granularity),
|
2013-10-11 11:43:22 +04:00
|
|
|
&extent,
|
|
|
|
errp);
|
2013-08-06 11:44:51 +04:00
|
|
|
if (ret < 0) {
|
|
|
|
return ret;
|
|
|
|
}
|
2011-08-12 19:19:30 +04:00
|
|
|
extent->compressed =
|
|
|
|
le16_to_cpu(header.compressAlgorithm) == VMDK4_COMPRESSION_DEFLATE;
|
2014-01-23 11:10:52 +04:00
|
|
|
if (extent->compressed) {
|
|
|
|
g_free(s->create_type);
|
|
|
|
s->create_type = g_strdup("streamOptimized");
|
|
|
|
}
|
2011-08-12 19:19:30 +04:00
|
|
|
extent->has_marker = le32_to_cpu(header.flags) & VMDK4_FLAG_MARKER;
|
2013-05-02 06:25:23 +04:00
|
|
|
extent->version = le32_to_cpu(header.version);
|
|
|
|
extent->has_zero_grain = le32_to_cpu(header.flags) & VMDK4_FLAG_ZERO_GRAIN;
|
2013-10-11 11:43:22 +04:00
|
|
|
ret = vmdk_init_tables(bs, extent, errp);
|
2011-07-12 15:56:31 +04:00
|
|
|
if (ret) {
|
2011-08-12 19:19:28 +04:00
|
|
|
/* free extent allocated by vmdk_add_extent */
|
|
|
|
vmdk_free_last_extent(bs);
|
2011-07-12 15:56:31 +04:00
|
|
|
}
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2011-07-19 04:38:22 +04:00
|
|
|
/* find an option value out of descriptor file */
|
|
|
|
static int vmdk_parse_description(const char *desc, const char *opt_name,
|
|
|
|
char *buf, int buf_size)
|
|
|
|
{
|
|
|
|
char *opt_pos, *opt_end;
|
|
|
|
const char *end = desc + strlen(desc);
|
|
|
|
|
|
|
|
opt_pos = strstr(desc, opt_name);
|
|
|
|
if (!opt_pos) {
|
2013-05-02 06:25:22 +04:00
|
|
|
return VMDK_ERROR;
|
2011-07-19 04:38:22 +04:00
|
|
|
}
|
|
|
|
/* Skip "=\"" following opt_name */
|
|
|
|
opt_pos += strlen(opt_name) + 2;
|
|
|
|
if (opt_pos >= end) {
|
2013-05-02 06:25:22 +04:00
|
|
|
return VMDK_ERROR;
|
2011-07-19 04:38:22 +04:00
|
|
|
}
|
|
|
|
opt_end = opt_pos;
|
|
|
|
while (opt_end < end && *opt_end != '"') {
|
|
|
|
opt_end++;
|
|
|
|
}
|
|
|
|
if (opt_end == end || buf_size < opt_end - opt_pos + 1) {
|
2013-05-02 06:25:22 +04:00
|
|
|
return VMDK_ERROR;
|
2011-07-19 04:38:22 +04:00
|
|
|
}
|
|
|
|
pstrcpy(buf, opt_end - opt_pos + 1, opt_pos);
|
2013-05-02 06:25:22 +04:00
|
|
|
return VMDK_OK;
|
2011-07-19 04:38:22 +04:00
|
|
|
}
|
|
|
|
|
2011-08-12 19:19:28 +04:00
|
|
|
/* Open an extent file and append to bs array */
|
|
|
|
static int vmdk_open_sparse(BlockDriverState *bs,
|
2014-02-17 17:44:03 +04:00
|
|
|
BlockDriverState *file, int flags,
|
|
|
|
char *buf, Error **errp)
|
2011-08-12 19:19:28 +04:00
|
|
|
{
|
|
|
|
uint32_t magic;
|
|
|
|
|
2014-02-17 17:44:03 +04:00
|
|
|
magic = ldl_be_p(buf);
|
2011-08-12 19:19:28 +04:00
|
|
|
switch (magic) {
|
|
|
|
case VMDK3_MAGIC:
|
2013-10-11 11:43:22 +04:00
|
|
|
return vmdk_open_vmfs_sparse(bs, file, flags, errp);
|
2011-08-12 19:19:28 +04:00
|
|
|
break;
|
|
|
|
case VMDK4_MAGIC:
|
2013-10-11 11:43:22 +04:00
|
|
|
return vmdk_open_vmdk4(bs, file, flags, errp);
|
2011-08-12 19:19:28 +04:00
|
|
|
break;
|
|
|
|
default:
|
2014-02-17 17:44:06 +04:00
|
|
|
error_setg(errp, "Image not in VMDK format");
|
|
|
|
return -EINVAL;
|
2011-08-12 19:19:28 +04:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-07-19 04:38:22 +04:00
|
|
|
static int vmdk_parse_extents(const char *desc, BlockDriverState *bs,
|
2013-10-11 11:43:22 +04:00
|
|
|
const char *desc_file_path, Error **errp)
|
2011-07-19 04:38:22 +04:00
|
|
|
{
|
|
|
|
int ret;
|
2015-01-22 16:03:25 +03:00
|
|
|
int matches;
|
2011-07-19 04:38:22 +04:00
|
|
|
char access[11];
|
|
|
|
char type[11];
|
|
|
|
char fname[512];
|
|
|
|
const char *p = desc;
|
|
|
|
int64_t sectors = 0;
|
|
|
|
int64_t flat_offset;
|
2015-01-22 16:03:26 +03:00
|
|
|
char *extent_path;
|
2011-08-12 19:19:28 +04:00
|
|
|
BlockDriverState *extent_file;
|
2013-10-31 06:06:23 +04:00
|
|
|
BDRVVmdkState *s = bs->opaque;
|
|
|
|
VmdkExtent *extent;
|
2011-07-19 04:38:22 +04:00
|
|
|
|
|
|
|
while (*p) {
|
2014-12-04 02:28:30 +03:00
|
|
|
/* parse extent line in one of below formats:
|
|
|
|
*
|
2011-07-19 04:38:22 +04:00
|
|
|
* RW [size in sectors] FLAT "file-name.vmdk" OFFSET
|
|
|
|
* RW [size in sectors] SPARSE "file-name.vmdk"
|
2014-12-04 02:28:30 +03:00
|
|
|
* RW [size in sectors] VMFS "file-name.vmdk"
|
|
|
|
* RW [size in sectors] VMFSSPARSE "file-name.vmdk"
|
2011-07-19 04:38:22 +04:00
|
|
|
*/
|
|
|
|
flat_offset = -1;
|
2015-01-22 16:03:25 +03:00
|
|
|
matches = sscanf(p, "%10s %" SCNd64 " %10s \"%511[^\n\r\"]\" %" SCNd64,
|
|
|
|
access, §ors, type, fname, &flat_offset);
|
|
|
|
if (matches < 4 || strcmp(access, "RW")) {
|
2011-07-19 04:38:22 +04:00
|
|
|
goto next_line;
|
|
|
|
} else if (!strcmp(type, "FLAT")) {
|
2015-01-22 16:03:25 +03:00
|
|
|
if (matches != 5 || flat_offset < 0) {
|
2013-10-11 11:43:22 +04:00
|
|
|
error_setg(errp, "Invalid extent lines: \n%s", p);
|
2011-07-19 04:38:22 +04:00
|
|
|
return -EINVAL;
|
|
|
|
}
|
2013-10-18 11:07:33 +04:00
|
|
|
} else if (!strcmp(type, "VMFS")) {
|
2015-01-22 16:03:25 +03:00
|
|
|
if (matches == 4) {
|
2013-12-09 09:24:36 +04:00
|
|
|
flat_offset = 0;
|
|
|
|
} else {
|
|
|
|
error_setg(errp, "Invalid extent lines:\n%s", p);
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
2015-01-22 16:03:25 +03:00
|
|
|
} else if (matches != 4) {
|
2013-12-09 09:24:36 +04:00
|
|
|
error_setg(errp, "Invalid extent lines:\n%s", p);
|
2011-07-19 04:38:22 +04:00
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (sectors <= 0 ||
|
2013-08-19 14:54:27 +04:00
|
|
|
(strcmp(type, "FLAT") && strcmp(type, "SPARSE") &&
|
2013-08-19 14:54:28 +04:00
|
|
|
strcmp(type, "VMFS") && strcmp(type, "VMFSSPARSE")) ||
|
2011-07-19 04:38:22 +04:00
|
|
|
(strcmp(access, "RW"))) {
|
|
|
|
goto next_line;
|
|
|
|
}
|
|
|
|
|
2014-12-03 16:57:22 +03:00
|
|
|
if (!path_is_absolute(fname) && !path_has_protocol(fname) &&
|
|
|
|
!desc_file_path[0])
|
|
|
|
{
|
|
|
|
error_setg(errp, "Cannot use relative extent paths with VMDK "
|
|
|
|
"descriptor file '%s'", bs->file->filename);
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
2015-01-22 16:03:26 +03:00
|
|
|
extent_path = g_malloc0(PATH_MAX);
|
2015-02-10 21:22:56 +03:00
|
|
|
path_combine(extent_path, PATH_MAX, desc_file_path, fname);
|
2014-02-18 21:33:07 +04:00
|
|
|
extent_file = NULL;
|
|
|
|
ret = bdrv_open(&extent_file, extent_path, NULL, NULL,
|
|
|
|
bs->open_flags | BDRV_O_PROTOCOL, NULL, errp);
|
2015-01-22 16:03:26 +03:00
|
|
|
g_free(extent_path);
|
2011-08-12 19:19:28 +04:00
|
|
|
if (ret) {
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2011-07-19 04:38:22 +04:00
|
|
|
/* save to extents array */
|
2013-08-19 14:54:28 +04:00
|
|
|
if (!strcmp(type, "FLAT") || !strcmp(type, "VMFS")) {
|
2011-07-19 04:38:22 +04:00
|
|
|
/* FLAT extent */
|
|
|
|
|
2013-08-06 11:44:51 +04:00
|
|
|
ret = vmdk_add_extent(bs, extent_file, true, sectors,
|
2013-10-11 11:43:22 +04:00
|
|
|
0, 0, 0, 0, 0, &extent, errp);
|
2013-08-06 11:44:51 +04:00
|
|
|
if (ret < 0) {
|
2014-09-05 00:04:42 +04:00
|
|
|
bdrv_unref(extent_file);
|
2013-08-06 11:44:51 +04:00
|
|
|
return ret;
|
|
|
|
}
|
2011-08-12 19:19:33 +04:00
|
|
|
extent->flat_start_offset = flat_offset << 9;
|
2013-08-19 14:54:27 +04:00
|
|
|
} else if (!strcmp(type, "SPARSE") || !strcmp(type, "VMFSSPARSE")) {
|
|
|
|
/* SPARSE extent and VMFSSPARSE extent are both "COWD" sparse file*/
|
2014-02-17 17:44:03 +04:00
|
|
|
char *buf = vmdk_read_desc(extent_file, 0, errp);
|
|
|
|
if (!buf) {
|
|
|
|
ret = -EINVAL;
|
|
|
|
} else {
|
|
|
|
ret = vmdk_open_sparse(bs, extent_file, bs->open_flags, buf, errp);
|
|
|
|
}
|
2014-09-05 00:04:43 +04:00
|
|
|
g_free(buf);
|
2011-08-12 19:19:28 +04:00
|
|
|
if (ret) {
|
2013-08-23 05:14:47 +04:00
|
|
|
bdrv_unref(extent_file);
|
2011-08-12 19:19:28 +04:00
|
|
|
return ret;
|
|
|
|
}
|
2013-10-31 06:06:23 +04:00
|
|
|
extent = &s->extents[s->num_extents - 1];
|
2011-07-19 04:38:22 +04:00
|
|
|
} else {
|
2013-10-11 11:43:22 +04:00
|
|
|
error_setg(errp, "Unsupported extent type '%s'", type);
|
2014-09-05 00:04:42 +04:00
|
|
|
bdrv_unref(extent_file);
|
2011-07-19 04:38:22 +04:00
|
|
|
return -ENOTSUP;
|
|
|
|
}
|
2013-10-31 06:06:23 +04:00
|
|
|
extent->type = g_strdup(type);
|
2011-07-19 04:38:22 +04:00
|
|
|
next_line:
|
|
|
|
/* move to next line */
|
2013-10-11 15:48:29 +04:00
|
|
|
while (*p) {
|
|
|
|
if (*p == '\n') {
|
|
|
|
p++;
|
|
|
|
break;
|
|
|
|
}
|
2011-07-19 04:38:22 +04:00
|
|
|
p++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2014-02-17 17:44:03 +04:00
|
|
|
static int vmdk_open_desc_file(BlockDriverState *bs, int flags, char *buf,
|
|
|
|
Error **errp)
|
2011-07-19 04:38:22 +04:00
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
char ct[128];
|
|
|
|
BDRVVmdkState *s = bs->opaque;
|
|
|
|
|
|
|
|
if (vmdk_parse_description(buf, "createType", ct, sizeof(ct))) {
|
2014-02-17 17:44:06 +04:00
|
|
|
error_setg(errp, "invalid VMDK image descriptor");
|
|
|
|
ret = -EINVAL;
|
2013-06-12 15:06:30 +04:00
|
|
|
goto exit;
|
2011-07-19 04:38:22 +04:00
|
|
|
}
|
2011-08-12 19:19:27 +04:00
|
|
|
if (strcmp(ct, "monolithicFlat") &&
|
2013-08-19 14:54:28 +04:00
|
|
|
strcmp(ct, "vmfs") &&
|
2013-08-19 14:54:27 +04:00
|
|
|
strcmp(ct, "vmfsSparse") &&
|
2011-08-12 19:19:28 +04:00
|
|
|
strcmp(ct, "twoGbMaxExtentSparse") &&
|
2011-08-12 19:19:27 +04:00
|
|
|
strcmp(ct, "twoGbMaxExtentFlat")) {
|
2013-10-11 11:43:22 +04:00
|
|
|
error_setg(errp, "Unsupported image type '%s'", ct);
|
2013-06-12 15:06:30 +04:00
|
|
|
ret = -ENOTSUP;
|
|
|
|
goto exit;
|
2011-07-19 04:38:22 +04:00
|
|
|
}
|
2013-10-31 06:06:23 +04:00
|
|
|
s->create_type = g_strdup(ct);
|
2011-07-19 04:38:22 +04:00
|
|
|
s->desc_offset = 0;
|
2014-12-03 16:57:22 +03:00
|
|
|
ret = vmdk_parse_extents(buf, bs, bs->file->exact_filename, errp);
|
2013-06-12 15:06:30 +04:00
|
|
|
exit:
|
|
|
|
return ret;
|
2011-07-19 04:38:22 +04:00
|
|
|
}
|
|
|
|
|
2013-09-05 16:22:29 +04:00
|
|
|
static int vmdk_open(BlockDriverState *bs, QDict *options, int flags,
|
|
|
|
Error **errp)
|
2011-07-12 15:56:31 +04:00
|
|
|
{
|
2014-12-04 02:28:33 +03:00
|
|
|
char *buf;
|
2011-08-12 19:19:28 +04:00
|
|
|
int ret;
|
|
|
|
BDRVVmdkState *s = bs->opaque;
|
2014-02-17 17:44:04 +04:00
|
|
|
uint32_t magic;
|
2011-07-12 15:56:31 +04:00
|
|
|
|
2014-02-17 17:44:03 +04:00
|
|
|
buf = vmdk_read_desc(bs->file, 0, errp);
|
|
|
|
if (!buf) {
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
2014-02-17 17:44:04 +04:00
|
|
|
magic = ldl_be_p(buf);
|
|
|
|
switch (magic) {
|
|
|
|
case VMDK3_MAGIC:
|
|
|
|
case VMDK4_MAGIC:
|
|
|
|
ret = vmdk_open_sparse(bs, bs->file, flags, buf, errp);
|
|
|
|
s->desc_offset = 0x200;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
ret = vmdk_open_desc_file(bs, flags, buf, errp);
|
|
|
|
break;
|
2011-07-12 15:56:31 +04:00
|
|
|
}
|
2014-02-17 17:44:04 +04:00
|
|
|
if (ret) {
|
|
|
|
goto fail;
|
|
|
|
}
|
|
|
|
|
vmdk: clean up open
Move vmdk_parent_open to vmdk_open. There's another path how
vmdk_parent_open can be reached:
vmdk_parse_extents() -> vmdk_open_sparse() -> vmdk_open_vmdk4() ->
vmdk_open_desc_file().
If that can happen, however, the code is bogus. vmdk_parent_open
reads from bs->file:
if (bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE) != DESC_SIZE) {
but it is always called with s->desc_offset == 0 and with the same
bs->file. So the data that vmdk_parent_open reads comes always from the
same place, and anyway there is only one place where it can write it,
namely bs->backing_file.
So, if it cannot happen, the patched code is okay.
It is also possible that the recursive call can happen, but only once. In
that case there would still be a bug in vmdk_open_desc_file setting
s->desc_offset = 0, but the patched code is okay.
Finally, in the case where multiple recursive calls can happen the code
would need to be rewritten anyway. It is likely that this would anyway
involve adding several parameters to vmdk_parent_open, and calling it from
vmdk_open_vmdk4.
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2011-10-20 15:16:20 +04:00
|
|
|
/* try to open parent images, if exist */
|
|
|
|
ret = vmdk_parent_open(bs);
|
|
|
|
if (ret) {
|
|
|
|
goto fail;
|
|
|
|
}
|
2013-10-31 06:06:23 +04:00
|
|
|
s->cid = vmdk_read_cid(bs, 0);
|
vmdk: clean up open
Move vmdk_parent_open to vmdk_open. There's another path how
vmdk_parent_open can be reached:
vmdk_parse_extents() -> vmdk_open_sparse() -> vmdk_open_vmdk4() ->
vmdk_open_desc_file().
If that can happen, however, the code is bogus. vmdk_parent_open
reads from bs->file:
if (bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE) != DESC_SIZE) {
but it is always called with s->desc_offset == 0 and with the same
bs->file. So the data that vmdk_parent_open reads comes always from the
same place, and anyway there is only one place where it can write it,
namely bs->backing_file.
So, if it cannot happen, the patched code is okay.
It is also possible that the recursive call can happen, but only once. In
that case there would still be a bug in vmdk_open_desc_file setting
s->desc_offset = 0, but the patched code is okay.
Finally, in the case where multiple recursive calls can happen the code
would need to be rewritten anyway. It is likely that this would anyway
involve adding several parameters to vmdk_parent_open, and calling it from
vmdk_open_vmdk4.
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2011-10-20 15:16:20 +04:00
|
|
|
s->parent_cid = vmdk_read_cid(bs, 1);
|
2011-10-20 15:16:21 +04:00
|
|
|
qemu_co_mutex_init(&s->lock);
|
2011-11-22 19:50:27 +04:00
|
|
|
|
|
|
|
/* Disable migration when VMDK images are used */
|
2015-04-08 12:29:19 +03:00
|
|
|
error_setg(&s->migration_blocker, "The vmdk format used by node '%s' "
|
|
|
|
"does not support live migration",
|
|
|
|
bdrv_get_device_or_node_name(bs));
|
2011-11-22 19:50:27 +04:00
|
|
|
migrate_add_blocker(s->migration_blocker);
|
2014-02-17 17:44:03 +04:00
|
|
|
g_free(buf);
|
2011-11-22 19:50:27 +04:00
|
|
|
return 0;
|
vmdk: clean up open
Move vmdk_parent_open to vmdk_open. There's another path how
vmdk_parent_open can be reached:
vmdk_parse_extents() -> vmdk_open_sparse() -> vmdk_open_vmdk4() ->
vmdk_open_desc_file().
If that can happen, however, the code is bogus. vmdk_parent_open
reads from bs->file:
if (bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE) != DESC_SIZE) {
but it is always called with s->desc_offset == 0 and with the same
bs->file. So the data that vmdk_parent_open reads comes always from the
same place, and anyway there is only one place where it can write it,
namely bs->backing_file.
So, if it cannot happen, the patched code is okay.
It is also possible that the recursive call can happen, but only once. In
that case there would still be a bug in vmdk_open_desc_file setting
s->desc_offset = 0, but the patched code is okay.
Finally, in the case where multiple recursive calls can happen the code
would need to be rewritten anyway. It is likely that this would anyway
involve adding several parameters to vmdk_parent_open, and calling it from
vmdk_open_vmdk4.
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2011-10-20 15:16:20 +04:00
|
|
|
|
|
|
|
fail:
|
2014-02-17 17:44:03 +04:00
|
|
|
g_free(buf);
|
2013-10-31 06:06:23 +04:00
|
|
|
g_free(s->create_type);
|
|
|
|
s->create_type = NULL;
|
vmdk: clean up open
Move vmdk_parent_open to vmdk_open. There's another path how
vmdk_parent_open can be reached:
vmdk_parse_extents() -> vmdk_open_sparse() -> vmdk_open_vmdk4() ->
vmdk_open_desc_file().
If that can happen, however, the code is bogus. vmdk_parent_open
reads from bs->file:
if (bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE) != DESC_SIZE) {
but it is always called with s->desc_offset == 0 and with the same
bs->file. So the data that vmdk_parent_open reads comes always from the
same place, and anyway there is only one place where it can write it,
namely bs->backing_file.
So, if it cannot happen, the patched code is okay.
It is also possible that the recursive call can happen, but only once. In
that case there would still be a bug in vmdk_open_desc_file setting
s->desc_offset = 0, but the patched code is okay.
Finally, in the case where multiple recursive calls can happen the code
would need to be rewritten anyway. It is likely that this would anyway
involve adding several parameters to vmdk_parent_open, and calling it from
vmdk_open_vmdk4.
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2011-10-20 15:16:20 +04:00
|
|
|
vmdk_free_extents(bs);
|
|
|
|
return ret;
|
2004-08-02 01:59:26 +04:00
|
|
|
}
|
|
|
|
|
2013-12-11 22:26:16 +04:00
|
|
|
|
2014-07-16 19:48:16 +04:00
|
|
|
static void vmdk_refresh_limits(BlockDriverState *bs, Error **errp)
|
2013-12-11 22:26:16 +04:00
|
|
|
{
|
|
|
|
BDRVVmdkState *s = bs->opaque;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < s->num_extents; i++) {
|
|
|
|
if (!s->extents[i].flat) {
|
|
|
|
bs->bl.write_zeroes_alignment =
|
|
|
|
MAX(bs->bl.write_zeroes_alignment,
|
|
|
|
s->extents[i].cluster_sectors);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
vmdk: Optimize cluster allocation
This drops the unnecessary bdrv_truncate() from, and also improves,
cluster allocation code path.
Before, when we need a new cluster, get_cluster_offset truncates the
image to bdrv_getlength() + cluster_size, and returns the offset of
added area, i.e. the image length before truncating.
This is not efficient, so it's now rewritten as:
- Save the extent file length when opening.
- When allocating cluster, use the saved length as cluster offset.
- Don't truncate image, because we'll anyway write data there: just
write any data at the EOF position, in descending priority:
* New user data (cluster allocation happens in a write request).
* Filling data in the beginning and/or ending of the new cluster, if
not covered by user data: either backing file content (COW), or
zero for standalone images.
One major benifit of this change is, on host mounted NFS images, even
over a fast network, ftruncate is slow (see the example below). This
change significantly speeds up cluster allocation. Comparing by
converting a cirros image (296M) to VMDK on an NFS mount point, over
1Gbe LAN:
$ time qemu-img convert cirros-0.3.1.img /mnt/a.raw -O vmdk
Before:
real 0m21.796s
user 0m0.130s
sys 0m0.483s
After:
real 0m2.017s
user 0m0.047s
sys 0m0.190s
We also get rid of unchecked bdrv_getlength() and bdrv_truncate(), and
get a little more documentation in function comments.
Tested that this passes qemu-iotests for all VMDK subformats.
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-07-30 10:39:10 +04:00
|
|
|
/**
|
|
|
|
* get_whole_cluster
|
|
|
|
*
|
|
|
|
* Copy backing file's cluster that covers @sector_num, otherwise write zero,
|
|
|
|
* to the cluster at @cluster_sector_num.
|
|
|
|
*
|
|
|
|
* If @skip_start_sector < @skip_end_sector, the relative range
|
|
|
|
* [@skip_start_sector, @skip_end_sector) is not copied or written, and leave
|
|
|
|
* it for call to write user data in the request.
|
|
|
|
*/
|
2011-07-12 15:56:28 +04:00
|
|
|
static int get_whole_cluster(BlockDriverState *bs,
|
vmdk: Optimize cluster allocation
This drops the unnecessary bdrv_truncate() from, and also improves,
cluster allocation code path.
Before, when we need a new cluster, get_cluster_offset truncates the
image to bdrv_getlength() + cluster_size, and returns the offset of
added area, i.e. the image length before truncating.
This is not efficient, so it's now rewritten as:
- Save the extent file length when opening.
- When allocating cluster, use the saved length as cluster offset.
- Don't truncate image, because we'll anyway write data there: just
write any data at the EOF position, in descending priority:
* New user data (cluster allocation happens in a write request).
* Filling data in the beginning and/or ending of the new cluster, if
not covered by user data: either backing file content (COW), or
zero for standalone images.
One major benifit of this change is, on host mounted NFS images, even
over a fast network, ftruncate is slow (see the example below). This
change significantly speeds up cluster allocation. Comparing by
converting a cirros image (296M) to VMDK on an NFS mount point, over
1Gbe LAN:
$ time qemu-img convert cirros-0.3.1.img /mnt/a.raw -O vmdk
Before:
real 0m21.796s
user 0m0.130s
sys 0m0.483s
After:
real 0m2.017s
user 0m0.047s
sys 0m0.190s
We also get rid of unchecked bdrv_getlength() and bdrv_truncate(), and
get a little more documentation in function comments.
Tested that this passes qemu-iotests for all VMDK subformats.
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-07-30 10:39:10 +04:00
|
|
|
VmdkExtent *extent,
|
|
|
|
uint64_t cluster_sector_num,
|
|
|
|
uint64_t sector_num,
|
|
|
|
uint64_t skip_start_sector,
|
|
|
|
uint64_t skip_end_sector)
|
2007-01-25 00:05:24 +03:00
|
|
|
{
|
2013-08-06 11:44:54 +04:00
|
|
|
int ret = VMDK_OK;
|
vmdk: Optimize cluster allocation
This drops the unnecessary bdrv_truncate() from, and also improves,
cluster allocation code path.
Before, when we need a new cluster, get_cluster_offset truncates the
image to bdrv_getlength() + cluster_size, and returns the offset of
added area, i.e. the image length before truncating.
This is not efficient, so it's now rewritten as:
- Save the extent file length when opening.
- When allocating cluster, use the saved length as cluster offset.
- Don't truncate image, because we'll anyway write data there: just
write any data at the EOF position, in descending priority:
* New user data (cluster allocation happens in a write request).
* Filling data in the beginning and/or ending of the new cluster, if
not covered by user data: either backing file content (COW), or
zero for standalone images.
One major benifit of this change is, on host mounted NFS images, even
over a fast network, ftruncate is slow (see the example below). This
change significantly speeds up cluster allocation. Comparing by
converting a cirros image (296M) to VMDK on an NFS mount point, over
1Gbe LAN:
$ time qemu-img convert cirros-0.3.1.img /mnt/a.raw -O vmdk
Before:
real 0m21.796s
user 0m0.130s
sys 0m0.483s
After:
real 0m2.017s
user 0m0.047s
sys 0m0.190s
We also get rid of unchecked bdrv_getlength() and bdrv_truncate(), and
get a little more documentation in function comments.
Tested that this passes qemu-iotests for all VMDK subformats.
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-07-30 10:39:10 +04:00
|
|
|
int64_t cluster_bytes;
|
|
|
|
uint8_t *whole_grain;
|
|
|
|
|
|
|
|
/* For COW, align request sector_num to cluster start */
|
|
|
|
sector_num = QEMU_ALIGN_DOWN(sector_num, extent->cluster_sectors);
|
|
|
|
cluster_bytes = extent->cluster_sectors << BDRV_SECTOR_BITS;
|
|
|
|
whole_grain = qemu_blockalign(bs, cluster_bytes);
|
2007-01-25 00:05:24 +03:00
|
|
|
|
vmdk: Optimize cluster allocation
This drops the unnecessary bdrv_truncate() from, and also improves,
cluster allocation code path.
Before, when we need a new cluster, get_cluster_offset truncates the
image to bdrv_getlength() + cluster_size, and returns the offset of
added area, i.e. the image length before truncating.
This is not efficient, so it's now rewritten as:
- Save the extent file length when opening.
- When allocating cluster, use the saved length as cluster offset.
- Don't truncate image, because we'll anyway write data there: just
write any data at the EOF position, in descending priority:
* New user data (cluster allocation happens in a write request).
* Filling data in the beginning and/or ending of the new cluster, if
not covered by user data: either backing file content (COW), or
zero for standalone images.
One major benifit of this change is, on host mounted NFS images, even
over a fast network, ftruncate is slow (see the example below). This
change significantly speeds up cluster allocation. Comparing by
converting a cirros image (296M) to VMDK on an NFS mount point, over
1Gbe LAN:
$ time qemu-img convert cirros-0.3.1.img /mnt/a.raw -O vmdk
Before:
real 0m21.796s
user 0m0.130s
sys 0m0.483s
After:
real 0m2.017s
user 0m0.047s
sys 0m0.190s
We also get rid of unchecked bdrv_getlength() and bdrv_truncate(), and
get a little more documentation in function comments.
Tested that this passes qemu-iotests for all VMDK subformats.
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-07-30 10:39:10 +04:00
|
|
|
if (!bs->backing_hd) {
|
|
|
|
memset(whole_grain, 0, skip_start_sector << BDRV_SECTOR_BITS);
|
|
|
|
memset(whole_grain + (skip_end_sector << BDRV_SECTOR_BITS), 0,
|
|
|
|
cluster_bytes - (skip_end_sector << BDRV_SECTOR_BITS));
|
|
|
|
}
|
|
|
|
|
|
|
|
assert(skip_end_sector <= extent->cluster_sectors);
|
2011-07-12 15:56:29 +04:00
|
|
|
/* we will be here if it's first write on non-exist grain(cluster).
|
|
|
|
* try to read from parent image, if exist */
|
vmdk: Optimize cluster allocation
This drops the unnecessary bdrv_truncate() from, and also improves,
cluster allocation code path.
Before, when we need a new cluster, get_cluster_offset truncates the
image to bdrv_getlength() + cluster_size, and returns the offset of
added area, i.e. the image length before truncating.
This is not efficient, so it's now rewritten as:
- Save the extent file length when opening.
- When allocating cluster, use the saved length as cluster offset.
- Don't truncate image, because we'll anyway write data there: just
write any data at the EOF position, in descending priority:
* New user data (cluster allocation happens in a write request).
* Filling data in the beginning and/or ending of the new cluster, if
not covered by user data: either backing file content (COW), or
zero for standalone images.
One major benifit of this change is, on host mounted NFS images, even
over a fast network, ftruncate is slow (see the example below). This
change significantly speeds up cluster allocation. Comparing by
converting a cirros image (296M) to VMDK on an NFS mount point, over
1Gbe LAN:
$ time qemu-img convert cirros-0.3.1.img /mnt/a.raw -O vmdk
Before:
real 0m21.796s
user 0m0.130s
sys 0m0.483s
After:
real 0m2.017s
user 0m0.047s
sys 0m0.190s
We also get rid of unchecked bdrv_getlength() and bdrv_truncate(), and
get a little more documentation in function comments.
Tested that this passes qemu-iotests for all VMDK subformats.
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-07-30 10:39:10 +04:00
|
|
|
if (bs->backing_hd && !vmdk_is_cid_valid(bs)) {
|
|
|
|
ret = VMDK_ERROR;
|
|
|
|
goto exit;
|
|
|
|
}
|
2007-01-25 00:05:24 +03:00
|
|
|
|
vmdk: Optimize cluster allocation
This drops the unnecessary bdrv_truncate() from, and also improves,
cluster allocation code path.
Before, when we need a new cluster, get_cluster_offset truncates the
image to bdrv_getlength() + cluster_size, and returns the offset of
added area, i.e. the image length before truncating.
This is not efficient, so it's now rewritten as:
- Save the extent file length when opening.
- When allocating cluster, use the saved length as cluster offset.
- Don't truncate image, because we'll anyway write data there: just
write any data at the EOF position, in descending priority:
* New user data (cluster allocation happens in a write request).
* Filling data in the beginning and/or ending of the new cluster, if
not covered by user data: either backing file content (COW), or
zero for standalone images.
One major benifit of this change is, on host mounted NFS images, even
over a fast network, ftruncate is slow (see the example below). This
change significantly speeds up cluster allocation. Comparing by
converting a cirros image (296M) to VMDK on an NFS mount point, over
1Gbe LAN:
$ time qemu-img convert cirros-0.3.1.img /mnt/a.raw -O vmdk
Before:
real 0m21.796s
user 0m0.130s
sys 0m0.483s
After:
real 0m2.017s
user 0m0.047s
sys 0m0.190s
We also get rid of unchecked bdrv_getlength() and bdrv_truncate(), and
get a little more documentation in function comments.
Tested that this passes qemu-iotests for all VMDK subformats.
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-07-30 10:39:10 +04:00
|
|
|
/* Read backing data before skip range */
|
|
|
|
if (skip_start_sector > 0) {
|
|
|
|
if (bs->backing_hd) {
|
|
|
|
ret = bdrv_read(bs->backing_hd, sector_num,
|
|
|
|
whole_grain, skip_start_sector);
|
|
|
|
if (ret < 0) {
|
|
|
|
ret = VMDK_ERROR;
|
|
|
|
goto exit;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
ret = bdrv_write(extent->file, cluster_sector_num, whole_grain,
|
|
|
|
skip_start_sector);
|
2010-04-16 21:28:14 +04:00
|
|
|
if (ret < 0) {
|
2013-08-06 11:44:54 +04:00
|
|
|
ret = VMDK_ERROR;
|
|
|
|
goto exit;
|
2010-04-16 21:28:14 +04:00
|
|
|
}
|
vmdk: Optimize cluster allocation
This drops the unnecessary bdrv_truncate() from, and also improves,
cluster allocation code path.
Before, when we need a new cluster, get_cluster_offset truncates the
image to bdrv_getlength() + cluster_size, and returns the offset of
added area, i.e. the image length before truncating.
This is not efficient, so it's now rewritten as:
- Save the extent file length when opening.
- When allocating cluster, use the saved length as cluster offset.
- Don't truncate image, because we'll anyway write data there: just
write any data at the EOF position, in descending priority:
* New user data (cluster allocation happens in a write request).
* Filling data in the beginning and/or ending of the new cluster, if
not covered by user data: either backing file content (COW), or
zero for standalone images.
One major benifit of this change is, on host mounted NFS images, even
over a fast network, ftruncate is slow (see the example below). This
change significantly speeds up cluster allocation. Comparing by
converting a cirros image (296M) to VMDK on an NFS mount point, over
1Gbe LAN:
$ time qemu-img convert cirros-0.3.1.img /mnt/a.raw -O vmdk
Before:
real 0m21.796s
user 0m0.130s
sys 0m0.483s
After:
real 0m2.017s
user 0m0.047s
sys 0m0.190s
We also get rid of unchecked bdrv_getlength() and bdrv_truncate(), and
get a little more documentation in function comments.
Tested that this passes qemu-iotests for all VMDK subformats.
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-07-30 10:39:10 +04:00
|
|
|
}
|
|
|
|
/* Read backing data after skip range */
|
|
|
|
if (skip_end_sector < extent->cluster_sectors) {
|
|
|
|
if (bs->backing_hd) {
|
|
|
|
ret = bdrv_read(bs->backing_hd, sector_num + skip_end_sector,
|
|
|
|
whole_grain + (skip_end_sector << BDRV_SECTOR_BITS),
|
|
|
|
extent->cluster_sectors - skip_end_sector);
|
|
|
|
if (ret < 0) {
|
|
|
|
ret = VMDK_ERROR;
|
|
|
|
goto exit;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
ret = bdrv_write(extent->file, cluster_sector_num + skip_end_sector,
|
|
|
|
whole_grain + (skip_end_sector << BDRV_SECTOR_BITS),
|
|
|
|
extent->cluster_sectors - skip_end_sector);
|
2010-04-16 21:28:14 +04:00
|
|
|
if (ret < 0) {
|
2013-08-06 11:44:54 +04:00
|
|
|
ret = VMDK_ERROR;
|
|
|
|
goto exit;
|
2007-06-18 19:01:30 +04:00
|
|
|
}
|
|
|
|
}
|
vmdk: Optimize cluster allocation
This drops the unnecessary bdrv_truncate() from, and also improves,
cluster allocation code path.
Before, when we need a new cluster, get_cluster_offset truncates the
image to bdrv_getlength() + cluster_size, and returns the offset of
added area, i.e. the image length before truncating.
This is not efficient, so it's now rewritten as:
- Save the extent file length when opening.
- When allocating cluster, use the saved length as cluster offset.
- Don't truncate image, because we'll anyway write data there: just
write any data at the EOF position, in descending priority:
* New user data (cluster allocation happens in a write request).
* Filling data in the beginning and/or ending of the new cluster, if
not covered by user data: either backing file content (COW), or
zero for standalone images.
One major benifit of this change is, on host mounted NFS images, even
over a fast network, ftruncate is slow (see the example below). This
change significantly speeds up cluster allocation. Comparing by
converting a cirros image (296M) to VMDK on an NFS mount point, over
1Gbe LAN:
$ time qemu-img convert cirros-0.3.1.img /mnt/a.raw -O vmdk
Before:
real 0m21.796s
user 0m0.130s
sys 0m0.483s
After:
real 0m2.017s
user 0m0.047s
sys 0m0.190s
We also get rid of unchecked bdrv_getlength() and bdrv_truncate(), and
get a little more documentation in function comments.
Tested that this passes qemu-iotests for all VMDK subformats.
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-07-30 10:39:10 +04:00
|
|
|
|
2013-08-06 11:44:54 +04:00
|
|
|
exit:
|
|
|
|
qemu_vfree(whole_grain);
|
|
|
|
return ret;
|
2007-06-18 19:01:30 +04:00
|
|
|
}
|
|
|
|
|
vmdk: Optimize cluster allocation
This drops the unnecessary bdrv_truncate() from, and also improves,
cluster allocation code path.
Before, when we need a new cluster, get_cluster_offset truncates the
image to bdrv_getlength() + cluster_size, and returns the offset of
added area, i.e. the image length before truncating.
This is not efficient, so it's now rewritten as:
- Save the extent file length when opening.
- When allocating cluster, use the saved length as cluster offset.
- Don't truncate image, because we'll anyway write data there: just
write any data at the EOF position, in descending priority:
* New user data (cluster allocation happens in a write request).
* Filling data in the beginning and/or ending of the new cluster, if
not covered by user data: either backing file content (COW), or
zero for standalone images.
One major benifit of this change is, on host mounted NFS images, even
over a fast network, ftruncate is slow (see the example below). This
change significantly speeds up cluster allocation. Comparing by
converting a cirros image (296M) to VMDK on an NFS mount point, over
1Gbe LAN:
$ time qemu-img convert cirros-0.3.1.img /mnt/a.raw -O vmdk
Before:
real 0m21.796s
user 0m0.130s
sys 0m0.483s
After:
real 0m2.017s
user 0m0.047s
sys 0m0.190s
We also get rid of unchecked bdrv_getlength() and bdrv_truncate(), and
get a little more documentation in function comments.
Tested that this passes qemu-iotests for all VMDK subformats.
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-07-30 10:39:10 +04:00
|
|
|
static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data,
|
|
|
|
uint32_t offset)
|
2007-06-18 19:01:30 +04:00
|
|
|
{
|
vmdk: Optimize cluster allocation
This drops the unnecessary bdrv_truncate() from, and also improves,
cluster allocation code path.
Before, when we need a new cluster, get_cluster_offset truncates the
image to bdrv_getlength() + cluster_size, and returns the offset of
added area, i.e. the image length before truncating.
This is not efficient, so it's now rewritten as:
- Save the extent file length when opening.
- When allocating cluster, use the saved length as cluster offset.
- Don't truncate image, because we'll anyway write data there: just
write any data at the EOF position, in descending priority:
* New user data (cluster allocation happens in a write request).
* Filling data in the beginning and/or ending of the new cluster, if
not covered by user data: either backing file content (COW), or
zero for standalone images.
One major benifit of this change is, on host mounted NFS images, even
over a fast network, ftruncate is slow (see the example below). This
change significantly speeds up cluster allocation. Comparing by
converting a cirros image (296M) to VMDK on an NFS mount point, over
1Gbe LAN:
$ time qemu-img convert cirros-0.3.1.img /mnt/a.raw -O vmdk
Before:
real 0m21.796s
user 0m0.130s
sys 0m0.483s
After:
real 0m2.017s
user 0m0.047s
sys 0m0.190s
We also get rid of unchecked bdrv_getlength() and bdrv_truncate(), and
get a little more documentation in function comments.
Tested that this passes qemu-iotests for all VMDK subformats.
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-07-30 10:39:10 +04:00
|
|
|
offset = cpu_to_le32(offset);
|
2007-06-18 19:01:30 +04:00
|
|
|
/* update L2 table */
|
2011-07-12 15:56:28 +04:00
|
|
|
if (bdrv_pwrite_sync(
|
|
|
|
extent->file,
|
|
|
|
((int64_t)m_data->l2_offset * 512)
|
vmdk: Optimize cluster allocation
This drops the unnecessary bdrv_truncate() from, and also improves,
cluster allocation code path.
Before, when we need a new cluster, get_cluster_offset truncates the
image to bdrv_getlength() + cluster_size, and returns the offset of
added area, i.e. the image length before truncating.
This is not efficient, so it's now rewritten as:
- Save the extent file length when opening.
- When allocating cluster, use the saved length as cluster offset.
- Don't truncate image, because we'll anyway write data there: just
write any data at the EOF position, in descending priority:
* New user data (cluster allocation happens in a write request).
* Filling data in the beginning and/or ending of the new cluster, if
not covered by user data: either backing file content (COW), or
zero for standalone images.
One major benifit of this change is, on host mounted NFS images, even
over a fast network, ftruncate is slow (see the example below). This
change significantly speeds up cluster allocation. Comparing by
converting a cirros image (296M) to VMDK on an NFS mount point, over
1Gbe LAN:
$ time qemu-img convert cirros-0.3.1.img /mnt/a.raw -O vmdk
Before:
real 0m21.796s
user 0m0.130s
sys 0m0.483s
After:
real 0m2.017s
user 0m0.047s
sys 0m0.190s
We also get rid of unchecked bdrv_getlength() and bdrv_truncate(), and
get a little more documentation in function comments.
Tested that this passes qemu-iotests for all VMDK subformats.
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-07-30 10:39:10 +04:00
|
|
|
+ (m_data->l2_index * sizeof(offset)),
|
2013-05-02 06:25:26 +04:00
|
|
|
&offset, sizeof(offset)) < 0) {
|
2013-05-02 06:25:22 +04:00
|
|
|
return VMDK_ERROR;
|
2011-07-12 15:56:28 +04:00
|
|
|
}
|
2007-06-18 19:01:30 +04:00
|
|
|
/* update backup L2 table */
|
2011-07-12 15:56:28 +04:00
|
|
|
if (extent->l1_backup_table_offset != 0) {
|
|
|
|
m_data->l2_offset = extent->l1_backup_table[m_data->l1_index];
|
|
|
|
if (bdrv_pwrite_sync(
|
|
|
|
extent->file,
|
|
|
|
((int64_t)m_data->l2_offset * 512)
|
vmdk: Optimize cluster allocation
This drops the unnecessary bdrv_truncate() from, and also improves,
cluster allocation code path.
Before, when we need a new cluster, get_cluster_offset truncates the
image to bdrv_getlength() + cluster_size, and returns the offset of
added area, i.e. the image length before truncating.
This is not efficient, so it's now rewritten as:
- Save the extent file length when opening.
- When allocating cluster, use the saved length as cluster offset.
- Don't truncate image, because we'll anyway write data there: just
write any data at the EOF position, in descending priority:
* New user data (cluster allocation happens in a write request).
* Filling data in the beginning and/or ending of the new cluster, if
not covered by user data: either backing file content (COW), or
zero for standalone images.
One major benifit of this change is, on host mounted NFS images, even
over a fast network, ftruncate is slow (see the example below). This
change significantly speeds up cluster allocation. Comparing by
converting a cirros image (296M) to VMDK on an NFS mount point, over
1Gbe LAN:
$ time qemu-img convert cirros-0.3.1.img /mnt/a.raw -O vmdk
Before:
real 0m21.796s
user 0m0.130s
sys 0m0.483s
After:
real 0m2.017s
user 0m0.047s
sys 0m0.190s
We also get rid of unchecked bdrv_getlength() and bdrv_truncate(), and
get a little more documentation in function comments.
Tested that this passes qemu-iotests for all VMDK subformats.
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-07-30 10:39:10 +04:00
|
|
|
+ (m_data->l2_index * sizeof(offset)),
|
2013-05-02 06:25:26 +04:00
|
|
|
&offset, sizeof(offset)) < 0) {
|
2013-05-02 06:25:22 +04:00
|
|
|
return VMDK_ERROR;
|
2011-07-12 15:56:28 +04:00
|
|
|
}
|
2007-01-25 00:05:24 +03:00
|
|
|
}
|
2013-05-02 06:25:27 +04:00
|
|
|
if (m_data->l2_cache_entry) {
|
|
|
|
*m_data->l2_cache_entry = offset;
|
|
|
|
}
|
2007-06-18 19:01:30 +04:00
|
|
|
|
2013-05-02 06:25:22 +04:00
|
|
|
return VMDK_OK;
|
2007-01-25 00:05:24 +03:00
|
|
|
}
|
|
|
|
|
vmdk: Optimize cluster allocation
This drops the unnecessary bdrv_truncate() from, and also improves,
cluster allocation code path.
Before, when we need a new cluster, get_cluster_offset truncates the
image to bdrv_getlength() + cluster_size, and returns the offset of
added area, i.e. the image length before truncating.
This is not efficient, so it's now rewritten as:
- Save the extent file length when opening.
- When allocating cluster, use the saved length as cluster offset.
- Don't truncate image, because we'll anyway write data there: just
write any data at the EOF position, in descending priority:
* New user data (cluster allocation happens in a write request).
* Filling data in the beginning and/or ending of the new cluster, if
not covered by user data: either backing file content (COW), or
zero for standalone images.
One major benifit of this change is, on host mounted NFS images, even
over a fast network, ftruncate is slow (see the example below). This
change significantly speeds up cluster allocation. Comparing by
converting a cirros image (296M) to VMDK on an NFS mount point, over
1Gbe LAN:
$ time qemu-img convert cirros-0.3.1.img /mnt/a.raw -O vmdk
Before:
real 0m21.796s
user 0m0.130s
sys 0m0.483s
After:
real 0m2.017s
user 0m0.047s
sys 0m0.190s
We also get rid of unchecked bdrv_getlength() and bdrv_truncate(), and
get a little more documentation in function comments.
Tested that this passes qemu-iotests for all VMDK subformats.
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-07-30 10:39:10 +04:00
|
|
|
/**
|
|
|
|
* get_cluster_offset
|
|
|
|
*
|
|
|
|
* Look up cluster offset in extent file by sector number, and store in
|
|
|
|
* @cluster_offset.
|
|
|
|
*
|
|
|
|
* For flat extents, the start offset as parsed from the description file is
|
|
|
|
* returned.
|
|
|
|
*
|
|
|
|
* For sparse extents, look up in L1, L2 table. If allocate is true, return an
|
|
|
|
* offset for a new cluster and update L2 cache. If there is a backing file,
|
|
|
|
* COW is done before returning; otherwise, zeroes are written to the allocated
|
|
|
|
* cluster. Both COW and zero writing skips the sector range
|
|
|
|
* [@skip_start_sector, @skip_end_sector) passed in by caller, because caller
|
|
|
|
* has new data to write there.
|
|
|
|
*
|
|
|
|
* Returns: VMDK_OK if cluster exists and mapped in the image.
|
|
|
|
* VMDK_UNALLOC if cluster is not mapped and @allocate is false.
|
|
|
|
* VMDK_ERROR if failed.
|
|
|
|
*/
|
2011-07-12 15:56:35 +04:00
|
|
|
static int get_cluster_offset(BlockDriverState *bs,
|
vmdk: Optimize cluster allocation
This drops the unnecessary bdrv_truncate() from, and also improves,
cluster allocation code path.
Before, when we need a new cluster, get_cluster_offset truncates the
image to bdrv_getlength() + cluster_size, and returns the offset of
added area, i.e. the image length before truncating.
This is not efficient, so it's now rewritten as:
- Save the extent file length when opening.
- When allocating cluster, use the saved length as cluster offset.
- Don't truncate image, because we'll anyway write data there: just
write any data at the EOF position, in descending priority:
* New user data (cluster allocation happens in a write request).
* Filling data in the beginning and/or ending of the new cluster, if
not covered by user data: either backing file content (COW), or
zero for standalone images.
One major benifit of this change is, on host mounted NFS images, even
over a fast network, ftruncate is slow (see the example below). This
change significantly speeds up cluster allocation. Comparing by
converting a cirros image (296M) to VMDK on an NFS mount point, over
1Gbe LAN:
$ time qemu-img convert cirros-0.3.1.img /mnt/a.raw -O vmdk
Before:
real 0m21.796s
user 0m0.130s
sys 0m0.483s
After:
real 0m2.017s
user 0m0.047s
sys 0m0.190s
We also get rid of unchecked bdrv_getlength() and bdrv_truncate(), and
get a little more documentation in function comments.
Tested that this passes qemu-iotests for all VMDK subformats.
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-07-30 10:39:10 +04:00
|
|
|
VmdkExtent *extent,
|
|
|
|
VmdkMetaData *m_data,
|
|
|
|
uint64_t offset,
|
|
|
|
bool allocate,
|
|
|
|
uint64_t *cluster_offset,
|
|
|
|
uint64_t skip_start_sector,
|
|
|
|
uint64_t skip_end_sector)
|
2004-08-02 01:59:26 +04:00
|
|
|
{
|
|
|
|
unsigned int l1_index, l2_offset, l2_index;
|
|
|
|
int min_index, i, j;
|
2013-05-02 06:25:26 +04:00
|
|
|
uint32_t min_count, *l2_table;
|
2013-05-02 06:25:23 +04:00
|
|
|
bool zeroed = false;
|
vmdk: Optimize cluster allocation
This drops the unnecessary bdrv_truncate() from, and also improves,
cluster allocation code path.
Before, when we need a new cluster, get_cluster_offset truncates the
image to bdrv_getlength() + cluster_size, and returns the offset of
added area, i.e. the image length before truncating.
This is not efficient, so it's now rewritten as:
- Save the extent file length when opening.
- When allocating cluster, use the saved length as cluster offset.
- Don't truncate image, because we'll anyway write data there: just
write any data at the EOF position, in descending priority:
* New user data (cluster allocation happens in a write request).
* Filling data in the beginning and/or ending of the new cluster, if
not covered by user data: either backing file content (COW), or
zero for standalone images.
One major benifit of this change is, on host mounted NFS images, even
over a fast network, ftruncate is slow (see the example below). This
change significantly speeds up cluster allocation. Comparing by
converting a cirros image (296M) to VMDK on an NFS mount point, over
1Gbe LAN:
$ time qemu-img convert cirros-0.3.1.img /mnt/a.raw -O vmdk
Before:
real 0m21.796s
user 0m0.130s
sys 0m0.483s
After:
real 0m2.017s
user 0m0.047s
sys 0m0.190s
We also get rid of unchecked bdrv_getlength() and bdrv_truncate(), and
get a little more documentation in function comments.
Tested that this passes qemu-iotests for all VMDK subformats.
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-07-30 10:39:10 +04:00
|
|
|
int64_t ret;
|
2014-09-23 05:56:21 +04:00
|
|
|
int64_t cluster_sector;
|
2007-06-18 19:01:30 +04:00
|
|
|
|
2011-07-12 15:56:38 +04:00
|
|
|
if (m_data) {
|
2007-06-18 19:01:30 +04:00
|
|
|
m_data->valid = 0;
|
2011-07-12 15:56:38 +04:00
|
|
|
}
|
2011-07-12 15:56:35 +04:00
|
|
|
if (extent->flat) {
|
2011-07-19 04:38:22 +04:00
|
|
|
*cluster_offset = extent->flat_start_offset;
|
2013-05-02 06:25:22 +04:00
|
|
|
return VMDK_OK;
|
2011-07-12 15:56:35 +04:00
|
|
|
}
|
2007-06-18 19:01:30 +04:00
|
|
|
|
2011-08-12 19:19:27 +04:00
|
|
|
offset -= (extent->end_sector - extent->sectors) * SECTOR_SIZE;
|
2011-07-12 15:56:28 +04:00
|
|
|
l1_index = (offset >> 9) / extent->l1_entry_sectors;
|
|
|
|
if (l1_index >= extent->l1_size) {
|
2013-05-02 06:25:22 +04:00
|
|
|
return VMDK_ERROR;
|
2011-07-12 15:56:28 +04:00
|
|
|
}
|
|
|
|
l2_offset = extent->l1_table[l1_index];
|
|
|
|
if (!l2_offset) {
|
2013-05-02 06:25:22 +04:00
|
|
|
return VMDK_UNALLOC;
|
2011-07-12 15:56:28 +04:00
|
|
|
}
|
2011-07-12 15:56:31 +04:00
|
|
|
for (i = 0; i < L2_CACHE_SIZE; i++) {
|
2011-07-12 15:56:28 +04:00
|
|
|
if (l2_offset == extent->l2_cache_offsets[i]) {
|
2004-08-02 01:59:26 +04:00
|
|
|
/* increment the hit count */
|
2011-07-12 15:56:28 +04:00
|
|
|
if (++extent->l2_cache_counts[i] == 0xffffffff) {
|
2011-07-12 15:56:31 +04:00
|
|
|
for (j = 0; j < L2_CACHE_SIZE; j++) {
|
2011-07-12 15:56:28 +04:00
|
|
|
extent->l2_cache_counts[j] >>= 1;
|
2004-08-02 01:59:26 +04:00
|
|
|
}
|
|
|
|
}
|
2011-07-12 15:56:28 +04:00
|
|
|
l2_table = extent->l2_cache + (i * extent->l2_size);
|
2004-08-02 01:59:26 +04:00
|
|
|
goto found;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
/* not found: load a new entry in the least used one */
|
|
|
|
min_index = 0;
|
|
|
|
min_count = 0xffffffff;
|
2011-07-12 15:56:31 +04:00
|
|
|
for (i = 0; i < L2_CACHE_SIZE; i++) {
|
2011-07-12 15:56:28 +04:00
|
|
|
if (extent->l2_cache_counts[i] < min_count) {
|
|
|
|
min_count = extent->l2_cache_counts[i];
|
2004-08-02 01:59:26 +04:00
|
|
|
min_index = i;
|
|
|
|
}
|
|
|
|
}
|
2011-07-12 15:56:28 +04:00
|
|
|
l2_table = extent->l2_cache + (min_index * extent->l2_size);
|
|
|
|
if (bdrv_pread(
|
|
|
|
extent->file,
|
|
|
|
(int64_t)l2_offset * 512,
|
|
|
|
l2_table,
|
|
|
|
extent->l2_size * sizeof(uint32_t)
|
|
|
|
) != extent->l2_size * sizeof(uint32_t)) {
|
2013-05-02 06:25:22 +04:00
|
|
|
return VMDK_ERROR;
|
2011-07-12 15:56:28 +04:00
|
|
|
}
|
2007-01-25 00:05:24 +03:00
|
|
|
|
2011-07-12 15:56:28 +04:00
|
|
|
extent->l2_cache_offsets[min_index] = l2_offset;
|
|
|
|
extent->l2_cache_counts[min_index] = 1;
|
2004-08-02 01:59:26 +04:00
|
|
|
found:
|
2011-07-12 15:56:28 +04:00
|
|
|
l2_index = ((offset >> 9) / extent->cluster_sectors) % extent->l2_size;
|
vmdk: Optimize cluster allocation
This drops the unnecessary bdrv_truncate() from, and also improves,
cluster allocation code path.
Before, when we need a new cluster, get_cluster_offset truncates the
image to bdrv_getlength() + cluster_size, and returns the offset of
added area, i.e. the image length before truncating.
This is not efficient, so it's now rewritten as:
- Save the extent file length when opening.
- When allocating cluster, use the saved length as cluster offset.
- Don't truncate image, because we'll anyway write data there: just
write any data at the EOF position, in descending priority:
* New user data (cluster allocation happens in a write request).
* Filling data in the beginning and/or ending of the new cluster, if
not covered by user data: either backing file content (COW), or
zero for standalone images.
One major benifit of this change is, on host mounted NFS images, even
over a fast network, ftruncate is slow (see the example below). This
change significantly speeds up cluster allocation. Comparing by
converting a cirros image (296M) to VMDK on an NFS mount point, over
1Gbe LAN:
$ time qemu-img convert cirros-0.3.1.img /mnt/a.raw -O vmdk
Before:
real 0m21.796s
user 0m0.130s
sys 0m0.483s
After:
real 0m2.017s
user 0m0.047s
sys 0m0.190s
We also get rid of unchecked bdrv_getlength() and bdrv_truncate(), and
get a little more documentation in function comments.
Tested that this passes qemu-iotests for all VMDK subformats.
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-07-30 10:39:10 +04:00
|
|
|
cluster_sector = le32_to_cpu(l2_table[l2_index]);
|
2007-06-18 19:01:30 +04:00
|
|
|
|
2013-05-02 06:25:27 +04:00
|
|
|
if (m_data) {
|
|
|
|
m_data->valid = 1;
|
|
|
|
m_data->l1_index = l1_index;
|
|
|
|
m_data->l2_index = l2_index;
|
|
|
|
m_data->l2_offset = l2_offset;
|
|
|
|
m_data->l2_cache_entry = &l2_table[l2_index];
|
|
|
|
}
|
vmdk: Optimize cluster allocation
This drops the unnecessary bdrv_truncate() from, and also improves,
cluster allocation code path.
Before, when we need a new cluster, get_cluster_offset truncates the
image to bdrv_getlength() + cluster_size, and returns the offset of
added area, i.e. the image length before truncating.
This is not efficient, so it's now rewritten as:
- Save the extent file length when opening.
- When allocating cluster, use the saved length as cluster offset.
- Don't truncate image, because we'll anyway write data there: just
write any data at the EOF position, in descending priority:
* New user data (cluster allocation happens in a write request).
* Filling data in the beginning and/or ending of the new cluster, if
not covered by user data: either backing file content (COW), or
zero for standalone images.
One major benifit of this change is, on host mounted NFS images, even
over a fast network, ftruncate is slow (see the example below). This
change significantly speeds up cluster allocation. Comparing by
converting a cirros image (296M) to VMDK on an NFS mount point, over
1Gbe LAN:
$ time qemu-img convert cirros-0.3.1.img /mnt/a.raw -O vmdk
Before:
real 0m21.796s
user 0m0.130s
sys 0m0.483s
After:
real 0m2.017s
user 0m0.047s
sys 0m0.190s
We also get rid of unchecked bdrv_getlength() and bdrv_truncate(), and
get a little more documentation in function comments.
Tested that this passes qemu-iotests for all VMDK subformats.
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-07-30 10:39:10 +04:00
|
|
|
if (extent->has_zero_grain && cluster_sector == VMDK_GTE_ZEROED) {
|
2013-05-02 06:25:23 +04:00
|
|
|
zeroed = true;
|
|
|
|
}
|
|
|
|
|
vmdk: Optimize cluster allocation
This drops the unnecessary bdrv_truncate() from, and also improves,
cluster allocation code path.
Before, when we need a new cluster, get_cluster_offset truncates the
image to bdrv_getlength() + cluster_size, and returns the offset of
added area, i.e. the image length before truncating.
This is not efficient, so it's now rewritten as:
- Save the extent file length when opening.
- When allocating cluster, use the saved length as cluster offset.
- Don't truncate image, because we'll anyway write data there: just
write any data at the EOF position, in descending priority:
* New user data (cluster allocation happens in a write request).
* Filling data in the beginning and/or ending of the new cluster, if
not covered by user data: either backing file content (COW), or
zero for standalone images.
One major benifit of this change is, on host mounted NFS images, even
over a fast network, ftruncate is slow (see the example below). This
change significantly speeds up cluster allocation. Comparing by
converting a cirros image (296M) to VMDK on an NFS mount point, over
1Gbe LAN:
$ time qemu-img convert cirros-0.3.1.img /mnt/a.raw -O vmdk
Before:
real 0m21.796s
user 0m0.130s
sys 0m0.483s
After:
real 0m2.017s
user 0m0.047s
sys 0m0.190s
We also get rid of unchecked bdrv_getlength() and bdrv_truncate(), and
get a little more documentation in function comments.
Tested that this passes qemu-iotests for all VMDK subformats.
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-07-30 10:39:10 +04:00
|
|
|
if (!cluster_sector || zeroed) {
|
2011-07-12 15:56:35 +04:00
|
|
|
if (!allocate) {
|
2013-05-02 06:25:23 +04:00
|
|
|
return zeroed ? VMDK_ZEROED : VMDK_UNALLOC;
|
2011-07-12 15:56:35 +04:00
|
|
|
}
|
2010-04-16 23:07:19 +04:00
|
|
|
|
vmdk: Optimize cluster allocation
This drops the unnecessary bdrv_truncate() from, and also improves,
cluster allocation code path.
Before, when we need a new cluster, get_cluster_offset truncates the
image to bdrv_getlength() + cluster_size, and returns the offset of
added area, i.e. the image length before truncating.
This is not efficient, so it's now rewritten as:
- Save the extent file length when opening.
- When allocating cluster, use the saved length as cluster offset.
- Don't truncate image, because we'll anyway write data there: just
write any data at the EOF position, in descending priority:
* New user data (cluster allocation happens in a write request).
* Filling data in the beginning and/or ending of the new cluster, if
not covered by user data: either backing file content (COW), or
zero for standalone images.
One major benifit of this change is, on host mounted NFS images, even
over a fast network, ftruncate is slow (see the example below). This
change significantly speeds up cluster allocation. Comparing by
converting a cirros image (296M) to VMDK on an NFS mount point, over
1Gbe LAN:
$ time qemu-img convert cirros-0.3.1.img /mnt/a.raw -O vmdk
Before:
real 0m21.796s
user 0m0.130s
sys 0m0.483s
After:
real 0m2.017s
user 0m0.047s
sys 0m0.190s
We also get rid of unchecked bdrv_getlength() and bdrv_truncate(), and
get a little more documentation in function comments.
Tested that this passes qemu-iotests for all VMDK subformats.
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-07-30 10:39:10 +04:00
|
|
|
cluster_sector = extent->next_cluster_sector;
|
|
|
|
extent->next_cluster_sector += extent->cluster_sectors;
|
2007-06-18 19:01:30 +04:00
|
|
|
|
|
|
|
/* First of all we write grain itself, to avoid race condition
|
|
|
|
* that may to corrupt the image.
|
|
|
|
* This problem may occur because of insufficient space on host disk
|
|
|
|
* or inappropriate VM shutdown.
|
|
|
|
*/
|
vmdk: Optimize cluster allocation
This drops the unnecessary bdrv_truncate() from, and also improves,
cluster allocation code path.
Before, when we need a new cluster, get_cluster_offset truncates the
image to bdrv_getlength() + cluster_size, and returns the offset of
added area, i.e. the image length before truncating.
This is not efficient, so it's now rewritten as:
- Save the extent file length when opening.
- When allocating cluster, use the saved length as cluster offset.
- Don't truncate image, because we'll anyway write data there: just
write any data at the EOF position, in descending priority:
* New user data (cluster allocation happens in a write request).
* Filling data in the beginning and/or ending of the new cluster, if
not covered by user data: either backing file content (COW), or
zero for standalone images.
One major benifit of this change is, on host mounted NFS images, even
over a fast network, ftruncate is slow (see the example below). This
change significantly speeds up cluster allocation. Comparing by
converting a cirros image (296M) to VMDK on an NFS mount point, over
1Gbe LAN:
$ time qemu-img convert cirros-0.3.1.img /mnt/a.raw -O vmdk
Before:
real 0m21.796s
user 0m0.130s
sys 0m0.483s
After:
real 0m2.017s
user 0m0.047s
sys 0m0.190s
We also get rid of unchecked bdrv_getlength() and bdrv_truncate(), and
get a little more documentation in function comments.
Tested that this passes qemu-iotests for all VMDK subformats.
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-07-30 10:39:10 +04:00
|
|
|
ret = get_whole_cluster(bs, extent,
|
|
|
|
cluster_sector,
|
|
|
|
offset >> BDRV_SECTOR_BITS,
|
|
|
|
skip_start_sector, skip_end_sector);
|
|
|
|
if (ret) {
|
|
|
|
return ret;
|
2007-06-18 19:01:30 +04:00
|
|
|
}
|
2005-04-27 01:08:00 +04:00
|
|
|
}
|
vmdk: Optimize cluster allocation
This drops the unnecessary bdrv_truncate() from, and also improves,
cluster allocation code path.
Before, when we need a new cluster, get_cluster_offset truncates the
image to bdrv_getlength() + cluster_size, and returns the offset of
added area, i.e. the image length before truncating.
This is not efficient, so it's now rewritten as:
- Save the extent file length when opening.
- When allocating cluster, use the saved length as cluster offset.
- Don't truncate image, because we'll anyway write data there: just
write any data at the EOF position, in descending priority:
* New user data (cluster allocation happens in a write request).
* Filling data in the beginning and/or ending of the new cluster, if
not covered by user data: either backing file content (COW), or
zero for standalone images.
One major benifit of this change is, on host mounted NFS images, even
over a fast network, ftruncate is slow (see the example below). This
change significantly speeds up cluster allocation. Comparing by
converting a cirros image (296M) to VMDK on an NFS mount point, over
1Gbe LAN:
$ time qemu-img convert cirros-0.3.1.img /mnt/a.raw -O vmdk
Before:
real 0m21.796s
user 0m0.130s
sys 0m0.483s
After:
real 0m2.017s
user 0m0.047s
sys 0m0.190s
We also get rid of unchecked bdrv_getlength() and bdrv_truncate(), and
get a little more documentation in function comments.
Tested that this passes qemu-iotests for all VMDK subformats.
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-07-30 10:39:10 +04:00
|
|
|
*cluster_offset = cluster_sector << BDRV_SECTOR_BITS;
|
2013-05-02 06:25:22 +04:00
|
|
|
return VMDK_OK;
|
2004-08-02 01:59:26 +04:00
|
|
|
}
|
|
|
|
|
2011-07-12 15:56:28 +04:00
|
|
|
static VmdkExtent *find_extent(BDRVVmdkState *s,
|
|
|
|
int64_t sector_num, VmdkExtent *start_hint)
|
|
|
|
{
|
|
|
|
VmdkExtent *extent = start_hint;
|
|
|
|
|
|
|
|
if (!extent) {
|
|
|
|
extent = &s->extents[0];
|
|
|
|
}
|
|
|
|
while (extent < &s->extents[s->num_extents]) {
|
|
|
|
if (sector_num < extent->end_sector) {
|
|
|
|
return extent;
|
|
|
|
}
|
|
|
|
extent++;
|
|
|
|
}
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2013-09-04 21:00:28 +04:00
|
|
|
static int64_t coroutine_fn vmdk_co_get_block_status(BlockDriverState *bs,
|
2011-11-14 16:44:21 +04:00
|
|
|
int64_t sector_num, int nb_sectors, int *pnum)
|
2004-08-02 01:59:26 +04:00
|
|
|
{
|
|
|
|
BDRVVmdkState *s = bs->opaque;
|
2011-07-12 15:56:28 +04:00
|
|
|
int64_t index_in_cluster, n, ret;
|
|
|
|
uint64_t offset;
|
|
|
|
VmdkExtent *extent;
|
|
|
|
|
|
|
|
extent = find_extent(s, sector_num, NULL);
|
|
|
|
if (!extent) {
|
|
|
|
return 0;
|
|
|
|
}
|
2011-11-14 16:44:21 +04:00
|
|
|
qemu_co_mutex_lock(&s->lock);
|
2011-07-12 15:56:35 +04:00
|
|
|
ret = get_cluster_offset(bs, extent, NULL,
|
vmdk: Optimize cluster allocation
This drops the unnecessary bdrv_truncate() from, and also improves,
cluster allocation code path.
Before, when we need a new cluster, get_cluster_offset truncates the
image to bdrv_getlength() + cluster_size, and returns the offset of
added area, i.e. the image length before truncating.
This is not efficient, so it's now rewritten as:
- Save the extent file length when opening.
- When allocating cluster, use the saved length as cluster offset.
- Don't truncate image, because we'll anyway write data there: just
write any data at the EOF position, in descending priority:
* New user data (cluster allocation happens in a write request).
* Filling data in the beginning and/or ending of the new cluster, if
not covered by user data: either backing file content (COW), or
zero for standalone images.
One major benifit of this change is, on host mounted NFS images, even
over a fast network, ftruncate is slow (see the example below). This
change significantly speeds up cluster allocation. Comparing by
converting a cirros image (296M) to VMDK on an NFS mount point, over
1Gbe LAN:
$ time qemu-img convert cirros-0.3.1.img /mnt/a.raw -O vmdk
Before:
real 0m21.796s
user 0m0.130s
sys 0m0.483s
After:
real 0m2.017s
user 0m0.047s
sys 0m0.190s
We also get rid of unchecked bdrv_getlength() and bdrv_truncate(), and
get a little more documentation in function comments.
Tested that this passes qemu-iotests for all VMDK subformats.
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-07-30 10:39:10 +04:00
|
|
|
sector_num * 512, false, &offset,
|
|
|
|
0, 0);
|
2011-11-14 16:44:21 +04:00
|
|
|
qemu_co_mutex_unlock(&s->lock);
|
2013-05-02 06:25:23 +04:00
|
|
|
|
2013-09-04 21:00:30 +04:00
|
|
|
switch (ret) {
|
|
|
|
case VMDK_ERROR:
|
|
|
|
ret = -EIO;
|
|
|
|
break;
|
|
|
|
case VMDK_UNALLOC:
|
|
|
|
ret = 0;
|
|
|
|
break;
|
|
|
|
case VMDK_ZEROED:
|
|
|
|
ret = BDRV_BLOCK_ZERO;
|
|
|
|
break;
|
|
|
|
case VMDK_OK:
|
|
|
|
ret = BDRV_BLOCK_DATA;
|
2014-02-26 13:47:57 +04:00
|
|
|
if (extent->file == bs->file && !extent->compressed) {
|
2013-09-04 21:00:30 +04:00
|
|
|
ret |= BDRV_BLOCK_OFFSET_VALID | offset;
|
|
|
|
}
|
|
|
|
|
|
|
|
break;
|
|
|
|
}
|
2011-07-12 15:56:35 +04:00
|
|
|
|
|
|
|
index_in_cluster = sector_num % extent->cluster_sectors;
|
|
|
|
n = extent->cluster_sectors - index_in_cluster;
|
2011-07-12 15:56:38 +04:00
|
|
|
if (n > nb_sectors) {
|
2004-08-02 01:59:26 +04:00
|
|
|
n = nb_sectors;
|
2011-07-12 15:56:38 +04:00
|
|
|
}
|
2004-08-02 01:59:26 +04:00
|
|
|
*pnum = n;
|
2011-07-12 15:56:28 +04:00
|
|
|
return ret;
|
2004-08-02 01:59:26 +04:00
|
|
|
}
|
|
|
|
|
2011-08-12 19:19:29 +04:00
|
|
|
static int vmdk_write_extent(VmdkExtent *extent, int64_t cluster_offset,
|
|
|
|
int64_t offset_in_cluster, const uint8_t *buf,
|
|
|
|
int nb_sectors, int64_t sector_num)
|
|
|
|
{
|
|
|
|
int ret;
|
2011-08-12 19:19:31 +04:00
|
|
|
VmdkGrainMarker *data = NULL;
|
|
|
|
uLongf buf_len;
|
2011-08-12 19:19:29 +04:00
|
|
|
const uint8_t *write_buf = buf;
|
|
|
|
int write_len = nb_sectors * 512;
|
|
|
|
|
2011-08-12 19:19:31 +04:00
|
|
|
if (extent->compressed) {
|
|
|
|
if (!extent->has_marker) {
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
buf_len = (extent->cluster_sectors << 9) * 2;
|
|
|
|
data = g_malloc(buf_len + sizeof(VmdkGrainMarker));
|
|
|
|
if (compress(data->data, &buf_len, buf, nb_sectors << 9) != Z_OK ||
|
|
|
|
buf_len == 0) {
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
data->lba = sector_num;
|
|
|
|
data->size = buf_len;
|
|
|
|
write_buf = (uint8_t *)data;
|
|
|
|
write_len = buf_len + sizeof(VmdkGrainMarker);
|
|
|
|
}
|
2011-08-12 19:19:29 +04:00
|
|
|
ret = bdrv_pwrite(extent->file,
|
|
|
|
cluster_offset + offset_in_cluster,
|
|
|
|
write_buf,
|
|
|
|
write_len);
|
|
|
|
if (ret != write_len) {
|
|
|
|
ret = ret < 0 ? ret : -EIO;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
ret = 0;
|
|
|
|
out:
|
2011-08-12 19:19:31 +04:00
|
|
|
g_free(data);
|
2011-08-12 19:19:29 +04:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int vmdk_read_extent(VmdkExtent *extent, int64_t cluster_offset,
|
|
|
|
int64_t offset_in_cluster, uint8_t *buf,
|
|
|
|
int nb_sectors)
|
|
|
|
{
|
|
|
|
int ret;
|
2011-08-12 19:19:31 +04:00
|
|
|
int cluster_bytes, buf_bytes;
|
|
|
|
uint8_t *cluster_buf, *compressed_data;
|
|
|
|
uint8_t *uncomp_buf;
|
|
|
|
uint32_t data_len;
|
|
|
|
VmdkGrainMarker *marker;
|
|
|
|
uLongf buf_len;
|
|
|
|
|
2011-08-12 19:19:29 +04:00
|
|
|
|
2011-08-12 19:19:31 +04:00
|
|
|
if (!extent->compressed) {
|
|
|
|
ret = bdrv_pread(extent->file,
|
|
|
|
cluster_offset + offset_in_cluster,
|
|
|
|
buf, nb_sectors * 512);
|
|
|
|
if (ret == nb_sectors * 512) {
|
|
|
|
return 0;
|
|
|
|
} else {
|
|
|
|
return -EIO;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
cluster_bytes = extent->cluster_sectors * 512;
|
|
|
|
/* Read two clusters in case GrainMarker + compressed data > one cluster */
|
|
|
|
buf_bytes = cluster_bytes * 2;
|
|
|
|
cluster_buf = g_malloc(buf_bytes);
|
|
|
|
uncomp_buf = g_malloc(cluster_bytes);
|
2011-08-12 19:19:29 +04:00
|
|
|
ret = bdrv_pread(extent->file,
|
2011-08-12 19:19:31 +04:00
|
|
|
cluster_offset,
|
|
|
|
cluster_buf, buf_bytes);
|
|
|
|
if (ret < 0) {
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
compressed_data = cluster_buf;
|
|
|
|
buf_len = cluster_bytes;
|
|
|
|
data_len = cluster_bytes;
|
|
|
|
if (extent->has_marker) {
|
|
|
|
marker = (VmdkGrainMarker *)cluster_buf;
|
|
|
|
compressed_data = marker->data;
|
|
|
|
data_len = le32_to_cpu(marker->size);
|
|
|
|
}
|
|
|
|
if (!data_len || data_len > buf_bytes) {
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
ret = uncompress(uncomp_buf, &buf_len, compressed_data, data_len);
|
|
|
|
if (ret != Z_OK) {
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
}
|
|
|
|
if (offset_in_cluster < 0 ||
|
|
|
|
offset_in_cluster + nb_sectors * 512 > buf_len) {
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto out;
|
2011-08-12 19:19:29 +04:00
|
|
|
}
|
2011-08-12 19:19:31 +04:00
|
|
|
memcpy(buf, uncomp_buf + offset_in_cluster, nb_sectors * 512);
|
|
|
|
ret = 0;
|
|
|
|
|
|
|
|
out:
|
|
|
|
g_free(uncomp_buf);
|
|
|
|
g_free(cluster_buf);
|
|
|
|
return ret;
|
2011-08-12 19:19:29 +04:00
|
|
|
}
|
|
|
|
|
2007-09-17 01:08:06 +04:00
|
|
|
static int vmdk_read(BlockDriverState *bs, int64_t sector_num,
|
2004-08-02 01:59:26 +04:00
|
|
|
uint8_t *buf, int nb_sectors)
|
|
|
|
{
|
|
|
|
BDRVVmdkState *s = bs->opaque;
|
2011-07-12 15:56:28 +04:00
|
|
|
int ret;
|
|
|
|
uint64_t n, index_in_cluster;
|
2012-11-10 12:22:18 +04:00
|
|
|
uint64_t extent_begin_sector, extent_relative_sector_num;
|
2011-07-12 15:56:28 +04:00
|
|
|
VmdkExtent *extent = NULL;
|
2004-08-02 01:59:26 +04:00
|
|
|
uint64_t cluster_offset;
|
2007-01-25 00:05:24 +03:00
|
|
|
|
2004-08-02 01:59:26 +04:00
|
|
|
while (nb_sectors > 0) {
|
2011-07-12 15:56:28 +04:00
|
|
|
extent = find_extent(s, sector_num, extent);
|
|
|
|
if (!extent) {
|
|
|
|
return -EIO;
|
|
|
|
}
|
vmdk: Optimize cluster allocation
This drops the unnecessary bdrv_truncate() from, and also improves,
cluster allocation code path.
Before, when we need a new cluster, get_cluster_offset truncates the
image to bdrv_getlength() + cluster_size, and returns the offset of
added area, i.e. the image length before truncating.
This is not efficient, so it's now rewritten as:
- Save the extent file length when opening.
- When allocating cluster, use the saved length as cluster offset.
- Don't truncate image, because we'll anyway write data there: just
write any data at the EOF position, in descending priority:
* New user data (cluster allocation happens in a write request).
* Filling data in the beginning and/or ending of the new cluster, if
not covered by user data: either backing file content (COW), or
zero for standalone images.
One major benifit of this change is, on host mounted NFS images, even
over a fast network, ftruncate is slow (see the example below). This
change significantly speeds up cluster allocation. Comparing by
converting a cirros image (296M) to VMDK on an NFS mount point, over
1Gbe LAN:
$ time qemu-img convert cirros-0.3.1.img /mnt/a.raw -O vmdk
Before:
real 0m21.796s
user 0m0.130s
sys 0m0.483s
After:
real 0m2.017s
user 0m0.047s
sys 0m0.190s
We also get rid of unchecked bdrv_getlength() and bdrv_truncate(), and
get a little more documentation in function comments.
Tested that this passes qemu-iotests for all VMDK subformats.
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-07-30 10:39:10 +04:00
|
|
|
ret = get_cluster_offset(bs, extent, NULL,
|
|
|
|
sector_num << 9, false, &cluster_offset,
|
|
|
|
0, 0);
|
2012-11-10 12:22:18 +04:00
|
|
|
extent_begin_sector = extent->end_sector - extent->sectors;
|
|
|
|
extent_relative_sector_num = sector_num - extent_begin_sector;
|
|
|
|
index_in_cluster = extent_relative_sector_num % extent->cluster_sectors;
|
2011-07-12 15:56:28 +04:00
|
|
|
n = extent->cluster_sectors - index_in_cluster;
|
2011-07-12 15:56:38 +04:00
|
|
|
if (n > nb_sectors) {
|
2004-08-02 01:59:26 +04:00
|
|
|
n = nb_sectors;
|
2011-07-12 15:56:38 +04:00
|
|
|
}
|
2013-05-02 06:25:23 +04:00
|
|
|
if (ret != VMDK_OK) {
|
2011-07-12 15:56:35 +04:00
|
|
|
/* if not allocated, try to read from parent image, if exist */
|
2013-05-02 06:25:23 +04:00
|
|
|
if (bs->backing_hd && ret != VMDK_ZEROED) {
|
2011-07-12 15:56:38 +04:00
|
|
|
if (!vmdk_is_cid_valid(bs)) {
|
2011-07-19 04:38:22 +04:00
|
|
|
return -EINVAL;
|
2011-07-12 15:56:38 +04:00
|
|
|
}
|
2009-07-17 10:20:41 +04:00
|
|
|
ret = bdrv_read(bs->backing_hd, sector_num, buf, n);
|
2011-07-12 15:56:38 +04:00
|
|
|
if (ret < 0) {
|
2011-07-19 04:38:22 +04:00
|
|
|
return ret;
|
2011-07-12 15:56:38 +04:00
|
|
|
}
|
2007-01-25 00:05:24 +03:00
|
|
|
} else {
|
|
|
|
memset(buf, 0, 512 * n);
|
|
|
|
}
|
2004-08-02 01:59:26 +04:00
|
|
|
} else {
|
2011-08-12 19:19:29 +04:00
|
|
|
ret = vmdk_read_extent(extent,
|
|
|
|
cluster_offset, index_in_cluster * 512,
|
|
|
|
buf, n);
|
|
|
|
if (ret) {
|
2011-07-19 04:38:22 +04:00
|
|
|
return ret;
|
|
|
|
}
|
2004-08-02 01:59:26 +04:00
|
|
|
}
|
|
|
|
nb_sectors -= n;
|
|
|
|
sector_num += n;
|
|
|
|
buf += n * 512;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2011-10-20 15:16:22 +04:00
|
|
|
static coroutine_fn int vmdk_co_read(BlockDriverState *bs, int64_t sector_num,
|
|
|
|
uint8_t *buf, int nb_sectors)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
BDRVVmdkState *s = bs->opaque;
|
|
|
|
qemu_co_mutex_lock(&s->lock);
|
|
|
|
ret = vmdk_read(bs, sector_num, buf, nb_sectors);
|
|
|
|
qemu_co_mutex_unlock(&s->lock);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2013-05-02 06:25:27 +04:00
|
|
|
/**
|
|
|
|
* vmdk_write:
|
|
|
|
* @zeroed: buf is ignored (data is zero), use zeroed_grain GTE feature
|
2013-08-01 14:12:17 +04:00
|
|
|
* if possible, otherwise return -ENOTSUP.
|
|
|
|
* @zero_dry_run: used for zeroed == true only, don't update L2 table, just try
|
|
|
|
* with each cluster. By dry run we can find if the zero write
|
|
|
|
* is possible without modifying image data.
|
2013-05-02 06:25:27 +04:00
|
|
|
*
|
|
|
|
* Returns: error code with 0 for success.
|
|
|
|
*/
|
2007-09-17 01:08:06 +04:00
|
|
|
static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
|
2013-05-02 06:25:27 +04:00
|
|
|
const uint8_t *buf, int nb_sectors,
|
|
|
|
bool zeroed, bool zero_dry_run)
|
2004-08-02 01:59:26 +04:00
|
|
|
{
|
2005-04-27 01:08:00 +04:00
|
|
|
BDRVVmdkState *s = bs->opaque;
|
2011-07-12 15:56:28 +04:00
|
|
|
VmdkExtent *extent = NULL;
|
2014-01-08 05:42:07 +04:00
|
|
|
int ret;
|
|
|
|
int64_t index_in_cluster, n;
|
2012-11-10 12:22:18 +04:00
|
|
|
uint64_t extent_begin_sector, extent_relative_sector_num;
|
2005-04-27 01:08:00 +04:00
|
|
|
uint64_t cluster_offset;
|
2011-07-12 15:56:28 +04:00
|
|
|
VmdkMetaData m_data;
|
2005-04-27 01:08:00 +04:00
|
|
|
|
2007-06-18 19:01:30 +04:00
|
|
|
if (sector_num > bs->total_sectors) {
|
2013-10-11 11:43:22 +04:00
|
|
|
error_report("Wrong offset: sector_num=0x%" PRIx64
|
2007-09-21 10:09:39 +04:00
|
|
|
" total_sectors=0x%" PRIx64 "\n",
|
2007-06-18 19:01:30 +04:00
|
|
|
sector_num, bs->total_sectors);
|
2011-07-19 04:38:22 +04:00
|
|
|
return -EIO;
|
2007-06-18 19:01:30 +04:00
|
|
|
}
|
|
|
|
|
2005-04-27 01:08:00 +04:00
|
|
|
while (nb_sectors > 0) {
|
2011-07-12 15:56:28 +04:00
|
|
|
extent = find_extent(s, sector_num, extent);
|
|
|
|
if (!extent) {
|
|
|
|
return -EIO;
|
|
|
|
}
|
vmdk: Optimize cluster allocation
This drops the unnecessary bdrv_truncate() from, and also improves,
cluster allocation code path.
Before, when we need a new cluster, get_cluster_offset truncates the
image to bdrv_getlength() + cluster_size, and returns the offset of
added area, i.e. the image length before truncating.
This is not efficient, so it's now rewritten as:
- Save the extent file length when opening.
- When allocating cluster, use the saved length as cluster offset.
- Don't truncate image, because we'll anyway write data there: just
write any data at the EOF position, in descending priority:
* New user data (cluster allocation happens in a write request).
* Filling data in the beginning and/or ending of the new cluster, if
not covered by user data: either backing file content (COW), or
zero for standalone images.
One major benifit of this change is, on host mounted NFS images, even
over a fast network, ftruncate is slow (see the example below). This
change significantly speeds up cluster allocation. Comparing by
converting a cirros image (296M) to VMDK on an NFS mount point, over
1Gbe LAN:
$ time qemu-img convert cirros-0.3.1.img /mnt/a.raw -O vmdk
Before:
real 0m21.796s
user 0m0.130s
sys 0m0.483s
After:
real 0m2.017s
user 0m0.047s
sys 0m0.190s
We also get rid of unchecked bdrv_getlength() and bdrv_truncate(), and
get a little more documentation in function comments.
Tested that this passes qemu-iotests for all VMDK subformats.
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-07-30 10:39:10 +04:00
|
|
|
extent_begin_sector = extent->end_sector - extent->sectors;
|
|
|
|
extent_relative_sector_num = sector_num - extent_begin_sector;
|
|
|
|
index_in_cluster = extent_relative_sector_num % extent->cluster_sectors;
|
|
|
|
n = extent->cluster_sectors - index_in_cluster;
|
|
|
|
if (n > nb_sectors) {
|
|
|
|
n = nb_sectors;
|
|
|
|
}
|
|
|
|
ret = get_cluster_offset(bs, extent, &m_data, sector_num << 9,
|
|
|
|
!(extent->compressed || zeroed),
|
|
|
|
&cluster_offset,
|
|
|
|
index_in_cluster, index_in_cluster + n);
|
2011-08-12 19:19:31 +04:00
|
|
|
if (extent->compressed) {
|
2013-05-02 06:25:22 +04:00
|
|
|
if (ret == VMDK_OK) {
|
2011-08-12 19:19:31 +04:00
|
|
|
/* Refuse write to allocated cluster for streamOptimized */
|
2013-10-11 11:43:22 +04:00
|
|
|
error_report("Could not write to allocated cluster"
|
|
|
|
" for streamOptimized");
|
2011-08-12 19:19:31 +04:00
|
|
|
return -EIO;
|
|
|
|
} else {
|
|
|
|
/* allocate */
|
vmdk: Optimize cluster allocation
This drops the unnecessary bdrv_truncate() from, and also improves,
cluster allocation code path.
Before, when we need a new cluster, get_cluster_offset truncates the
image to bdrv_getlength() + cluster_size, and returns the offset of
added area, i.e. the image length before truncating.
This is not efficient, so it's now rewritten as:
- Save the extent file length when opening.
- When allocating cluster, use the saved length as cluster offset.
- Don't truncate image, because we'll anyway write data there: just
write any data at the EOF position, in descending priority:
* New user data (cluster allocation happens in a write request).
* Filling data in the beginning and/or ending of the new cluster, if
not covered by user data: either backing file content (COW), or
zero for standalone images.
One major benifit of this change is, on host mounted NFS images, even
over a fast network, ftruncate is slow (see the example below). This
change significantly speeds up cluster allocation. Comparing by
converting a cirros image (296M) to VMDK on an NFS mount point, over
1Gbe LAN:
$ time qemu-img convert cirros-0.3.1.img /mnt/a.raw -O vmdk
Before:
real 0m21.796s
user 0m0.130s
sys 0m0.483s
After:
real 0m2.017s
user 0m0.047s
sys 0m0.190s
We also get rid of unchecked bdrv_getlength() and bdrv_truncate(), and
get a little more documentation in function comments.
Tested that this passes qemu-iotests for all VMDK subformats.
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-07-30 10:39:10 +04:00
|
|
|
ret = get_cluster_offset(bs, extent, &m_data, sector_num << 9,
|
|
|
|
true, &cluster_offset, 0, 0);
|
2011-08-12 19:19:31 +04:00
|
|
|
}
|
|
|
|
}
|
2013-05-02 06:25:27 +04:00
|
|
|
if (ret == VMDK_ERROR) {
|
2011-07-12 15:56:35 +04:00
|
|
|
return -EINVAL;
|
2011-07-12 15:56:28 +04:00
|
|
|
}
|
2013-05-02 06:25:27 +04:00
|
|
|
if (zeroed) {
|
|
|
|
/* Do zeroed write, buf is ignored */
|
|
|
|
if (extent->has_zero_grain &&
|
|
|
|
index_in_cluster == 0 &&
|
|
|
|
n >= extent->cluster_sectors) {
|
|
|
|
n = extent->cluster_sectors;
|
|
|
|
if (!zero_dry_run) {
|
|
|
|
/* update L2 tables */
|
vmdk: Optimize cluster allocation
This drops the unnecessary bdrv_truncate() from, and also improves,
cluster allocation code path.
Before, when we need a new cluster, get_cluster_offset truncates the
image to bdrv_getlength() + cluster_size, and returns the offset of
added area, i.e. the image length before truncating.
This is not efficient, so it's now rewritten as:
- Save the extent file length when opening.
- When allocating cluster, use the saved length as cluster offset.
- Don't truncate image, because we'll anyway write data there: just
write any data at the EOF position, in descending priority:
* New user data (cluster allocation happens in a write request).
* Filling data in the beginning and/or ending of the new cluster, if
not covered by user data: either backing file content (COW), or
zero for standalone images.
One major benifit of this change is, on host mounted NFS images, even
over a fast network, ftruncate is slow (see the example below). This
change significantly speeds up cluster allocation. Comparing by
converting a cirros image (296M) to VMDK on an NFS mount point, over
1Gbe LAN:
$ time qemu-img convert cirros-0.3.1.img /mnt/a.raw -O vmdk
Before:
real 0m21.796s
user 0m0.130s
sys 0m0.483s
After:
real 0m2.017s
user 0m0.047s
sys 0m0.190s
We also get rid of unchecked bdrv_getlength() and bdrv_truncate(), and
get a little more documentation in function comments.
Tested that this passes qemu-iotests for all VMDK subformats.
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-07-30 10:39:10 +04:00
|
|
|
if (vmdk_L2update(extent, &m_data, VMDK_GTE_ZEROED)
|
|
|
|
!= VMDK_OK) {
|
2013-05-02 06:25:27 +04:00
|
|
|
return -EIO;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
return -ENOTSUP;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
ret = vmdk_write_extent(extent,
|
|
|
|
cluster_offset, index_in_cluster * 512,
|
|
|
|
buf, n, sector_num);
|
|
|
|
if (ret) {
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
if (m_data.valid) {
|
|
|
|
/* update L2 tables */
|
vmdk: Optimize cluster allocation
This drops the unnecessary bdrv_truncate() from, and also improves,
cluster allocation code path.
Before, when we need a new cluster, get_cluster_offset truncates the
image to bdrv_getlength() + cluster_size, and returns the offset of
added area, i.e. the image length before truncating.
This is not efficient, so it's now rewritten as:
- Save the extent file length when opening.
- When allocating cluster, use the saved length as cluster offset.
- Don't truncate image, because we'll anyway write data there: just
write any data at the EOF position, in descending priority:
* New user data (cluster allocation happens in a write request).
* Filling data in the beginning and/or ending of the new cluster, if
not covered by user data: either backing file content (COW), or
zero for standalone images.
One major benifit of this change is, on host mounted NFS images, even
over a fast network, ftruncate is slow (see the example below). This
change significantly speeds up cluster allocation. Comparing by
converting a cirros image (296M) to VMDK on an NFS mount point, over
1Gbe LAN:
$ time qemu-img convert cirros-0.3.1.img /mnt/a.raw -O vmdk
Before:
real 0m21.796s
user 0m0.130s
sys 0m0.483s
After:
real 0m2.017s
user 0m0.047s
sys 0m0.190s
We also get rid of unchecked bdrv_getlength() and bdrv_truncate(), and
get a little more documentation in function comments.
Tested that this passes qemu-iotests for all VMDK subformats.
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-07-30 10:39:10 +04:00
|
|
|
if (vmdk_L2update(extent, &m_data,
|
|
|
|
cluster_offset >> BDRV_SECTOR_BITS)
|
|
|
|
!= VMDK_OK) {
|
2013-05-02 06:25:27 +04:00
|
|
|
return -EIO;
|
|
|
|
}
|
2011-07-12 15:56:28 +04:00
|
|
|
}
|
2007-06-18 19:01:30 +04:00
|
|
|
}
|
2005-04-27 01:08:00 +04:00
|
|
|
nb_sectors -= n;
|
|
|
|
sector_num += n;
|
|
|
|
buf += n * 512;
|
2007-01-25 00:05:24 +03:00
|
|
|
|
2011-07-12 15:56:38 +04:00
|
|
|
/* update CID on the first write every time the virtual disk is
|
|
|
|
* opened */
|
2011-07-12 15:56:34 +04:00
|
|
|
if (!s->cid_updated) {
|
2014-12-04 02:28:29 +03:00
|
|
|
ret = vmdk_write_cid(bs, g_random_int());
|
2011-10-26 14:25:25 +04:00
|
|
|
if (ret < 0) {
|
|
|
|
return ret;
|
|
|
|
}
|
2011-07-12 15:56:34 +04:00
|
|
|
s->cid_updated = true;
|
2007-01-25 00:05:24 +03:00
|
|
|
}
|
2005-04-27 01:08:00 +04:00
|
|
|
}
|
|
|
|
return 0;
|
2004-08-02 01:59:26 +04:00
|
|
|
}
|
|
|
|
|
2011-10-20 15:16:23 +04:00
|
|
|
static coroutine_fn int vmdk_co_write(BlockDriverState *bs, int64_t sector_num,
|
|
|
|
const uint8_t *buf, int nb_sectors)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
BDRVVmdkState *s = bs->opaque;
|
|
|
|
qemu_co_mutex_lock(&s->lock);
|
2013-05-02 06:25:27 +04:00
|
|
|
ret = vmdk_write(bs, sector_num, buf, nb_sectors, false, false);
|
|
|
|
qemu_co_mutex_unlock(&s->lock);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2014-05-06 17:08:44 +04:00
|
|
|
static int vmdk_write_compressed(BlockDriverState *bs,
|
|
|
|
int64_t sector_num,
|
|
|
|
const uint8_t *buf,
|
|
|
|
int nb_sectors)
|
|
|
|
{
|
|
|
|
BDRVVmdkState *s = bs->opaque;
|
|
|
|
if (s->num_extents == 1 && s->extents[0].compressed) {
|
|
|
|
return vmdk_write(bs, sector_num, buf, nb_sectors, false, false);
|
|
|
|
} else {
|
|
|
|
return -ENOTSUP;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-05-02 06:25:27 +04:00
|
|
|
static int coroutine_fn vmdk_co_write_zeroes(BlockDriverState *bs,
|
|
|
|
int64_t sector_num,
|
2013-10-24 14:06:51 +04:00
|
|
|
int nb_sectors,
|
|
|
|
BdrvRequestFlags flags)
|
2013-05-02 06:25:27 +04:00
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
BDRVVmdkState *s = bs->opaque;
|
|
|
|
qemu_co_mutex_lock(&s->lock);
|
2013-08-01 14:12:17 +04:00
|
|
|
/* write zeroes could fail if sectors not aligned to cluster, test it with
|
|
|
|
* dry_run == true before really updating image */
|
2013-05-02 06:25:27 +04:00
|
|
|
ret = vmdk_write(bs, sector_num, NULL, nb_sectors, true, true);
|
|
|
|
if (!ret) {
|
|
|
|
ret = vmdk_write(bs, sector_num, NULL, nb_sectors, true, false);
|
|
|
|
}
|
2011-10-20 15:16:23 +04:00
|
|
|
qemu_co_mutex_unlock(&s->lock);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2011-08-12 19:19:32 +04:00
|
|
|
static int vmdk_create_extent(const char *filename, int64_t filesize,
|
2013-12-20 05:48:48 +04:00
|
|
|
bool flat, bool compress, bool zeroed_grain,
|
qemu-img create: add 'nocow' option
Add 'nocow' option so that users could have a chance to set NOCOW flag to
newly created files. It's useful on btrfs file system to enhance performance.
Btrfs has low performance when hosting VM images, even more when the guest
in those VM are also using btrfs as file system. One way to mitigate this bad
performance is to turn off COW attributes on VM files. Generally, there are
two ways to turn off NOCOW on btrfs: a) by mounting fs with nodatacow, then
all newly created files will be NOCOW. b) per file. Add the NOCOW file
attribute. It could only be done to empty or new files.
This patch tries the second way, according to the option, it could add NOCOW
per file.
For most block drivers, since the create file step is in raw-posix.c, so we
can do setting NOCOW flag ioctl in raw-posix.c only.
But there are some exceptions, like block/vpc.c and block/vdi.c, they are
creating file by calling qemu_open directly. For them, do the same setting
NOCOW flag ioctl work in them separately.
[Fixed up 082.out due to the new 'nocow' creation option
--Stefan]
Signed-off-by: Chunyan Liu <cyliu@suse.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-06-30 10:29:58 +04:00
|
|
|
QemuOpts *opts, Error **errp)
|
2005-07-02 18:02:54 +04:00
|
|
|
{
|
2011-07-19 04:45:23 +04:00
|
|
|
int ret, i;
|
2013-12-20 05:48:48 +04:00
|
|
|
BlockDriverState *bs = NULL;
|
2005-07-02 18:02:54 +04:00
|
|
|
VMDK4Header header;
|
2014-05-28 07:38:58 +04:00
|
|
|
Error *local_err = NULL;
|
2013-12-20 05:48:48 +04:00
|
|
|
uint32_t tmp, magic, grains, gd_sectors, gt_size, gt_count;
|
|
|
|
uint32_t *gd_buf = NULL;
|
|
|
|
int gd_buf_size;
|
2009-05-18 18:42:10 +04:00
|
|
|
|
qemu-img create: add 'nocow' option
Add 'nocow' option so that users could have a chance to set NOCOW flag to
newly created files. It's useful on btrfs file system to enhance performance.
Btrfs has low performance when hosting VM images, even more when the guest
in those VM are also using btrfs as file system. One way to mitigate this bad
performance is to turn off COW attributes on VM files. Generally, there are
two ways to turn off NOCOW on btrfs: a) by mounting fs with nodatacow, then
all newly created files will be NOCOW. b) per file. Add the NOCOW file
attribute. It could only be done to empty or new files.
This patch tries the second way, according to the option, it could add NOCOW
per file.
For most block drivers, since the create file step is in raw-posix.c, so we
can do setting NOCOW flag ioctl in raw-posix.c only.
But there are some exceptions, like block/vpc.c and block/vdi.c, they are
creating file by calling qemu_open directly. For them, do the same setting
NOCOW flag ioctl work in them separately.
[Fixed up 082.out due to the new 'nocow' creation option
--Stefan]
Signed-off-by: Chunyan Liu <cyliu@suse.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-06-30 10:29:58 +04:00
|
|
|
ret = bdrv_create_file(filename, opts, &local_err);
|
2013-12-20 05:48:48 +04:00
|
|
|
if (ret < 0) {
|
|
|
|
error_propagate(errp, local_err);
|
|
|
|
goto exit;
|
2009-05-18 18:42:10 +04:00
|
|
|
}
|
2013-12-20 05:48:48 +04:00
|
|
|
|
2014-02-18 21:33:07 +04:00
|
|
|
assert(bs == NULL);
|
|
|
|
ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL,
|
|
|
|
NULL, &local_err);
|
2013-12-20 05:48:48 +04:00
|
|
|
if (ret < 0) {
|
|
|
|
error_propagate(errp, local_err);
|
|
|
|
goto exit;
|
|
|
|
}
|
|
|
|
|
2011-07-19 04:45:23 +04:00
|
|
|
if (flat) {
|
2013-12-20 05:48:48 +04:00
|
|
|
ret = bdrv_truncate(bs, filesize);
|
2011-07-19 04:45:23 +04:00
|
|
|
if (ret < 0) {
|
2014-02-12 23:46:24 +04:00
|
|
|
error_setg_errno(errp, -ret, "Could not truncate file");
|
2011-07-19 04:45:23 +04:00
|
|
|
}
|
|
|
|
goto exit;
|
2007-01-25 00:05:24 +03:00
|
|
|
}
|
2005-07-02 18:02:54 +04:00
|
|
|
magic = cpu_to_be32(VMDK4_MAGIC);
|
|
|
|
memset(&header, 0, sizeof(header));
|
2013-05-02 06:25:24 +04:00
|
|
|
header.version = zeroed_grain ? 2 : 1;
|
2013-05-02 06:25:25 +04:00
|
|
|
header.flags = VMDK4_FLAG_RGD | VMDK4_FLAG_NL_DETECT
|
2013-05-02 06:25:24 +04:00
|
|
|
| (compress ? VMDK4_FLAG_COMPRESS | VMDK4_FLAG_MARKER : 0)
|
|
|
|
| (zeroed_grain ? VMDK4_FLAG_ZERO_GRAIN : 0);
|
2011-08-12 19:19:32 +04:00
|
|
|
header.compressAlgorithm = compress ? VMDK4_COMPRESSION_DEFLATE : 0;
|
2013-12-20 05:48:48 +04:00
|
|
|
header.capacity = filesize / BDRV_SECTOR_SIZE;
|
2011-05-25 02:46:55 +04:00
|
|
|
header.granularity = 128;
|
2013-12-20 05:48:48 +04:00
|
|
|
header.num_gtes_per_gt = BDRV_SECTOR_SIZE;
|
2005-07-02 18:02:54 +04:00
|
|
|
|
2013-12-20 05:48:48 +04:00
|
|
|
grains = DIV_ROUND_UP(filesize / BDRV_SECTOR_SIZE, header.granularity);
|
|
|
|
gt_size = DIV_ROUND_UP(header.num_gtes_per_gt * sizeof(uint32_t),
|
|
|
|
BDRV_SECTOR_SIZE);
|
|
|
|
gt_count = DIV_ROUND_UP(grains, header.num_gtes_per_gt);
|
|
|
|
gd_sectors = DIV_ROUND_UP(gt_count * sizeof(uint32_t), BDRV_SECTOR_SIZE);
|
2005-07-02 18:02:54 +04:00
|
|
|
|
|
|
|
header.desc_offset = 1;
|
|
|
|
header.desc_size = 20;
|
|
|
|
header.rgd_offset = header.desc_offset + header.desc_size;
|
2013-12-20 05:48:48 +04:00
|
|
|
header.gd_offset = header.rgd_offset + gd_sectors + (gt_size * gt_count);
|
2005-07-02 18:02:54 +04:00
|
|
|
header.grain_offset =
|
2013-12-20 05:48:48 +04:00
|
|
|
ROUND_UP(header.gd_offset + gd_sectors + (gt_size * gt_count),
|
|
|
|
header.granularity);
|
2011-05-25 02:46:55 +04:00
|
|
|
/* swap endianness for all header fields */
|
|
|
|
header.version = cpu_to_le32(header.version);
|
|
|
|
header.flags = cpu_to_le32(header.flags);
|
|
|
|
header.capacity = cpu_to_le64(header.capacity);
|
|
|
|
header.granularity = cpu_to_le64(header.granularity);
|
2013-08-06 11:44:55 +04:00
|
|
|
header.num_gtes_per_gt = cpu_to_le32(header.num_gtes_per_gt);
|
2005-07-02 18:02:54 +04:00
|
|
|
header.desc_offset = cpu_to_le64(header.desc_offset);
|
|
|
|
header.desc_size = cpu_to_le64(header.desc_size);
|
|
|
|
header.rgd_offset = cpu_to_le64(header.rgd_offset);
|
|
|
|
header.gd_offset = cpu_to_le64(header.gd_offset);
|
|
|
|
header.grain_offset = cpu_to_le64(header.grain_offset);
|
2011-08-12 19:19:32 +04:00
|
|
|
header.compressAlgorithm = cpu_to_le16(header.compressAlgorithm);
|
2005-07-02 18:02:54 +04:00
|
|
|
|
|
|
|
header.check_bytes[0] = 0xa;
|
|
|
|
header.check_bytes[1] = 0x20;
|
|
|
|
header.check_bytes[2] = 0xd;
|
|
|
|
header.check_bytes[3] = 0xa;
|
2007-09-17 12:09:54 +04:00
|
|
|
|
|
|
|
/* write all the data */
|
2013-12-20 05:48:48 +04:00
|
|
|
ret = bdrv_pwrite(bs, 0, &magic, sizeof(magic));
|
|
|
|
if (ret < 0) {
|
|
|
|
error_set(errp, QERR_IO_ERROR);
|
2010-01-20 02:56:13 +03:00
|
|
|
goto exit;
|
|
|
|
}
|
2013-12-20 05:48:48 +04:00
|
|
|
ret = bdrv_pwrite(bs, sizeof(magic), &header, sizeof(header));
|
|
|
|
if (ret < 0) {
|
|
|
|
error_set(errp, QERR_IO_ERROR);
|
2010-01-20 02:56:13 +03:00
|
|
|
goto exit;
|
|
|
|
}
|
2005-07-02 18:02:54 +04:00
|
|
|
|
2013-12-20 05:48:48 +04:00
|
|
|
ret = bdrv_truncate(bs, le64_to_cpu(header.grain_offset) << 9);
|
2010-01-20 02:56:13 +03:00
|
|
|
if (ret < 0) {
|
2014-02-12 23:46:24 +04:00
|
|
|
error_setg_errno(errp, -ret, "Could not truncate file");
|
2010-01-20 02:56:13 +03:00
|
|
|
goto exit;
|
|
|
|
}
|
2005-07-02 18:02:54 +04:00
|
|
|
|
|
|
|
/* write grain directory */
|
2013-12-20 05:48:48 +04:00
|
|
|
gd_buf_size = gd_sectors * BDRV_SECTOR_SIZE;
|
|
|
|
gd_buf = g_malloc0(gd_buf_size);
|
|
|
|
for (i = 0, tmp = le64_to_cpu(header.rgd_offset) + gd_sectors;
|
2010-01-20 02:56:13 +03:00
|
|
|
i < gt_count; i++, tmp += gt_size) {
|
2013-12-20 05:48:48 +04:00
|
|
|
gd_buf[i] = cpu_to_le32(tmp);
|
|
|
|
}
|
|
|
|
ret = bdrv_pwrite(bs, le64_to_cpu(header.rgd_offset) * BDRV_SECTOR_SIZE,
|
|
|
|
gd_buf, gd_buf_size);
|
|
|
|
if (ret < 0) {
|
|
|
|
error_set(errp, QERR_IO_ERROR);
|
|
|
|
goto exit;
|
2010-01-20 02:56:13 +03:00
|
|
|
}
|
2007-09-17 12:09:54 +04:00
|
|
|
|
2005-07-02 18:02:54 +04:00
|
|
|
/* write backup grain directory */
|
2013-12-20 05:48:48 +04:00
|
|
|
for (i = 0, tmp = le64_to_cpu(header.gd_offset) + gd_sectors;
|
2010-01-20 02:56:13 +03:00
|
|
|
i < gt_count; i++, tmp += gt_size) {
|
2013-12-20 05:48:48 +04:00
|
|
|
gd_buf[i] = cpu_to_le32(tmp);
|
|
|
|
}
|
|
|
|
ret = bdrv_pwrite(bs, le64_to_cpu(header.gd_offset) * BDRV_SECTOR_SIZE,
|
|
|
|
gd_buf, gd_buf_size);
|
|
|
|
if (ret < 0) {
|
|
|
|
error_set(errp, QERR_IO_ERROR);
|
|
|
|
goto exit;
|
2010-01-20 02:56:13 +03:00
|
|
|
}
|
2005-07-02 18:02:54 +04:00
|
|
|
|
2011-07-19 04:45:23 +04:00
|
|
|
ret = 0;
|
2013-12-20 05:48:48 +04:00
|
|
|
exit:
|
|
|
|
if (bs) {
|
|
|
|
bdrv_unref(bs);
|
|
|
|
}
|
|
|
|
g_free(gd_buf);
|
2011-07-19 04:45:23 +04:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int filename_decompose(const char *filename, char *path, char *prefix,
|
2013-10-11 11:43:22 +04:00
|
|
|
char *postfix, size_t buf_len, Error **errp)
|
2011-07-19 04:45:23 +04:00
|
|
|
{
|
|
|
|
const char *p, *q;
|
|
|
|
|
|
|
|
if (filename == NULL || !strlen(filename)) {
|
2013-10-11 11:43:22 +04:00
|
|
|
error_setg(errp, "No filename provided");
|
2013-05-02 06:25:22 +04:00
|
|
|
return VMDK_ERROR;
|
2011-07-19 04:45:23 +04:00
|
|
|
}
|
|
|
|
p = strrchr(filename, '/');
|
|
|
|
if (p == NULL) {
|
|
|
|
p = strrchr(filename, '\\');
|
|
|
|
}
|
|
|
|
if (p == NULL) {
|
|
|
|
p = strrchr(filename, ':');
|
|
|
|
}
|
|
|
|
if (p != NULL) {
|
|
|
|
p++;
|
|
|
|
if (p - filename >= buf_len) {
|
2013-05-02 06:25:22 +04:00
|
|
|
return VMDK_ERROR;
|
2011-07-19 04:45:23 +04:00
|
|
|
}
|
|
|
|
pstrcpy(path, p - filename + 1, filename);
|
|
|
|
} else {
|
|
|
|
p = filename;
|
|
|
|
path[0] = '\0';
|
|
|
|
}
|
|
|
|
q = strrchr(p, '.');
|
|
|
|
if (q == NULL) {
|
|
|
|
pstrcpy(prefix, buf_len, p);
|
|
|
|
postfix[0] = '\0';
|
|
|
|
} else {
|
|
|
|
if (q - p >= buf_len) {
|
2013-05-02 06:25:22 +04:00
|
|
|
return VMDK_ERROR;
|
2011-07-19 04:45:23 +04:00
|
|
|
}
|
|
|
|
pstrcpy(prefix, q - p + 1, p);
|
|
|
|
pstrcpy(postfix, buf_len, q);
|
|
|
|
}
|
2013-05-02 06:25:22 +04:00
|
|
|
return VMDK_OK;
|
2011-07-19 04:45:23 +04:00
|
|
|
}
|
|
|
|
|
2014-06-05 13:21:09 +04:00
|
|
|
static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp)
|
2011-07-19 04:45:23 +04:00
|
|
|
{
|
2013-12-20 05:48:48 +04:00
|
|
|
int idx = 0;
|
|
|
|
BlockDriverState *new_bs = NULL;
|
2014-05-28 07:38:58 +04:00
|
|
|
Error *local_err = NULL;
|
2013-12-03 06:41:05 +04:00
|
|
|
char *desc = NULL;
|
2011-07-19 04:45:23 +04:00
|
|
|
int64_t total_size = 0, filesize;
|
2014-06-05 13:21:09 +04:00
|
|
|
char *adapter_type = NULL;
|
|
|
|
char *backing_file = NULL;
|
|
|
|
char *fmt = NULL;
|
2011-07-19 04:45:23 +04:00
|
|
|
int flags = 0;
|
|
|
|
int ret = 0;
|
2011-08-12 19:19:32 +04:00
|
|
|
bool flat, split, compress;
|
2013-12-03 06:41:05 +04:00
|
|
|
GString *ext_desc_lines;
|
2015-01-22 16:03:26 +03:00
|
|
|
char *path = g_malloc0(PATH_MAX);
|
|
|
|
char *prefix = g_malloc0(PATH_MAX);
|
|
|
|
char *postfix = g_malloc0(PATH_MAX);
|
|
|
|
char *desc_line = g_malloc0(BUF_SIZE);
|
|
|
|
char *ext_filename = g_malloc0(PATH_MAX);
|
|
|
|
char *desc_filename = g_malloc0(PATH_MAX);
|
2011-07-19 04:45:23 +04:00
|
|
|
const int64_t split_size = 0x80000000; /* VMDK has constant split size */
|
|
|
|
const char *desc_extent_line;
|
2015-01-22 16:03:26 +03:00
|
|
|
char *parent_desc_line = g_malloc0(BUF_SIZE);
|
2011-07-19 04:45:23 +04:00
|
|
|
uint32_t parent_cid = 0xffffffff;
|
2013-01-30 03:26:52 +04:00
|
|
|
uint32_t number_heads = 16;
|
2013-05-02 06:25:24 +04:00
|
|
|
bool zeroed_grain = false;
|
2013-12-20 05:48:48 +04:00
|
|
|
uint32_t desc_offset = 0, desc_len;
|
2011-07-19 04:45:23 +04:00
|
|
|
const char desc_template[] =
|
|
|
|
"# Disk DescriptorFile\n"
|
|
|
|
"version=1\n"
|
2014-04-17 14:43:53 +04:00
|
|
|
"CID=%" PRIx32 "\n"
|
|
|
|
"parentCID=%" PRIx32 "\n"
|
2011-07-19 04:45:23 +04:00
|
|
|
"createType=\"%s\"\n"
|
|
|
|
"%s"
|
|
|
|
"\n"
|
|
|
|
"# Extent description\n"
|
|
|
|
"%s"
|
|
|
|
"\n"
|
|
|
|
"# The Disk Data Base\n"
|
|
|
|
"#DDB\n"
|
|
|
|
"\n"
|
|
|
|
"ddb.virtualHWVersion = \"%d\"\n"
|
|
|
|
"ddb.geometry.cylinders = \"%" PRId64 "\"\n"
|
2014-04-17 07:34:37 +04:00
|
|
|
"ddb.geometry.heads = \"%" PRIu32 "\"\n"
|
2011-07-19 04:45:23 +04:00
|
|
|
"ddb.geometry.sectors = \"63\"\n"
|
2013-01-30 03:26:52 +04:00
|
|
|
"ddb.adapterType = \"%s\"\n";
|
2011-07-19 04:45:23 +04:00
|
|
|
|
2013-12-03 06:41:05 +04:00
|
|
|
ext_desc_lines = g_string_new(NULL);
|
|
|
|
|
2013-10-11 11:43:22 +04:00
|
|
|
if (filename_decompose(filename, path, prefix, postfix, PATH_MAX, errp)) {
|
2013-12-03 06:41:05 +04:00
|
|
|
ret = -EINVAL;
|
|
|
|
goto exit;
|
2011-07-19 04:45:23 +04:00
|
|
|
}
|
|
|
|
/* Read out options */
|
2014-09-10 13:05:45 +04:00
|
|
|
total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
|
|
|
|
BDRV_SECTOR_SIZE);
|
2014-06-05 13:21:09 +04:00
|
|
|
adapter_type = qemu_opt_get_del(opts, BLOCK_OPT_ADAPTER_TYPE);
|
|
|
|
backing_file = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE);
|
|
|
|
if (qemu_opt_get_bool_del(opts, BLOCK_OPT_COMPAT6, false)) {
|
|
|
|
flags |= BLOCK_FLAG_COMPAT6;
|
|
|
|
}
|
|
|
|
fmt = qemu_opt_get_del(opts, BLOCK_OPT_SUBFMT);
|
|
|
|
if (qemu_opt_get_bool_del(opts, BLOCK_OPT_ZEROED_GRAIN, false)) {
|
|
|
|
zeroed_grain = true;
|
2011-07-19 04:45:23 +04:00
|
|
|
}
|
2014-06-05 13:21:09 +04:00
|
|
|
|
2013-01-30 03:26:52 +04:00
|
|
|
if (!adapter_type) {
|
2014-06-05 13:21:09 +04:00
|
|
|
adapter_type = g_strdup("ide");
|
2013-01-30 03:26:52 +04:00
|
|
|
} else if (strcmp(adapter_type, "ide") &&
|
|
|
|
strcmp(adapter_type, "buslogic") &&
|
|
|
|
strcmp(adapter_type, "lsilogic") &&
|
|
|
|
strcmp(adapter_type, "legacyESX")) {
|
2013-10-11 11:43:22 +04:00
|
|
|
error_setg(errp, "Unknown adapter type: '%s'", adapter_type);
|
2013-12-03 06:41:05 +04:00
|
|
|
ret = -EINVAL;
|
|
|
|
goto exit;
|
2013-01-30 03:26:52 +04:00
|
|
|
}
|
|
|
|
if (strcmp(adapter_type, "ide") != 0) {
|
|
|
|
/* that's the number of heads with which vmware operates when
|
|
|
|
creating, exporting, etc. vmdk files with a non-ide adapter type */
|
|
|
|
number_heads = 255;
|
|
|
|
}
|
2011-07-19 04:45:23 +04:00
|
|
|
if (!fmt) {
|
|
|
|
/* Default format to monolithicSparse */
|
2014-06-05 13:21:09 +04:00
|
|
|
fmt = g_strdup("monolithicSparse");
|
2011-07-19 04:45:23 +04:00
|
|
|
} else if (strcmp(fmt, "monolithicFlat") &&
|
|
|
|
strcmp(fmt, "monolithicSparse") &&
|
|
|
|
strcmp(fmt, "twoGbMaxExtentSparse") &&
|
2011-08-12 19:19:32 +04:00
|
|
|
strcmp(fmt, "twoGbMaxExtentFlat") &&
|
|
|
|
strcmp(fmt, "streamOptimized")) {
|
2013-10-11 11:43:22 +04:00
|
|
|
error_setg(errp, "Unknown subformat: '%s'", fmt);
|
2013-12-03 06:41:05 +04:00
|
|
|
ret = -EINVAL;
|
|
|
|
goto exit;
|
2011-07-19 04:45:23 +04:00
|
|
|
}
|
|
|
|
split = !(strcmp(fmt, "twoGbMaxExtentFlat") &&
|
|
|
|
strcmp(fmt, "twoGbMaxExtentSparse"));
|
|
|
|
flat = !(strcmp(fmt, "monolithicFlat") &&
|
|
|
|
strcmp(fmt, "twoGbMaxExtentFlat"));
|
2011-08-12 19:19:32 +04:00
|
|
|
compress = !strcmp(fmt, "streamOptimized");
|
2011-07-19 04:45:23 +04:00
|
|
|
if (flat) {
|
2014-04-17 07:34:37 +04:00
|
|
|
desc_extent_line = "RW %" PRId64 " FLAT \"%s\" 0\n";
|
2011-07-19 04:45:23 +04:00
|
|
|
} else {
|
2014-04-17 07:34:37 +04:00
|
|
|
desc_extent_line = "RW %" PRId64 " SPARSE \"%s\"\n";
|
2011-07-19 04:45:23 +04:00
|
|
|
}
|
|
|
|
if (flat && backing_file) {
|
2013-10-11 11:43:22 +04:00
|
|
|
error_setg(errp, "Flat image can't have backing file");
|
2013-12-03 06:41:05 +04:00
|
|
|
ret = -ENOTSUP;
|
|
|
|
goto exit;
|
2011-07-19 04:45:23 +04:00
|
|
|
}
|
2013-10-11 11:43:23 +04:00
|
|
|
if (flat && zeroed_grain) {
|
|
|
|
error_setg(errp, "Flat image can't enable zeroed grain");
|
2013-12-03 06:41:05 +04:00
|
|
|
ret = -ENOTSUP;
|
|
|
|
goto exit;
|
2013-10-11 11:43:23 +04:00
|
|
|
}
|
2011-07-19 04:45:23 +04:00
|
|
|
if (backing_file) {
|
2014-02-18 21:33:05 +04:00
|
|
|
BlockDriverState *bs = NULL;
|
2014-11-26 19:20:28 +03:00
|
|
|
char *full_backing = g_new0(char, PATH_MAX);
|
|
|
|
bdrv_get_full_backing_filename_from_filename(filename, backing_file,
|
|
|
|
full_backing, PATH_MAX,
|
|
|
|
&local_err);
|
|
|
|
if (local_err) {
|
|
|
|
g_free(full_backing);
|
|
|
|
error_propagate(errp, local_err);
|
|
|
|
ret = -ENOENT;
|
|
|
|
goto exit;
|
|
|
|
}
|
|
|
|
ret = bdrv_open(&bs, full_backing, NULL, NULL, BDRV_O_NO_BACKING, NULL,
|
2014-02-18 21:33:06 +04:00
|
|
|
errp);
|
2014-11-26 19:20:28 +03:00
|
|
|
g_free(full_backing);
|
2011-07-19 04:45:23 +04:00
|
|
|
if (ret != 0) {
|
2013-12-03 06:41:05 +04:00
|
|
|
goto exit;
|
2011-07-19 04:45:23 +04:00
|
|
|
}
|
|
|
|
if (strcmp(bs->drv->format_name, "vmdk")) {
|
2013-08-23 05:14:47 +04:00
|
|
|
bdrv_unref(bs);
|
2013-12-03 06:41:05 +04:00
|
|
|
ret = -EINVAL;
|
|
|
|
goto exit;
|
2011-07-19 04:45:23 +04:00
|
|
|
}
|
|
|
|
parent_cid = vmdk_read_cid(bs, 0);
|
2013-08-23 05:14:47 +04:00
|
|
|
bdrv_unref(bs);
|
2015-01-22 16:03:26 +03:00
|
|
|
snprintf(parent_desc_line, BUF_SIZE,
|
2013-06-26 13:24:32 +04:00
|
|
|
"parentFileNameHint=\"%s\"", backing_file);
|
2011-07-19 04:45:23 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Create extents */
|
|
|
|
filesize = total_size;
|
|
|
|
while (filesize > 0) {
|
|
|
|
int64_t size = filesize;
|
|
|
|
|
|
|
|
if (split && size > split_size) {
|
|
|
|
size = split_size;
|
|
|
|
}
|
|
|
|
if (split) {
|
2015-01-22 16:03:26 +03:00
|
|
|
snprintf(desc_filename, PATH_MAX, "%s-%c%03d%s",
|
2011-07-19 04:45:23 +04:00
|
|
|
prefix, flat ? 'f' : 's', ++idx, postfix);
|
|
|
|
} else if (flat) {
|
2015-01-22 16:03:26 +03:00
|
|
|
snprintf(desc_filename, PATH_MAX, "%s-flat%s", prefix, postfix);
|
2011-07-19 04:45:23 +04:00
|
|
|
} else {
|
2015-01-22 16:03:26 +03:00
|
|
|
snprintf(desc_filename, PATH_MAX, "%s%s", prefix, postfix);
|
2011-07-19 04:45:23 +04:00
|
|
|
}
|
2015-01-22 16:03:26 +03:00
|
|
|
snprintf(ext_filename, PATH_MAX, "%s%s", path, desc_filename);
|
2011-07-19 04:45:23 +04:00
|
|
|
|
2013-05-02 06:25:24 +04:00
|
|
|
if (vmdk_create_extent(ext_filename, size,
|
qemu-img create: add 'nocow' option
Add 'nocow' option so that users could have a chance to set NOCOW flag to
newly created files. It's useful on btrfs file system to enhance performance.
Btrfs has low performance when hosting VM images, even more when the guest
in those VM are also using btrfs as file system. One way to mitigate this bad
performance is to turn off COW attributes on VM files. Generally, there are
two ways to turn off NOCOW on btrfs: a) by mounting fs with nodatacow, then
all newly created files will be NOCOW. b) per file. Add the NOCOW file
attribute. It could only be done to empty or new files.
This patch tries the second way, according to the option, it could add NOCOW
per file.
For most block drivers, since the create file step is in raw-posix.c, so we
can do setting NOCOW flag ioctl in raw-posix.c only.
But there are some exceptions, like block/vpc.c and block/vdi.c, they are
creating file by calling qemu_open directly. For them, do the same setting
NOCOW flag ioctl work in them separately.
[Fixed up 082.out due to the new 'nocow' creation option
--Stefan]
Signed-off-by: Chunyan Liu <cyliu@suse.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-06-30 10:29:58 +04:00
|
|
|
flat, compress, zeroed_grain, opts, errp)) {
|
2013-12-03 06:41:05 +04:00
|
|
|
ret = -EINVAL;
|
|
|
|
goto exit;
|
2011-07-19 04:45:23 +04:00
|
|
|
}
|
|
|
|
filesize -= size;
|
|
|
|
|
|
|
|
/* Format description line */
|
2015-01-22 16:03:26 +03:00
|
|
|
snprintf(desc_line, BUF_SIZE,
|
2013-12-20 05:48:48 +04:00
|
|
|
desc_extent_line, size / BDRV_SECTOR_SIZE, desc_filename);
|
2013-12-03 06:41:05 +04:00
|
|
|
g_string_append(ext_desc_lines, desc_line);
|
2011-07-19 04:45:23 +04:00
|
|
|
}
|
|
|
|
/* generate descriptor file */
|
2013-12-03 06:41:05 +04:00
|
|
|
desc = g_strdup_printf(desc_template,
|
2014-12-04 02:28:29 +03:00
|
|
|
g_random_int(),
|
2013-12-03 06:41:05 +04:00
|
|
|
parent_cid,
|
|
|
|
fmt,
|
|
|
|
parent_desc_line,
|
|
|
|
ext_desc_lines->str,
|
|
|
|
(flags & BLOCK_FLAG_COMPAT6 ? 6 : 4),
|
2013-12-20 05:48:48 +04:00
|
|
|
total_size /
|
|
|
|
(int64_t)(63 * number_heads * BDRV_SECTOR_SIZE),
|
2013-12-03 06:41:05 +04:00
|
|
|
number_heads,
|
|
|
|
adapter_type);
|
2013-12-20 05:48:48 +04:00
|
|
|
desc_len = strlen(desc);
|
|
|
|
/* the descriptor offset = 0x200 */
|
|
|
|
if (!split && !flat) {
|
|
|
|
desc_offset = 0x200;
|
2011-07-19 04:45:23 +04:00
|
|
|
} else {
|
2014-06-05 13:21:11 +04:00
|
|
|
ret = bdrv_create_file(filename, opts, &local_err);
|
2013-12-20 05:48:48 +04:00
|
|
|
if (ret < 0) {
|
2014-05-28 07:38:58 +04:00
|
|
|
error_propagate(errp, local_err);
|
2013-12-20 05:48:48 +04:00
|
|
|
goto exit;
|
|
|
|
}
|
2011-07-19 04:45:23 +04:00
|
|
|
}
|
2014-02-18 21:33:07 +04:00
|
|
|
assert(new_bs == NULL);
|
|
|
|
ret = bdrv_open(&new_bs, filename, NULL, NULL,
|
|
|
|
BDRV_O_RDWR | BDRV_O_PROTOCOL, NULL, &local_err);
|
2013-12-20 05:48:48 +04:00
|
|
|
if (ret < 0) {
|
2014-05-28 07:38:58 +04:00
|
|
|
error_propagate(errp, local_err);
|
2013-12-03 06:41:05 +04:00
|
|
|
goto exit;
|
2011-07-19 04:45:23 +04:00
|
|
|
}
|
2013-12-20 05:48:48 +04:00
|
|
|
ret = bdrv_pwrite(new_bs, desc_offset, desc, desc_len);
|
|
|
|
if (ret < 0) {
|
|
|
|
error_setg_errno(errp, -ret, "Could not write description");
|
|
|
|
goto exit;
|
2011-07-19 04:45:23 +04:00
|
|
|
}
|
2013-12-20 05:48:48 +04:00
|
|
|
/* bdrv_pwrite write padding zeros to align to sector, we don't need that
|
|
|
|
* for description file */
|
|
|
|
if (desc_offset == 0) {
|
|
|
|
ret = bdrv_truncate(new_bs, desc_len);
|
|
|
|
if (ret < 0) {
|
2014-02-12 23:46:24 +04:00
|
|
|
error_setg_errno(errp, -ret, "Could not truncate file");
|
2013-12-20 05:48:48 +04:00
|
|
|
}
|
2010-01-20 02:56:13 +03:00
|
|
|
}
|
2013-12-03 06:41:05 +04:00
|
|
|
exit:
|
2013-12-20 05:48:48 +04:00
|
|
|
if (new_bs) {
|
|
|
|
bdrv_unref(new_bs);
|
|
|
|
}
|
2014-06-05 13:21:09 +04:00
|
|
|
g_free(adapter_type);
|
|
|
|
g_free(backing_file);
|
|
|
|
g_free(fmt);
|
2013-12-03 06:41:05 +04:00
|
|
|
g_free(desc);
|
2015-01-22 16:03:26 +03:00
|
|
|
g_free(path);
|
|
|
|
g_free(prefix);
|
|
|
|
g_free(postfix);
|
|
|
|
g_free(desc_line);
|
|
|
|
g_free(ext_filename);
|
|
|
|
g_free(desc_filename);
|
|
|
|
g_free(parent_desc_line);
|
2013-12-03 06:41:05 +04:00
|
|
|
g_string_free(ext_desc_lines, true);
|
2010-01-20 02:56:13 +03:00
|
|
|
return ret;
|
2005-07-02 18:02:54 +04:00
|
|
|
}
|
|
|
|
|
2004-09-18 23:32:11 +04:00
|
|
|
static void vmdk_close(BlockDriverState *bs)
|
2004-08-02 01:59:26 +04:00
|
|
|
{
|
2011-11-22 19:50:27 +04:00
|
|
|
BDRVVmdkState *s = bs->opaque;
|
|
|
|
|
2011-07-12 15:56:28 +04:00
|
|
|
vmdk_free_extents(bs);
|
2013-10-31 06:06:23 +04:00
|
|
|
g_free(s->create_type);
|
2011-11-22 19:50:27 +04:00
|
|
|
|
|
|
|
migrate_del_blocker(s->migration_blocker);
|
|
|
|
error_free(s->migration_blocker);
|
2004-08-02 01:59:26 +04:00
|
|
|
}
|
|
|
|
|
2011-10-20 15:16:24 +04:00
|
|
|
static coroutine_fn int vmdk_co_flush(BlockDriverState *bs)
|
2006-06-04 15:39:07 +04:00
|
|
|
{
|
2011-07-12 15:56:33 +04:00
|
|
|
BDRVVmdkState *s = bs->opaque;
|
2012-03-12 21:26:01 +04:00
|
|
|
int i, err;
|
|
|
|
int ret = 0;
|
2011-07-12 15:56:33 +04:00
|
|
|
|
|
|
|
for (i = 0; i < s->num_extents; i++) {
|
2011-10-20 15:16:24 +04:00
|
|
|
err = bdrv_co_flush(s->extents[i].file);
|
2011-07-12 15:56:33 +04:00
|
|
|
if (err < 0) {
|
|
|
|
ret = err;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return ret;
|
2006-06-04 15:39:07 +04:00
|
|
|
}
|
|
|
|
|
2011-07-12 15:56:39 +04:00
|
|
|
static int64_t vmdk_get_allocated_file_size(BlockDriverState *bs)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
int64_t ret = 0;
|
|
|
|
int64_t r;
|
|
|
|
BDRVVmdkState *s = bs->opaque;
|
|
|
|
|
|
|
|
ret = bdrv_get_allocated_file_size(bs->file);
|
|
|
|
if (ret < 0) {
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
for (i = 0; i < s->num_extents; i++) {
|
|
|
|
if (s->extents[i].file == bs->file) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
r = bdrv_get_allocated_file_size(s->extents[i].file);
|
|
|
|
if (r < 0) {
|
|
|
|
return r;
|
|
|
|
}
|
|
|
|
ret += r;
|
|
|
|
}
|
|
|
|
return ret;
|
|
|
|
}
|
2009-05-18 18:42:10 +04:00
|
|
|
|
2013-07-01 07:33:17 +04:00
|
|
|
static int vmdk_has_zero_init(BlockDriverState *bs)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
BDRVVmdkState *s = bs->opaque;
|
|
|
|
|
|
|
|
/* If has a flat extent and its underlying storage doesn't have zero init,
|
|
|
|
* return 0. */
|
|
|
|
for (i = 0; i < s->num_extents; i++) {
|
|
|
|
if (s->extents[i].flat) {
|
|
|
|
if (!bdrv_has_zero_init(s->extents[i].file)) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2013-10-31 06:06:23 +04:00
|
|
|
static ImageInfo *vmdk_get_extent_info(VmdkExtent *extent)
|
|
|
|
{
|
|
|
|
ImageInfo *info = g_new0(ImageInfo, 1);
|
|
|
|
|
|
|
|
*info = (ImageInfo){
|
|
|
|
.filename = g_strdup(extent->file->filename),
|
|
|
|
.format = g_strdup(extent->type),
|
|
|
|
.virtual_size = extent->sectors * BDRV_SECTOR_SIZE,
|
|
|
|
.compressed = extent->compressed,
|
|
|
|
.has_compressed = extent->compressed,
|
|
|
|
.cluster_size = extent->cluster_sectors * BDRV_SECTOR_SIZE,
|
|
|
|
.has_cluster_size = !extent->flat,
|
|
|
|
};
|
|
|
|
|
|
|
|
return info;
|
|
|
|
}
|
|
|
|
|
2014-01-29 12:34:16 +04:00
|
|
|
static int vmdk_check(BlockDriverState *bs, BdrvCheckResult *result,
|
|
|
|
BdrvCheckMode fix)
|
|
|
|
{
|
|
|
|
BDRVVmdkState *s = bs->opaque;
|
|
|
|
VmdkExtent *extent = NULL;
|
|
|
|
int64_t sector_num = 0;
|
2014-06-26 15:23:22 +04:00
|
|
|
int64_t total_sectors = bdrv_nb_sectors(bs);
|
2014-01-29 12:34:16 +04:00
|
|
|
int ret;
|
|
|
|
uint64_t cluster_offset;
|
|
|
|
|
|
|
|
if (fix) {
|
|
|
|
return -ENOTSUP;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (;;) {
|
|
|
|
if (sector_num >= total_sectors) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
extent = find_extent(s, sector_num, extent);
|
|
|
|
if (!extent) {
|
|
|
|
fprintf(stderr,
|
|
|
|
"ERROR: could not find extent for sector %" PRId64 "\n",
|
|
|
|
sector_num);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
ret = get_cluster_offset(bs, extent, NULL,
|
|
|
|
sector_num << BDRV_SECTOR_BITS,
|
vmdk: Optimize cluster allocation
This drops the unnecessary bdrv_truncate() from, and also improves,
cluster allocation code path.
Before, when we need a new cluster, get_cluster_offset truncates the
image to bdrv_getlength() + cluster_size, and returns the offset of
added area, i.e. the image length before truncating.
This is not efficient, so it's now rewritten as:
- Save the extent file length when opening.
- When allocating cluster, use the saved length as cluster offset.
- Don't truncate image, because we'll anyway write data there: just
write any data at the EOF position, in descending priority:
* New user data (cluster allocation happens in a write request).
* Filling data in the beginning and/or ending of the new cluster, if
not covered by user data: either backing file content (COW), or
zero for standalone images.
One major benifit of this change is, on host mounted NFS images, even
over a fast network, ftruncate is slow (see the example below). This
change significantly speeds up cluster allocation. Comparing by
converting a cirros image (296M) to VMDK on an NFS mount point, over
1Gbe LAN:
$ time qemu-img convert cirros-0.3.1.img /mnt/a.raw -O vmdk
Before:
real 0m21.796s
user 0m0.130s
sys 0m0.483s
After:
real 0m2.017s
user 0m0.047s
sys 0m0.190s
We also get rid of unchecked bdrv_getlength() and bdrv_truncate(), and
get a little more documentation in function comments.
Tested that this passes qemu-iotests for all VMDK subformats.
Signed-off-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2014-07-30 10:39:10 +04:00
|
|
|
false, &cluster_offset, 0, 0);
|
2014-01-29 12:34:16 +04:00
|
|
|
if (ret == VMDK_ERROR) {
|
|
|
|
fprintf(stderr,
|
|
|
|
"ERROR: could not get cluster_offset for sector %"
|
|
|
|
PRId64 "\n", sector_num);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (ret == VMDK_OK && cluster_offset >= bdrv_getlength(extent->file)) {
|
|
|
|
fprintf(stderr,
|
|
|
|
"ERROR: cluster offset for sector %"
|
|
|
|
PRId64 " points after EOF\n", sector_num);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
sector_num += extent->cluster_sectors;
|
|
|
|
}
|
|
|
|
|
|
|
|
result->corruptions++;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2013-10-31 06:06:23 +04:00
|
|
|
static ImageInfoSpecific *vmdk_get_specific_info(BlockDriverState *bs)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
BDRVVmdkState *s = bs->opaque;
|
|
|
|
ImageInfoSpecific *spec_info = g_new0(ImageInfoSpecific, 1);
|
|
|
|
ImageInfoList **next;
|
|
|
|
|
|
|
|
*spec_info = (ImageInfoSpecific){
|
|
|
|
.kind = IMAGE_INFO_SPECIFIC_KIND_VMDK,
|
|
|
|
{
|
|
|
|
.vmdk = g_new0(ImageInfoSpecificVmdk, 1),
|
|
|
|
},
|
|
|
|
};
|
|
|
|
|
|
|
|
*spec_info->vmdk = (ImageInfoSpecificVmdk) {
|
|
|
|
.create_type = g_strdup(s->create_type),
|
|
|
|
.cid = s->cid,
|
|
|
|
.parent_cid = s->parent_cid,
|
|
|
|
};
|
|
|
|
|
|
|
|
next = &spec_info->vmdk->extents;
|
|
|
|
for (i = 0; i < s->num_extents; i++) {
|
|
|
|
*next = g_new0(ImageInfoList, 1);
|
|
|
|
(*next)->value = vmdk_get_extent_info(&s->extents[i]);
|
|
|
|
(*next)->next = NULL;
|
|
|
|
next = &(*next)->next;
|
|
|
|
}
|
|
|
|
|
|
|
|
return spec_info;
|
|
|
|
}
|
|
|
|
|
2014-11-14 07:09:21 +03:00
|
|
|
static bool vmdk_extents_type_eq(const VmdkExtent *a, const VmdkExtent *b)
|
|
|
|
{
|
|
|
|
return a->flat == b->flat &&
|
|
|
|
a->compressed == b->compressed &&
|
|
|
|
(a->flat || a->cluster_sectors == b->cluster_sectors);
|
|
|
|
}
|
|
|
|
|
2014-05-06 17:08:45 +04:00
|
|
|
static int vmdk_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
BDRVVmdkState *s = bs->opaque;
|
|
|
|
assert(s->num_extents);
|
2014-11-14 07:09:21 +03:00
|
|
|
|
2014-05-06 17:08:45 +04:00
|
|
|
/* See if we have multiple extents but they have different cases */
|
|
|
|
for (i = 1; i < s->num_extents; i++) {
|
2014-11-14 07:09:21 +03:00
|
|
|
if (!vmdk_extents_type_eq(&s->extents[0], &s->extents[i])) {
|
2014-05-06 17:08:45 +04:00
|
|
|
return -ENOTSUP;
|
|
|
|
}
|
|
|
|
}
|
2014-11-14 07:09:21 +03:00
|
|
|
bdi->needs_compressed_writes = s->extents[0].compressed;
|
|
|
|
if (!s->extents[0].flat) {
|
|
|
|
bdi->cluster_size = s->extents[0].cluster_sectors << BDRV_SECTOR_BITS;
|
|
|
|
}
|
2014-05-06 17:08:45 +04:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2014-05-08 18:34:54 +04:00
|
|
|
static void vmdk_detach_aio_context(BlockDriverState *bs)
|
|
|
|
{
|
|
|
|
BDRVVmdkState *s = bs->opaque;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < s->num_extents; i++) {
|
|
|
|
bdrv_detach_aio_context(s->extents[i].file);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void vmdk_attach_aio_context(BlockDriverState *bs,
|
|
|
|
AioContext *new_context)
|
|
|
|
{
|
|
|
|
BDRVVmdkState *s = bs->opaque;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < s->num_extents; i++) {
|
|
|
|
bdrv_attach_aio_context(s->extents[i].file, new_context);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-06-05 13:21:09 +04:00
|
|
|
static QemuOptsList vmdk_create_opts = {
|
|
|
|
.name = "vmdk-create-opts",
|
|
|
|
.head = QTAILQ_HEAD_INITIALIZER(vmdk_create_opts.head),
|
|
|
|
.desc = {
|
|
|
|
{
|
|
|
|
.name = BLOCK_OPT_SIZE,
|
|
|
|
.type = QEMU_OPT_SIZE,
|
|
|
|
.help = "Virtual disk size"
|
|
|
|
},
|
|
|
|
{
|
|
|
|
.name = BLOCK_OPT_ADAPTER_TYPE,
|
|
|
|
.type = QEMU_OPT_STRING,
|
|
|
|
.help = "Virtual adapter type, can be one of "
|
|
|
|
"ide (default), lsilogic, buslogic or legacyESX"
|
|
|
|
},
|
|
|
|
{
|
|
|
|
.name = BLOCK_OPT_BACKING_FILE,
|
|
|
|
.type = QEMU_OPT_STRING,
|
|
|
|
.help = "File name of a base image"
|
|
|
|
},
|
|
|
|
{
|
|
|
|
.name = BLOCK_OPT_COMPAT6,
|
|
|
|
.type = QEMU_OPT_BOOL,
|
|
|
|
.help = "VMDK version 6 image",
|
|
|
|
.def_value_str = "off"
|
|
|
|
},
|
|
|
|
{
|
|
|
|
.name = BLOCK_OPT_SUBFMT,
|
|
|
|
.type = QEMU_OPT_STRING,
|
|
|
|
.help =
|
|
|
|
"VMDK flat extent format, can be one of "
|
|
|
|
"{monolithicSparse (default) | monolithicFlat | twoGbMaxExtentSparse | twoGbMaxExtentFlat | streamOptimized} "
|
|
|
|
},
|
|
|
|
{
|
|
|
|
.name = BLOCK_OPT_ZEROED_GRAIN,
|
|
|
|
.type = QEMU_OPT_BOOL,
|
|
|
|
.help = "Enable efficient zero writes "
|
|
|
|
"using the zeroed-grain GTE feature"
|
|
|
|
},
|
|
|
|
{ /* end of list */ }
|
|
|
|
}
|
2009-05-18 18:42:10 +04:00
|
|
|
};
|
|
|
|
|
2009-05-10 02:03:42 +04:00
|
|
|
static BlockDriver bdrv_vmdk = {
|
2013-07-01 07:33:17 +04:00
|
|
|
.format_name = "vmdk",
|
|
|
|
.instance_size = sizeof(BDRVVmdkState),
|
|
|
|
.bdrv_probe = vmdk_probe,
|
|
|
|
.bdrv_open = vmdk_open,
|
2014-01-29 12:34:16 +04:00
|
|
|
.bdrv_check = vmdk_check,
|
2013-07-01 07:33:17 +04:00
|
|
|
.bdrv_reopen_prepare = vmdk_reopen_prepare,
|
|
|
|
.bdrv_read = vmdk_co_read,
|
|
|
|
.bdrv_write = vmdk_co_write,
|
2014-05-06 17:08:44 +04:00
|
|
|
.bdrv_write_compressed = vmdk_write_compressed,
|
2013-07-01 07:33:17 +04:00
|
|
|
.bdrv_co_write_zeroes = vmdk_co_write_zeroes,
|
|
|
|
.bdrv_close = vmdk_close,
|
2014-06-05 13:21:11 +04:00
|
|
|
.bdrv_create = vmdk_create,
|
2013-07-01 07:33:17 +04:00
|
|
|
.bdrv_co_flush_to_disk = vmdk_co_flush,
|
2013-09-04 21:00:28 +04:00
|
|
|
.bdrv_co_get_block_status = vmdk_co_get_block_status,
|
2013-07-01 07:33:17 +04:00
|
|
|
.bdrv_get_allocated_file_size = vmdk_get_allocated_file_size,
|
|
|
|
.bdrv_has_zero_init = vmdk_has_zero_init,
|
2013-10-31 06:06:23 +04:00
|
|
|
.bdrv_get_specific_info = vmdk_get_specific_info,
|
2013-12-11 22:26:16 +04:00
|
|
|
.bdrv_refresh_limits = vmdk_refresh_limits,
|
2014-05-06 17:08:45 +04:00
|
|
|
.bdrv_get_info = vmdk_get_info,
|
2014-05-08 18:34:54 +04:00
|
|
|
.bdrv_detach_aio_context = vmdk_detach_aio_context,
|
|
|
|
.bdrv_attach_aio_context = vmdk_attach_aio_context,
|
2013-07-01 07:33:17 +04:00
|
|
|
|
2014-06-04 17:09:35 +04:00
|
|
|
.supports_backing = true,
|
2014-06-05 13:21:09 +04:00
|
|
|
.create_opts = &vmdk_create_opts,
|
2004-08-02 01:59:26 +04:00
|
|
|
};
|
2009-05-10 02:03:42 +04:00
|
|
|
|
|
|
|
static void bdrv_vmdk_init(void)
|
|
|
|
{
|
|
|
|
bdrv_register(&bdrv_vmdk);
|
|
|
|
}
|
|
|
|
|
|
|
|
block_init(bdrv_vmdk_init);
|