From 65a9bb25d6b7a4bb41edb102fa0363d81800b417 Mon Sep 17 00:00:00 2001 From: Markus Armbruster Date: Thu, 26 Jun 2014 13:23:17 +0200 Subject: [PATCH 01/59] block: New bdrv_nb_sectors() A call to retrieve the image size converts between bytes and sectors several times: * BlockDriver method bdrv_getlength() returns bytes. * refresh_total_sectors() converts to sectors, rounding up, and stores in total_sectors. * bdrv_getlength() converts total_sectors back to bytes (now rounded up to a multiple of the sector size). * Callers wanting sectors rather bytes convert it right back. Example: bdrv_get_geometry(). bdrv_nb_sectors() provides a way to omit the last two conversions. It's exactly bdrv_getlength() with the conversion to bytes omitted. It's functionally like bdrv_get_geometry() without its odd error handling. Reimplement bdrv_getlength() and bdrv_get_geometry() on top of bdrv_nb_sectors(). The next patches will convert some users of bdrv_getlength() to bdrv_nb_sectors(). Signed-off-by: Markus Armbruster Reviewed-by: Eric Blake Reviewed-by: Benoit Canet Signed-off-by: Stefan Hajnoczi --- block.c | 29 +++++++++++++++++++---------- include/block/block.h | 1 + 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/block.c b/block.c index 8cf519ba3a..1239bb3661 100644 --- a/block.c +++ b/block.c @@ -701,6 +701,7 @@ static int find_image_format(BlockDriverState *bs, const char *filename, /** * Set the current 'total_sectors' value + * Return 0 on success, -errno on error. */ static int refresh_total_sectors(BlockDriverState *bs, int64_t hint) { @@ -3536,11 +3537,12 @@ int64_t bdrv_get_allocated_file_size(BlockDriverState *bs) } /** - * Length of a file in bytes. Return < 0 if error or unknown. + * Return number of sectors on success, -errno on error. */ -int64_t bdrv_getlength(BlockDriverState *bs) +int64_t bdrv_nb_sectors(BlockDriverState *bs) { BlockDriver *drv = bs->drv; + if (!drv) return -ENOMEDIUM; @@ -3550,19 +3552,26 @@ int64_t bdrv_getlength(BlockDriverState *bs) return ret; } } - return bs->total_sectors * BDRV_SECTOR_SIZE; + return bs->total_sectors; +} + +/** + * Return length in bytes on success, -errno on error. + * The length is always a multiple of BDRV_SECTOR_SIZE. + */ +int64_t bdrv_getlength(BlockDriverState *bs) +{ + int64_t ret = bdrv_nb_sectors(bs); + + return ret < 0 ? ret : ret * BDRV_SECTOR_SIZE; } /* return 0 as number of sectors if no device present or error */ void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr) { - int64_t length; - length = bdrv_getlength(bs); - if (length < 0) - length = 0; - else - length = length >> BDRV_SECTOR_BITS; - *nb_sectors_ptr = length; + int64_t nb_sectors = bdrv_nb_sectors(bs); + + *nb_sectors_ptr = nb_sectors < 0 ? 0 : nb_sectors; } void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error, diff --git a/include/block/block.h b/include/block/block.h index f08471d7e1..d4c816d738 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -275,6 +275,7 @@ BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs, const char *backing_file); int bdrv_get_backing_file_depth(BlockDriverState *bs); int bdrv_truncate(BlockDriverState *bs, int64_t offset); +int64_t bdrv_nb_sectors(BlockDriverState *bs); int64_t bdrv_getlength(BlockDriverState *bs); int64_t bdrv_get_allocated_file_size(BlockDriverState *bs); void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr); From d32f7c101b37c03f222008331db3b1d09493a4a3 Mon Sep 17 00:00:00 2001 From: Markus Armbruster Date: Thu, 26 Jun 2014 13:23:18 +0200 Subject: [PATCH 02/59] block: Use bdrv_nb_sectors() in bdrv_make_zero() Instead of bdrv_getlength(). Variable target_size is initially in bytes, then changes meaning to sectors. Ugh. Replace by target_sectors. Signed-off-by: Markus Armbruster Reviewed-by: Eric Blake Reviewed-by: Benoit Canet Signed-off-by: Stefan Hajnoczi --- block.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/block.c b/block.c index 1239bb3661..75553fb2b6 100644 --- a/block.c +++ b/block.c @@ -2828,18 +2828,16 @@ int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num, */ int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags) { - int64_t target_size; - int64_t ret, nb_sectors, sector_num = 0; + int64_t target_sectors, ret, nb_sectors, sector_num = 0; int n; - target_size = bdrv_getlength(bs); - if (target_size < 0) { - return target_size; + target_sectors = bdrv_nb_sectors(bs); + if (target_sectors < 0) { + return target_sectors; } - target_size /= BDRV_SECTOR_SIZE; for (;;) { - nb_sectors = target_size - sector_num; + nb_sectors = target_sectors - sector_num; if (nb_sectors <= 0) { return 0; } From 4049082c4b489959850743ec22d1fec125752038 Mon Sep 17 00:00:00 2001 From: Markus Armbruster Date: Thu, 26 Jun 2014 13:23:19 +0200 Subject: [PATCH 03/59] block: Use bdrv_nb_sectors() in bdrv_aligned_preadv() Instead of bdrv_getlength(). Eliminate variable len. Signed-off-by: Markus Armbruster Reviewed-by: Eric Blake Reviewed-by: Benoit Canet Signed-off-by: Stefan Hajnoczi --- block.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/block.c b/block.c index 75553fb2b6..5b7e05385c 100644 --- a/block.c +++ b/block.c @@ -3055,15 +3055,14 @@ static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs, ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); } else { /* Read zeros after EOF of growable BDSes */ - int64_t len, total_sectors, max_nb_sectors; + int64_t total_sectors, max_nb_sectors; - len = bdrv_getlength(bs); - if (len < 0) { - ret = len; + total_sectors = bdrv_nb_sectors(bs); + if (total_sectors < 0) { + ret = total_sectors; goto out; } - total_sectors = DIV_ROUND_UP(len, BDRV_SECTOR_SIZE); max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num), align >> BDRV_SECTOR_BITS); if (max_nb_sectors > 0) { From 30a7f2fc9178080a0ce22c12767ce994f882937e Mon Sep 17 00:00:00 2001 From: Markus Armbruster Date: Thu, 26 Jun 2014 13:23:20 +0200 Subject: [PATCH 04/59] block: Use bdrv_nb_sectors() in bdrv_co_get_block_status() Instead of bdrv_getlength(). Replace variables length, length2 by total_sectors, nb_sectors2. Bonus: use total_sectors instead of the slightly unclean bs->total_sectors. Signed-off-by: Markus Armbruster Reviewed-by: Eric Blake Reviewed-by: Benoit Canet Signed-off-by: Stefan Hajnoczi --- block.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/block.c b/block.c index 5b7e05385c..c8ef425302 100644 --- a/block.c +++ b/block.c @@ -3951,21 +3951,21 @@ static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *pnum) { - int64_t length; + int64_t total_sectors; int64_t n; int64_t ret, ret2; - length = bdrv_getlength(bs); - if (length < 0) { - return length; + total_sectors = bdrv_nb_sectors(bs); + if (total_sectors < 0) { + return total_sectors; } - if (sector_num >= (length >> BDRV_SECTOR_BITS)) { + if (sector_num >= total_sectors) { *pnum = 0; return 0; } - n = bs->total_sectors - sector_num; + n = total_sectors - sector_num; if (n < nb_sectors) { nb_sectors = n; } @@ -4000,8 +4000,8 @@ static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs, ret |= BDRV_BLOCK_ZERO; } else if (bs->backing_hd) { BlockDriverState *bs2 = bs->backing_hd; - int64_t length2 = bdrv_getlength(bs2); - if (length2 >= 0 && sector_num >= (length2 >> BDRV_SECTOR_BITS)) { + int64_t nb_sectors2 = bdrv_nb_sectors(bs2); + if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) { ret |= BDRV_BLOCK_ZERO; } } From 43716fa8058a9095b9c1f9be4397dbaf7c521ee0 Mon Sep 17 00:00:00 2001 From: Markus Armbruster Date: Thu, 26 Jun 2014 13:23:21 +0200 Subject: [PATCH 05/59] block: Use bdrv_nb_sectors() in img_convert() Instead of bdrv_getlength(). Replace variable output_length by output_sectors. Signed-off-by: Markus Armbruster Reviewed-by: Eric Blake Reviewed-by: Benoit Canet Signed-off-by: Stefan Hajnoczi --- qemu-img.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/qemu-img.c b/qemu-img.c index d4518e724f..9832a7ee12 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -1469,13 +1469,13 @@ static int img_convert(int argc, char **argv) buf = qemu_blockalign(out_bs, bufsectors * BDRV_SECTOR_SIZE); if (skip_create) { - int64_t output_length = bdrv_getlength(out_bs); - if (output_length < 0) { + int64_t output_sectors = bdrv_nb_sectors(out_bs); + if (output_sectors < 0) { error_report("unable to get output image length: %s\n", - strerror(-output_length)); + strerror(-output_sectors)); ret = -1; goto out; - } else if (output_length < total_sectors << BDRV_SECTOR_BITS) { + } else if (output_sectors < total_sectors) { error_report("output file is smaller than input file"); ret = -1; goto out; From 57322b7811a9136507742bb7ebc2215dd8c2e911 Mon Sep 17 00:00:00 2001 From: Markus Armbruster Date: Thu, 26 Jun 2014 13:23:22 +0200 Subject: [PATCH 06/59] block: Use bdrv_nb_sectors() where sectors, not bytes are wanted Instead of bdrv_getlength(). Aside: a few of these callers don't handle errors. I didn't investigate whether they should. Signed-off-by: Markus Armbruster Reviewed-by: Eric Blake Reviewed-by: Benoit Canet Signed-off-by: Stefan Hajnoczi --- block-migration.c | 9 ++++----- block.c | 3 +-- block/qcow2.c | 2 +- block/vmdk.c | 5 ++--- 4 files changed, 8 insertions(+), 11 deletions(-) diff --git a/block-migration.c b/block-migration.c index 73cdd07e8c..ba3ed36f77 100644 --- a/block-migration.c +++ b/block-migration.c @@ -186,7 +186,7 @@ static int bmds_aio_inflight(BlkMigDevState *bmds, int64_t sector) { int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK; - if ((sector << BDRV_SECTOR_BITS) < bdrv_getlength(bmds->bs)) { + if (sector < bdrv_nb_sectors(bmds->bs)) { return !!(bmds->aio_bitmap[chunk / (sizeof(unsigned long) * 8)] & (1UL << (chunk % (sizeof(unsigned long) * 8)))); } else { @@ -223,8 +223,7 @@ static void alloc_aio_bitmap(BlkMigDevState *bmds) BlockDriverState *bs = bmds->bs; int64_t bitmap_size; - bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) + - BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1; + bitmap_size = bdrv_nb_sectors(bs) + BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1; bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8; bmds->aio_bitmap = g_malloc0(bitmap_size); @@ -350,7 +349,7 @@ static void init_blk_migration_it(void *opaque, BlockDriverState *bs) int64_t sectors; if (!bdrv_is_read_only(bs)) { - sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS; + sectors = bdrv_nb_sectors(bs); if (sectors <= 0) { return; } @@ -799,7 +798,7 @@ static int block_load(QEMUFile *f, void *opaque, int version_id) if (bs != bs_prev) { bs_prev = bs; - total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS; + total_sectors = bdrv_nb_sectors(bs); if (total_sectors <= 0) { error_report("Error getting length of block device %s", device_name); diff --git a/block.c b/block.c index c8ef425302..2fcc2f738a 100644 --- a/block.c +++ b/block.c @@ -5283,13 +5283,12 @@ BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity, granularity >>= BDRV_SECTOR_BITS; assert(granularity); - bitmap_size = bdrv_getlength(bs); + bitmap_size = bdrv_nb_sectors(bs); if (bitmap_size < 0) { error_setg_errno(errp, -bitmap_size, "could not get length of device"); errno = -bitmap_size; return NULL; } - bitmap_size >>= BDRV_SECTOR_BITS; bitmap = g_malloc0(sizeof(BdrvDirtyBitmap)); bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1); QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list); diff --git a/block/qcow2.c b/block/qcow2.c index 1e3ab6bd02..ad93824c6c 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -1557,7 +1557,7 @@ static int preallocate(BlockDriverState *bs) int ret; QCowL2Meta *meta; - nb_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS; + nb_sectors = bdrv_nb_sectors(bs); offset = 0; while (nb_sectors) { diff --git a/block/vmdk.c b/block/vmdk.c index 0517bbaf91..33d9bec02b 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -669,8 +669,7 @@ static int vmdk_open_vmdk4(BlockDriverState *bs, if (le32_to_cpu(header.flags) & VMDK4_FLAG_RGD) { l1_backup_offset = le64_to_cpu(header.rgd_offset) << 9; } - if (bdrv_getlength(file) < - le64_to_cpu(header.grain_offset) * BDRV_SECTOR_SIZE) { + if (bdrv_nb_sectors(file) < le64_to_cpu(header.grain_offset)) { error_setg(errp, "File truncated, expecting at least %" PRId64 " bytes", (int64_t)(le64_to_cpu(header.grain_offset) * BDRV_SECTOR_SIZE)); @@ -1999,7 +1998,7 @@ static int vmdk_check(BlockDriverState *bs, BdrvCheckResult *result, BDRVVmdkState *s = bs->opaque; VmdkExtent *extent = NULL; int64_t sector_num = 0; - int64_t total_sectors = bdrv_getlength(bs) / BDRV_SECTOR_SIZE; + int64_t total_sectors = bdrv_nb_sectors(bs); int ret; uint64_t cluster_offset; From 75d3d21f9ed9fc66a56d4d7cf491d504541fc6ac Mon Sep 17 00:00:00 2001 From: Markus Armbruster Date: Thu, 26 Jun 2014 13:23:23 +0200 Subject: [PATCH 07/59] block: Drop superfluous aligning of bdrv_getlength()'s value It returns a multiple of the sector size. Signed-off-by: Markus Armbruster Reviewed-by: Max Reitz Reviewed-by: Benoit Canet Signed-off-by: Stefan Hajnoczi --- block.c | 1 - block/qcow2.c | 1 - 2 files changed, 2 deletions(-) diff --git a/block.c b/block.c index 2fcc2f738a..1af74e601b 100644 --- a/block.c +++ b/block.c @@ -1314,7 +1314,6 @@ int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp) error_setg_errno(errp, -total_size, "Could not get image size"); goto out; } - total_size &= BDRV_SECTOR_MASK; /* Create the temporary image */ ret = get_tmp_filename(tmp_filename, PATH_MAX + 1); diff --git a/block/qcow2.c b/block/qcow2.c index ad93824c6c..964ab93b08 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -1947,7 +1947,6 @@ static int qcow2_write_compressed(BlockDriverState *bs, int64_t sector_num, /* align end of file to a sector boundary to ease reading with sector based I/Os */ cluster_offset = bdrv_getlength(bs->file); - cluster_offset = (cluster_offset + 511) & ~511; bdrv_truncate(bs->file, cluster_offset); return 0; } From d739f1c41094141268a33fa8fc6715fbab714fbe Mon Sep 17 00:00:00 2001 From: Markus Armbruster Date: Thu, 26 Jun 2014 13:23:24 +0200 Subject: [PATCH 08/59] qemu-img: Make img_convert() get image size just once per image Chiefly so I don't have to do the error checking in quadruplicate in the next commit. Moreover, replacing the frequently updated bs_sectors by an array assigned just once makes the code easier to understand. Signed-off-by: Markus Armbruster Reviewed-by: Max Reitz Reviewed-by: Benoit Canet Signed-off-by: Stefan Hajnoczi --- qemu-img.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/qemu-img.c b/qemu-img.c index 9832a7ee12..b8329974aa 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -1182,7 +1182,7 @@ static int img_convert(int argc, char **argv) BlockDriver *drv, *proto_drv; BlockDriverState **bs = NULL, *out_bs = NULL; int64_t total_sectors, nb_sectors, sector_num, bs_offset; - uint64_t bs_sectors; + uint64_t *bs_sectors = NULL; uint8_t * buf = NULL; size_t bufsectors = IO_BUF_SIZE / BDRV_SECTOR_SIZE; const uint8_t *buf1; @@ -1323,7 +1323,8 @@ static int img_convert(int argc, char **argv) qemu_progress_print(0, 100); - bs = g_malloc0(bs_n * sizeof(BlockDriverState *)); + bs = g_new0(BlockDriverState *, bs_n); + bs_sectors = g_new(uint64_t, bs_n); total_sectors = 0; for (bs_i = 0; bs_i < bs_n; bs_i++) { @@ -1337,8 +1338,8 @@ static int img_convert(int argc, char **argv) ret = -1; goto out; } - bdrv_get_geometry(bs[bs_i], &bs_sectors); - total_sectors += bs_sectors; + bdrv_get_geometry(bs[bs_i], &bs_sectors[bs_i]); + total_sectors += bs_sectors[bs_i]; } if (sn_opts) { @@ -1456,7 +1457,6 @@ static int img_convert(int argc, char **argv) bs_i = 0; bs_offset = 0; - bdrv_get_geometry(bs[0], &bs_sectors); /* increase bufsectors from the default 4096 (2M) if opt_transfer_length * or discard_alignment of the out_bs is greater. Limit to 32768 (16MB) @@ -1523,19 +1523,19 @@ static int img_convert(int argc, char **argv) buf2 = buf; while (remainder > 0) { int nlow; - while (bs_num == bs_sectors) { + while (bs_num == bs_sectors[bs_i]) { + bs_offset += bs_sectors[bs_i]; bs_i++; assert (bs_i < bs_n); - bs_offset += bs_sectors; - bdrv_get_geometry(bs[bs_i], &bs_sectors); bs_num = 0; /* printf("changing part: sector_num=%" PRId64 ", " "bs_i=%d, bs_offset=%" PRId64 ", bs_sectors=%" PRId64 - "\n", sector_num, bs_i, bs_offset, bs_sectors); */ + "\n", sector_num, bs_i, bs_offset, bs_sectors[bs_i]); */ } - assert (bs_num < bs_sectors); + assert (bs_num < bs_sectors[bs_i]); - nlow = (remainder > bs_sectors - bs_num) ? bs_sectors - bs_num : remainder; + nlow = remainder > bs_sectors[bs_i] - bs_num + ? bs_sectors[bs_i] - bs_num : remainder; ret = bdrv_read(bs[bs_i], bs_num, buf2, nlow); if (ret < 0) { @@ -1596,14 +1596,13 @@ restart: break; } - while (sector_num - bs_offset >= bs_sectors) { + while (sector_num - bs_offset >= bs_sectors[bs_i]) { + bs_offset += bs_sectors[bs_i]; bs_i ++; assert (bs_i < bs_n); - bs_offset += bs_sectors; - bdrv_get_geometry(bs[bs_i], &bs_sectors); /* printf("changing part: sector_num=%" PRId64 ", bs_i=%d, " "bs_offset=%" PRId64 ", bs_sectors=%" PRId64 "\n", - sector_num, bs_i, bs_offset, bs_sectors); */ + sector_num, bs_i, bs_offset, bs_sectors[bs_i]); */ } if ((out_baseimg || has_zero_init) && @@ -1656,7 +1655,7 @@ restart: } } - n = MIN(n, bs_sectors - (sector_num - bs_offset)); + n = MIN(n, bs_sectors[bs_i] - (sector_num - bs_offset)); sectors_read += n; if (count_allocated_sectors) { @@ -1714,6 +1713,7 @@ out: } g_free(bs); } + g_free(bs_sectors); fail_getopt: g_free(options); From 52bf1e722d996d1accfc35e29283f172d003d9b2 Mon Sep 17 00:00:00 2001 From: Markus Armbruster Date: Thu, 26 Jun 2014 13:23:25 +0200 Subject: [PATCH 09/59] block: Avoid bdrv_get_geometry() where errors should be detected bdrv_get_geometry() hides errors. Use bdrv_nb_sectors() or bdrv_getlength() instead where that's obviously inappropriate. Signed-off-by: Markus Armbruster Reviewed-by: Eric Blake Reviewed-by: Max Reitz Reviewed-by: Benoit Canet Signed-off-by: Stefan Hajnoczi --- block.c | 11 ++++++--- block/qapi.c | 14 ++++++++---- qemu-img.c | 64 ++++++++++++++++++++++++++++++++++++++++------------ 3 files changed, 68 insertions(+), 21 deletions(-) diff --git a/block.c b/block.c index 1af74e601b..7cafc49612 100644 --- a/block.c +++ b/block.c @@ -5595,7 +5595,7 @@ void bdrv_img_create(const char *filename, const char *fmt, if (size == -1) { if (backing_file) { BlockDriverState *bs; - uint64_t size; + int64_t size; int back_flags; /* backing files always opened read-only */ @@ -5613,8 +5613,13 @@ void bdrv_img_create(const char *filename, const char *fmt, local_err = NULL; goto out; } - bdrv_get_geometry(bs, &size); - size *= 512; + size = bdrv_getlength(bs); + if (size < 0) { + error_setg_errno(errp, -size, "Could not get size of '%s'", + backing_file); + bdrv_unref(bs); + goto out; + } qemu_opt_set_number(opts, BLOCK_OPT_SIZE, size); diff --git a/block/qapi.c b/block/qapi.c index f44f6b4012..28ebb62c8e 100644 --- a/block/qapi.c +++ b/block/qapi.c @@ -165,19 +165,25 @@ void bdrv_query_image_info(BlockDriverState *bs, ImageInfo **p_info, Error **errp) { - uint64_t total_sectors; + int64_t size; const char *backing_filename; char backing_filename2[1024]; BlockDriverInfo bdi; int ret; Error *err = NULL; - ImageInfo *info = g_new0(ImageInfo, 1); + ImageInfo *info; - bdrv_get_geometry(bs, &total_sectors); + size = bdrv_getlength(bs); + if (size < 0) { + error_setg_errno(errp, -size, "Can't get size of device '%s'", + bdrv_get_device_name(bs)); + return; + } + info = g_new0(ImageInfo, 1); info->filename = g_strdup(bs->filename); info->format = g_strdup(bdrv_get_format_name(bs)); - info->virtual_size = total_sectors * 512; + info->virtual_size = size; info->actual_size = bdrv_get_allocated_file_size(bs); info->has_actual_size = info->actual_size >= 0; if (bdrv_is_encrypted(bs)) { diff --git a/qemu-img.c b/qemu-img.c index b8329974aa..9566faeb56 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -955,7 +955,6 @@ static int img_compare(int argc, char **argv) int64_t sector_num = 0; int64_t nb_sectors; int c, pnum; - uint64_t bs_sectors; uint64_t progress_base; for (;;) { @@ -1017,10 +1016,20 @@ static int img_compare(int argc, char **argv) buf1 = qemu_blockalign(bs1, IO_BUF_SIZE); buf2 = qemu_blockalign(bs2, IO_BUF_SIZE); - bdrv_get_geometry(bs1, &bs_sectors); - total_sectors1 = bs_sectors; - bdrv_get_geometry(bs2, &bs_sectors); - total_sectors2 = bs_sectors; + total_sectors1 = bdrv_nb_sectors(bs1); + if (total_sectors1 < 0) { + error_report("Can't get size of %s: %s", + filename1, strerror(-total_sectors1)); + ret = 4; + goto out; + } + total_sectors2 = bdrv_nb_sectors(bs2); + if (total_sectors2 < 0) { + error_report("Can't get size of %s: %s", + filename2, strerror(-total_sectors2)); + ret = 4; + goto out; + } total_sectors = MIN(total_sectors1, total_sectors2); progress_base = MAX(total_sectors1, total_sectors2); @@ -1182,7 +1191,7 @@ static int img_convert(int argc, char **argv) BlockDriver *drv, *proto_drv; BlockDriverState **bs = NULL, *out_bs = NULL; int64_t total_sectors, nb_sectors, sector_num, bs_offset; - uint64_t *bs_sectors = NULL; + int64_t *bs_sectors = NULL; uint8_t * buf = NULL; size_t bufsectors = IO_BUF_SIZE / BDRV_SECTOR_SIZE; const uint8_t *buf1; @@ -1324,7 +1333,7 @@ static int img_convert(int argc, char **argv) qemu_progress_print(0, 100); bs = g_new0(BlockDriverState *, bs_n); - bs_sectors = g_new(uint64_t, bs_n); + bs_sectors = g_new(int64_t, bs_n); total_sectors = 0; for (bs_i = 0; bs_i < bs_n; bs_i++) { @@ -1338,7 +1347,13 @@ static int img_convert(int argc, char **argv) ret = -1; goto out; } - bdrv_get_geometry(bs[bs_i], &bs_sectors[bs_i]); + bs_sectors[bs_i] = bdrv_nb_sectors(bs[bs_i]); + if (bs_sectors[bs_i] < 0) { + error_report("Could not get size of %s: %s", + argv[optind + bs_i], strerror(-bs_sectors[bs_i])); + ret = -1; + goto out; + } total_sectors += bs_sectors[bs_i]; } @@ -2413,9 +2428,9 @@ static int img_rebase(int argc, char **argv) * the image is the same as the original one at any time. */ if (!unsafe) { - uint64_t num_sectors; - uint64_t old_backing_num_sectors; - uint64_t new_backing_num_sectors = 0; + int64_t num_sectors; + int64_t old_backing_num_sectors; + int64_t new_backing_num_sectors = 0; uint64_t sector; int n; uint8_t * buf_old; @@ -2425,10 +2440,31 @@ static int img_rebase(int argc, char **argv) buf_old = qemu_blockalign(bs, IO_BUF_SIZE); buf_new = qemu_blockalign(bs, IO_BUF_SIZE); - bdrv_get_geometry(bs, &num_sectors); - bdrv_get_geometry(bs_old_backing, &old_backing_num_sectors); + num_sectors = bdrv_nb_sectors(bs); + if (num_sectors < 0) { + error_report("Could not get size of '%s': %s", + filename, strerror(-num_sectors)); + ret = -1; + goto out; + } + old_backing_num_sectors = bdrv_nb_sectors(bs_old_backing); + if (old_backing_num_sectors < 0) { + char backing_name[1024]; + + bdrv_get_backing_filename(bs, backing_name, sizeof(backing_name)); + error_report("Could not get size of '%s': %s", + backing_name, strerror(-old_backing_num_sectors)); + ret = -1; + goto out; + } if (bs_new_backing) { - bdrv_get_geometry(bs_new_backing, &new_backing_num_sectors); + new_backing_num_sectors = bdrv_nb_sectors(bs_new_backing); + if (new_backing_num_sectors < 0) { + error_report("Could not get size of '%s': %s", + out_baseimg, strerror(-new_backing_num_sectors)); + ret = -1; + goto out; + } } if (num_sectors != 0) { From 8e436ec1f307a01882fd9166477667370c9dbfff Mon Sep 17 00:00:00 2001 From: Maria Kustova Date: Mon, 21 Jul 2014 15:16:33 +0400 Subject: [PATCH 10/59] docs: Make the recommendation for the backing file name position a requirement The current version of the qcow2 specification recommends to save the backing file name in the end of the first cluster. It follows that the backing file name can be saved somewhere in the image, but the first cluster, which contradicts the current QEMU implementation. The patch makes the backing file name required to be placed after the header extensions in the first image cluster. Signed-off-by: Maria Kustova Signed-off-by: Kevin Wolf --- docs/specs/qcow2.txt | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/specs/qcow2.txt b/docs/specs/qcow2.txt index 3f713a6447..cfbc8b070c 100644 --- a/docs/specs/qcow2.txt +++ b/docs/specs/qcow2.txt @@ -135,12 +135,12 @@ be stored. Each extension has a structure like the following: Unless stated otherwise, each header extension type shall appear at most once in the same image. -The remaining space between the end of the header extension area and the end of -the first cluster can be used for the backing file name. It is not allowed to -store other data here, so that an implementation can safely modify the header -and add extensions without harming data of compatible features that it -doesn't support. Compatible features that need space for additional data can -use a header extension. +If the image has a backing file then the backing file name should be stored in +the remaining space between the end of the header extension area and the end of +the first cluster. It is not allowed to store other data here, so that an +implementation can safely modify the header and add extensions without harming +data of compatible features that it doesn't support. Compatible features that +need space for additional data can use a header extension. == Feature name table == From 8efc936336ea7e572b117ed7049ce0136952c003 Mon Sep 17 00:00:00 2001 From: Hu Tao Date: Thu, 26 Jun 2014 17:34:50 +0800 Subject: [PATCH 11/59] configure: explicitly state version requirements to devel packages Signed-off-by: Hu Tao Signed-off-by: Stefan Hajnoczi --- configure | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/configure b/configure index f7685b565c..5715fe6deb 100755 --- a/configure +++ b/configure @@ -3086,7 +3086,8 @@ if test "$glusterfs" != "no" ; then fi else if test "$glusterfs" = "yes" ; then - feature_not_found "GlusterFS backend support" "Install glusterfs-api devel" + feature_not_found "GlusterFS backend support" \ + "Install glusterfs-api devel >= 3" fi glusterfs="no" fi @@ -3531,7 +3532,8 @@ EOF spice_server_version=$($pkg_config --modversion spice-server) else if test "$spice" = "yes" ; then - feature_not_found "spice" "Install spice-server and spice-protocol devel" + feature_not_found "spice" \ + "Install spice-server(>=0.12.0) and spice-protocol(>=0.12.3) devel" fi spice="no" fi @@ -3562,7 +3564,7 @@ EOF smartcard_nss="yes" else if test "$smartcard_nss" = "yes"; then - feature_not_found "nss" + feature_not_found "nss" "Install nss devel >= 3.12.8" fi smartcard_nss="no" fi @@ -3578,7 +3580,7 @@ if test "$libusb" != "no" ; then libs_softmmu="$libs_softmmu $libusb_libs" else if test "$libusb" = "yes"; then - feature_not_found "libusb" "Install libusb devel" + feature_not_found "libusb" "Install libusb devel >= 1.0.13" fi libusb="no" fi @@ -4003,7 +4005,7 @@ if test "$libnfs" != "no" ; then LIBS="$LIBS $libnfs_libs" else if test "$libnfs" = "yes" ; then - feature_not_found "libnfs" + feature_not_found "libnfs" "Install libnfs devel >= 1.9.3" fi libnfs="no" fi From 8cced121436a3298e5866dbfaec91cd475ad59cf Mon Sep 17 00:00:00 2001 From: "Gonglei (Arei)" Date: Mon, 28 Jul 2014 06:03:45 +0000 Subject: [PATCH 12/59] xen_disk: fix possible null-ptr dereference Signed-off-by: Gonglei Signed-off-by: Stefan Hajnoczi --- hw/block/xen_disk.c | 1 + 1 file changed, 1 insertion(+) diff --git a/hw/block/xen_disk.c b/hw/block/xen_disk.c index aed5b5b3e9..a221d0bfca 100644 --- a/hw/block/xen_disk.c +++ b/hw/block/xen_disk.c @@ -589,6 +589,7 @@ static int blk_send_response_one(struct ioreq *ioreq) break; default: dst = NULL; + return 0; } memcpy(dst, &resp, sizeof(resp)); blkdev->rings.common.rsp_prod_pvt++; From ef558696b5c688a8a3bef4ab8f6b27937cc24c89 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Wed, 23 Jul 2014 12:55:32 +0100 Subject: [PATCH 13/59] docs/multiple-iothreads.txt: add documentation on IOThread programming This document explains how IOThreads and the main loop are related, especially how to write code that can run in an IOThread. Currently only virtio-blk-data-plane uses these techniques. The next obvious target is virtio-scsi; there has also been work on virtio-net. Signed-off-by: Stefan Hajnoczi Reviewed-by: Eric Blake --- docs/multiple-iothreads.txt | 134 ++++++++++++++++++++++++++++++++++++ 1 file changed, 134 insertions(+) create mode 100644 docs/multiple-iothreads.txt diff --git a/docs/multiple-iothreads.txt b/docs/multiple-iothreads.txt new file mode 100644 index 0000000000..40b8419916 --- /dev/null +++ b/docs/multiple-iothreads.txt @@ -0,0 +1,134 @@ +Copyright (c) 2014 Red Hat Inc. + +This work is licensed under the terms of the GNU GPL, version 2 or later. See +the COPYING file in the top-level directory. + + +This document explains the IOThread feature and how to write code that runs +outside the QEMU global mutex. + +The main loop and IOThreads +--------------------------- +QEMU is an event-driven program that can do several things at once using an +event loop. The VNC server and the QMP monitor are both processed from the +same event loop, which monitors their file descriptors until they become +readable and then invokes a callback. + +The default event loop is called the main loop (see main-loop.c). It is +possible to create additional event loop threads using -object +iothread,id=my-iothread. + +Side note: The main loop and IOThread are both event loops but their code is +not shared completely. Sometimes it is useful to remember that although they +are conceptually similar they are currently not interchangeable. + +Why IOThreads are useful +------------------------ +IOThreads allow the user to control the placement of work. The main loop is a +scalability bottleneck on hosts with many CPUs. Work can be spread across +several IOThreads instead of just one main loop. When set up correctly this +can improve I/O latency and reduce jitter seen by the guest. + +The main loop is also deeply associated with the QEMU global mutex, which is a +scalability bottleneck in itself. vCPU threads and the main loop use the QEMU +global mutex to serialize execution of QEMU code. This mutex is necessary +because a lot of QEMU's code historically was not thread-safe. + +The fact that all I/O processing is done in a single main loop and that the +QEMU global mutex is contended by all vCPU threads and the main loop explain +why it is desirable to place work into IOThreads. + +The experimental virtio-blk data-plane implementation has been benchmarked and +shows these effects: +ftp://public.dhe.ibm.com/linux/pdfs/KVM_Virtualized_IO_Performance_Paper.pdf + +How to program for IOThreads +---------------------------- +The main difference between legacy code and new code that can run in an +IOThread is dealing explicitly with the event loop object, AioContext +(see include/block/aio.h). Code that only works in the main loop +implicitly uses the main loop's AioContext. Code that supports running +in IOThreads must be aware of its AioContext. + +AioContext supports the following services: + * File descriptor monitoring (read/write/error on POSIX hosts) + * Event notifiers (inter-thread signalling) + * Timers + * Bottom Halves (BH) deferred callbacks + +There are several old APIs that use the main loop AioContext: + * LEGACY qemu_aio_set_fd_handler() - monitor a file descriptor + * LEGACY qemu_aio_set_event_notifier() - monitor an event notifier + * LEGACY timer_new_ms() - create a timer + * LEGACY qemu_bh_new() - create a BH + * LEGACY qemu_aio_wait() - run an event loop iteration + +Since they implicitly work on the main loop they cannot be used in code that +runs in an IOThread. They might cause a crash or deadlock if called from an +IOThread since the QEMU global mutex is not held. + +Instead, use the AioContext functions directly (see include/block/aio.h): + * aio_set_fd_handler() - monitor a file descriptor + * aio_set_event_notifier() - monitor an event notifier + * aio_timer_new() - create a timer + * aio_bh_new() - create a BH + * aio_poll() - run an event loop iteration + +The AioContext can be obtained from the IOThread using +iothread_get_aio_context() or for the main loop using qemu_get_aio_context(). +Code that takes an AioContext argument works both in IOThreads or the main +loop, depending on which AioContext instance the caller passes in. + +How to synchronize with an IOThread +----------------------------------- +AioContext is not thread-safe so some rules must be followed when using file +descriptors, event notifiers, timers, or BHs across threads: + +1. AioContext functions can be called safely from file descriptor, event +notifier, timer, or BH callbacks invoked by the AioContext. No locking is +necessary. + +2. Other threads wishing to access the AioContext must use +aio_context_acquire()/aio_context_release() for mutual exclusion. Once the +context is acquired no other thread can access it or run event loop iterations +in this AioContext. + +aio_context_acquire()/aio_context_release() calls may be nested. This +means you can call them if you're not sure whether #1 applies. + +There is currently no lock ordering rule if a thread needs to acquire multiple +AioContexts simultaneously. Therefore, it is only safe for code holding the +QEMU global mutex to acquire other AioContexts. + +Side note: the best way to schedule a function call across threads is to create +a BH in the target AioContext beforehand and then call qemu_bh_schedule(). No +acquire/release or locking is needed for the qemu_bh_schedule() call. But be +sure to acquire the AioContext for aio_bh_new() if necessary. + +The relationship between AioContext and the block layer +------------------------------------------------------- +The AioContext originates from the QEMU block layer because it provides a +scoped way of running event loop iterations until all work is done. This +feature is used to complete all in-flight block I/O requests (see +bdrv_drain_all()). Nowadays AioContext is a generic event loop that can be +used by any QEMU subsystem. + +The block layer has support for AioContext integrated. Each BlockDriverState +is associated with an AioContext using bdrv_set_aio_context() and +bdrv_get_aio_context(). This allows block layer code to process I/O inside the +right AioContext. Other subsystems may wish to follow a similar approach. + +Block layer code must therefore expect to run in an IOThread and avoid using +old APIs that implicitly use the main loop. See the "How to program for +IOThreads" above for information on how to do that. + +If main loop code such as a QMP function wishes to access a BlockDriverState it +must first call aio_context_acquire(bdrv_get_aio_context(bs)) to ensure the +IOThread does not run in parallel. + +Long-running jobs (usually in the form of coroutines) are best scheduled in the +BlockDriverState's AioContext to avoid the need to acquire/release around each +bdrv_*() call. Be aware that there is currently no mechanism to get notified +when bdrv_set_aio_context() moves this BlockDriverState to a different +AioContext (see bdrv_detach_aio_context()/bdrv_attach_aio_context()), so you +may need to add this if you want to support long-running jobs. From 4115dd6527fbdf49dbd1eba24ad68e0fae1e305a Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Wed, 9 Jul 2014 14:01:31 +0200 Subject: [PATCH 14/59] qmp: hide "hotplugged" device property from device-list-properties The "hotplugged" device property was not reported before commit f4eb32b590bf58c1c67570775eb78beb09964fad ("qmp: show QOM properties in device-list-properties"). Fix this difference. Signed-off-by: Stefan Hajnoczi Reviewed-by: Eric Blake --- qmp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/qmp.c b/qmp.c index 0d2553abf5..c6767c4df3 100644 --- a/qmp.c +++ b/qmp.c @@ -509,6 +509,7 @@ DevicePropertyInfoList *qmp_device_list_properties(const char *typename, if (strcmp(prop->name, "type") == 0 || strcmp(prop->name, "realized") == 0 || strcmp(prop->name, "hotpluggable") == 0 || + strcmp(prop->name, "hotplugged") == 0 || strcmp(prop->name, "parent_bus") == 0) { continue; } From ef523587da4f213ca17133a90402d0815ecf08ee Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Wed, 9 Jul 2014 14:01:32 +0200 Subject: [PATCH 15/59] qdev-monitor: include QOM properties in -device FOO, help output Update -device FOO,help to include QOM properties in addition to qdev properties. Devices are gradually adding more QOM properties that are not reflected as qdev properties. It is important to report all device properties since management tools like libvirt use this information (and device-list-properties QMP) to detect the presence of QEMU features. This patch reuses the device-list-properties QMP machinery to avoid code duplication. Reported-by: Cole Robinson Signed-off-by: Stefan Hajnoczi Reviewed-by: Eric Blake Tested-by: Cole Robinson --- qdev-monitor.c | 40 +++++++++++++++++----------------------- 1 file changed, 17 insertions(+), 23 deletions(-) diff --git a/qdev-monitor.c b/qdev-monitor.c index f87f3d89cd..5fe5e75a88 100644 --- a/qdev-monitor.c +++ b/qdev-monitor.c @@ -182,9 +182,10 @@ static const char *find_typename_by_alias(const char *alias) int qdev_device_help(QemuOpts *opts) { + Error *local_err = NULL; const char *driver; - Property *prop; - ObjectClass *klass; + DevicePropertyInfoList *prop_list; + DevicePropertyInfoList *prop; driver = qemu_opt_get(opts, "driver"); if (driver && is_help_option(driver)) { @@ -196,35 +197,28 @@ int qdev_device_help(QemuOpts *opts) return 0; } - klass = object_class_by_name(driver); - if (!klass) { + if (!object_class_by_name(driver)) { const char *typename = find_typename_by_alias(driver); if (typename) { driver = typename; - klass = object_class_by_name(driver); } } - if (!object_class_dynamic_cast(klass, TYPE_DEVICE)) { - return 0; + prop_list = qmp_device_list_properties(driver, &local_err); + if (!prop_list) { + error_printf("%s\n", error_get_pretty(local_err)); + error_free(local_err); + return 1; } - do { - for (prop = DEVICE_CLASS(klass)->props; prop && prop->name; prop++) { - /* - * TODO Properties without a parser are just for dirty hacks. - * qdev_prop_ptr is the only such PropertyInfo. It's marked - * for removal. This conditional should be removed along with - * it. - */ - if (!prop->info->set) { - continue; /* no way to set it, don't show */ - } - error_printf("%s.%s=%s\n", driver, prop->name, - prop->info->legacy_name ?: prop->info->name); - } - klass = object_class_get_parent(klass); - } while (klass != object_class_by_name(TYPE_DEVICE)); + + for (prop = prop_list; prop; prop = prop->next) { + error_printf("%s.%s=%s\n", driver, + prop->value->name, + prop->value->type); + } + + qapi_free_DevicePropertyInfoList(prop_list); return 1; } From a8d8a1a06c8ebf630f39bb7263bc711b8297d367 Mon Sep 17 00:00:00 2001 From: Fam Zheng Date: Wed, 30 Jul 2014 14:39:09 +0800 Subject: [PATCH 16/59] qemu-iotests: Add data pattern in version3 VMDK sample image in 059 It's possible that we diverge from the specification with our implementation. Having a reference image in the test cases may detect such problems when we introduce a bug that can read what it creates, but can't handle a real VMDK. Signed-off-by: Fam Zheng Signed-off-by: Stefan Hajnoczi --- tests/qemu-iotests/059 | 4 + tests/qemu-iotests/059.out | 202 +++++++++++++++++- .../sample_images/iotest-version3.vmdk.bz2 | Bin 414 -> 4764 bytes 3 files changed, 205 insertions(+), 1 deletion(-) diff --git a/tests/qemu-iotests/059 b/tests/qemu-iotests/059 index 26a2fd3e0e..3c053c29b4 100755 --- a/tests/qemu-iotests/059 +++ b/tests/qemu-iotests/059 @@ -114,6 +114,10 @@ echo echo "=== Testing version 3 ===" _use_sample_img iotest-version3.vmdk.bz2 _img_info +for i in {0..99}; do + $QEMU_IO -r -c "read -P $(( i % 10 + 0x30 )) $(( i * 64 * 1024 * 10 + i * 512 )) 512" $TEST_IMG \ + | _filter_qemu_io +done echo echo "=== Testing 4TB monolithicFlat creation and IO ===" diff --git a/tests/qemu-iotests/059.out b/tests/qemu-iotests/059.out index eba0dedda0..0dadba658f 100644 --- a/tests/qemu-iotests/059.out +++ b/tests/qemu-iotests/059.out @@ -2056,8 +2056,208 @@ wrote 512/512 bytes at offset 10240 === Testing version 3 === image: TEST_DIR/iotest-version3.IMGFMT file format: IMGFMT -virtual size: 1.0G (1073741824 bytes) +virtual size: 16G (17179869184 bytes) cluster_size: 65536 +read 512/512 bytes at offset 0 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 655872 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 1311744 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 1967616 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 2623488 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 3279360 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 3935232 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 4591104 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 5246976 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 5902848 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 6558720 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 7214592 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 7870464 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 8526336 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 9182208 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 9838080 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 10493952 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 11149824 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 11805696 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 12461568 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 13117440 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 13773312 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 14429184 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 15085056 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 15740928 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 16396800 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 17052672 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 17708544 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 18364416 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 19020288 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 19676160 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 20332032 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 20987904 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 21643776 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 22299648 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 22955520 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 23611392 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 24267264 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 24923136 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 25579008 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 26234880 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 26890752 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 27546624 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 28202496 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 28858368 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 29514240 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 30170112 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 30825984 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 31481856 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 32137728 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 32793600 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 33449472 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 34105344 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 34761216 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 35417088 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 36072960 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 36728832 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 37384704 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 38040576 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 38696448 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 39352320 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 40008192 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 40664064 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 41319936 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 41975808 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 42631680 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 43287552 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 43943424 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 44599296 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 45255168 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 45911040 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 46566912 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 47222784 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 47878656 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 48534528 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 49190400 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 49846272 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 50502144 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 51158016 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 51813888 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 52469760 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 53125632 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 53781504 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 54437376 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 55093248 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 55749120 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 56404992 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 57060864 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 57716736 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 58372608 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 59028480 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 59684352 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 60340224 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 60996096 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 61651968 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 62307840 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 62963712 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 63619584 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 64275456 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 64931328 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) === Testing 4TB monolithicFlat creation and IO === Formatting 'TEST_DIR/iotest-version3.IMGFMT', fmt=IMGFMT size=4398046511104 diff --git a/tests/qemu-iotests/sample_images/iotest-version3.vmdk.bz2 b/tests/qemu-iotests/sample_images/iotest-version3.vmdk.bz2 index 30abf217e72d38e97b1e34a5db6add15ca3812d9..619329a246cd44f2592c034cd11d9d0afbcc1d38 100644 GIT binary patch literal 4764 zcmai0c{mde_}^x8%r$I`q+!k^)+kqPnEM!7&Pt0MQ6dV_jKYY`5yn^UE6q`^iX0}^#FD;aUSylMPq%IQv%*kK!G8>UKmWt{TJ zgwrJ>rwCsYqO2n=jnT#m-B&hI^c~s3Cl;2) zn%JPQmiNw#8e@meY}9WTrjJb66FGtAMY?77PX0Xj0%1T0uOhwyAWAf{<;_nY!3t4~ zZ~(co93x!B10f0z4gfGh0hLRkPWWX2kI8}S>k9w?I6OFD83lX5fW-$fz(JnOLG*xu zlt_y9PBvykmt_1HWjU3J30MBN5L+zPYcO3oTM=7E6vkqG zWrT&1Of4{Pju6GPBo15hr?Rr%SEjsj&evg%X|MK0H+|Q}Pbk&N_KX$AkZD_h@sG%- zi#jYv^>lmkF^@Oyk1$L(x#&sF_5+eq zjt1u2Rkw*#Rk>{;|7>?1_kAK{og?FO7hg&Z&psaixOrE@Ub3^_7Dmr<;hX6@Msh2H zir>xpPw=xe_b;>!!J{eDC!>B^`AYg$Y1zOHzn`OGM;^(Q=I)ZtdtW0zZ|kBr_Vf)5 ziHxWBz0+v+3)x>}vG%g)AIDxbteP7OrseSufhUeNh>sMh}TwnmkUdk?x- zFNZn*bNl;i|A(qX5Gup7)=>>kBPbjY z7u5sNfWsx#snib7S|X9?Qb+EFBuQ%cmy+Pa>J?Q*ai$IgN!23S0o1Fis%QsVAt*e% z_a~5gw5d@OLJ$>E;D5pRXklz@VPWb(9v~-LRB4d4#o$%v1U2z8M7Q3Gk&3vpVqyFV zR}0S1Wqo+i2c0RHRe(zQqEf9PI0+IA0qR9bp3`za&Fk0C)5?+C?~Mk2U;2*3uZ!N18B6=x+D-eBj!fq_7Uad# zy^t<0)_LQ4?1m>-EpIxQTB_G-ptE$|(^Og)(k#*c2Q8^eAcZOg=n0DeBjrQVwyYv| zcWq1Qllnb($Gi#C)pnbi#P{y;_hLykZLL@Rs#^SREVBS{V^oRRR!=jrpCD#c)i`gr zAIVMW`i{jIFCT28DxLr{87B}8eir$cjsRuO0*tb$g5PoGD>T8B1XZUZsL>8MbOVCW z+aThnmxy)&L+i$!g+X|mmu83o>>zmsg3mo`+%=9d_p{-?bjDxblMSfQ-^wH0-fj_I#@`0HB>P#6`F8 zJ9x6m-AQmNDofJM%sFRyE~O@RN&lir=-;;w&!i@{&0Uy3)NIP0w^^@IMRef#gv8c{ zpG?R4>~fE7zPcQOid}A=aqP%FSR3cklT6%A?n<5Rr*v?N#Au>GGU3D;@Pu&}kKZDWwbTSyXQg`QEXyDqw{OKOi z^P!CjRi)jX@B77~y6fH;f2yzfd;vLc9F%Gz|y}3aEz(!a$C|5fn_r6GQ@gC72O*Z3A`o6`)-fq##{d7tej3^8#=SHuA8t z#w$DTHAYACdyD0Xy5k*48DBn4L>>V!DX{6n=!Gc=yagz*bh-icYLbD)#X`FM_lIGv zQ8uc9T5azP8cd)?*zsPzvk%lZYAup1bLgnfX^TFa$&_0PkWRtzm2a5|+L*9~$(A>m zUY~3Fy^{^s=U#t$Twxp^7i`bt5&F&J!l z(~&?mS?zLd?XC4BzK1t1O6Q4F2eRYHc~30FE&)rS-?V!eciy%;WFpIK;F|6%>HvSh z2i=LMW|bJvY!#EB$)XUFnt0S(J@4W3Mb)n2o}S>pHRH2_h=OqB^v1g@&{8u+u>|)2 z8uEy|Rr1xyfQolbE0$jVl6-?KR+y?pb88sBkWMg*D!*X^NI?k+CHU%}8-itA?l}>D zo4*6-fJYGRs+Q(SAGziaxs5(UT#0A4{ zKFVEaJ;hu6HNtu&wqQYzqjxN2zS!K)ZiZhz{74p{jml>Ys8T?fS_1egXsA($R2dCu z0i@AlFDUdLRVb)F7^14oIZA>nmKoXiSk+u5UEM6v+KnM=u;3qa1vt|oXgJp8&1{9oFn|d_6iK)R^<|F5AC>Jc{;oW!`0IIXDdI+ zPunU9=L_^DN3$+mjL~u1;cLWsQky27`z|UIz8`vTpEZebM@v zpzj+c#Jk8oRj#H4$^DXy(u(8mZKuvv>$#&PjZ@`TGX-LFQg-}mmAF?{eq}D)F;~pH z|Dvpj)Wcb&4!rV?xNK>J{wwu^bAMg!*`elldvom%5TEcuWs_%*nJd}#SOuM+N8d$x ze#yBT-LC=+(&Ckx*9Cd85DK*7qJ{a92WifFlGj%$x}wx`03x(HDvnmxx_0O{pTdn5XT+Zf^64gf4WUt z+~B3XlS7~YvC~H81atC`Ln|J^B26h6Pmjw;>bWG5PQH8TUA^$C1wU{V5i`2cKZK5n zmWTfQB`yMeSN!wk+pD3%8>T7aDk5@XP2NWh!k65X-o2a{g}oN|3h48%*FV=`t}Hrf z*2x>OlJG&kf~%|Aw>QI_axK*%mliN7BSP+ygFAm;+C2Fk^>xf__MN;w6c;T2pW;*= zS?7skJ&b{7huueNbXN4qr#&3Ab_mYp*J=X9AWPn!wrO@IW_XOpU)e9#`!e&bS>NYJ zgIjbDK2aE9{N{odGe?A%8xymHw5qW2Xr>2N6pb?3YGRy4F%jb_L;6SiQ$PhfNUy$g z7&&YtF7As}Zh%R-`UuEnS)m%Gkw2`K_ZD^*iKm(w3`YBF=_+Tl-dvv4&|5<&{MOTEJfZGK z>mI1vs5$CsMB-3GU>`;4l;@r+00E*@?l;vol`RL>23^w34N8}|a^-rD(^(|z_4nhr z4#Zb$Z<;Z8!s*E{B}O4kyLh!Ru-lv>&tNjjM{aw!2V8e!#zpFN7He(UQ z6W1F6r)T;|tKQJZd3Voo!#PJW3|fU-{>rjvO}9l^%O~E)PjZ!WNTf8?LyLy(ojz#E z{WUPeeGw9Dh8gj0{j%O3c!$Nxz7=g;Ip1V^VWYe^{@QgiNEgv69R(GlD3@6Mi3)x+ zlc#noE^|$ga&$EJo0OBhDQ?VE;~!+ndSAUG+6a6bv3T62cDwMRp|SpDP)3nTU7i2E zve;vDgpYp&#zS>(oiqT4cF8A!(4X9hvtq>f?`Dp3degW&vwD&DH z*%1wwLvdZIBxzxAiI-ruv*V!NOo zvT2Un+XhIvj;$X~L-j7QmQJ@rKTGlnPpr4gT;us+RR!-Dv_%*XJ66#ond&j^NcLq_ z_WZSimYEyHpQh#lcK(a6u1ARhaBx4I_OG_~{jF$^+!m9K95(9eVP?9I^sF0rXkm&R zydFUOw)X81bL1nc`=nO=+R>GjsA%nn6T*gExOT`j8T5>ETtX;GKUsD2^P2~EOT>l~ z?Y5t7YY&!Ps$2{hea~KM2sydC7qW)CHJ8%mdiCSxQMK!I*7?-TS2gu5_+$7(V6dQ8 z!JJ;P7yQUPC2OU-acIJHLT z^xHzroK57-U`^SIzeR4B6aXntIhl^X&xp>x672z;abrEvS%UGmBZbLYT(h31nGLoX zS^Z&W+&kBz$EFHbbXpiEcT2m!T+Tm%mbdOIwmI5Q{(u=J4Gi6D?miOOthXqU2ByeG zbID<%1C9AR@4KmrD;5P*4?%k+#~K;dgMhb(tgWrhlXLp6_&3~r5w=6<$4hkAiGe_- zbD=oZFZ~;__d=id$c3-?HV2eQ-#`dF6(LHq8-VM5Yd5#}CdNzcRYd0DyD zM&2Hk*}ngIBupTb*1i2H!JOoZ~l)ZrbH%Q}oWup($ZYNuP+onz zC89&oo+5m#SRo3f$tj3j4cj-uY<9AFkA%eEJTaa)*;e;U06X$QBY(3aGcne4@Z5Q` zOld1c2c2n5eaHx1cF@GqGfd88M9D~UB914v;WOL96@WSu-<9};viD>nCB_lZ=L5MA zWAhh{E!lb{@J3L6J+%};gkaKNrNrxivB)qiB85a>b>(iaNAFyd9Z|7XRWs!&h*2TY z2uM_*TOC2Lp=5yL%rudbN4;JP^)svmH_P%A-P+35?p=v$3M>tU*);6x0S=pmRj{{| z7O Date: Wed, 30 Jul 2014 14:39:10 +0800 Subject: [PATCH 17/59] vmdk: Optimize cluster allocation This drops the unnecessary bdrv_truncate() from, and also improves, cluster allocation code path. Before, when we need a new cluster, get_cluster_offset truncates the image to bdrv_getlength() + cluster_size, and returns the offset of added area, i.e. the image length before truncating. This is not efficient, so it's now rewritten as: - Save the extent file length when opening. - When allocating cluster, use the saved length as cluster offset. - Don't truncate image, because we'll anyway write data there: just write any data at the EOF position, in descending priority: * New user data (cluster allocation happens in a write request). * Filling data in the beginning and/or ending of the new cluster, if not covered by user data: either backing file content (COW), or zero for standalone images. One major benifit of this change is, on host mounted NFS images, even over a fast network, ftruncate is slow (see the example below). This change significantly speeds up cluster allocation. Comparing by converting a cirros image (296M) to VMDK on an NFS mount point, over 1Gbe LAN: $ time qemu-img convert cirros-0.3.1.img /mnt/a.raw -O vmdk Before: real 0m21.796s user 0m0.130s sys 0m0.483s After: real 0m2.017s user 0m0.047s sys 0m0.190s We also get rid of unchecked bdrv_getlength() and bdrv_truncate(), and get a little more documentation in function comments. Tested that this passes qemu-iotests for all VMDK subformats. Signed-off-by: Fam Zheng Signed-off-by: Stefan Hajnoczi --- block/vmdk.c | 228 ++++++++++++++++++++++++++++++++------------------- 1 file changed, 143 insertions(+), 85 deletions(-) diff --git a/block/vmdk.c b/block/vmdk.c index 33d9bec02b..3700a11f5f 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -106,6 +106,7 @@ typedef struct VmdkExtent { uint32_t l2_cache_counts[L2_CACHE_SIZE]; int64_t cluster_sectors; + int64_t next_cluster_sector; char *type; } VmdkExtent; @@ -124,7 +125,6 @@ typedef struct BDRVVmdkState { } BDRVVmdkState; typedef struct VmdkMetaData { - uint32_t offset; unsigned int l1_index; unsigned int l2_index; unsigned int l2_offset; @@ -397,6 +397,7 @@ static int vmdk_add_extent(BlockDriverState *bs, { VmdkExtent *extent; BDRVVmdkState *s = bs->opaque; + int64_t length; if (cluster_sectors > 0x200000) { /* 0x200000 * 512Bytes = 1GB for one cluster is unrealistic */ @@ -412,6 +413,11 @@ static int vmdk_add_extent(BlockDriverState *bs, return -EFBIG; } + length = bdrv_getlength(file); + if (length < 0) { + return length; + } + s->extents = g_realloc(s->extents, (s->num_extents + 1) * sizeof(VmdkExtent)); extent = &s->extents[s->num_extents]; @@ -427,6 +433,8 @@ static int vmdk_add_extent(BlockDriverState *bs, extent->l1_entry_sectors = l2_size * cluster_sectors; extent->l2_size = l2_size; extent->cluster_sectors = flat ? sectors : cluster_sectors; + extent->next_cluster_sector = + ROUND_UP(DIV_ROUND_UP(length, BDRV_SECTOR_SIZE), cluster_sectors); if (s->num_extents > 1) { extent->end_sector = (*(extent - 1)).end_sector + extent->sectors; @@ -951,57 +959,97 @@ static void vmdk_refresh_limits(BlockDriverState *bs, Error **errp) } } +/** + * get_whole_cluster + * + * Copy backing file's cluster that covers @sector_num, otherwise write zero, + * to the cluster at @cluster_sector_num. + * + * If @skip_start_sector < @skip_end_sector, the relative range + * [@skip_start_sector, @skip_end_sector) is not copied or written, and leave + * it for call to write user data in the request. + */ static int get_whole_cluster(BlockDriverState *bs, - VmdkExtent *extent, - uint64_t cluster_offset, - uint64_t offset, - bool allocate) + VmdkExtent *extent, + uint64_t cluster_sector_num, + uint64_t sector_num, + uint64_t skip_start_sector, + uint64_t skip_end_sector) { int ret = VMDK_OK; - uint8_t *whole_grain = NULL; + int64_t cluster_bytes; + uint8_t *whole_grain; + /* For COW, align request sector_num to cluster start */ + sector_num = QEMU_ALIGN_DOWN(sector_num, extent->cluster_sectors); + cluster_bytes = extent->cluster_sectors << BDRV_SECTOR_BITS; + whole_grain = qemu_blockalign(bs, cluster_bytes); + + if (!bs->backing_hd) { + memset(whole_grain, 0, skip_start_sector << BDRV_SECTOR_BITS); + memset(whole_grain + (skip_end_sector << BDRV_SECTOR_BITS), 0, + cluster_bytes - (skip_end_sector << BDRV_SECTOR_BITS)); + } + + assert(skip_end_sector <= extent->cluster_sectors); /* we will be here if it's first write on non-exist grain(cluster). * try to read from parent image, if exist */ - if (bs->backing_hd) { - whole_grain = - qemu_blockalign(bs, extent->cluster_sectors << BDRV_SECTOR_BITS); - if (!vmdk_is_cid_valid(bs)) { - ret = VMDK_ERROR; - goto exit; - } + if (bs->backing_hd && !vmdk_is_cid_valid(bs)) { + ret = VMDK_ERROR; + goto exit; + } - /* floor offset to cluster */ - offset -= offset % (extent->cluster_sectors * 512); - ret = bdrv_read(bs->backing_hd, offset >> 9, whole_grain, - extent->cluster_sectors); - if (ret < 0) { - ret = VMDK_ERROR; - goto exit; + /* Read backing data before skip range */ + if (skip_start_sector > 0) { + if (bs->backing_hd) { + ret = bdrv_read(bs->backing_hd, sector_num, + whole_grain, skip_start_sector); + if (ret < 0) { + ret = VMDK_ERROR; + goto exit; + } } - - /* Write grain only into the active image */ - ret = bdrv_write(extent->file, cluster_offset, whole_grain, - extent->cluster_sectors); + ret = bdrv_write(extent->file, cluster_sector_num, whole_grain, + skip_start_sector); if (ret < 0) { ret = VMDK_ERROR; goto exit; } } + /* Read backing data after skip range */ + if (skip_end_sector < extent->cluster_sectors) { + if (bs->backing_hd) { + ret = bdrv_read(bs->backing_hd, sector_num + skip_end_sector, + whole_grain + (skip_end_sector << BDRV_SECTOR_BITS), + extent->cluster_sectors - skip_end_sector); + if (ret < 0) { + ret = VMDK_ERROR; + goto exit; + } + } + ret = bdrv_write(extent->file, cluster_sector_num + skip_end_sector, + whole_grain + (skip_end_sector << BDRV_SECTOR_BITS), + extent->cluster_sectors - skip_end_sector); + if (ret < 0) { + ret = VMDK_ERROR; + goto exit; + } + } + exit: qemu_vfree(whole_grain); return ret; } -static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data) +static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data, + uint32_t offset) { - uint32_t offset; - QEMU_BUILD_BUG_ON(sizeof(offset) != sizeof(m_data->offset)); - offset = cpu_to_le32(m_data->offset); + offset = cpu_to_le32(offset); /* update L2 table */ if (bdrv_pwrite_sync( extent->file, ((int64_t)m_data->l2_offset * 512) - + (m_data->l2_index * sizeof(m_data->offset)), + + (m_data->l2_index * sizeof(offset)), &offset, sizeof(offset)) < 0) { return VMDK_ERROR; } @@ -1011,7 +1059,7 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data) if (bdrv_pwrite_sync( extent->file, ((int64_t)m_data->l2_offset * 512) - + (m_data->l2_index * sizeof(m_data->offset)), + + (m_data->l2_index * sizeof(offset)), &offset, sizeof(offset)) < 0) { return VMDK_ERROR; } @@ -1023,17 +1071,41 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data) return VMDK_OK; } +/** + * get_cluster_offset + * + * Look up cluster offset in extent file by sector number, and store in + * @cluster_offset. + * + * For flat extents, the start offset as parsed from the description file is + * returned. + * + * For sparse extents, look up in L1, L2 table. If allocate is true, return an + * offset for a new cluster and update L2 cache. If there is a backing file, + * COW is done before returning; otherwise, zeroes are written to the allocated + * cluster. Both COW and zero writing skips the sector range + * [@skip_start_sector, @skip_end_sector) passed in by caller, because caller + * has new data to write there. + * + * Returns: VMDK_OK if cluster exists and mapped in the image. + * VMDK_UNALLOC if cluster is not mapped and @allocate is false. + * VMDK_ERROR if failed. + */ static int get_cluster_offset(BlockDriverState *bs, - VmdkExtent *extent, - VmdkMetaData *m_data, - uint64_t offset, - int allocate, - uint64_t *cluster_offset) + VmdkExtent *extent, + VmdkMetaData *m_data, + uint64_t offset, + bool allocate, + uint64_t *cluster_offset, + uint64_t skip_start_sector, + uint64_t skip_end_sector) { unsigned int l1_index, l2_offset, l2_index; int min_index, i, j; uint32_t min_count, *l2_table; bool zeroed = false; + int64_t ret; + int32_t cluster_sector; if (m_data) { m_data->valid = 0; @@ -1087,52 +1159,41 @@ static int get_cluster_offset(BlockDriverState *bs, extent->l2_cache_counts[min_index] = 1; found: l2_index = ((offset >> 9) / extent->cluster_sectors) % extent->l2_size; - *cluster_offset = le32_to_cpu(l2_table[l2_index]); + cluster_sector = le32_to_cpu(l2_table[l2_index]); if (m_data) { m_data->valid = 1; m_data->l1_index = l1_index; m_data->l2_index = l2_index; - m_data->offset = *cluster_offset; m_data->l2_offset = l2_offset; m_data->l2_cache_entry = &l2_table[l2_index]; } - if (extent->has_zero_grain && *cluster_offset == VMDK_GTE_ZEROED) { + if (extent->has_zero_grain && cluster_sector == VMDK_GTE_ZEROED) { zeroed = true; } - if (!*cluster_offset || zeroed) { + if (!cluster_sector || zeroed) { if (!allocate) { return zeroed ? VMDK_ZEROED : VMDK_UNALLOC; } - /* Avoid the L2 tables update for the images that have snapshots. */ - *cluster_offset = bdrv_getlength(extent->file); - if (!extent->compressed) { - bdrv_truncate( - extent->file, - *cluster_offset + (extent->cluster_sectors << 9) - ); - } - - *cluster_offset >>= 9; - l2_table[l2_index] = cpu_to_le32(*cluster_offset); + cluster_sector = extent->next_cluster_sector; + extent->next_cluster_sector += extent->cluster_sectors; /* First of all we write grain itself, to avoid race condition * that may to corrupt the image. * This problem may occur because of insufficient space on host disk * or inappropriate VM shutdown. */ - if (get_whole_cluster( - bs, extent, *cluster_offset, offset, allocate) == -1) { - return VMDK_ERROR; - } - - if (m_data) { - m_data->offset = *cluster_offset; + ret = get_whole_cluster(bs, extent, + cluster_sector, + offset >> BDRV_SECTOR_BITS, + skip_start_sector, skip_end_sector); + if (ret) { + return ret; } } - *cluster_offset <<= 9; + *cluster_offset = cluster_sector << BDRV_SECTOR_BITS; return VMDK_OK; } @@ -1167,7 +1228,8 @@ static int64_t coroutine_fn vmdk_co_get_block_status(BlockDriverState *bs, } qemu_co_mutex_lock(&s->lock); ret = get_cluster_offset(bs, extent, NULL, - sector_num * 512, 0, &offset); + sector_num * 512, false, &offset, + 0, 0); qemu_co_mutex_unlock(&s->lock); switch (ret) { @@ -1320,9 +1382,9 @@ static int vmdk_read(BlockDriverState *bs, int64_t sector_num, if (!extent) { return -EIO; } - ret = get_cluster_offset( - bs, extent, NULL, - sector_num << 9, 0, &cluster_offset); + ret = get_cluster_offset(bs, extent, NULL, + sector_num << 9, false, &cluster_offset, + 0, 0); extent_begin_sector = extent->end_sector - extent->sectors; extent_relative_sector_num = sector_num - extent_begin_sector; index_in_cluster = extent_relative_sector_num % extent->cluster_sectors; @@ -1403,12 +1465,17 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num, if (!extent) { return -EIO; } - ret = get_cluster_offset( - bs, - extent, - &m_data, - sector_num << 9, !extent->compressed, - &cluster_offset); + extent_begin_sector = extent->end_sector - extent->sectors; + extent_relative_sector_num = sector_num - extent_begin_sector; + index_in_cluster = extent_relative_sector_num % extent->cluster_sectors; + n = extent->cluster_sectors - index_in_cluster; + if (n > nb_sectors) { + n = nb_sectors; + } + ret = get_cluster_offset(bs, extent, &m_data, sector_num << 9, + !(extent->compressed || zeroed), + &cluster_offset, + index_in_cluster, index_in_cluster + n); if (extent->compressed) { if (ret == VMDK_OK) { /* Refuse write to allocated cluster for streamOptimized */ @@ -1417,24 +1484,13 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num, return -EIO; } else { /* allocate */ - ret = get_cluster_offset( - bs, - extent, - &m_data, - sector_num << 9, 1, - &cluster_offset); + ret = get_cluster_offset(bs, extent, &m_data, sector_num << 9, + true, &cluster_offset, 0, 0); } } if (ret == VMDK_ERROR) { return -EINVAL; } - extent_begin_sector = extent->end_sector - extent->sectors; - extent_relative_sector_num = sector_num - extent_begin_sector; - index_in_cluster = extent_relative_sector_num % extent->cluster_sectors; - n = extent->cluster_sectors - index_in_cluster; - if (n > nb_sectors) { - n = nb_sectors; - } if (zeroed) { /* Do zeroed write, buf is ignored */ if (extent->has_zero_grain && @@ -1442,9 +1498,9 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num, n >= extent->cluster_sectors) { n = extent->cluster_sectors; if (!zero_dry_run) { - m_data.offset = VMDK_GTE_ZEROED; /* update L2 tables */ - if (vmdk_L2update(extent, &m_data) != VMDK_OK) { + if (vmdk_L2update(extent, &m_data, VMDK_GTE_ZEROED) + != VMDK_OK) { return -EIO; } } @@ -1460,7 +1516,9 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num, } if (m_data.valid) { /* update L2 tables */ - if (vmdk_L2update(extent, &m_data) != VMDK_OK) { + if (vmdk_L2update(extent, &m_data, + cluster_offset >> BDRV_SECTOR_BITS) + != VMDK_OK) { return -EIO; } } @@ -2019,7 +2077,7 @@ static int vmdk_check(BlockDriverState *bs, BdrvCheckResult *result, } ret = get_cluster_offset(bs, extent, NULL, sector_num << BDRV_SECTOR_BITS, - 0, &cluster_offset); + false, &cluster_offset, 0, 0); if (ret == VMDK_ERROR) { fprintf(stderr, "ERROR: could not get cluster_offset for sector %" From 000c4dfff4d7686e2fba3066a477a1290ed60622 Mon Sep 17 00:00:00 2001 From: Chunyan Liu Date: Wed, 30 Jul 2014 10:55:06 +0800 Subject: [PATCH 18/59] qemu-img info: show nocow info Add nocow info in 'qemu-img info' output to show whether the file currently has NOCOW flag set or not. Signed-off-by: Chunyan Liu Reviewed-by: Eric Blake Signed-off-by: Stefan Hajnoczi --- block/qapi.c | 26 ++++++++++++++++++++++++++ qapi/block-core.json | 5 ++++- 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/block/qapi.c b/block/qapi.c index 28ebb62c8e..79d1e6a9f4 100644 --- a/block/qapi.c +++ b/block/qapi.c @@ -28,6 +28,13 @@ #include "qapi-visit.h" #include "qapi/qmp-output-visitor.h" #include "qapi/qmp/types.h" +#ifdef __linux__ +#include +#include +#ifndef FS_NOCOW_FL +#define FS_NOCOW_FL 0x00800000 /* Do not cow file */ +#endif +#endif BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs) { @@ -172,6 +179,9 @@ void bdrv_query_image_info(BlockDriverState *bs, int ret; Error *err = NULL; ImageInfo *info; +#ifdef __linux__ + int fd, attr; +#endif size = bdrv_getlength(bs); if (size < 0) { @@ -201,6 +211,18 @@ void bdrv_query_image_info(BlockDriverState *bs, info->format_specific = bdrv_get_specific_info(bs); info->has_format_specific = info->format_specific != NULL; +#ifdef __linux__ + /* get NOCOW info */ + fd = qemu_open(bs->filename, O_RDONLY | O_NONBLOCK); + if (fd >= 0) { + if (ioctl(fd, FS_IOC_GETFLAGS, &attr) == 0 && (attr & FS_NOCOW_FL)) { + info->has_nocow = true; + info->nocow = true; + } + qemu_close(fd); + } +#endif + backing_filename = bs->backing_file; if (backing_filename[0] != '\0') { info->backing_filename = g_strdup(backing_filename); @@ -631,4 +653,8 @@ void bdrv_image_info_dump(fprintf_function func_fprintf, void *f, func_fprintf(f, "Format specific information:\n"); bdrv_image_info_specific_dump(func_fprintf, f, info->format_specific); } + + if (info->has_nocow && info->nocow) { + func_fprintf(f, "NOCOW flag: set\n"); + } } diff --git a/qapi/block-core.json b/qapi/block-core.json index e378653a77..72b4015fb4 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -115,6 +115,8 @@ # @format-specific: #optional structure supplying additional format-specific # information (since 1.7) # +# @nocow: #optional info of whether NOCOW flag is set or not. (since 2.2) +# # Since: 1.3 # ## @@ -126,7 +128,8 @@ '*backing-filename': 'str', '*full-backing-filename': 'str', '*backing-filename-format': 'str', '*snapshots': ['SnapshotInfo'], '*backing-image': 'ImageInfo', - '*format-specific': 'ImageInfoSpecific' } } + '*format-specific': 'ImageInfoSpecific', + '*nocow': 'bool' } } ## # @ImageCheck: From c9a12e751b9f18afbc1b1db4197d9d13893a4c89 Mon Sep 17 00:00:00 2001 From: Chrysostomos Nanakos Date: Mon, 4 Aug 2014 17:35:32 +0300 Subject: [PATCH 19/59] block: Support Archipelago as a QEMU block backend VM Image on Archipelago volume is specified like this: file.driver=archipelago,file.volume=[,file.mport=[, file.vport=][,file.segment=]] 'archipelago' is the protocol. 'mport' is the port number on which mapperd is listening. This is optional and if not specified, QEMU will make Archipelago to use the default port. 'vport' is the port number on which vlmcd is listening. This is optional and if not specified, QEMU will make Archipelago to use the default port. 'segment' is the name of the shared memory segment Archipelago stack is using. This is optional and if not specified, QEMU will make Archipelago to use the default value, 'archipelago'. Examples: file.driver=archipelago,file.volume=my_vm_volume file.driver=archipelago,file.volume=my_vm_volume,file.mport=123 file.driver=archipelago,file.volume=my_vm_volume,file.mport=123, file.vport=1234 file.driver=archipelago,file.volume=my_vm_volume,file.mport=123, file.vport=1234,file.segment=my_segment Signed-off-by: Chrysostomos Nanakos Reviewed-by: Stefan Hajnoczi Signed-off-by: Kevin Wolf --- MAINTAINERS | 6 + block/Makefile.objs | 2 + block/archipelago.c | 787 ++++++++++++++++++++++++++++++++++++++++++++ configure | 40 +++ 4 files changed, 835 insertions(+) create mode 100644 block/archipelago.c diff --git a/MAINTAINERS b/MAINTAINERS index 906f252477..59940f97ad 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1000,3 +1000,9 @@ SSH M: Richard W.M. Jones S: Supported F: block/ssh.c + +ARCHIPELAGO +M: Chrysostomos Nanakos +M: Chrysostomos Nanakos +S: Maintained +F: block/archipelago.c diff --git a/block/Makefile.objs b/block/Makefile.objs index fd88c03ece..858d2b387b 100644 --- a/block/Makefile.objs +++ b/block/Makefile.objs @@ -17,6 +17,7 @@ block-obj-$(CONFIG_LIBNFS) += nfs.o block-obj-$(CONFIG_CURL) += curl.o block-obj-$(CONFIG_RBD) += rbd.o block-obj-$(CONFIG_GLUSTERFS) += gluster.o +block-obj-$(CONFIG_ARCHIPELAGO) += archipelago.o block-obj-$(CONFIG_LIBSSH2) += ssh.o endif @@ -35,5 +36,6 @@ gluster.o-cflags := $(GLUSTERFS_CFLAGS) gluster.o-libs := $(GLUSTERFS_LIBS) ssh.o-cflags := $(LIBSSH2_CFLAGS) ssh.o-libs := $(LIBSSH2_LIBS) +archipelago.o-libs := $(ARCHIPELAGO_LIBS) qcow.o-libs := -lz linux-aio.o-libs := -laio diff --git a/block/archipelago.c b/block/archipelago.c new file mode 100644 index 0000000000..d91872d90e --- /dev/null +++ b/block/archipelago.c @@ -0,0 +1,787 @@ +/* + * QEMU Block driver for Archipelago + * + * Copyright (C) 2014 Chrysostomos Nanakos + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +/* + * VM Image on Archipelago volume is specified like this: + * + * file.driver=archipelago,file.volume= + * [,file.mport=[,file.vport=] + * [,file.segment=]] + * + * 'archipelago' is the protocol. + * + * 'mport' is the port number on which mapperd is listening. This is optional + * and if not specified, QEMU will make Archipelago to use the default port. + * + * 'vport' is the port number on which vlmcd is listening. This is optional + * and if not specified, QEMU will make Archipelago to use the default port. + * + * 'segment' is the name of the shared memory segment Archipelago stack + * is using. This is optional and if not specified, QEMU will make Archipelago + * to use the default value, 'archipelago'. + * + * Examples: + * + * file.driver=archipelago,file.volume=my_vm_volume + * file.driver=archipelago,file.volume=my_vm_volume,file.mport=123 + * file.driver=archipelago,file.volume=my_vm_volume,file.mport=123, + * file.vport=1234 + * file.driver=archipelago,file.volume=my_vm_volume,file.mport=123, + * file.vport=1234,file.segment=my_segment + */ + +#include "block/block_int.h" +#include "qemu/error-report.h" +#include "qemu/thread.h" +#include "qapi/qmp/qint.h" +#include "qapi/qmp/qstring.h" +#include "qapi/qmp/qjson.h" + +#include +#include +#include + +#define ARCHIP_FD_READ 0 +#define ARCHIP_FD_WRITE 1 +#define MAX_REQUEST_SIZE 524288 + +#define ARCHIPELAGO_OPT_VOLUME "volume" +#define ARCHIPELAGO_OPT_SEGMENT "segment" +#define ARCHIPELAGO_OPT_MPORT "mport" +#define ARCHIPELAGO_OPT_VPORT "vport" +#define ARCHIPELAGO_DFL_MPORT 1001 +#define ARCHIPELAGO_DFL_VPORT 501 + +#define archipelagolog(fmt, ...) \ + do { \ + fprintf(stderr, "archipelago\t%-24s: " fmt, __func__, ##__VA_ARGS__); \ + } while (0) + +typedef enum { + ARCHIP_OP_READ, + ARCHIP_OP_WRITE, + ARCHIP_OP_FLUSH, + ARCHIP_OP_VOLINFO, +} ARCHIPCmd; + +typedef struct ArchipelagoAIOCB { + BlockDriverAIOCB common; + QEMUBH *bh; + struct BDRVArchipelagoState *s; + QEMUIOVector *qiov; + ARCHIPCmd cmd; + bool cancelled; + int status; + int64_t size; + int64_t ret; +} ArchipelagoAIOCB; + +typedef struct BDRVArchipelagoState { + ArchipelagoAIOCB *event_acb; + char *volname; + char *segment_name; + uint64_t size; + /* Archipelago specific */ + struct xseg *xseg; + struct xseg_port *port; + xport srcport; + xport sport; + xport mportno; + xport vportno; + QemuMutex archip_mutex; + QemuCond archip_cond; + bool is_signaled; + /* Request handler specific */ + QemuThread request_th; + QemuCond request_cond; + QemuMutex request_mutex; + bool th_is_signaled; + bool stopping; +} BDRVArchipelagoState; + +typedef struct ArchipelagoSegmentedRequest { + size_t count; + size_t total; + int ref; + int failed; +} ArchipelagoSegmentedRequest; + +typedef struct AIORequestData { + const char *volname; + off_t offset; + size_t size; + uint64_t bufidx; + int ret; + int op; + ArchipelagoAIOCB *aio_cb; + ArchipelagoSegmentedRequest *segreq; +} AIORequestData; + +static void qemu_archipelago_complete_aio(void *opaque); + +static void init_local_signal(struct xseg *xseg, xport sport, xport srcport) +{ + if (xseg && (sport != srcport)) { + xseg_init_local_signal(xseg, srcport); + sport = srcport; + } +} + +static void archipelago_finish_aiocb(AIORequestData *reqdata) +{ + if (reqdata->aio_cb->ret != reqdata->segreq->total) { + reqdata->aio_cb->ret = -EIO; + } else if (reqdata->aio_cb->ret == reqdata->segreq->total) { + reqdata->aio_cb->ret = 0; + } + reqdata->aio_cb->bh = aio_bh_new( + bdrv_get_aio_context(reqdata->aio_cb->common.bs), + qemu_archipelago_complete_aio, reqdata + ); + qemu_bh_schedule(reqdata->aio_cb->bh); +} + +static int wait_reply(struct xseg *xseg, xport srcport, struct xseg_port *port, + struct xseg_request *expected_req) +{ + struct xseg_request *req; + xseg_prepare_wait(xseg, srcport); + void *psd = xseg_get_signal_desc(xseg, port); + while (1) { + req = xseg_receive(xseg, srcport, X_NONBLOCK); + if (req) { + if (req != expected_req) { + archipelagolog("Unknown received request\n"); + xseg_put_request(xseg, req, srcport); + } else if (!(req->state & XS_SERVED)) { + return -1; + } else { + break; + } + } + xseg_wait_signal(xseg, psd, 100000UL); + } + xseg_cancel_wait(xseg, srcport); + return 0; +} + +static void xseg_request_handler(void *state) +{ + BDRVArchipelagoState *s = (BDRVArchipelagoState *) state; + void *psd = xseg_get_signal_desc(s->xseg, s->port); + qemu_mutex_lock(&s->request_mutex); + + while (!s->stopping) { + struct xseg_request *req; + void *data; + xseg_prepare_wait(s->xseg, s->srcport); + req = xseg_receive(s->xseg, s->srcport, X_NONBLOCK); + if (req) { + AIORequestData *reqdata; + ArchipelagoSegmentedRequest *segreq; + xseg_get_req_data(s->xseg, req, (void **)&reqdata); + + switch (reqdata->op) { + case ARCHIP_OP_READ: + data = xseg_get_data(s->xseg, req); + segreq = reqdata->segreq; + segreq->count += req->serviced; + + qemu_iovec_from_buf(reqdata->aio_cb->qiov, reqdata->bufidx, + data, + req->serviced); + + xseg_put_request(s->xseg, req, s->srcport); + + if ((__sync_add_and_fetch(&segreq->ref, -1)) == 0) { + if (!segreq->failed) { + reqdata->aio_cb->ret = segreq->count; + archipelago_finish_aiocb(reqdata); + g_free(segreq); + } else { + g_free(segreq); + g_free(reqdata); + } + } else { + g_free(reqdata); + } + break; + case ARCHIP_OP_WRITE: + case ARCHIP_OP_FLUSH: + segreq = reqdata->segreq; + segreq->count += req->serviced; + xseg_put_request(s->xseg, req, s->srcport); + + if ((__sync_add_and_fetch(&segreq->ref, -1)) == 0) { + if (!segreq->failed) { + reqdata->aio_cb->ret = segreq->count; + archipelago_finish_aiocb(reqdata); + g_free(segreq); + } else { + g_free(segreq); + g_free(reqdata); + } + } else { + g_free(reqdata); + } + break; + case ARCHIP_OP_VOLINFO: + s->is_signaled = true; + qemu_cond_signal(&s->archip_cond); + break; + } + } else { + xseg_wait_signal(s->xseg, psd, 100000UL); + } + xseg_cancel_wait(s->xseg, s->srcport); + } + + s->th_is_signaled = true; + qemu_cond_signal(&s->request_cond); + qemu_mutex_unlock(&s->request_mutex); + qemu_thread_exit(NULL); +} + +static int qemu_archipelago_xseg_init(BDRVArchipelagoState *s) +{ + if (xseg_initialize()) { + archipelagolog("Cannot initialize XSEG\n"); + goto err_exit; + } + + s->xseg = xseg_join("posix", s->segment_name, + "posixfd", NULL); + if (!s->xseg) { + archipelagolog("Cannot join XSEG shared memory segment\n"); + goto err_exit; + } + s->port = xseg_bind_dynport(s->xseg); + s->srcport = s->port->portno; + init_local_signal(s->xseg, s->sport, s->srcport); + return 0; + +err_exit: + return -1; +} + +static int qemu_archipelago_init(BDRVArchipelagoState *s) +{ + int ret; + + ret = qemu_archipelago_xseg_init(s); + if (ret < 0) { + error_report("Cannot initialize XSEG. Aborting...\n"); + goto err_exit; + } + + qemu_cond_init(&s->archip_cond); + qemu_mutex_init(&s->archip_mutex); + qemu_cond_init(&s->request_cond); + qemu_mutex_init(&s->request_mutex); + s->th_is_signaled = false; + qemu_thread_create(&s->request_th, "xseg_io_th", + (void *) xseg_request_handler, + (void *) s, QEMU_THREAD_JOINABLE); + +err_exit: + return ret; +} + +static void qemu_archipelago_complete_aio(void *opaque) +{ + AIORequestData *reqdata = (AIORequestData *) opaque; + ArchipelagoAIOCB *aio_cb = (ArchipelagoAIOCB *) reqdata->aio_cb; + + qemu_bh_delete(aio_cb->bh); + aio_cb->common.cb(aio_cb->common.opaque, aio_cb->ret); + aio_cb->status = 0; + + if (!aio_cb->cancelled) { + qemu_aio_release(aio_cb); + } + g_free(reqdata); +} + +static QemuOptsList archipelago_runtime_opts = { + .name = "archipelago", + .head = QTAILQ_HEAD_INITIALIZER(archipelago_runtime_opts.head), + .desc = { + { + .name = ARCHIPELAGO_OPT_VOLUME, + .type = QEMU_OPT_STRING, + .help = "Name of the volume image", + }, + { + .name = ARCHIPELAGO_OPT_SEGMENT, + .type = QEMU_OPT_STRING, + .help = "Name of the Archipelago shared memory segment", + }, + { + .name = ARCHIPELAGO_OPT_MPORT, + .type = QEMU_OPT_NUMBER, + .help = "Archipelago mapperd port number" + }, + { + .name = ARCHIPELAGO_OPT_VPORT, + .type = QEMU_OPT_NUMBER, + .help = "Archipelago vlmcd port number" + + }, + { /* end of list */ } + }, +}; + +static int qemu_archipelago_open(BlockDriverState *bs, + QDict *options, + int bdrv_flags, + Error **errp) +{ + int ret = 0; + const char *volume, *segment_name; + QemuOpts *opts; + Error *local_err = NULL; + BDRVArchipelagoState *s = bs->opaque; + + opts = qemu_opts_create(&archipelago_runtime_opts, NULL, 0, &error_abort); + qemu_opts_absorb_qdict(opts, options, &local_err); + if (local_err) { + error_propagate(errp, local_err); + ret = -EINVAL; + goto err_exit; + } + + s->mportno = qemu_opt_get_number(opts, ARCHIPELAGO_OPT_MPORT, + ARCHIPELAGO_DFL_MPORT); + s->vportno = qemu_opt_get_number(opts, ARCHIPELAGO_OPT_VPORT, + ARCHIPELAGO_DFL_VPORT); + + segment_name = qemu_opt_get(opts, ARCHIPELAGO_OPT_SEGMENT); + if (segment_name == NULL) { + s->segment_name = g_strdup("archipelago"); + } else { + s->segment_name = g_strdup(segment_name); + } + + volume = qemu_opt_get(opts, ARCHIPELAGO_OPT_VOLUME); + if (volume == NULL) { + error_setg(errp, "archipelago block driver requires the 'volume'" + " option"); + ret = -EINVAL; + goto err_exit; + } + s->volname = g_strdup(volume); + + /* Initialize XSEG, join shared memory segment */ + ret = qemu_archipelago_init(s); + if (ret < 0) { + error_setg(errp, "cannot initialize XSEG and join shared " + "memory segment"); + goto err_exit; + } + + qemu_opts_del(opts); + return 0; + +err_exit: + g_free(s->volname); + g_free(s->segment_name); + qemu_opts_del(opts); + return ret; +} + +static void qemu_archipelago_close(BlockDriverState *bs) +{ + int r, targetlen; + char *target; + struct xseg_request *req; + BDRVArchipelagoState *s = bs->opaque; + + s->stopping = true; + + qemu_mutex_lock(&s->request_mutex); + while (!s->th_is_signaled) { + qemu_cond_wait(&s->request_cond, + &s->request_mutex); + } + qemu_mutex_unlock(&s->request_mutex); + qemu_thread_join(&s->request_th); + qemu_cond_destroy(&s->request_cond); + qemu_mutex_destroy(&s->request_mutex); + + qemu_cond_destroy(&s->archip_cond); + qemu_mutex_destroy(&s->archip_mutex); + + targetlen = strlen(s->volname); + req = xseg_get_request(s->xseg, s->srcport, s->vportno, X_ALLOC); + if (!req) { + archipelagolog("Cannot get XSEG request\n"); + goto err_exit; + } + r = xseg_prep_request(s->xseg, req, targetlen, 0); + if (r < 0) { + xseg_put_request(s->xseg, req, s->srcport); + archipelagolog("Cannot prepare XSEG close request\n"); + goto err_exit; + } + + target = xseg_get_target(s->xseg, req); + memcpy(target, s->volname, targetlen); + req->size = req->datalen; + req->offset = 0; + req->op = X_CLOSE; + + xport p = xseg_submit(s->xseg, req, s->srcport, X_ALLOC); + if (p == NoPort) { + xseg_put_request(s->xseg, req, s->srcport); + archipelagolog("Cannot submit XSEG close request\n"); + goto err_exit; + } + + xseg_signal(s->xseg, p); + wait_reply(s->xseg, s->srcport, s->port, req); + + xseg_put_request(s->xseg, req, s->srcport); + +err_exit: + g_free(s->volname); + g_free(s->segment_name); + xseg_quit_local_signal(s->xseg, s->srcport); + xseg_leave_dynport(s->xseg, s->port); + xseg_leave(s->xseg); +} + +static void qemu_archipelago_aio_cancel(BlockDriverAIOCB *blockacb) +{ + ArchipelagoAIOCB *aio_cb = (ArchipelagoAIOCB *) blockacb; + aio_cb->cancelled = true; + while (aio_cb->status == -EINPROGRESS) { + aio_poll(bdrv_get_aio_context(aio_cb->common.bs), true); + } + qemu_aio_release(aio_cb); +} + +static const AIOCBInfo archipelago_aiocb_info = { + .aiocb_size = sizeof(ArchipelagoAIOCB), + .cancel = qemu_archipelago_aio_cancel, +}; + +static int archipelago_submit_request(BDRVArchipelagoState *s, + uint64_t bufidx, + size_t count, + off_t offset, + ArchipelagoAIOCB *aio_cb, + ArchipelagoSegmentedRequest *segreq, + int op) +{ + int ret, targetlen; + char *target; + void *data = NULL; + struct xseg_request *req; + AIORequestData *reqdata = g_malloc(sizeof(AIORequestData)); + + targetlen = strlen(s->volname); + req = xseg_get_request(s->xseg, s->srcport, s->vportno, X_ALLOC); + if (!req) { + archipelagolog("Cannot get XSEG request\n"); + goto err_exit2; + } + ret = xseg_prep_request(s->xseg, req, targetlen, count); + if (ret < 0) { + archipelagolog("Cannot prepare XSEG request\n"); + goto err_exit; + } + target = xseg_get_target(s->xseg, req); + if (!target) { + archipelagolog("Cannot get XSEG target\n"); + goto err_exit; + } + memcpy(target, s->volname, targetlen); + req->size = count; + req->offset = offset; + + switch (op) { + case ARCHIP_OP_READ: + req->op = X_READ; + break; + case ARCHIP_OP_WRITE: + req->op = X_WRITE; + break; + case ARCHIP_OP_FLUSH: + req->op = X_FLUSH; + break; + } + reqdata->volname = s->volname; + reqdata->offset = offset; + reqdata->size = count; + reqdata->bufidx = bufidx; + reqdata->aio_cb = aio_cb; + reqdata->segreq = segreq; + reqdata->op = op; + + xseg_set_req_data(s->xseg, req, reqdata); + if (op == ARCHIP_OP_WRITE) { + data = xseg_get_data(s->xseg, req); + if (!data) { + archipelagolog("Cannot get XSEG data\n"); + goto err_exit; + } + qemu_iovec_to_buf(aio_cb->qiov, bufidx, data, count); + } + + xport p = xseg_submit(s->xseg, req, s->srcport, X_ALLOC); + if (p == NoPort) { + archipelagolog("Could not submit XSEG request\n"); + goto err_exit; + } + xseg_signal(s->xseg, p); + return 0; + +err_exit: + g_free(reqdata); + xseg_put_request(s->xseg, req, s->srcport); + return -EIO; +err_exit2: + g_free(reqdata); + return -EIO; +} + +static int archipelago_aio_segmented_rw(BDRVArchipelagoState *s, + size_t count, + off_t offset, + ArchipelagoAIOCB *aio_cb, + int op) +{ + int i, ret, segments_nr, last_segment_size; + ArchipelagoSegmentedRequest *segreq; + + segreq = g_malloc(sizeof(ArchipelagoSegmentedRequest)); + + if (op == ARCHIP_OP_FLUSH) { + segments_nr = 1; + segreq->ref = segments_nr; + segreq->total = count; + segreq->count = 0; + segreq->failed = 0; + ret = archipelago_submit_request(s, 0, count, offset, aio_cb, + segreq, ARCHIP_OP_FLUSH); + if (ret < 0) { + goto err_exit; + } + return 0; + } + + segments_nr = (int)(count / MAX_REQUEST_SIZE) + \ + ((count % MAX_REQUEST_SIZE) ? 1 : 0); + last_segment_size = (int)(count % MAX_REQUEST_SIZE); + + segreq->ref = segments_nr; + segreq->total = count; + segreq->count = 0; + segreq->failed = 0; + + for (i = 0; i < segments_nr - 1; i++) { + ret = archipelago_submit_request(s, i * MAX_REQUEST_SIZE, + MAX_REQUEST_SIZE, + offset + i * MAX_REQUEST_SIZE, + aio_cb, segreq, op); + + if (ret < 0) { + goto err_exit; + } + } + + if ((segments_nr > 1) && last_segment_size) { + ret = archipelago_submit_request(s, i * MAX_REQUEST_SIZE, + last_segment_size, + offset + i * MAX_REQUEST_SIZE, + aio_cb, segreq, op); + } else if ((segments_nr > 1) && !last_segment_size) { + ret = archipelago_submit_request(s, i * MAX_REQUEST_SIZE, + MAX_REQUEST_SIZE, + offset + i * MAX_REQUEST_SIZE, + aio_cb, segreq, op); + } else if (segments_nr == 1) { + ret = archipelago_submit_request(s, 0, count, offset, aio_cb, + segreq, op); + } + + if (ret < 0) { + goto err_exit; + } + + return 0; + +err_exit: + __sync_add_and_fetch(&segreq->failed, 1); + if (segments_nr == 1) { + if (__sync_add_and_fetch(&segreq->ref, -1) == 0) { + g_free(segreq); + } + } else { + if ((__sync_add_and_fetch(&segreq->ref, -segments_nr + i)) == 0) { + g_free(segreq); + } + } + + return ret; +} + +static BlockDriverAIOCB *qemu_archipelago_aio_rw(BlockDriverState *bs, + int64_t sector_num, + QEMUIOVector *qiov, + int nb_sectors, + BlockDriverCompletionFunc *cb, + void *opaque, + int op) +{ + ArchipelagoAIOCB *aio_cb; + BDRVArchipelagoState *s = bs->opaque; + int64_t size, off; + int ret; + + aio_cb = qemu_aio_get(&archipelago_aiocb_info, bs, cb, opaque); + aio_cb->cmd = op; + aio_cb->qiov = qiov; + + aio_cb->ret = 0; + aio_cb->s = s; + aio_cb->cancelled = false; + aio_cb->status = -EINPROGRESS; + + off = sector_num * BDRV_SECTOR_SIZE; + size = nb_sectors * BDRV_SECTOR_SIZE; + aio_cb->size = size; + + ret = archipelago_aio_segmented_rw(s, size, off, + aio_cb, op); + if (ret < 0) { + goto err_exit; + } + return &aio_cb->common; + +err_exit: + error_report("qemu_archipelago_aio_rw(): I/O Error\n"); + qemu_aio_release(aio_cb); + return NULL; +} + +static BlockDriverAIOCB *qemu_archipelago_aio_readv(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque) +{ + return qemu_archipelago_aio_rw(bs, sector_num, qiov, nb_sectors, cb, + opaque, ARCHIP_OP_READ); +} + +static BlockDriverAIOCB *qemu_archipelago_aio_writev(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque) +{ + return qemu_archipelago_aio_rw(bs, sector_num, qiov, nb_sectors, cb, + opaque, ARCHIP_OP_WRITE); +} + +static int64_t archipelago_volume_info(BDRVArchipelagoState *s) +{ + uint64_t size; + int ret, targetlen; + struct xseg_request *req; + struct xseg_reply_info *xinfo; + AIORequestData *reqdata = g_malloc(sizeof(AIORequestData)); + + const char *volname = s->volname; + targetlen = strlen(volname); + req = xseg_get_request(s->xseg, s->srcport, s->mportno, X_ALLOC); + if (!req) { + archipelagolog("Cannot get XSEG request\n"); + goto err_exit2; + } + ret = xseg_prep_request(s->xseg, req, targetlen, + sizeof(struct xseg_reply_info)); + if (ret < 0) { + archipelagolog("Cannot prepare XSEG request\n"); + goto err_exit; + } + char *target = xseg_get_target(s->xseg, req); + if (!target) { + archipelagolog("Cannot get XSEG target\n"); + goto err_exit; + } + memcpy(target, volname, targetlen); + req->size = req->datalen; + req->offset = 0; + req->op = X_INFO; + + reqdata->op = ARCHIP_OP_VOLINFO; + reqdata->volname = volname; + xseg_set_req_data(s->xseg, req, reqdata); + + xport p = xseg_submit(s->xseg, req, s->srcport, X_ALLOC); + if (p == NoPort) { + archipelagolog("Cannot submit XSEG request\n"); + goto err_exit; + } + xseg_signal(s->xseg, p); + qemu_mutex_lock(&s->archip_mutex); + while (!s->is_signaled) { + qemu_cond_wait(&s->archip_cond, &s->archip_mutex); + } + s->is_signaled = false; + qemu_mutex_unlock(&s->archip_mutex); + + xinfo = (struct xseg_reply_info *) xseg_get_data(s->xseg, req); + size = xinfo->size; + xseg_put_request(s->xseg, req, s->srcport); + g_free(reqdata); + s->size = size; + return size; + +err_exit: + xseg_put_request(s->xseg, req, s->srcport); +err_exit2: + g_free(reqdata); + return -EIO; +} + +static int64_t qemu_archipelago_getlength(BlockDriverState *bs) +{ + int64_t ret; + BDRVArchipelagoState *s = bs->opaque; + + ret = archipelago_volume_info(s); + return ret; +} + +static BlockDriverAIOCB *qemu_archipelago_aio_flush(BlockDriverState *bs, + BlockDriverCompletionFunc *cb, void *opaque) +{ + return qemu_archipelago_aio_rw(bs, 0, NULL, 0, cb, opaque, + ARCHIP_OP_FLUSH); +} + +static BlockDriver bdrv_archipelago = { + .format_name = "archipelago", + .protocol_name = "archipelago", + .instance_size = sizeof(BDRVArchipelagoState), + .bdrv_file_open = qemu_archipelago_open, + .bdrv_close = qemu_archipelago_close, + .bdrv_getlength = qemu_archipelago_getlength, + .bdrv_aio_readv = qemu_archipelago_aio_readv, + .bdrv_aio_writev = qemu_archipelago_aio_writev, + .bdrv_aio_flush = qemu_archipelago_aio_flush, + .bdrv_has_zero_init = bdrv_has_zero_init_1, +}; + +static void bdrv_archipelago_init(void) +{ + bdrv_register(&bdrv_archipelago); +} + +block_init(bdrv_archipelago_init); diff --git a/configure b/configure index 5715fe6deb..09e4f98322 100755 --- a/configure +++ b/configure @@ -326,6 +326,7 @@ seccomp="" glusterfs="" glusterfs_discard="no" glusterfs_zerofill="no" +archipelago="" virtio_blk_data_plane="" gtk="" gtkabi="" @@ -1087,6 +1088,10 @@ for opt do ;; --enable-glusterfs) glusterfs="yes" ;; + --disable-archipelago) archipelago="no" + ;; + --enable-archipelago) archipelago="yes" + ;; --disable-virtio-blk-data-plane) virtio_blk_data_plane="no" ;; --enable-virtio-blk-data-plane) virtio_blk_data_plane="yes" @@ -1382,6 +1387,8 @@ Advanced options (experts only): --enable-coroutine-pool enable coroutine freelist (better performance) --enable-glusterfs enable GlusterFS backend --disable-glusterfs disable GlusterFS backend + --enable-archipelago enable Archipelago backend + --disable-archipelago disable Archipelago backend --enable-gcov enable test coverage analysis with gcov --gcov=GCOV use specified gcov [$gcov_tool] --disable-tpm disable TPM support @@ -3071,6 +3078,33 @@ EOF fi fi + +########################################## +# archipelago probe +if test "$archipelago" != "no" ; then + cat > $TMPC < +#include +#include +int main(void) { + xseg_initialize(); + return 0; +} +EOF + archipelago_libs=-lxseg + if compile_prog "" "$archipelago_libs"; then + archipelago="yes" + libs_tools="$archipelago_libs $libs_tools" + libs_softmmu="$archipelago_libs $libs_softmmu" + else + if test "$archipelago" = "yes" ; then + feature_not_found "Archipelago backend support" "Install libxseg devel" + fi + archipelago="no" + fi +fi + + ########################################## # glusterfs probe if test "$glusterfs" != "no" ; then @@ -4252,6 +4286,7 @@ echo "seccomp support $seccomp" echo "coroutine backend $coroutine" echo "coroutine pool $coroutine_pool" echo "GlusterFS support $glusterfs" +echo "Archipelago support $archipelago" echo "virtio-blk-data-plane $virtio_blk_data_plane" echo "gcov $gcov_tool" echo "gcov enabled $gcov" @@ -4690,6 +4725,11 @@ if test "$glusterfs_zerofill" = "yes" ; then echo "CONFIG_GLUSTERFS_ZEROFILL=y" >> $config_host_mak fi +if test "$archipelago" = "yes" ; then + echo "CONFIG_ARCHIPELAGO=m" >> $config_host_mak + echo "ARCHIPELAGO_LIBS=$archipelago_libs" >> $config_host_mak +fi + if test "$libssh2" = "yes" ; then echo "CONFIG_LIBSSH2=m" >> $config_host_mak echo "LIBSSH2_CFLAGS=$libssh2_cflags" >> $config_host_mak From 70537a8506d5fd098dbb7a739e882fca18476d09 Mon Sep 17 00:00:00 2001 From: Chrysostomos Nanakos Date: Wed, 23 Jul 2014 17:07:30 +0300 Subject: [PATCH 20/59] block/archipelago: Implement bdrv_parse_filename() VM Image on Archipelago volume can also be specified like this: file=archipelago:[/mport=[:vport=][: segment=]] Examples: file=archipelago:my_vm_volume file=archipelago:my_vm_volume/mport=123 file=archipelago:my_vm_volume/mport=123:vport=1234 file=archipelago:my_vm_volume/mport=123:vport=1234:segment=my_segment Signed-off-by: Chrysostomos Nanakos Signed-off-by: Stefan Hajnoczi --- block/archipelago.c | 140 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 138 insertions(+), 2 deletions(-) diff --git a/block/archipelago.c b/block/archipelago.c index d91872d90e..297fc27529 100644 --- a/block/archipelago.c +++ b/block/archipelago.c @@ -15,6 +15,11 @@ * [,file.mport=[,file.vport=] * [,file.segment=]] * + * or + * + * file=archipelago:[/mport=[:vport=][: + * segment=]] + * * 'archipelago' is the protocol. * * 'mport' is the port number on which mapperd is listening. This is optional @@ -32,11 +37,20 @@ * file.driver=archipelago,file.volume=my_vm_volume * file.driver=archipelago,file.volume=my_vm_volume,file.mport=123 * file.driver=archipelago,file.volume=my_vm_volume,file.mport=123, - * file.vport=1234 + * file.vport=1234 * file.driver=archipelago,file.volume=my_vm_volume,file.mport=123, - * file.vport=1234,file.segment=my_segment + * file.vport=1234,file.segment=my_segment + * + * or + * + * file=archipelago:my_vm_volume + * file=archipelago:my_vm_volume/mport=123 + * file=archipelago:my_vm_volume/mport=123:vport=1234 + * file=archipelago:my_vm_volume/mport=123:vport=1234:segment=my_segment + * */ +#include "qemu-common.h" #include "block/block_int.h" #include "qemu/error-report.h" #include "qemu/thread.h" @@ -309,6 +323,127 @@ static void qemu_archipelago_complete_aio(void *opaque) g_free(reqdata); } +static void xseg_find_port(char *pstr, const char *needle, xport *aport) +{ + const char *a; + char *endptr = NULL; + unsigned long port; + if (strstart(pstr, needle, &a)) { + if (strlen(a) > 0) { + port = strtoul(a, &endptr, 10); + if (strlen(endptr)) { + *aport = -2; + return; + } + *aport = (xport) port; + } + } +} + +static void xseg_find_segment(char *pstr, const char *needle, + char **segment_name) +{ + const char *a; + if (strstart(pstr, needle, &a)) { + if (strlen(a) > 0) { + *segment_name = g_strdup(a); + } + } +} + +static void parse_filename_opts(const char *filename, Error **errp, + char **volume, char **segment_name, + xport *mport, xport *vport) +{ + const char *start; + char *tokens[4], *ds; + int idx; + xport lmport = NoPort, lvport = NoPort; + + strstart(filename, "archipelago:", &start); + + ds = g_strdup(start); + tokens[0] = strtok(ds, "/"); + tokens[1] = strtok(NULL, ":"); + tokens[2] = strtok(NULL, ":"); + tokens[3] = strtok(NULL, "\0"); + + if (!strlen(tokens[0])) { + error_setg(errp, "volume name must be specified first"); + g_free(ds); + return; + } + + for (idx = 1; idx < 4; idx++) { + if (tokens[idx] != NULL) { + if (strstart(tokens[idx], "mport=", NULL)) { + xseg_find_port(tokens[idx], "mport=", &lmport); + } + if (strstart(tokens[idx], "vport=", NULL)) { + xseg_find_port(tokens[idx], "vport=", &lvport); + } + if (strstart(tokens[idx], "segment=", NULL)) { + xseg_find_segment(tokens[idx], "segment=", segment_name); + } + } + } + + if ((lmport == -2) || (lvport == -2)) { + error_setg(errp, "mport and/or vport must be set"); + g_free(ds); + return; + } + *volume = g_strdup(tokens[0]); + *mport = lmport; + *vport = lvport; + g_free(ds); +} + +static void archipelago_parse_filename(const char *filename, QDict *options, + Error **errp) +{ + const char *start; + char *volume = NULL, *segment_name = NULL; + xport mport = NoPort, vport = NoPort; + + if (qdict_haskey(options, ARCHIPELAGO_OPT_VOLUME) + || qdict_haskey(options, ARCHIPELAGO_OPT_SEGMENT) + || qdict_haskey(options, ARCHIPELAGO_OPT_MPORT) + || qdict_haskey(options, ARCHIPELAGO_OPT_VPORT)) { + error_setg(errp, "volume/mport/vport/segment and a file name may not" + " be specified at the same time"); + return; + } + + if (!strstart(filename, "archipelago:", &start)) { + error_setg(errp, "File name must start with 'archipelago:'"); + return; + } + + if (!strlen(start) || strstart(start, "/", NULL)) { + error_setg(errp, "volume name must be specified"); + return; + } + + parse_filename_opts(filename, errp, &volume, &segment_name, &mport, &vport); + + if (volume) { + qdict_put(options, ARCHIPELAGO_OPT_VOLUME, qstring_from_str(volume)); + g_free(volume); + } + if (segment_name) { + qdict_put(options, ARCHIPELAGO_OPT_SEGMENT, + qstring_from_str(segment_name)); + g_free(segment_name); + } + if (mport != NoPort) { + qdict_put(options, ARCHIPELAGO_OPT_MPORT, qint_from_int(mport)); + } + if (vport != NoPort) { + qdict_put(options, ARCHIPELAGO_OPT_VPORT, qint_from_int(vport)); + } +} + static QemuOptsList archipelago_runtime_opts = { .name = "archipelago", .head = QTAILQ_HEAD_INITIALIZER(archipelago_runtime_opts.head), @@ -770,6 +905,7 @@ static BlockDriver bdrv_archipelago = { .format_name = "archipelago", .protocol_name = "archipelago", .instance_size = sizeof(BDRVArchipelagoState), + .bdrv_parse_filename = archipelago_parse_filename, .bdrv_file_open = qemu_archipelago_open, .bdrv_close = qemu_archipelago_close, .bdrv_getlength = qemu_archipelago_getlength, From 76d3d83a377aea9d3117a56de707df489c801b1a Mon Sep 17 00:00:00 2001 From: Chrysostomos Nanakos Date: Wed, 23 Jul 2014 17:07:31 +0300 Subject: [PATCH 21/59] block/archipelago: Add support for creating images qemu-img archipelago:[/mport=[:vport=] [:segment=]] [size] Signed-off-by: Chrysostomos Nanakos Signed-off-by: Stefan Hajnoczi --- block/archipelago.c | 146 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 146 insertions(+) diff --git a/block/archipelago.c b/block/archipelago.c index 297fc27529..6629d03a1d 100644 --- a/block/archipelago.c +++ b/block/archipelago.c @@ -592,6 +592,137 @@ err_exit: xseg_leave(s->xseg); } +static int qemu_archipelago_create_volume(Error **errp, const char *volname, + char *segment_name, + uint64_t size, xport mportno, + xport vportno) +{ + int ret, targetlen; + struct xseg *xseg = NULL; + struct xseg_request *req; + struct xseg_request_clone *xclone; + struct xseg_port *port; + xport srcport = NoPort, sport = NoPort; + char *target; + + /* Try default values if none has been set */ + if (mportno == (xport) -1) { + mportno = ARCHIPELAGO_DFL_MPORT; + } + + if (vportno == (xport) -1) { + vportno = ARCHIPELAGO_DFL_VPORT; + } + + if (xseg_initialize()) { + error_setg(errp, "Cannot initialize XSEG"); + return -1; + } + + xseg = xseg_join("posix", segment_name, + "posixfd", NULL); + + if (!xseg) { + error_setg(errp, "Cannot join XSEG shared memory segment"); + return -1; + } + + port = xseg_bind_dynport(xseg); + srcport = port->portno; + init_local_signal(xseg, sport, srcport); + + req = xseg_get_request(xseg, srcport, mportno, X_ALLOC); + if (!req) { + error_setg(errp, "Cannot get XSEG request"); + return -1; + } + + targetlen = strlen(volname); + ret = xseg_prep_request(xseg, req, targetlen, + sizeof(struct xseg_request_clone)); + if (ret < 0) { + error_setg(errp, "Cannot prepare XSEG request"); + goto err_exit; + } + + target = xseg_get_target(xseg, req); + if (!target) { + error_setg(errp, "Cannot get XSEG target.\n"); + goto err_exit; + } + memcpy(target, volname, targetlen); + xclone = (struct xseg_request_clone *) xseg_get_data(xseg, req); + memset(xclone->target, 0 , XSEG_MAX_TARGETLEN); + xclone->targetlen = 0; + xclone->size = size; + req->offset = 0; + req->size = req->datalen; + req->op = X_CLONE; + + xport p = xseg_submit(xseg, req, srcport, X_ALLOC); + if (p == NoPort) { + error_setg(errp, "Could not submit XSEG request"); + goto err_exit; + } + xseg_signal(xseg, p); + + ret = wait_reply(xseg, srcport, port, req); + if (ret < 0) { + error_setg(errp, "wait_reply() error."); + } + + xseg_put_request(xseg, req, srcport); + xseg_quit_local_signal(xseg, srcport); + xseg_leave_dynport(xseg, port); + xseg_leave(xseg); + return ret; + +err_exit: + xseg_put_request(xseg, req, srcport); + xseg_quit_local_signal(xseg, srcport); + xseg_leave_dynport(xseg, port); + xseg_leave(xseg); + return -1; +} + +static int qemu_archipelago_create(const char *filename, + QemuOpts *options, + Error **errp) +{ + int ret = 0; + uint64_t total_size = 0; + char *volname = NULL, *segment_name = NULL; + const char *start; + xport mport = NoPort, vport = NoPort; + + if (!strstart(filename, "archipelago:", &start)) { + error_setg(errp, "File name must start with 'archipelago:'"); + return -1; + } + + if (!strlen(start) || strstart(start, "/", NULL)) { + error_setg(errp, "volume name must be specified"); + return -1; + } + + parse_filename_opts(filename, errp, &volname, &segment_name, &mport, + &vport); + total_size = qemu_opt_get_size_del(options, BLOCK_OPT_SIZE, 0); + + if (segment_name == NULL) { + segment_name = g_strdup("archipelago"); + } + + /* Create an Archipelago volume */ + ret = qemu_archipelago_create_volume(errp, volname, segment_name, + total_size, mport, + vport); + + g_free(volname); + g_free(segment_name); + return ret; +} + static void qemu_archipelago_aio_cancel(BlockDriverAIOCB *blockacb) { ArchipelagoAIOCB *aio_cb = (ArchipelagoAIOCB *) blockacb; @@ -894,6 +1025,19 @@ static int64_t qemu_archipelago_getlength(BlockDriverState *bs) return ret; } +static QemuOptsList qemu_archipelago_create_opts = { + .name = "archipelago-create-opts", + .head = QTAILQ_HEAD_INITIALIZER(qemu_archipelago_create_opts.head), + .desc = { + { + .name = BLOCK_OPT_SIZE, + .type = QEMU_OPT_SIZE, + .help = "Virtual disk size" + }, + { /* end of list */ } + } +}; + static BlockDriverAIOCB *qemu_archipelago_aio_flush(BlockDriverState *bs, BlockDriverCompletionFunc *cb, void *opaque) { @@ -908,11 +1052,13 @@ static BlockDriver bdrv_archipelago = { .bdrv_parse_filename = archipelago_parse_filename, .bdrv_file_open = qemu_archipelago_open, .bdrv_close = qemu_archipelago_close, + .bdrv_create = qemu_archipelago_create, .bdrv_getlength = qemu_archipelago_getlength, .bdrv_aio_readv = qemu_archipelago_aio_readv, .bdrv_aio_writev = qemu_archipelago_aio_writev, .bdrv_aio_flush = qemu_archipelago_aio_flush, .bdrv_has_zero_init = bdrv_has_zero_init_1, + .create_opts = &qemu_archipelago_create_opts, }; static void bdrv_archipelago_init(void) From b1de5f439d46d690518e067faa429d4cd131ada5 Mon Sep 17 00:00:00 2001 From: Chrysostomos Nanakos Date: Wed, 30 Jul 2014 20:59:09 +0300 Subject: [PATCH 22/59] QMP: Add support for Archipelago Introduce new enum BlockdevOptionsArchipelago. @volume: #Name of the Archipelago volume image @mport: #'mport' is the port number on which mapperd is listening. This is optional and if not specified, QEMU will make Archipelago to use the default port. @vport: #'vport' is the port number on which vlmcd is listening. This is optional and if not specified, QEMU will make Archipelago to use the default port. @segment: #optional The name of the shared memory segment Archipelago stack is using. This is optional and if not specified, QEMU will make Archipelago use the default value, 'archipelago'. Signed-off-by: Chrysostomos Nanakos Signed-off-by: Stefan Hajnoczi --- qapi/block-core.json | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/qapi/block-core.json b/qapi/block-core.json index 72b4015fb4..fb74c56e32 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -197,6 +197,7 @@ # 'file', 'file', 'ftp', 'ftps', 'host_cdrom', 'host_device', # 'host_floppy', 'http', 'https', 'nbd', 'parallels', 'qcow', # 'qcow2', 'raw', 'tftp', 'vdi', 'vmdk', 'vpc', 'vvfat' +# 2.2: 'archipelago' # # @backing_file: #optional the name of the backing file (for copy-on-write) # @@ -1146,7 +1147,7 @@ # Since: 2.0 ## { 'enum': 'BlockdevDriver', - 'data': [ 'file', 'host_device', 'host_cdrom', 'host_floppy', + 'data': [ 'archipelago', 'file', 'host_device', 'host_cdrom', 'host_floppy', 'http', 'https', 'ftp', 'ftps', 'tftp', 'vvfat', 'blkdebug', 'blkverify', 'bochs', 'cloop', 'cow', 'dmg', 'parallels', 'qcow', 'qcow2', 'qed', 'raw', 'vdi', 'vhdx', 'vmdk', 'vpc', 'quorum' ] } @@ -1276,6 +1277,37 @@ '*pass-discard-snapshot': 'bool', '*pass-discard-other': 'bool' } } + +## +# @BlockdevOptionsArchipelago +# +# Driver specific block device options for Archipelago. +# +# @volume: Name of the Archipelago volume image +# +# @mport: #optional The port number on which mapperd is +# listening. This is optional +# and if not specified, QEMU will make Archipelago +# use the default port (1001). +# +# @vport: #optional The port number on which vlmcd is +# listening. This is optional +# and if not specified, QEMU will make Archipelago +# use the default port (501). +# +# @segment: #optional The name of the shared memory segment +# Archipelago stack is using. This is optional +# and if not specified, QEMU will make Archipelago +# use the default value, 'archipelago'. +# Since: 2.2 +## +{ 'type': 'BlockdevOptionsArchipelago', + 'data': { 'volume': 'str', + '*mport': 'int', + '*vport': 'int', + '*segment': 'str' } } + + ## # @BlkdebugEvent # @@ -1419,6 +1451,7 @@ 'base': 'BlockdevOptionsBase', 'discriminator': 'driver', 'data': { + 'archipelago':'BlockdevOptionsArchipelago', 'file': 'BlockdevOptionsFile', 'host_device':'BlockdevOptionsFile', 'host_cdrom': 'BlockdevOptionsFile', From 746ebfa77aa7a9550d11d3e7c8c10ecb8393ec76 Mon Sep 17 00:00:00 2001 From: Chrysostomos Nanakos Date: Wed, 23 Jul 2014 17:07:33 +0300 Subject: [PATCH 23/59] qemu-iotests: add support for Archipelago protocol Signed-off-by: Chrysostomos Nanakos Signed-off-by: Stefan Hajnoczi --- tests/qemu-iotests/common | 6 ++++++ tests/qemu-iotests/common.rc | 9 ++++++++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/tests/qemu-iotests/common b/tests/qemu-iotests/common index e4083f4212..70df659d5b 100644 --- a/tests/qemu-iotests/common +++ b/tests/qemu-iotests/common @@ -152,6 +152,7 @@ check options -nbd test nbd -ssh test ssh -nfs test nfs + -archipelago test archipelago -xdiff graphical mode diff -nocache use O_DIRECT on backing file -misalign misalign memory allocations @@ -263,6 +264,11 @@ testlist options xpand=false ;; + -archipelago) + IMGPROTO=archipelago + xpand=false + ;; + -nocache) CACHEMODE="none" CACHEMODE_IS_DEFAULT=false diff --git a/tests/qemu-iotests/common.rc b/tests/qemu-iotests/common.rc index e0ea7e3a7d..3fd691e6cd 100644 --- a/tests/qemu-iotests/common.rc +++ b/tests/qemu-iotests/common.rc @@ -64,6 +64,8 @@ elif [ "$IMGPROTO" = "ssh" ]; then elif [ "$IMGPROTO" = "nfs" ]; then TEST_DIR="nfs://127.0.0.1/$TEST_DIR" TEST_IMG=$TEST_DIR/t.$IMGFMT +elif [ "$IMGPROTO" = "archipelago" ]; then + TEST_IMG="archipelago:at.$IMGFMT" else TEST_IMG=$IMGPROTO:$TEST_DIR/t.$IMGFMT fi @@ -163,7 +165,8 @@ _make_test_img() -e "s# lazy_refcounts=\\(on\\|off\\)##g" \ -e "s# block_size=[0-9]\\+##g" \ -e "s# block_state_zero=\\(on\\|off\\)##g" \ - -e "s# log_size=[0-9]\\+##g" + -e "s# log_size=[0-9]\\+##g" \ + -e "s/archipelago:a/TEST_DIR\//g" # Start an NBD server on the image file, which is what we'll be talking to if [ $IMGPROTO = "nbd" ]; then @@ -206,6 +209,10 @@ _cleanup_test_img() rbd --no-progress rm "$TEST_DIR/t.$IMGFMT" > /dev/null ;; + archipelago) + vlmc remove "at.$IMGFMT" > /dev/null + ;; + sheepdog) collie vdi delete "$TEST_DIR/t.$IMGFMT" ;; From ac2662a913ee5854b1269256adbdc14e57ba480a Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Mon, 7 Jul 2014 15:15:52 +0200 Subject: [PATCH 24/59] coroutine: make pool size dynamic Allow coroutine users to adjust the pool size. For example, if the guest has multiple emulated disk drives we should keep around more coroutines. Signed-off-by: Stefan Hajnoczi Reviewed-by: Eric Blake --- include/block/coroutine.h | 11 +++++++++++ qemu-coroutine.c | 26 +++++++++++++++++++++++--- 2 files changed, 34 insertions(+), 3 deletions(-) diff --git a/include/block/coroutine.h b/include/block/coroutine.h index b408f96b82..b9b7f488c9 100644 --- a/include/block/coroutine.h +++ b/include/block/coroutine.h @@ -223,4 +223,15 @@ void coroutine_fn co_aio_sleep_ns(AioContext *ctx, QEMUClockType type, * Note that this function clobbers the handlers for the file descriptor. */ void coroutine_fn yield_until_fd_readable(int fd); + +/** + * Add or subtract from the coroutine pool size + * + * The coroutine implementation keeps a pool of coroutines to be reused by + * qemu_coroutine_create(). This makes coroutine creation cheap. Heavy + * coroutine users should call this to reserve pool space. Call it again with + * a negative number to release pool space. + */ +void qemu_coroutine_adjust_pool_size(int n); + #endif /* QEMU_COROUTINE_H */ diff --git a/qemu-coroutine.c b/qemu-coroutine.c index 470852100a..bd574aa1b5 100644 --- a/qemu-coroutine.c +++ b/qemu-coroutine.c @@ -19,14 +19,14 @@ #include "block/coroutine_int.h" enum { - /* Maximum free pool size prevents holding too many freed coroutines */ - POOL_MAX_SIZE = 64, + POOL_DEFAULT_SIZE = 64, }; /** Free list to speed up creation */ static QemuMutex pool_lock; static QSLIST_HEAD(, Coroutine) pool = QSLIST_HEAD_INITIALIZER(pool); static unsigned int pool_size; +static unsigned int pool_max_size = POOL_DEFAULT_SIZE; Coroutine *qemu_coroutine_create(CoroutineEntry *entry) { @@ -55,7 +55,7 @@ static void coroutine_delete(Coroutine *co) { if (CONFIG_COROUTINE_POOL) { qemu_mutex_lock(&pool_lock); - if (pool_size < POOL_MAX_SIZE) { + if (pool_size < pool_max_size) { QSLIST_INSERT_HEAD(&pool, co, pool_next); co->caller = NULL; pool_size++; @@ -137,3 +137,23 @@ void coroutine_fn qemu_coroutine_yield(void) self->caller = NULL; coroutine_swap(self, to); } + +void qemu_coroutine_adjust_pool_size(int n) +{ + qemu_mutex_lock(&pool_lock); + + pool_max_size += n; + + /* Callers should never take away more than they added */ + assert(pool_max_size >= POOL_DEFAULT_SIZE); + + /* Trim oversized pool down to new max */ + while (pool_size > pool_max_size) { + Coroutine *co = QSLIST_FIRST(&pool); + QSLIST_REMOVE_HEAD(&pool, pool_next); + pool_size--; + qemu_coroutine_delete(co); + } + + qemu_mutex_unlock(&pool_lock); +} From 2a87151fb2725658f7419038a80f91065159f34b Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Mon, 7 Jul 2014 15:15:53 +0200 Subject: [PATCH 25/59] block: bump coroutine pool size for drives When a BlockDriverState is associated with a storage controller DeviceState we expect guest I/O. Use this opportunity to bump the coroutine pool size by 64. This patch ensures that the coroutine pool size scales with the number of drives attached to the guest. It should increase coroutine pool usage (which makes qemu_coroutine_create() fast) without hogging too much memory when fewer drives are attached. Signed-off-by: Stefan Hajnoczi Reviewed-by: Eric Blake --- block.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/block.c b/block.c index 7cafc49612..6b29285381 100644 --- a/block.c +++ b/block.c @@ -57,6 +57,8 @@ struct BdrvDirtyBitmap { #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */ +#define COROUTINE_POOL_RESERVATION 64 /* number of coroutines to reserve */ + static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load); static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, @@ -2107,6 +2109,9 @@ int bdrv_attach_dev(BlockDriverState *bs, void *dev) } bs->dev = dev; bdrv_iostatus_reset(bs); + + /* We're expecting I/O from the device so bump up coroutine pool size */ + qemu_coroutine_adjust_pool_size(COROUTINE_POOL_RESERVATION); return 0; } @@ -2126,6 +2131,7 @@ void bdrv_detach_dev(BlockDriverState *bs, void *dev) bs->dev_ops = NULL; bs->dev_opaque = NULL; bs->guest_block_size = 512; + qemu_coroutine_adjust_pool_size(-COROUTINE_POOL_RESERVATION); } /* TODO change to return DeviceState * when all users are qdevified */ From c2e50e3d11a0bf4c973cc30478c1af0f2d5f8e81 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Tue, 15 Jul 2014 16:44:25 +0200 Subject: [PATCH 26/59] thread-pool: avoid per-thread-pool EventNotifier EventNotifier is implemented using an eventfd or pipe. It therefore consumes file descriptors, which can be limited by rlimits and should therefore be used sparingly. Switch from EventNotifier to QEMUBH in thread-pool.c. Originally EventNotifier was used because qemu_bh_schedule() was not thread-safe yet. Reported-by: Christian Borntraeger Signed-off-by: Stefan Hajnoczi --- thread-pool.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/thread-pool.c b/thread-pool.c index dfb699dd94..4cfd07893f 100644 --- a/thread-pool.c +++ b/thread-pool.c @@ -21,7 +21,6 @@ #include "block/coroutine.h" #include "trace.h" #include "block/block_int.h" -#include "qemu/event_notifier.h" #include "block/thread-pool.h" #include "qemu/main-loop.h" @@ -57,8 +56,8 @@ struct ThreadPoolElement { }; struct ThreadPool { - EventNotifier notifier; AioContext *ctx; + QEMUBH *completion_bh; QemuMutex lock; QemuCond check_cancel; QemuCond worker_stopped; @@ -119,7 +118,7 @@ static void *worker_thread(void *opaque) qemu_cond_broadcast(&pool->check_cancel); } - event_notifier_set(&pool->notifier); + qemu_bh_schedule(pool->completion_bh); } pool->cur_threads--; @@ -168,12 +167,11 @@ static void spawn_thread(ThreadPool *pool) } } -static void event_notifier_ready(EventNotifier *notifier) +static void thread_pool_completion_bh(void *opaque) { - ThreadPool *pool = container_of(notifier, ThreadPool, notifier); + ThreadPool *pool = opaque; ThreadPoolElement *elem, *next; - event_notifier_test_and_clear(notifier); restart: QLIST_FOREACH_SAFE(elem, &pool->head, all, next) { if (elem->state != THREAD_CANCELED && elem->state != THREAD_DONE) { @@ -215,7 +213,7 @@ static void thread_pool_cancel(BlockDriverAIOCB *acb) qemu_sem_timedwait(&pool->sem, 0) == 0) { QTAILQ_REMOVE(&pool->request_list, elem, reqs); elem->state = THREAD_CANCELED; - event_notifier_set(&pool->notifier); + qemu_bh_schedule(pool->completion_bh); } else { pool->pending_cancellations++; while (elem->state != THREAD_CANCELED && elem->state != THREAD_DONE) { @@ -224,7 +222,7 @@ static void thread_pool_cancel(BlockDriverAIOCB *acb) pool->pending_cancellations--; } qemu_mutex_unlock(&pool->lock); - event_notifier_ready(&pool->notifier); + thread_pool_completion_bh(pool); } static const AIOCBInfo thread_pool_aiocb_info = { @@ -293,8 +291,8 @@ static void thread_pool_init_one(ThreadPool *pool, AioContext *ctx) } memset(pool, 0, sizeof(*pool)); - event_notifier_init(&pool->notifier, false); pool->ctx = ctx; + pool->completion_bh = aio_bh_new(ctx, thread_pool_completion_bh, pool); qemu_mutex_init(&pool->lock); qemu_cond_init(&pool->check_cancel); qemu_cond_init(&pool->worker_stopped); @@ -304,8 +302,6 @@ static void thread_pool_init_one(ThreadPool *pool, AioContext *ctx) QLIST_INIT(&pool->head); QTAILQ_INIT(&pool->request_list); - - aio_set_event_notifier(ctx, &pool->notifier, event_notifier_ready); } ThreadPool *thread_pool_new(AioContext *ctx) @@ -339,11 +335,10 @@ void thread_pool_free(ThreadPool *pool) qemu_mutex_unlock(&pool->lock); - aio_set_event_notifier(pool->ctx, &pool->notifier, NULL); + qemu_bh_delete(pool->completion_bh); qemu_sem_destroy(&pool->sem); qemu_cond_destroy(&pool->check_cancel); qemu_cond_destroy(&pool->worker_stopped); qemu_mutex_destroy(&pool->lock); - event_notifier_cleanup(&pool->notifier); g_free(pool); } From 3c80ca158c96ff902a30883a8933e755988948b1 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Tue, 15 Jul 2014 16:44:26 +0200 Subject: [PATCH 27/59] thread-pool: avoid deadlock in nested aio_poll() calls The thread pool has a race condition if two elements complete before thread_pool_completion_bh() runs: If element A's callback waits for element B using aio_poll() it will deadlock since pool->completion_bh is not marked scheduled when the nested aio_poll() runs. Fix this by marking the BH scheduled while thread_pool_completion_bh() is executing. This way any nested aio_poll() loops will enter thread_pool_completion_bh() and complete the remaining elements. Signed-off-by: Stefan Hajnoczi --- thread-pool.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/thread-pool.c b/thread-pool.c index 4cfd07893f..23888dcfc4 100644 --- a/thread-pool.c +++ b/thread-pool.c @@ -185,6 +185,12 @@ restart: QLIST_REMOVE(elem, all); /* Read state before ret. */ smp_rmb(); + + /* Schedule ourselves in case elem->common.cb() calls aio_poll() to + * wait for another request that completed at the same time. + */ + qemu_bh_schedule(pool->completion_bh); + elem->common.cb(elem->common.opaque, elem->ret); qemu_aio_release(elem); goto restart; From 349592e0b9112171500e940dd921bb96cfc496d2 Mon Sep 17 00:00:00 2001 From: Jeff Cody Date: Wed, 6 Aug 2014 15:54:57 -0400 Subject: [PATCH 28/59] block: vhdx - add error check This add an error check for an invalid descriptor entry signature, when flushing the log descriptor entries. Signed-off-by: Jeff Cody Reviewed-by: Stefan Hajnoczi Signed-off-by: Kevin Wolf --- block/vhdx-log.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/block/vhdx-log.c b/block/vhdx-log.c index a77c040ee0..7c2630d659 100644 --- a/block/vhdx-log.c +++ b/block/vhdx-log.c @@ -435,6 +435,11 @@ static int vhdx_log_flush_desc(BlockDriverState *bs, VHDXLogDescriptor *desc, /* write 'count' sectors of sector */ memset(buffer, 0, VHDX_LOG_SECTOR_SIZE); count = desc->zero_length / VHDX_LOG_SECTOR_SIZE; + } else { + error_report("Invalid VHDX log descriptor entry signature 0x%" PRIx32, + desc->signature); + ret = -EINVAL; + goto exit; } file_offset = desc->file_offset; From 4f75b52a07e0a0e71c1c81d0cb0ddf9c52c8e1bc Mon Sep 17 00:00:00 2001 From: Jeff Cody Date: Wed, 6 Aug 2014 15:54:58 -0400 Subject: [PATCH 29/59] block: VHDX endian fixes This patch contains several changes for endian conversion fixes for VHDX, particularly for big-endian machines (multibyte values in VHDX are all on disk in LE format). Tests were done with existing qemu-iotests on an IBM POWER7 (8406-71Y). This includes sample images created by Hyper-V, both with dirty logs and without. In addition, VHDX image files created (and written to) on a BE machine were tested on a LE machine, and vice-versa. Reported-by: Markus Armburster Reported-by: Paolo Bonzini Signed-off-by: Jeff Cody Reviewed-by: Stefan Hajnoczi Signed-off-by: Kevin Wolf --- block/vhdx-endian.c | 11 +++++- block/vhdx-log.c | 43 +++++++++++++--------- block/vhdx.c | 89 +++++++++++++++++++++++++++------------------ block/vhdx.h | 1 + 4 files changed, 88 insertions(+), 56 deletions(-) diff --git a/block/vhdx-endian.c b/block/vhdx-endian.c index fe879ed995..0640d3f4a9 100644 --- a/block/vhdx-endian.c +++ b/block/vhdx-endian.c @@ -82,8 +82,6 @@ void vhdx_log_desc_le_import(VHDXLogDescriptor *d) assert(d != NULL); le32_to_cpus(&d->signature); - le32_to_cpus(&d->trailing_bytes); - le64_to_cpus(&d->leading_bytes); le64_to_cpus(&d->file_offset); le64_to_cpus(&d->sequence_number); } @@ -99,6 +97,15 @@ void vhdx_log_desc_le_export(VHDXLogDescriptor *d) cpu_to_le64s(&d->sequence_number); } +void vhdx_log_data_le_import(VHDXLogDataSector *d) +{ + assert(d != NULL); + + le32_to_cpus(&d->data_signature); + le32_to_cpus(&d->sequence_high); + le32_to_cpus(&d->sequence_low); +} + void vhdx_log_data_le_export(VHDXLogDataSector *d) { assert(d != NULL); diff --git a/block/vhdx-log.c b/block/vhdx-log.c index 7c2630d659..0088be8747 100644 --- a/block/vhdx-log.c +++ b/block/vhdx-log.c @@ -84,6 +84,7 @@ static int vhdx_log_peek_hdr(BlockDriverState *bs, VHDXLogEntries *log, if (ret < 0) { goto exit; } + vhdx_log_entry_hdr_le_import(hdr); exit: return ret; @@ -211,7 +212,7 @@ static bool vhdx_log_hdr_is_valid(VHDXLogEntries *log, VHDXLogEntryHeader *hdr, { int valid = false; - if (memcmp(&hdr->signature, "loge", 4)) { + if (hdr->signature != VHDX_LOG_SIGNATURE) { goto exit; } @@ -275,12 +276,12 @@ static bool vhdx_log_desc_is_valid(VHDXLogDescriptor *desc, goto exit; } - if (!memcmp(&desc->signature, "zero", 4)) { + if (desc->signature == VHDX_LOG_ZERO_SIGNATURE) { if (desc->zero_length % VHDX_LOG_SECTOR_SIZE == 0) { /* valid */ ret = true; } - } else if (!memcmp(&desc->signature, "desc", 4)) { + } else if (desc->signature == VHDX_LOG_DESC_SIGNATURE) { /* valid */ ret = true; } @@ -327,13 +328,15 @@ static int vhdx_compute_desc_sectors(uint32_t desc_cnt) * passed into this function. Each descriptor will also be validated, * and error returned if any are invalid. */ static int vhdx_log_read_desc(BlockDriverState *bs, BDRVVHDXState *s, - VHDXLogEntries *log, VHDXLogDescEntries **buffer) + VHDXLogEntries *log, VHDXLogDescEntries **buffer, + bool convert_endian) { int ret = 0; uint32_t desc_sectors; uint32_t sectors_read; VHDXLogEntryHeader hdr; VHDXLogDescEntries *desc_entries = NULL; + VHDXLogDescriptor desc; int i; assert(*buffer == NULL); @@ -342,7 +345,7 @@ static int vhdx_log_read_desc(BlockDriverState *bs, BDRVVHDXState *s, if (ret < 0) { goto exit; } - vhdx_log_entry_hdr_le_import(&hdr); + if (vhdx_log_hdr_is_valid(log, &hdr, s) == false) { ret = -EINVAL; goto exit; @@ -363,12 +366,19 @@ static int vhdx_log_read_desc(BlockDriverState *bs, BDRVVHDXState *s, /* put in proper endianness, and validate each desc */ for (i = 0; i < hdr.descriptor_count; i++) { - vhdx_log_desc_le_import(&desc_entries->desc[i]); - if (vhdx_log_desc_is_valid(&desc_entries->desc[i], &hdr) == false) { + desc = desc_entries->desc[i]; + vhdx_log_desc_le_import(&desc); + if (convert_endian) { + desc_entries->desc[i] = desc; + } + if (vhdx_log_desc_is_valid(&desc, &hdr) == false) { ret = -EINVAL; goto free_and_exit; } } + if (convert_endian) { + desc_entries->hdr = hdr; + } *buffer = desc_entries; goto exit; @@ -403,7 +413,7 @@ static int vhdx_log_flush_desc(BlockDriverState *bs, VHDXLogDescriptor *desc, buffer = qemu_blockalign(bs, VHDX_LOG_SECTOR_SIZE); - if (!memcmp(&desc->signature, "desc", 4)) { + if (desc->signature == VHDX_LOG_DESC_SIGNATURE) { /* data sector */ if (data == NULL) { ret = -EFAULT; @@ -431,7 +441,7 @@ static int vhdx_log_flush_desc(BlockDriverState *bs, VHDXLogDescriptor *desc, memcpy(buffer+offset, &desc->trailing_bytes, 4); - } else if (!memcmp(&desc->signature, "zero", 4)) { + } else if (desc->signature == VHDX_LOG_ZERO_SIGNATURE) { /* write 'count' sectors of sector */ memset(buffer, 0, VHDX_LOG_SECTOR_SIZE); count = desc->zero_length / VHDX_LOG_SECTOR_SIZE; @@ -498,13 +508,13 @@ static int vhdx_log_flush(BlockDriverState *bs, BDRVVHDXState *s, goto exit; } - ret = vhdx_log_read_desc(bs, s, &logs->log, &desc_entries); + ret = vhdx_log_read_desc(bs, s, &logs->log, &desc_entries, true); if (ret < 0) { goto exit; } for (i = 0; i < desc_entries->hdr.descriptor_count; i++) { - if (!memcmp(&desc_entries->desc[i].signature, "desc", 4)) { + if (desc_entries->desc[i].signature == VHDX_LOG_DESC_SIGNATURE) { /* data sector, so read a sector to flush */ ret = vhdx_log_read_sectors(bs, &logs->log, §ors_read, data, 1, false); @@ -515,6 +525,7 @@ static int vhdx_log_flush(BlockDriverState *bs, BDRVVHDXState *s, ret = -EINVAL; goto exit; } + vhdx_log_data_le_import(data); } ret = vhdx_log_flush_desc(bs, &desc_entries->desc[i], data); @@ -563,9 +574,6 @@ static int vhdx_validate_log_entry(BlockDriverState *bs, BDRVVHDXState *s, goto inc_and_exit; } - vhdx_log_entry_hdr_le_import(&hdr); - - if (vhdx_log_hdr_is_valid(log, &hdr, s) == false) { goto inc_and_exit; } @@ -578,13 +586,13 @@ static int vhdx_validate_log_entry(BlockDriverState *bs, BDRVVHDXState *s, desc_sectors = vhdx_compute_desc_sectors(hdr.descriptor_count); - /* Read desc sectors, and calculate log checksum */ + /* Read all log sectors, and calculate log checksum */ total_sectors = hdr.entry_length / VHDX_LOG_SECTOR_SIZE; /* read_desc() will increment the read idx */ - ret = vhdx_log_read_desc(bs, s, log, &desc_buffer); + ret = vhdx_log_read_desc(bs, s, log, &desc_buffer, false); if (ret < 0) { goto free_and_exit; } @@ -607,7 +615,7 @@ static int vhdx_validate_log_entry(BlockDriverState *bs, BDRVVHDXState *s, } } crc ^= 0xffffffff; - if (crc != desc_buffer->hdr.checksum) { + if (crc != hdr.checksum) { goto free_and_exit; } @@ -967,7 +975,6 @@ static int vhdx_log_write(BlockDriverState *bs, BDRVVHDXState *s, * last data sector */ vhdx_update_checksum(buffer, total_length, offsetof(VHDXLogEntryHeader, checksum)); - cpu_to_le32s((uint32_t *)(buffer + 4)); /* now write to the log */ ret = vhdx_log_write_sectors(bs, &s->log, §ors_written, buffer, diff --git a/block/vhdx.c b/block/vhdx.c index fedcf9f9ca..febce21f82 100644 --- a/block/vhdx.c +++ b/block/vhdx.c @@ -135,10 +135,8 @@ typedef struct VHDXSectorInfo { * buf: buffer pointer * size: size of buffer (must be > crc_offset+4) * - * Note: The resulting checksum is in the CPU endianness, not necessarily - * in the file format endianness (LE). Any header export to disk should - * make sure that vhdx_header_le_export() is used to convert to the - * correct endianness + * Note: The buffer should have all multi-byte data in little-endian format, + * and the resulting checksum is in little endian format. */ uint32_t vhdx_update_checksum(uint8_t *buf, size_t size, int crc_offset) { @@ -149,6 +147,7 @@ uint32_t vhdx_update_checksum(uint8_t *buf, size_t size, int crc_offset) memset(buf + crc_offset, 0, sizeof(crc)); crc = crc32c(0xffffffff, buf, size); + cpu_to_le32s(&crc); memcpy(buf + crc_offset, &crc, sizeof(crc)); return crc; @@ -300,7 +299,7 @@ static int vhdx_write_header(BlockDriverState *bs_file, VHDXHeader *hdr, { uint8_t *buffer = NULL; int ret; - VHDXHeader header_le; + VHDXHeader *header_le; assert(bs_file != NULL); assert(hdr != NULL); @@ -321,11 +320,12 @@ static int vhdx_write_header(BlockDriverState *bs_file, VHDXHeader *hdr, } /* overwrite the actual VHDXHeader portion */ - memcpy(buffer, hdr, sizeof(VHDXHeader)); - hdr->checksum = vhdx_update_checksum(buffer, VHDX_HEADER_SIZE, - offsetof(VHDXHeader, checksum)); - vhdx_header_le_export(hdr, &header_le); - ret = bdrv_pwrite_sync(bs_file, offset, &header_le, sizeof(VHDXHeader)); + header_le = (VHDXHeader *)buffer; + memcpy(header_le, hdr, sizeof(VHDXHeader)); + vhdx_header_le_export(hdr, header_le); + vhdx_update_checksum(buffer, VHDX_HEADER_SIZE, + offsetof(VHDXHeader, checksum)); + ret = bdrv_pwrite_sync(bs_file, offset, header_le, sizeof(VHDXHeader)); exit: qemu_vfree(buffer); @@ -432,13 +432,14 @@ static void vhdx_parse_header(BlockDriverState *bs, BDRVVHDXState *s, } /* copy over just the relevant portion that we need */ memcpy(header1, buffer, sizeof(VHDXHeader)); - vhdx_header_le_import(header1); - if (vhdx_checksum_is_valid(buffer, VHDX_HEADER_SIZE, 4) && - !memcmp(&header1->signature, "head", 4) && - header1->version == 1) { - h1_seq = header1->sequence_number; - h1_valid = true; + if (vhdx_checksum_is_valid(buffer, VHDX_HEADER_SIZE, 4)) { + vhdx_header_le_import(header1); + if (header1->signature == VHDX_HEADER_SIGNATURE && + header1->version == 1) { + h1_seq = header1->sequence_number; + h1_valid = true; + } } ret = bdrv_pread(bs->file, VHDX_HEADER2_OFFSET, buffer, VHDX_HEADER_SIZE); @@ -447,13 +448,14 @@ static void vhdx_parse_header(BlockDriverState *bs, BDRVVHDXState *s, } /* copy over just the relevant portion that we need */ memcpy(header2, buffer, sizeof(VHDXHeader)); - vhdx_header_le_import(header2); - if (vhdx_checksum_is_valid(buffer, VHDX_HEADER_SIZE, 4) && - !memcmp(&header2->signature, "head", 4) && - header2->version == 1) { - h2_seq = header2->sequence_number; - h2_valid = true; + if (vhdx_checksum_is_valid(buffer, VHDX_HEADER_SIZE, 4)) { + vhdx_header_le_import(header2); + if (header2->signature == VHDX_HEADER_SIGNATURE && + header2->version == 1) { + h2_seq = header2->sequence_number; + h2_valid = true; + } } /* If there is only 1 valid header (or no valid headers), we @@ -519,15 +521,21 @@ static int vhdx_open_region_tables(BlockDriverState *bs, BDRVVHDXState *s) goto fail; } memcpy(&s->rt, buffer, sizeof(s->rt)); - vhdx_region_header_le_import(&s->rt); offset += sizeof(s->rt); - if (!vhdx_checksum_is_valid(buffer, VHDX_HEADER_BLOCK_SIZE, 4) || - memcmp(&s->rt.signature, "regi", 4)) { + if (!vhdx_checksum_is_valid(buffer, VHDX_HEADER_BLOCK_SIZE, 4)) { ret = -EINVAL; goto fail; } + vhdx_region_header_le_import(&s->rt); + + if (s->rt.signature != VHDX_REGION_SIGNATURE) { + ret = -EINVAL; + goto fail; + } + + /* Per spec, maximum region table entry count is 2047 */ if (s->rt.entry_count > 2047) { ret = -EINVAL; @@ -630,7 +638,7 @@ static int vhdx_parse_metadata(BlockDriverState *bs, BDRVVHDXState *s) vhdx_metadata_header_le_import(&s->metadata_hdr); - if (memcmp(&s->metadata_hdr.signature, "metadata", 8)) { + if (s->metadata_hdr.signature != VHDX_METADATA_SIGNATURE) { ret = -EINVAL; goto exit; } @@ -951,7 +959,6 @@ static int vhdx_open(BlockDriverState *bs, QDict *options, int flags, /* s->bat is freed in vhdx_close() */ s->bat = qemu_blockalign(bs, s->bat_rt.length); - ret = bdrv_pread(bs->file, s->bat_offset, s->bat, s->bat_rt.length); if (ret < 0) { goto fail; @@ -1540,7 +1547,8 @@ exit: */ static int vhdx_create_bat(BlockDriverState *bs, BDRVVHDXState *s, uint64_t image_size, VHDXImageType type, - bool use_zero_blocks, VHDXRegionTableEntry *rt_bat) + bool use_zero_blocks, uint64_t file_offset, + uint32_t length) { int ret = 0; uint64_t data_file_offset; @@ -1555,7 +1563,7 @@ static int vhdx_create_bat(BlockDriverState *bs, BDRVVHDXState *s, /* this gives a data start after BAT/bitmap entries, and well * past any metadata entries (with a 4 MB buffer for future * expansion */ - data_file_offset = rt_bat->file_offset + rt_bat->length + 5 * MiB; + data_file_offset = file_offset + length + 5 * MiB; total_sectors = image_size >> s->logical_sector_size_bits; if (type == VHDX_TYPE_DYNAMIC) { @@ -1579,7 +1587,7 @@ static int vhdx_create_bat(BlockDriverState *bs, BDRVVHDXState *s, use_zero_blocks || bdrv_has_zero_init(bs) == 0) { /* for a fixed file, the default BAT entry is not zero */ - s->bat = g_malloc0(rt_bat->length); + s->bat = g_malloc0(length); block_state = type == VHDX_TYPE_FIXED ? PAYLOAD_BLOCK_FULLY_PRESENT : PAYLOAD_BLOCK_NOT_PRESENT; block_state = use_zero_blocks ? PAYLOAD_BLOCK_ZERO : block_state; @@ -1594,7 +1602,7 @@ static int vhdx_create_bat(BlockDriverState *bs, BDRVVHDXState *s, cpu_to_le64s(&s->bat[sinfo.bat_idx]); sector_num += s->sectors_per_block; } - ret = bdrv_pwrite(bs, rt_bat->file_offset, s->bat, rt_bat->length); + ret = bdrv_pwrite(bs, file_offset, s->bat, length); if (ret < 0) { goto exit; } @@ -1626,6 +1634,8 @@ static int vhdx_create_new_region_table(BlockDriverState *bs, int ret = 0; uint32_t offset = 0; void *buffer = NULL; + uint64_t bat_file_offset; + uint32_t bat_length; BDRVVHDXState *s = NULL; VHDXRegionTableHeader *region_table; VHDXRegionTableEntry *rt_bat; @@ -1674,19 +1684,26 @@ static int vhdx_create_new_region_table(BlockDriverState *bs, rt_metadata->length = 1 * MiB; /* min size, and more than enough */ *metadata_offset = rt_metadata->file_offset; + bat_file_offset = rt_bat->file_offset; + bat_length = rt_bat->length; + + vhdx_region_header_le_export(region_table); + vhdx_region_entry_le_export(rt_bat); + vhdx_region_entry_le_export(rt_metadata); + vhdx_update_checksum(buffer, VHDX_HEADER_BLOCK_SIZE, offsetof(VHDXRegionTableHeader, checksum)); /* The region table gives us the data we need to create the BAT, * so do that now */ - ret = vhdx_create_bat(bs, s, image_size, type, use_zero_blocks, rt_bat); + ret = vhdx_create_bat(bs, s, image_size, type, use_zero_blocks, + bat_file_offset, bat_length); + if (ret < 0) { + goto exit; + } /* Now write out the region headers to disk */ - vhdx_region_header_le_export(region_table); - vhdx_region_entry_le_export(rt_bat); - vhdx_region_entry_le_export(rt_metadata); - ret = bdrv_pwrite(bs, VHDX_REGION_TABLE_OFFSET, buffer, VHDX_HEADER_BLOCK_SIZE); if (ret < 0) { diff --git a/block/vhdx.h b/block/vhdx.h index 5370010c59..b4a12a081e 100644 --- a/block/vhdx.h +++ b/block/vhdx.h @@ -435,6 +435,7 @@ void vhdx_header_le_import(VHDXHeader *h); void vhdx_header_le_export(VHDXHeader *orig_h, VHDXHeader *new_h); void vhdx_log_desc_le_import(VHDXLogDescriptor *d); void vhdx_log_desc_le_export(VHDXLogDescriptor *d); +void vhdx_log_data_le_import(VHDXLogDataSector *d); void vhdx_log_data_le_export(VHDXLogDataSector *d); void vhdx_log_entry_hdr_le_import(VHDXLogEntryHeader *hdr); void vhdx_log_entry_hdr_le_export(VHDXLogEntryHeader *hdr); From 58803ce74f986785badfb2ef0f630e1cd97f52e5 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 6 Aug 2014 11:33:41 +0200 Subject: [PATCH 30/59] test-coroutine: add baseline test that times the cost of function calls This can be used to compute the cost of coroutine operations. In the end the cost of the function call is a few clock cycles, so it's pretty cheap for now, but it may become more relevant as the coroutine code is optimized. For example, here are the results on my machine: Function call 100000000 iterations: 0.173884 s Yield 100000000 iterations: 8.445064 s Lifecycle 1000000 iterations: 0.098445 s Nesting 10000 iterations of 1000 depth each: 7.406431 s One yield takes 83 nanoseconds, one enter takes 97 nanoseconds, one coroutine allocation takes (roughly, since some of the allocations in the nesting test do hit the pool) 739 nanoseconds: (8.445064 - 0.173884) * 10^9 / 100000000 = 82.7 (0.098445 * 100 - 0.173884) * 10^9 / 100000000 = 96.7 (7.406431 * 10 - 0.173884) * 10^9 / 100000000 = 738.9 Signed-off-by: Paolo Bonzini Reviewed-by: Stefan Hajnoczi Signed-off-by: Kevin Wolf --- tests/test-coroutine.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tests/test-coroutine.c b/tests/test-coroutine.c index 760636ddc4..6e634f4a78 100644 --- a/tests/test-coroutine.c +++ b/tests/test-coroutine.c @@ -288,6 +288,29 @@ static void perf_yield(void) maxcycles, duration); } +static __attribute__((noinline)) void dummy(unsigned *i) +{ + (*i)--; +} + +static void perf_baseline(void) +{ + unsigned int i, maxcycles; + double duration; + + maxcycles = 100000000; + i = maxcycles; + + g_test_timer_start(); + while (i > 0) { + dummy(&i); + } + duration = g_test_timer_elapsed(); + + g_test_message("Function call %u iterations: %f s\n", + maxcycles, duration); +} + int main(int argc, char **argv) { g_test_init(&argc, &argv, NULL); @@ -301,6 +324,7 @@ int main(int argc, char **argv) g_test_add_func("/perf/lifecycle", perf_lifecycle); g_test_add_func("/perf/nesting", perf_nesting); g_test_add_func("/perf/yield", perf_yield); + g_test_add_func("/perf/function-call", perf_baseline); } return g_test_run(); } From 9a4d5ca60772e09d0cbac01f1b4778aa68e00eaa Mon Sep 17 00:00:00 2001 From: Jeff Cody Date: Wed, 23 Jul 2014 17:22:57 -0400 Subject: [PATCH 31/59] block: allow bdrv_unref() to be passed NULL pointers If bdrv_unref() is passed a NULL BDS pointer, it is safe to exit with no operation. This will allow cleanup code to blindly call bdrv_unref() on a BDS that has been initialized to NULL. Reviewed-by: Max Reitz Signed-off-by: Jeff Cody Signed-off-by: Kevin Wolf --- block.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/block.c b/block.c index 6b29285381..d7cb7d48fa 100644 --- a/block.c +++ b/block.c @@ -5381,6 +5381,9 @@ void bdrv_ref(BlockDriverState *bs) * deleted. */ void bdrv_unref(BlockDriverState *bs) { + if (!bs) { + return; + } assert(bs->refcnt > 0); if (--bs->refcnt == 0) { bdrv_delete(bs); From 70747862f129ea0af5e3910f204cc93174c549e4 Mon Sep 17 00:00:00 2001 From: Jeff Cody Date: Wed, 23 Jul 2014 17:22:58 -0400 Subject: [PATCH 32/59] block: vdi - use block layer ops in vdi_create, instead of posix calls Use the block layer to create, and write to, the image file in the VDI .bdrv_create() operation. This has a couple of benefits: Images can now be created over protocols, and hacks such as NOCOW are not needed in the image format driver, and the underlying file protocol appropriate for the host OS can be relied upon. Also some minor cleanup for error handling. Reviewed-by: Max Reitz Signed-off-by: Jeff Cody Signed-off-by: Kevin Wolf --- block/vdi.c | 75 +++++++++++++++++++++-------------------------------- 1 file changed, 29 insertions(+), 46 deletions(-) diff --git a/block/vdi.c b/block/vdi.c index 197bd77c97..070acb622b 100644 --- a/block/vdi.c +++ b/block/vdi.c @@ -53,13 +53,6 @@ #include "block/block_int.h" #include "qemu/module.h" #include "migration/migration.h" -#ifdef __linux__ -#include -#include -#ifndef FS_NOCOW_FL -#define FS_NOCOW_FL 0x00800000 /* Do not cow file */ -#endif -#endif #if defined(CONFIG_UUID) #include @@ -681,7 +674,6 @@ static int vdi_co_write(BlockDriverState *bs, static int vdi_create(const char *filename, QemuOpts *opts, Error **errp) { - int fd; int result = 0; uint64_t bytes = 0; uint32_t blocks; @@ -690,7 +682,10 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp) VdiHeader header; size_t i; size_t bmap_size; - bool nocow = false; + int64_t offset = 0; + Error *local_err = NULL; + BlockDriverState *bs = NULL; + uint32_t *bmap = NULL; logout("\n"); @@ -707,7 +702,6 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp) image_type = VDI_TYPE_STATIC; } #endif - nocow = qemu_opt_get_bool_del(opts, BLOCK_OPT_NOCOW, false); if (bytes > VDI_DISK_SIZE_MAX) { result = -ENOTSUP; @@ -717,27 +711,16 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp) goto exit; } - fd = qemu_open(filename, - O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE, - 0644); - if (fd < 0) { - result = -errno; + result = bdrv_create_file(filename, opts, &local_err); + if (result < 0) { + error_propagate(errp, local_err); goto exit; } - - if (nocow) { -#ifdef __linux__ - /* Set NOCOW flag to solve performance issue on fs like btrfs. - * This is an optimisation. The FS_IOC_SETFLAGS ioctl return value will - * be ignored since any failure of this operation should not block the - * left work. - */ - int attr; - if (ioctl(fd, FS_IOC_GETFLAGS, &attr) == 0) { - attr |= FS_NOCOW_FL; - ioctl(fd, FS_IOC_SETFLAGS, &attr); - } -#endif + result = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL, + NULL, &local_err); + if (result < 0) { + error_propagate(errp, local_err); + goto exit; } /* We need enough blocks to store the given disk size, @@ -769,13 +752,15 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp) vdi_header_print(&header); #endif vdi_header_to_le(&header); - if (write(fd, &header, sizeof(header)) < 0) { - result = -errno; - goto close_and_exit; + result = bdrv_pwrite_sync(bs, offset, &header, sizeof(header)); + if (result < 0) { + error_setg(errp, "Error writing header to %s", filename); + goto exit; } + offset += sizeof(header); if (bmap_size > 0) { - uint32_t *bmap = g_malloc0(bmap_size); + bmap = g_malloc0(bmap_size); for (i = 0; i < blocks; i++) { if (image_type == VDI_TYPE_STATIC) { bmap[i] = i; @@ -783,27 +768,25 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp) bmap[i] = VDI_UNALLOCATED; } } - if (write(fd, bmap, bmap_size) < 0) { - result = -errno; - g_free(bmap); - goto close_and_exit; + result = bdrv_pwrite_sync(bs, offset, bmap, bmap_size); + if (result < 0) { + error_setg(errp, "Error writing bmap to %s", filename); + goto exit; } - g_free(bmap); + offset += bmap_size; } if (image_type == VDI_TYPE_STATIC) { - if (ftruncate(fd, sizeof(header) + bmap_size + blocks * block_size)) { - result = -errno; - goto close_and_exit; + result = bdrv_truncate(bs, offset + blocks * block_size); + if (result < 0) { + error_setg(errp, "Failed to statically allocate %s", filename); + goto exit; } } -close_and_exit: - if ((close(fd) < 0) && !result) { - result = -errno; - } - exit: + bdrv_unref(bs); + g_free(bmap); return result; } From dddc7750d68c3ea86d88b060a77acb60d2eeb4a7 Mon Sep 17 00:00:00 2001 From: Jeff Cody Date: Wed, 23 Jul 2014 17:22:59 -0400 Subject: [PATCH 33/59] block: use the standard 'ret' instead of 'result' Most QEMU code uses 'ret' for function return values. The VDI driver uses a mix of 'result' and 'ret'. This cleans that up, switching over to the standard 'ret' usage. Reviewed-by: Max Reitz Signed-off-by: Jeff Cody Signed-off-by: Kevin Wolf --- block/vdi.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/block/vdi.c b/block/vdi.c index 070acb622b..9d62a3cb97 100644 --- a/block/vdi.c +++ b/block/vdi.c @@ -350,23 +350,23 @@ static int vdi_make_empty(BlockDriverState *bs) static int vdi_probe(const uint8_t *buf, int buf_size, const char *filename) { const VdiHeader *header = (const VdiHeader *)buf; - int result = 0; + int ret = 0; logout("\n"); if (buf_size < sizeof(*header)) { /* Header too small, no VDI. */ } else if (le32_to_cpu(header->signature) == VDI_SIGNATURE) { - result = 100; + ret = 100; } - if (result == 0) { + if (ret == 0) { logout("no vdi image\n"); } else { logout("%s", header->text); } - return result; + return ret; } static int vdi_open(BlockDriverState *bs, QDict *options, int flags, @@ -674,7 +674,7 @@ static int vdi_co_write(BlockDriverState *bs, static int vdi_create(const char *filename, QemuOpts *opts, Error **errp) { - int result = 0; + int ret = 0; uint64_t bytes = 0; uint32_t blocks; size_t block_size = DEFAULT_CLUSTER_SIZE; @@ -704,21 +704,21 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp) #endif if (bytes > VDI_DISK_SIZE_MAX) { - result = -ENOTSUP; + ret = -ENOTSUP; error_setg(errp, "Unsupported VDI image size (size is 0x%" PRIx64 ", max supported is 0x%" PRIx64 ")", bytes, VDI_DISK_SIZE_MAX); goto exit; } - result = bdrv_create_file(filename, opts, &local_err); - if (result < 0) { + ret = bdrv_create_file(filename, opts, &local_err); + if (ret < 0) { error_propagate(errp, local_err); goto exit; } - result = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL, - NULL, &local_err); - if (result < 0) { + ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL, + NULL, &local_err); + if (ret < 0) { error_propagate(errp, local_err); goto exit; } @@ -752,8 +752,8 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp) vdi_header_print(&header); #endif vdi_header_to_le(&header); - result = bdrv_pwrite_sync(bs, offset, &header, sizeof(header)); - if (result < 0) { + ret = bdrv_pwrite_sync(bs, offset, &header, sizeof(header)); + if (ret < 0) { error_setg(errp, "Error writing header to %s", filename); goto exit; } @@ -768,8 +768,8 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp) bmap[i] = VDI_UNALLOCATED; } } - result = bdrv_pwrite_sync(bs, offset, bmap, bmap_size); - if (result < 0) { + ret = bdrv_pwrite_sync(bs, offset, bmap, bmap_size); + if (ret < 0) { error_setg(errp, "Error writing bmap to %s", filename); goto exit; } @@ -777,8 +777,8 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp) } if (image_type == VDI_TYPE_STATIC) { - result = bdrv_truncate(bs, offset + blocks * block_size); - if (result < 0) { + ret = bdrv_truncate(bs, offset + blocks * block_size); + if (ret < 0) { error_setg(errp, "Failed to statically allocate %s", filename); goto exit; } @@ -787,7 +787,7 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp) exit: bdrv_unref(bs); g_free(bmap); - return result; + return ret; } static void vdi_close(BlockDriverState *bs) From fef6070eff233400015cede968b0afe46c80bb0f Mon Sep 17 00:00:00 2001 From: Jeff Cody Date: Wed, 23 Jul 2014 17:23:00 -0400 Subject: [PATCH 34/59] block: vpc - use block layer ops in vpc_create, instead of posix calls Use the block layer to create, and write to, the image file in the VPC .bdrv_create() operation. This has a couple of benefits: Images can now be created over protocols, and hacks such as NOCOW are not needed in the image format driver, and the underlying file protocol appropriate for the host OS can be relied upon. Reviewed-by: Max Reitz Signed-off-by: Jeff Cody Signed-off-by: Kevin Wolf --- block/vpc.c | 106 +++++++++++++++++++++------------------------------- 1 file changed, 43 insertions(+), 63 deletions(-) diff --git a/block/vpc.c b/block/vpc.c index 8b376a40be..96903441b8 100644 --- a/block/vpc.c +++ b/block/vpc.c @@ -29,13 +29,6 @@ #if defined(CONFIG_UUID) #include #endif -#ifdef __linux__ -#include -#include -#ifndef FS_NOCOW_FL -#define FS_NOCOW_FL 0x00800000 /* Do not cow file */ -#endif -#endif /**************************************************************/ @@ -656,39 +649,41 @@ static int calculate_geometry(int64_t total_sectors, uint16_t* cyls, return 0; } -static int create_dynamic_disk(int fd, uint8_t *buf, int64_t total_sectors) +static int create_dynamic_disk(BlockDriverState *bs, uint8_t *buf, + int64_t total_sectors) { VHDDynDiskHeader *dyndisk_header = (VHDDynDiskHeader *) buf; size_t block_size, num_bat_entries; int i; - int ret = -EIO; + int ret; + int64_t offset = 0; // Write the footer (twice: at the beginning and at the end) block_size = 0x200000; num_bat_entries = (total_sectors + block_size / 512) / (block_size / 512); - if (write(fd, buf, HEADER_SIZE) != HEADER_SIZE) { + ret = bdrv_pwrite_sync(bs, offset, buf, HEADER_SIZE); + if (ret) { goto fail; } - if (lseek(fd, 1536 + ((num_bat_entries * 4 + 511) & ~511), SEEK_SET) < 0) { - goto fail; - } - if (write(fd, buf, HEADER_SIZE) != HEADER_SIZE) { + offset = 1536 + ((num_bat_entries * 4 + 511) & ~511); + ret = bdrv_pwrite_sync(bs, offset, buf, HEADER_SIZE); + if (ret < 0) { goto fail; } // Write the initial BAT - if (lseek(fd, 3 * 512, SEEK_SET) < 0) { - goto fail; - } + offset = 3 * 512; memset(buf, 0xFF, 512); for (i = 0; i < (num_bat_entries * 4 + 511) / 512; i++) { - if (write(fd, buf, 512) != 512) { + ret = bdrv_pwrite_sync(bs, offset, buf, 512); + if (ret < 0) { goto fail; } + offset += 512; } // Prepare the Dynamic Disk Header @@ -709,39 +704,35 @@ static int create_dynamic_disk(int fd, uint8_t *buf, int64_t total_sectors) dyndisk_header->checksum = be32_to_cpu(vpc_checksum(buf, 1024)); // Write the header - if (lseek(fd, 512, SEEK_SET) < 0) { - goto fail; - } + offset = 512; - if (write(fd, buf, 1024) != 1024) { + ret = bdrv_pwrite_sync(bs, offset, buf, 1024); + if (ret < 0) { goto fail; } - ret = 0; fail: return ret; } -static int create_fixed_disk(int fd, uint8_t *buf, int64_t total_size) +static int create_fixed_disk(BlockDriverState *bs, uint8_t *buf, + int64_t total_size) { - int ret = -EIO; + int ret; /* Add footer to total size */ - total_size += 512; - if (ftruncate(fd, total_size) != 0) { - ret = -errno; - goto fail; - } - if (lseek(fd, -512, SEEK_END) < 0) { - goto fail; - } - if (write(fd, buf, HEADER_SIZE) != HEADER_SIZE) { - goto fail; + total_size += HEADER_SIZE; + + ret = bdrv_truncate(bs, total_size); + if (ret < 0) { + return ret; } - ret = 0; + ret = bdrv_pwrite_sync(bs, total_size - HEADER_SIZE, buf, HEADER_SIZE); + if (ret < 0) { + return ret; + } - fail: return ret; } @@ -750,7 +741,7 @@ static int vpc_create(const char *filename, QemuOpts *opts, Error **errp) uint8_t buf[1024]; VHDFooter *footer = (VHDFooter *) buf; char *disk_type_param; - int fd, i; + int i; uint16_t cyls = 0; uint8_t heads = 0; uint8_t secs_per_cyl = 0; @@ -758,7 +749,8 @@ static int vpc_create(const char *filename, QemuOpts *opts, Error **errp) int64_t total_size; int disk_type; int ret = -EIO; - bool nocow = false; + Error *local_err = NULL; + BlockDriverState *bs = NULL; /* Read out options */ total_size = qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0); @@ -775,28 +767,17 @@ static int vpc_create(const char *filename, QemuOpts *opts, Error **errp) } else { disk_type = VHD_DYNAMIC; } - nocow = qemu_opt_get_bool_del(opts, BLOCK_OPT_NOCOW, false); - /* Create the file */ - fd = qemu_open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0644); - if (fd < 0) { - ret = -EIO; + ret = bdrv_create_file(filename, opts, &local_err); + if (ret < 0) { + error_propagate(errp, local_err); goto out; } - - if (nocow) { -#ifdef __linux__ - /* Set NOCOW flag to solve performance issue on fs like btrfs. - * This is an optimisation. The FS_IOC_SETFLAGS ioctl return value will - * be ignored since any failure of this operation should not block the - * left work. - */ - int attr; - if (ioctl(fd, FS_IOC_GETFLAGS, &attr) == 0) { - attr |= FS_NOCOW_FL; - ioctl(fd, FS_IOC_SETFLAGS, &attr); - } -#endif + ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL, + NULL, &local_err); + if (ret < 0) { + error_propagate(errp, local_err); + goto out; } /* @@ -810,7 +791,7 @@ static int vpc_create(const char *filename, QemuOpts *opts, Error **errp) &secs_per_cyl)) { ret = -EFBIG; - goto fail; + goto out; } } @@ -856,14 +837,13 @@ static int vpc_create(const char *filename, QemuOpts *opts, Error **errp) footer->checksum = be32_to_cpu(vpc_checksum(buf, HEADER_SIZE)); if (disk_type == VHD_DYNAMIC) { - ret = create_dynamic_disk(fd, buf, total_sectors); + ret = create_dynamic_disk(bs, buf, total_sectors); } else { - ret = create_fixed_disk(fd, buf, total_size); + ret = create_fixed_disk(bs, buf, total_size); } -fail: - qemu_close(fd); out: + bdrv_unref(bs); g_free(disk_type_param); return ret; } From 23d20b5b4fb7bde102e6779b7a13b88375e4db66 Mon Sep 17 00:00:00 2001 From: Jeff Cody Date: Wed, 23 Jul 2014 17:23:01 -0400 Subject: [PATCH 35/59] block: iotest - update 084 to test static VDI image creation This updates the VDI corruption test to also test static VDI image creation, as well as the default dynamic image creation. Reviewed-by: Max Reitz Signed-off-by: Jeff Cody Signed-off-by: Kevin Wolf --- tests/qemu-iotests/084 | 16 ++++++++++++++-- tests/qemu-iotests/084.out | 14 ++++++++++++++ 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/tests/qemu-iotests/084 b/tests/qemu-iotests/084 index cb4d7b729e..ae33c2cba4 100755 --- a/tests/qemu-iotests/084 +++ b/tests/qemu-iotests/084 @@ -1,6 +1,7 @@ #!/bin/bash # -# Test case for VDI header corruption; image too large, and too many blocks +# Test case for VDI header corruption; image too large, and too many blocks. +# Also simple test for creating dynamic and static VDI images. # # Copyright (C) 2013 Red Hat, Inc. # @@ -43,14 +44,25 @@ _supported_fmt vdi _supported_proto generic _supported_os Linux +size=64M ds_offset=368 # disk image size field offset bs_offset=376 # block size field offset bii_offset=384 # block in image field offset +echo +echo "=== Statically allocated image creation ===" +echo +_make_test_img $size -o static +_img_info +stat -c"disk image file size in bytes: %s" "${TEST_IMG}" +_cleanup_test_img + echo echo "=== Testing image size bounds ===" echo -_make_test_img 64M +_make_test_img $size +_img_info +stat -c"disk image file size in bytes: %s" "${TEST_IMG}" # check for image size too large # poke max image size, and appropriate blocks_in_image value diff --git a/tests/qemu-iotests/084.out b/tests/qemu-iotests/084.out index c7120d9b0b..ea29ae0b9d 100644 --- a/tests/qemu-iotests/084.out +++ b/tests/qemu-iotests/084.out @@ -1,8 +1,22 @@ QA output created by 084 +=== Statically allocated image creation === + +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 +image: TEST_DIR/t.IMGFMT +file format: IMGFMT +virtual size: 64M (67108864 bytes) +cluster_size: 1048576 +disk image file size in bytes: 67109888 + === Testing image size bounds === Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 +image: TEST_DIR/t.IMGFMT +file format: IMGFMT +virtual size: 64M (67108864 bytes) +cluster_size: 1048576 +disk image file size in bytes: 1024 Test 1: Maximum size (1024 TB): qemu-img: Could not open 'TEST_DIR/t.IMGFMT': Could not open 'TEST_DIR/t.IMGFMT': Invalid argument From 7d2a35cc921ea4832083a7e8598461868bb538ce Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Tue, 20 May 2014 12:24:05 +0200 Subject: [PATCH 36/59] block: Introduce qemu_try_blockalign() This function returns NULL instead of aborting when an allocation fails. Signed-off-by: Kevin Wolf Reviewed-by: Benoit Canet --- block.c | 13 +++++++++++++ include/block/block.h | 1 + include/qemu/osdep.h | 1 + util/oslib-posix.c | 16 ++++++++++------ util/oslib-win32.c | 9 +++++++-- 5 files changed, 32 insertions(+), 8 deletions(-) diff --git a/block.c b/block.c index d7cb7d48fa..3365c05b9d 100644 --- a/block.c +++ b/block.c @@ -5258,6 +5258,19 @@ void *qemu_blockalign(BlockDriverState *bs, size_t size) return qemu_memalign(bdrv_opt_mem_align(bs), size); } +void *qemu_try_blockalign(BlockDriverState *bs, size_t size) +{ + size_t align = bdrv_opt_mem_align(bs); + + /* Ensure that NULL is never returned on success */ + assert(align > 0); + if (size == 0) { + size = align; + } + + return qemu_try_memalign(align, size); +} + /* * Check if all memory in this vector is sector aligned. */ diff --git a/include/block/block.h b/include/block/block.h index d4c816d738..e94b701667 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -455,6 +455,7 @@ void bdrv_img_create(const char *filename, const char *fmt, size_t bdrv_opt_mem_align(BlockDriverState *bs); void bdrv_set_guest_block_size(BlockDriverState *bs, int align); void *qemu_blockalign(BlockDriverState *bs, size_t size); +void *qemu_try_blockalign(BlockDriverState *bs, size_t size); bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov); struct HBitmapIter; diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h index 8480d523f0..9dd43fc2db 100644 --- a/include/qemu/osdep.h +++ b/include/qemu/osdep.h @@ -95,6 +95,7 @@ typedef signed int int_fast16_t; #define qemu_printf printf int qemu_daemon(int nochdir, int noclose); +void *qemu_try_memalign(size_t alignment, size_t size); void *qemu_memalign(size_t alignment, size_t size); void *qemu_anon_ram_alloc(size_t size); void qemu_vfree(void *ptr); diff --git a/util/oslib-posix.c b/util/oslib-posix.c index cdbfb2e270..016a047cfc 100644 --- a/util/oslib-posix.c +++ b/util/oslib-posix.c @@ -94,7 +94,7 @@ void *qemu_oom_check(void *ptr) return ptr; } -void *qemu_memalign(size_t alignment, size_t size) +void *qemu_try_memalign(size_t alignment, size_t size) { void *ptr; @@ -106,19 +106,23 @@ void *qemu_memalign(size_t alignment, size_t size) int ret; ret = posix_memalign(&ptr, alignment, size); if (ret != 0) { - fprintf(stderr, "Failed to allocate %zu B: %s\n", - size, strerror(ret)); - abort(); + errno = ret; + ptr = NULL; } #elif defined(CONFIG_BSD) - ptr = qemu_oom_check(valloc(size)); + ptr = valloc(size); #else - ptr = qemu_oom_check(memalign(alignment, size)); + ptr = memalign(alignment, size); #endif trace_qemu_memalign(alignment, size, ptr); return ptr; } +void *qemu_memalign(size_t alignment, size_t size) +{ + return qemu_oom_check(qemu_try_memalign(alignment, size)); +} + /* alloc shared memory pages */ void *qemu_anon_ram_alloc(size_t size) { diff --git a/util/oslib-win32.c b/util/oslib-win32.c index 507cedd84d..a3eab4acba 100644 --- a/util/oslib-win32.c +++ b/util/oslib-win32.c @@ -50,18 +50,23 @@ void *qemu_oom_check(void *ptr) return ptr; } -void *qemu_memalign(size_t alignment, size_t size) +void *qemu_try_memalign(size_t alignment, size_t size) { void *ptr; if (!size) { abort(); } - ptr = qemu_oom_check(VirtualAlloc(NULL, size, MEM_COMMIT, PAGE_READWRITE)); + ptr = VirtualAlloc(NULL, size, MEM_COMMIT, PAGE_READWRITE); trace_qemu_memalign(alignment, size, ptr); return ptr; } +void *qemu_memalign(size_t alignment, size_t size) +{ + return qemu_oom_check(qemu_try_memalign(alignment, size)); +} + void *qemu_anon_ram_alloc(size_t size) { void *ptr; From 857d4f46c31d2f4d57d2f0fad9dfb584262bf9b9 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Tue, 20 May 2014 13:16:51 +0200 Subject: [PATCH 37/59] block: Handle failure for potentially large allocations Some code in the block layer makes potentially huge allocations. Failure is not completely unexpected there, so avoid aborting qemu and handle out-of-memory situations gracefully. This patch addresses bounce buffer allocations in block.c. While at it, convert bdrv_commit() from plain g_malloc() to qemu_try_blockalign(). Signed-off-by: Kevin Wolf Reviewed-by: Eric Blake Reviewed-by: Stefan Hajnoczi --- block.c | 34 +++++++++++++++++++++++++++------- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/block.c b/block.c index 3365c05b9d..6ef3465b4a 100644 --- a/block.c +++ b/block.c @@ -2275,7 +2275,14 @@ int bdrv_commit(BlockDriverState *bs) } total_sectors = length >> BDRV_SECTOR_BITS; - buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE); + + /* qemu_try_blockalign() for bs will choose an alignment that works for + * bs->backing_hd as well, so no need to compare the alignment manually. */ + buf = qemu_try_blockalign(bs, COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE); + if (buf == NULL) { + ret = -ENOMEM; + goto ro_cleanup; + } for (sector = 0; sector < total_sectors; sector += n) { ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n); @@ -2313,7 +2320,7 @@ int bdrv_commit(BlockDriverState *bs) ret = 0; ro_cleanup: - g_free(buf); + qemu_vfree(buf); if (ro) { /* ignoring error return here */ @@ -2972,7 +2979,12 @@ static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs, cluster_sector_num, cluster_nb_sectors); iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE; - iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len); + iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len); + if (bounce_buffer == NULL) { + ret = -ENOMEM; + goto err; + } + qemu_iovec_init_external(&bounce_qiov, &iov, 1); ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors, @@ -3256,7 +3268,11 @@ static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, /* Fall back to bounce buffer if write zeroes is unsupported */ iov.iov_len = num * BDRV_SECTOR_SIZE; if (iov.iov_base == NULL) { - iov.iov_base = qemu_blockalign(bs, num * BDRV_SECTOR_SIZE); + iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE); + if (iov.iov_base == NULL) { + ret = -ENOMEM; + goto fail; + } memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE); } qemu_iovec_init_external(&qiov, &iov, 1); @@ -3276,6 +3292,7 @@ static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, nb_sectors -= num; } +fail: qemu_vfree(iov.iov_base); return ret; } @@ -4618,8 +4635,9 @@ static void bdrv_aio_bh_cb(void *opaque) { BlockDriverAIOCBSync *acb = opaque; - if (!acb->is_write) + if (!acb->is_write && acb->ret >= 0) { qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size); + } qemu_vfree(acb->bounce); acb->common.cb(acb->common.opaque, acb->ret); qemu_bh_delete(acb->bh); @@ -4641,10 +4659,12 @@ static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs, acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque); acb->is_write = is_write; acb->qiov = qiov; - acb->bounce = qemu_blockalign(bs, qiov->size); + acb->bounce = qemu_try_blockalign(bs, qiov->size); acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb); - if (is_write) { + if (acb->bounce == NULL) { + acb->ret = -ENOMEM; + } else if (is_write) { qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size); acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors); } else { From 7bf665ee3504ab0ca4b8c29e1f603829a8272a26 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Tue, 20 May 2014 13:21:26 +0200 Subject: [PATCH 38/59] bochs: Handle failure for potentially large allocations Some code in the block layer makes potentially huge allocations. Failure is not completely unexpected there, so avoid aborting qemu and handle out-of-memory situations gracefully. This patch addresses the allocations in the bochs block driver. Signed-off-by: Kevin Wolf Reviewed-by: Stefan Hajnoczi Reviewed-by: Benoit Canet --- block/bochs.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/block/bochs.c b/block/bochs.c index eba23df335..6674b27438 100644 --- a/block/bochs.c +++ b/block/bochs.c @@ -131,7 +131,11 @@ static int bochs_open(BlockDriverState *bs, QDict *options, int flags, return -EFBIG; } - s->catalog_bitmap = g_malloc(s->catalog_size * 4); + s->catalog_bitmap = g_try_malloc(s->catalog_size * 4); + if (s->catalog_size && s->catalog_bitmap == NULL) { + error_setg(errp, "Could not allocate memory for catalog"); + return -ENOMEM; + } ret = bdrv_pread(bs->file, le32_to_cpu(bochs.header), s->catalog_bitmap, s->catalog_size * 4); From 4ae7a52e435e4e50a27e0ccc441da80a7fccbc21 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Tue, 20 May 2014 13:22:38 +0200 Subject: [PATCH 39/59] cloop: Handle failure for potentially large allocations Some code in the block layer makes potentially huge allocations. Failure is not completely unexpected there, so avoid aborting qemu and handle out-of-memory situations gracefully. This patch addresses the allocations in the cloop block driver. Signed-off-by: Kevin Wolf Reviewed-by: Stefan Hajnoczi Reviewed-by: Benoit Canet --- block/cloop.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/block/cloop.c b/block/cloop.c index 8457737922..f328be06f8 100644 --- a/block/cloop.c +++ b/block/cloop.c @@ -116,7 +116,12 @@ static int cloop_open(BlockDriverState *bs, QDict *options, int flags, "try increasing block size"); return -EINVAL; } - s->offsets = g_malloc(offsets_size); + + s->offsets = g_try_malloc(offsets_size); + if (s->offsets == NULL) { + error_setg(errp, "Could not allocate offsets table"); + return -ENOMEM; + } ret = bdrv_pread(bs->file, 128 + 4 + 4, s->offsets, offsets_size); if (ret < 0) { @@ -158,8 +163,20 @@ static int cloop_open(BlockDriverState *bs, QDict *options, int flags, } /* initialize zlib engine */ - s->compressed_block = g_malloc(max_compressed_block_size + 1); - s->uncompressed_block = g_malloc(s->block_size); + s->compressed_block = g_try_malloc(max_compressed_block_size + 1); + if (s->compressed_block == NULL) { + error_setg(errp, "Could not allocate compressed_block"); + ret = -ENOMEM; + goto fail; + } + + s->uncompressed_block = g_try_malloc(s->block_size); + if (s->uncompressed_block == NULL) { + error_setg(errp, "Could not allocate uncompressed_block"); + ret = -ENOMEM; + goto fail; + } + if (inflateInit(&s->zstream) != Z_OK) { ret = -EINVAL; goto fail; From 8dc7a7725bd6db2aa7e3c09b49bc21a1a25f40cb Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Tue, 20 May 2014 13:26:40 +0200 Subject: [PATCH 40/59] curl: Handle failure for potentially large allocations Some code in the block layer makes potentially huge allocations. Failure is not completely unexpected there, so avoid aborting qemu and handle out-of-memory situations gracefully. This patch addresses the allocations in the curl block driver. Signed-off-by: Kevin Wolf Reviewed-by: Stefan Hajnoczi Reviewed-by: Benoit Canet --- block/curl.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/block/curl.c b/block/curl.c index 79ff2f1e41..d4b85d20a5 100644 --- a/block/curl.c +++ b/block/curl.c @@ -640,7 +640,13 @@ static void curl_readv_bh_cb(void *p) state->buf_start = start; state->buf_len = acb->end + s->readahead_size; end = MIN(start + state->buf_len, s->len) - 1; - state->orig_buf = g_malloc(state->buf_len); + state->orig_buf = g_try_malloc(state->buf_len); + if (state->buf_len && state->orig_buf == NULL) { + curl_clean_state(state); + acb->common.cb(acb->common.opaque, -ENOMEM); + qemu_aio_release(acb); + return; + } state->acb[0] = acb; snprintf(state->range, 127, "%zd-%zd", start, end); From b546a944749f963c5b4e27765354df10ac531853 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Tue, 20 May 2014 13:28:14 +0200 Subject: [PATCH 41/59] dmg: Handle failure for potentially large allocations Some code in the block layer makes potentially huge allocations. Failure is not completely unexpected there, so avoid aborting qemu and handle out-of-memory situations gracefully. This patch addresses the allocations in the dmg block driver. Signed-off-by: Kevin Wolf Reviewed-by: Stefan Hajnoczi Reviewed-by: Benoit Canet --- block/dmg.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/block/dmg.c b/block/dmg.c index 1e153cd76d..e455886d2b 100644 --- a/block/dmg.c +++ b/block/dmg.c @@ -284,8 +284,15 @@ static int dmg_open(BlockDriverState *bs, QDict *options, int flags, } /* initialize zlib engine */ - s->compressed_chunk = g_malloc(max_compressed_size + 1); - s->uncompressed_chunk = g_malloc(512 * max_sectors_per_chunk); + s->compressed_chunk = qemu_try_blockalign(bs->file, + max_compressed_size + 1); + s->uncompressed_chunk = qemu_try_blockalign(bs->file, + 512 * max_sectors_per_chunk); + if (s->compressed_chunk == NULL || s->uncompressed_chunk == NULL) { + ret = -ENOMEM; + goto fail; + } + if (inflateInit(&s->zstream) != Z_OK) { ret = -EINVAL; goto fail; @@ -302,8 +309,8 @@ fail: g_free(s->lengths); g_free(s->sectors); g_free(s->sectorcounts); - g_free(s->compressed_chunk); - g_free(s->uncompressed_chunk); + qemu_vfree(s->compressed_chunk); + qemu_vfree(s->uncompressed_chunk); return ret; } @@ -426,8 +433,8 @@ static void dmg_close(BlockDriverState *bs) g_free(s->lengths); g_free(s->sectors); g_free(s->sectorcounts); - g_free(s->compressed_chunk); - g_free(s->uncompressed_chunk); + qemu_vfree(s->compressed_chunk); + qemu_vfree(s->uncompressed_chunk); inflateEnd(&s->zstream); } From 4d5a3f888c7dbdd1bf892bab2a3fb5c1455ccc78 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Tue, 20 May 2014 13:30:49 +0200 Subject: [PATCH 42/59] iscsi: Handle failure for potentially large allocations Some code in the block layer makes potentially huge allocations. Failure is not completely unexpected there, so avoid aborting qemu and handle out-of-memory situations gracefully. This patch addresses the allocations in the iscsi block driver. Signed-off-by: Kevin Wolf Acked-by: Paolo Bonzini Reviewed-by: Benoit Canet Reviewed-by: Eric Blake --- block/iscsi.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/block/iscsi.c b/block/iscsi.c index a7bb6970ac..2c9cfc18eb 100644 --- a/block/iscsi.c +++ b/block/iscsi.c @@ -893,7 +893,10 @@ coroutine_fn iscsi_co_write_zeroes(BlockDriverState *bs, int64_t sector_num, nb_blocks = sector_qemu2lun(nb_sectors, iscsilun); if (iscsilun->zeroblock == NULL) { - iscsilun->zeroblock = g_malloc0(iscsilun->block_size); + iscsilun->zeroblock = g_try_malloc0(iscsilun->block_size); + if (iscsilun->zeroblock == NULL) { + return -ENOMEM; + } } iscsi_co_init_iscsitask(iscsilun, &iTask); From 2347dd7b6841c1543ceb49cb232d596eb5dd1ca3 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Tue, 20 May 2014 13:31:20 +0200 Subject: [PATCH 43/59] nfs: Handle failure for potentially large allocations Some code in the block layer makes potentially huge allocations. Failure is not completely unexpected there, so avoid aborting qemu and handle out-of-memory situations gracefully. This patch addresses the allocations in the nfs block driver. Signed-off-by: Kevin Wolf Reviewed-by: Stefan Hajnoczi Reviewed-by: Benoit Canet --- block/nfs.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/block/nfs.c b/block/nfs.c index 8439e0d389..fe46c33709 100644 --- a/block/nfs.c +++ b/block/nfs.c @@ -172,7 +172,11 @@ static int coroutine_fn nfs_co_writev(BlockDriverState *bs, nfs_co_init_task(client, &task); - buf = g_malloc(nb_sectors * BDRV_SECTOR_SIZE); + buf = g_try_malloc(nb_sectors * BDRV_SECTOR_SIZE); + if (nb_sectors && buf == NULL) { + return -ENOMEM; + } + qemu_iovec_to_buf(iov, 0, buf, nb_sectors * BDRV_SECTOR_SIZE); if (nfs_pwrite_async(client->context, client->fh, From f7b593d93746ff1032a809542152212246e24fa7 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Tue, 20 May 2014 13:32:14 +0200 Subject: [PATCH 44/59] parallels: Handle failure for potentially large allocations Some code in the block layer makes potentially huge allocations. Failure is not completely unexpected there, so avoid aborting qemu and handle out-of-memory situations gracefully. This patch addresses the allocations in the parallels block driver. Signed-off-by: Kevin Wolf Reviewed-by: Stefan Hajnoczi Reviewed-by: Benoit Canet --- block/parallels.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/block/parallels.c b/block/parallels.c index 1a5bd350b3..7325678a4d 100644 --- a/block/parallels.c +++ b/block/parallels.c @@ -105,7 +105,11 @@ static int parallels_open(BlockDriverState *bs, QDict *options, int flags, ret = -EFBIG; goto fail; } - s->catalog_bitmap = g_malloc(s->catalog_size * 4); + s->catalog_bitmap = g_try_malloc(s->catalog_size * 4); + if (s->catalog_size && s->catalog_bitmap == NULL) { + ret = -ENOMEM; + goto fail; + } ret = bdrv_pread(bs->file, 64, s->catalog_bitmap, s->catalog_size * 4); if (ret < 0) { From 0df93305f21712e975ab5df260cc5a91e5daafca Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Tue, 20 May 2014 13:36:05 +0200 Subject: [PATCH 45/59] qcow1: Handle failure for potentially large allocations Some code in the block layer makes potentially huge allocations. Failure is not completely unexpected there, so avoid aborting qemu and handle out-of-memory situations gracefully. This patch addresses the allocations in the qcow1 block driver. Signed-off-by: Kevin Wolf Reviewed-by: Stefan Hajnoczi --- block/qcow.c | 33 ++++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/block/qcow.c b/block/qcow.c index a874056cf3..67332f03c1 100644 --- a/block/qcow.c +++ b/block/qcow.c @@ -182,7 +182,12 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags, } s->l1_table_offset = header.l1_table_offset; - s->l1_table = g_malloc(s->l1_size * sizeof(uint64_t)); + s->l1_table = g_try_malloc(s->l1_size * sizeof(uint64_t)); + if (s->l1_table == NULL) { + error_setg(errp, "Could not allocate memory for L1 table"); + ret = -ENOMEM; + goto fail; + } ret = bdrv_pread(bs->file, s->l1_table_offset, s->l1_table, s->l1_size * sizeof(uint64_t)); @@ -193,8 +198,16 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags, for(i = 0;i < s->l1_size; i++) { be64_to_cpus(&s->l1_table[i]); } - /* alloc L2 cache */ - s->l2_cache = g_malloc(s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t)); + + /* alloc L2 cache (max. 64k * 16 * 8 = 8 MB) */ + s->l2_cache = + qemu_try_blockalign(bs->file, + s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t)); + if (s->l2_cache == NULL) { + error_setg(errp, "Could not allocate L2 table cache"); + ret = -ENOMEM; + goto fail; + } s->cluster_cache = g_malloc(s->cluster_size); s->cluster_data = g_malloc(s->cluster_size); s->cluster_cache_offset = -1; @@ -226,7 +239,7 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags, fail: g_free(s->l1_table); - g_free(s->l2_cache); + qemu_vfree(s->l2_cache); g_free(s->cluster_cache); g_free(s->cluster_data); return ret; @@ -517,7 +530,10 @@ static coroutine_fn int qcow_co_readv(BlockDriverState *bs, int64_t sector_num, void *orig_buf; if (qiov->niov > 1) { - buf = orig_buf = qemu_blockalign(bs, qiov->size); + buf = orig_buf = qemu_try_blockalign(bs, qiov->size); + if (buf == NULL) { + return -ENOMEM; + } } else { orig_buf = NULL; buf = (uint8_t *)qiov->iov->iov_base; @@ -619,7 +635,10 @@ static coroutine_fn int qcow_co_writev(BlockDriverState *bs, int64_t sector_num, s->cluster_cache_offset = -1; /* disable compressed cache */ if (qiov->niov > 1) { - buf = orig_buf = qemu_blockalign(bs, qiov->size); + buf = orig_buf = qemu_try_blockalign(bs, qiov->size); + if (buf == NULL) { + return -ENOMEM; + } qemu_iovec_to_buf(qiov, 0, buf, qiov->size); } else { orig_buf = NULL; @@ -685,7 +704,7 @@ static void qcow_close(BlockDriverState *bs) BDRVQcowState *s = bs->opaque; g_free(s->l1_table); - g_free(s->l2_cache); + qemu_vfree(s->l2_cache); g_free(s->cluster_cache); g_free(s->cluster_data); From de82815db1c89da058b7fb941dab137d6d9ab738 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Tue, 20 May 2014 17:12:47 +0200 Subject: [PATCH 46/59] qcow2: Handle failure for potentially large allocations Some code in the block layer makes potentially huge allocations. Failure is not completely unexpected there, so avoid aborting qemu and handle out-of-memory situations gracefully. This patch addresses the allocations in the qcow2 block driver. Signed-off-by: Kevin Wolf Reviewed-by: Stefan Hajnoczi --- block/qcow2-cache.c | 13 +++++++++++- block/qcow2-cluster.c | 36 ++++++++++++++++++++++++------- block/qcow2-refcount.c | 48 +++++++++++++++++++++++++++++++++--------- block/qcow2-snapshot.c | 23 +++++++++++++++----- block/qcow2.c | 42 +++++++++++++++++++++++++++++------- 5 files changed, 130 insertions(+), 32 deletions(-) diff --git a/block/qcow2-cache.c b/block/qcow2-cache.c index 8ecbb5bc00..5353b44828 100644 --- a/block/qcow2-cache.c +++ b/block/qcow2-cache.c @@ -53,10 +53,21 @@ Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables) c->entries = g_malloc0(sizeof(*c->entries) * num_tables); for (i = 0; i < c->size; i++) { - c->entries[i].table = qemu_blockalign(bs, s->cluster_size); + c->entries[i].table = qemu_try_blockalign(bs->file, s->cluster_size); + if (c->entries[i].table == NULL) { + goto fail; + } } return c; + +fail: + for (i = 0; i < c->size; i++) { + qemu_vfree(c->entries[i].table); + } + g_free(c->entries); + g_free(c); + return NULL; } int qcow2_cache_destroy(BlockDriverState* bs, Qcow2Cache *c) diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index 4208dc08b5..e7c5f486cd 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -72,14 +72,20 @@ int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size, #endif new_l1_size2 = sizeof(uint64_t) * new_l1_size; - new_l1_table = g_malloc0(align_offset(new_l1_size2, 512)); + new_l1_table = qemu_try_blockalign(bs->file, + align_offset(new_l1_size2, 512)); + if (new_l1_table == NULL) { + return -ENOMEM; + } + memset(new_l1_table, 0, align_offset(new_l1_size2, 512)); + memcpy(new_l1_table, s->l1_table, s->l1_size * sizeof(uint64_t)); /* write new table (align to cluster) */ BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ALLOC_TABLE); new_l1_table_offset = qcow2_alloc_clusters(bs, new_l1_size2); if (new_l1_table_offset < 0) { - g_free(new_l1_table); + qemu_vfree(new_l1_table); return new_l1_table_offset; } @@ -113,7 +119,7 @@ int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size, if (ret < 0) { goto fail; } - g_free(s->l1_table); + qemu_vfree(s->l1_table); old_l1_table_offset = s->l1_table_offset; s->l1_table_offset = new_l1_table_offset; s->l1_table = new_l1_table; @@ -123,7 +129,7 @@ int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size, QCOW2_DISCARD_OTHER); return 0; fail: - g_free(new_l1_table); + qemu_vfree(new_l1_table); qcow2_free_clusters(bs, new_l1_table_offset, new_l1_size2, QCOW2_DISCARD_OTHER); return ret; @@ -372,7 +378,10 @@ static int coroutine_fn copy_sectors(BlockDriverState *bs, } iov.iov_len = n * BDRV_SECTOR_SIZE; - iov.iov_base = qemu_blockalign(bs, iov.iov_len); + iov.iov_base = qemu_try_blockalign(bs, iov.iov_len); + if (iov.iov_base == NULL) { + return -ENOMEM; + } qemu_iovec_init_external(&qiov, &iov, 1); @@ -702,7 +711,11 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m) trace_qcow2_cluster_link_l2(qemu_coroutine_self(), m->nb_clusters); assert(m->nb_clusters > 0); - old_cluster = g_malloc(m->nb_clusters * sizeof(uint64_t)); + old_cluster = g_try_malloc(m->nb_clusters * sizeof(uint64_t)); + if (old_cluster == NULL) { + ret = -ENOMEM; + goto err; + } /* copy content of unmodified sectors */ ret = perform_cow(bs, m, &m->cow_start); @@ -1562,7 +1575,10 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table, if (!is_active_l1) { /* inactive L2 tables require a buffer to be stored in when loading * them from disk */ - l2_table = qemu_blockalign(bs, s->cluster_size); + l2_table = qemu_try_blockalign(bs->file, s->cluster_size); + if (l2_table == NULL) { + return -ENOMEM; + } } for (i = 0; i < l1_size; i++) { @@ -1740,7 +1756,11 @@ int qcow2_expand_zero_clusters(BlockDriverState *bs) nb_clusters = size_to_clusters(s, bs->file->total_sectors * BDRV_SECTOR_SIZE); - expanded_clusters = g_malloc0((nb_clusters + 7) / 8); + expanded_clusters = g_try_malloc0((nb_clusters + 7) / 8); + if (expanded_clusters == NULL) { + ret = -ENOMEM; + goto fail; + } ret = expand_zero_clusters_in_l1(bs, s->l1_table, s->l1_size, &expanded_clusters, &nb_clusters); diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c index cc6cf743d6..87a56d84af 100644 --- a/block/qcow2-refcount.c +++ b/block/qcow2-refcount.c @@ -46,8 +46,12 @@ int qcow2_refcount_init(BlockDriverState *bs) assert(s->refcount_table_size <= INT_MAX / sizeof(uint64_t)); refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t); - s->refcount_table = g_malloc(refcount_table_size2); + s->refcount_table = g_try_malloc(refcount_table_size2); + if (s->refcount_table_size > 0) { + if (s->refcount_table == NULL) { + goto fail; + } BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_LOAD); ret = bdrv_pread(bs->file, s->refcount_table_offset, s->refcount_table, refcount_table_size2); @@ -344,8 +348,14 @@ static int alloc_refcount_block(BlockDriverState *bs, uint64_t meta_offset = (blocks_used * refcount_block_clusters) * s->cluster_size; uint64_t table_offset = meta_offset + blocks_clusters * s->cluster_size; - uint16_t *new_blocks = g_malloc0(blocks_clusters * s->cluster_size); - uint64_t *new_table = g_malloc0(table_size * sizeof(uint64_t)); + uint64_t *new_table = g_try_malloc0(table_size * sizeof(uint64_t)); + uint16_t *new_blocks = g_try_malloc0(blocks_clusters * s->cluster_size); + + assert(table_size > 0 && blocks_clusters > 0); + if (new_table == NULL || new_blocks == NULL) { + ret = -ENOMEM; + goto fail_table; + } /* Fill the new refcount table */ memcpy(new_table, s->refcount_table, @@ -424,6 +434,7 @@ static int alloc_refcount_block(BlockDriverState *bs, return -EAGAIN; fail_table: + g_free(new_blocks); g_free(new_table); fail_block: if (*refcount_block != NULL) { @@ -847,7 +858,8 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs, int64_t l1_table_offset, int l1_size, int addend) { BDRVQcowState *s = bs->opaque; - uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2, l1_allocated; + uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2; + bool l1_allocated = false; int64_t old_offset, old_l2_offset; int i, j, l1_modified = 0, nb_csectors, refcount; int ret; @@ -862,8 +874,12 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs, * l1_table_offset when it is the current s->l1_table_offset! Be careful * when changing this! */ if (l1_table_offset != s->l1_table_offset) { - l1_table = g_malloc0(align_offset(l1_size2, 512)); - l1_allocated = 1; + l1_table = g_try_malloc0(align_offset(l1_size2, 512)); + if (l1_size2 && l1_table == NULL) { + ret = -ENOMEM; + goto fail; + } + l1_allocated = true; ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2); if (ret < 0) { @@ -875,7 +891,7 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs, } else { assert(l1_size == s->l1_size); l1_table = s->l1_table; - l1_allocated = 0; + l1_allocated = false; } for(i = 0; i < l1_size; i++) { @@ -1197,7 +1213,11 @@ static int check_refcounts_l1(BlockDriverState *bs, if (l1_size2 == 0) { l1_table = NULL; } else { - l1_table = g_malloc(l1_size2); + l1_table = g_try_malloc(l1_size2); + if (l1_table == NULL) { + ret = -ENOMEM; + goto fail; + } if (bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2) != l1_size2) goto fail; @@ -1501,7 +1521,11 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res, return -EFBIG; } - refcount_table = g_malloc0(nb_clusters * sizeof(uint16_t)); + refcount_table = g_try_malloc0(nb_clusters * sizeof(uint16_t)); + if (nb_clusters && refcount_table == NULL) { + res->check_errors++; + return -ENOMEM; + } res->bfi.total_clusters = size_to_clusters(s, bs->total_sectors * BDRV_SECTOR_SIZE); @@ -1753,9 +1777,13 @@ int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset, uint64_t l1_ofs = s->snapshots[i].l1_table_offset; uint32_t l1_sz = s->snapshots[i].l1_size; uint64_t l1_sz2 = l1_sz * sizeof(uint64_t); - uint64_t *l1 = g_malloc(l1_sz2); + uint64_t *l1 = g_try_malloc(l1_sz2); int ret; + if (l1_sz2 && l1 == NULL) { + return -ENOMEM; + } + ret = bdrv_pread(bs->file, l1_ofs, l1, l1_sz2); if (ret < 0) { g_free(l1); diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c index 0aa9defbe2..f67b47282f 100644 --- a/block/qcow2-snapshot.c +++ b/block/qcow2-snapshot.c @@ -381,7 +381,12 @@ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info) sn->l1_table_offset = l1_table_offset; sn->l1_size = s->l1_size; - l1_table = g_malloc(s->l1_size * sizeof(uint64_t)); + l1_table = g_try_malloc(s->l1_size * sizeof(uint64_t)); + if (s->l1_size && l1_table == NULL) { + ret = -ENOMEM; + goto fail; + } + for(i = 0; i < s->l1_size; i++) { l1_table[i] = cpu_to_be64(s->l1_table[i]); } @@ -499,7 +504,11 @@ int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id) * Decrease the refcount referenced by the old one only when the L1 * table is overwritten. */ - sn_l1_table = g_malloc0(cur_l1_bytes); + sn_l1_table = g_try_malloc0(cur_l1_bytes); + if (cur_l1_bytes && sn_l1_table == NULL) { + ret = -ENOMEM; + goto fail; + } ret = bdrv_pread(bs->file, sn->l1_table_offset, sn_l1_table, sn_l1_bytes); if (ret < 0) { @@ -698,17 +707,21 @@ int qcow2_snapshot_load_tmp(BlockDriverState *bs, return -EFBIG; } new_l1_bytes = sn->l1_size * sizeof(uint64_t); - new_l1_table = g_malloc0(align_offset(new_l1_bytes, 512)); + new_l1_table = qemu_try_blockalign(bs->file, + align_offset(new_l1_bytes, 512)); + if (new_l1_table == NULL) { + return -ENOMEM; + } ret = bdrv_pread(bs->file, sn->l1_table_offset, new_l1_table, new_l1_bytes); if (ret < 0) { error_setg(errp, "Failed to read l1 table for snapshot"); - g_free(new_l1_table); + qemu_vfree(new_l1_table); return ret; } /* Switch the L1 table */ - g_free(s->l1_table); + qemu_vfree(s->l1_table); s->l1_size = sn->l1_size; s->l1_table_offset = sn->l1_table_offset; diff --git a/block/qcow2.c b/block/qcow2.c index 964ab93b08..435e0e11d0 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -688,8 +688,13 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags, if (s->l1_size > 0) { - s->l1_table = g_malloc0( + s->l1_table = qemu_try_blockalign(bs->file, align_offset(s->l1_size * sizeof(uint64_t), 512)); + if (s->l1_table == NULL) { + error_setg(errp, "Could not allocate L1 table"); + ret = -ENOMEM; + goto fail; + } ret = bdrv_pread(bs->file, s->l1_table_offset, s->l1_table, s->l1_size * sizeof(uint64_t)); if (ret < 0) { @@ -704,11 +709,22 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags, /* alloc L2 table/refcount block cache */ s->l2_table_cache = qcow2_cache_create(bs, L2_CACHE_SIZE); s->refcount_block_cache = qcow2_cache_create(bs, REFCOUNT_CACHE_SIZE); + if (s->l2_table_cache == NULL || s->refcount_block_cache == NULL) { + error_setg(errp, "Could not allocate metadata caches"); + ret = -ENOMEM; + goto fail; + } s->cluster_cache = g_malloc(s->cluster_size); /* one more sector for decompressed data alignment */ - s->cluster_data = qemu_blockalign(bs, QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size - + 512); + s->cluster_data = qemu_try_blockalign(bs->file, QCOW_MAX_CRYPT_CLUSTERS + * s->cluster_size + 512); + if (s->cluster_data == NULL) { + error_setg(errp, "Could not allocate temporary cluster buffer"); + ret = -ENOMEM; + goto fail; + } + s->cluster_cache_offset = -1; s->flags = flags; @@ -852,7 +868,7 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags, cleanup_unknown_header_ext(bs); qcow2_free_snapshots(bs); qcow2_refcount_close(bs); - g_free(s->l1_table); + qemu_vfree(s->l1_table); /* else pre-write overlap checks in cache_destroy may crash */ s->l1_table = NULL; if (s->l2_table_cache) { @@ -1082,7 +1098,12 @@ static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num, */ if (!cluster_data) { cluster_data = - qemu_blockalign(bs, QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size); + qemu_try_blockalign(bs->file, QCOW_MAX_CRYPT_CLUSTERS + * s->cluster_size); + if (cluster_data == NULL) { + ret = -ENOMEM; + goto fail; + } } assert(cur_nr_sectors <= @@ -1182,8 +1203,13 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs, if (s->crypt_method) { if (!cluster_data) { - cluster_data = qemu_blockalign(bs, QCOW_MAX_CRYPT_CLUSTERS * - s->cluster_size); + cluster_data = qemu_try_blockalign(bs->file, + QCOW_MAX_CRYPT_CLUSTERS + * s->cluster_size); + if (cluster_data == NULL) { + ret = -ENOMEM; + goto fail; + } } assert(hd_qiov.size <= @@ -1270,7 +1296,7 @@ fail: static void qcow2_close(BlockDriverState *bs) { BDRVQcowState *s = bs->opaque; - g_free(s->l1_table); + qemu_vfree(s->l1_table); /* else pre-write overlap checks in cache_destroy may crash */ s->l1_table = NULL; From 4f4896db5fb2285df016ff927508560c89b845a4 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Tue, 20 May 2014 13:39:57 +0200 Subject: [PATCH 47/59] qed: Handle failure for potentially large allocations Some code in the block layer makes potentially huge allocations. Failure is not completely unexpected there, so avoid aborting qemu and handle out-of-memory situations gracefully. This patch addresses the allocations in the qed block driver. Signed-off-by: Kevin Wolf Reviewed-by: Stefan Hajnoczi Reviewed-by: Benoit Canet --- block/qed-check.c | 7 +++++-- block/qed.c | 6 +++++- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/block/qed-check.c b/block/qed-check.c index b473dcd61f..40a882cc93 100644 --- a/block/qed-check.c +++ b/block/qed-check.c @@ -227,8 +227,11 @@ int qed_check(BDRVQEDState *s, BdrvCheckResult *result, bool fix) }; int ret; - check.used_clusters = g_malloc0(((check.nclusters + 31) / 32) * - sizeof(check.used_clusters[0])); + check.used_clusters = g_try_malloc0(((check.nclusters + 31) / 32) * + sizeof(check.used_clusters[0])); + if (check.nclusters && check.used_clusters == NULL) { + return -ENOMEM; + } check.result->bfi.total_clusters = (s->header.image_size + s->header.cluster_size - 1) / diff --git a/block/qed.c b/block/qed.c index 7944832181..ba395af76a 100644 --- a/block/qed.c +++ b/block/qed.c @@ -1240,7 +1240,11 @@ static void qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len) struct iovec *iov = acb->qiov->iov; if (!iov->iov_base) { - iov->iov_base = qemu_blockalign(acb->common.bs, iov->iov_len); + iov->iov_base = qemu_try_blockalign(acb->common.bs, iov->iov_len); + if (iov->iov_base == NULL) { + qed_aio_complete(acb, -ENOMEM); + return; + } memset(iov->iov_base, 0, iov->iov_len); } } From 50d4a858e62b1d864227d13f07d2c79c118d046a Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Wed, 21 May 2014 18:02:42 +0200 Subject: [PATCH 48/59] raw-posix: Handle failure for potentially large allocations Some code in the block layer makes potentially huge allocations. Failure is not completely unexpected there, so avoid aborting qemu and handle out-of-memory situations gracefully. This patch addresses the allocations in the raw-posix block driver. Signed-off-by: Kevin Wolf Reviewed-by: Stefan Hajnoczi --- block/raw-posix.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/block/raw-posix.c b/block/raw-posix.c index 8e9758e920..1194eb00ad 100644 --- a/block/raw-posix.c +++ b/block/raw-posix.c @@ -798,7 +798,11 @@ static ssize_t handle_aiocb_rw(RawPosixAIOData *aiocb) * Ok, we have to do it the hard way, copy all segments into * a single aligned buffer. */ - buf = qemu_blockalign(aiocb->bs, aiocb->aio_nbytes); + buf = qemu_try_blockalign(aiocb->bs, aiocb->aio_nbytes); + if (buf == NULL) { + return -ENOMEM; + } + if (aiocb->aio_type & QEMU_AIO_WRITE) { char *p = buf; int i; From 4b6af3d58a73193017926dd59de3b3e7b4890323 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Wed, 21 May 2014 18:05:47 +0200 Subject: [PATCH 49/59] raw-win32: Handle failure for potentially large allocations Some code in the block layer makes potentially huge allocations. Failure is not completely unexpected there, so avoid aborting qemu and handle out-of-memory situations gracefully. This patch addresses the allocations in the raw-win32 block driver. Signed-off-by: Kevin Wolf Reviewed-by: Stefan Hajnoczi --- block/win32-aio.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/block/win32-aio.c b/block/win32-aio.c index 8e417f70ae..5030e3274d 100644 --- a/block/win32-aio.c +++ b/block/win32-aio.c @@ -139,7 +139,10 @@ BlockDriverAIOCB *win32_aio_submit(BlockDriverState *bs, waiocb->is_read = (type == QEMU_AIO_READ); if (qiov->niov > 1) { - waiocb->buf = qemu_blockalign(bs, qiov->size); + waiocb->buf = qemu_try_blockalign(bs, qiov->size); + if (waiocb->buf == NULL) { + goto out; + } if (type & QEMU_AIO_WRITE) { iov_to_buf(qiov->iov, qiov->niov, 0, waiocb->buf, qiov->size); } @@ -168,6 +171,7 @@ BlockDriverAIOCB *win32_aio_submit(BlockDriverState *bs, out_dec_count: aio->count--; +out: qemu_aio_release(waiocb); return NULL; } From 0f7a02379bb672666e21dfcd6b549c3506f8d784 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Wed, 21 May 2014 18:11:48 +0200 Subject: [PATCH 50/59] rbd: Handle failure for potentially large allocations Some code in the block layer makes potentially huge allocations. Failure is not completely unexpected there, so avoid aborting qemu and handle out-of-memory situations gracefully. This patch addresses the allocations in the rbd block driver. Signed-off-by: Kevin Wolf Reviewed-by: Stefan Hajnoczi --- block/rbd.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/block/rbd.c b/block/rbd.c index 2b797d3e8b..4459102adf 100644 --- a/block/rbd.c +++ b/block/rbd.c @@ -617,7 +617,7 @@ static BlockDriverAIOCB *rbd_start_aio(BlockDriverState *bs, RBDAIOCmd cmd) { RBDAIOCB *acb; - RADOSCB *rcb; + RADOSCB *rcb = NULL; rbd_completion_t c; int64_t off, size; char *buf; @@ -631,7 +631,10 @@ static BlockDriverAIOCB *rbd_start_aio(BlockDriverState *bs, if (cmd == RBD_AIO_DISCARD || cmd == RBD_AIO_FLUSH) { acb->bounce = NULL; } else { - acb->bounce = qemu_blockalign(bs, qiov->size); + acb->bounce = qemu_try_blockalign(bs, qiov->size); + if (acb->bounce == NULL) { + goto failed; + } } acb->ret = 0; acb->error = 0; From 17cce735780f0ff6a2ef173c34614bd47acd56e5 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Tue, 20 May 2014 13:25:43 +0200 Subject: [PATCH 51/59] vdi: Handle failure for potentially large allocations Some code in the block layer makes potentially huge allocations. Failure is not completely unexpected there, so avoid aborting qemu and handle out-of-memory situations gracefully. This patch addresses the allocations in the vdi block driver. Signed-off-by: Kevin Wolf Reviewed-by: Stefan Hajnoczi Reviewed-by: Benoit Canet --- block/vdi.c | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/block/vdi.c b/block/vdi.c index 9d62a3cb97..adc6aa9a5f 100644 --- a/block/vdi.c +++ b/block/vdi.c @@ -292,7 +292,12 @@ static int vdi_check(BlockDriverState *bs, BdrvCheckResult *res, return -ENOTSUP; } - bmap = g_malloc(s->header.blocks_in_image * sizeof(uint32_t)); + bmap = g_try_malloc(s->header.blocks_in_image * sizeof(uint32_t)); + if (s->header.blocks_in_image && bmap == NULL) { + res->check_errors++; + return -ENOMEM; + } + memset(bmap, 0xff, s->header.blocks_in_image * sizeof(uint32_t)); /* Check block map and value of blocks_allocated. */ @@ -471,7 +476,12 @@ static int vdi_open(BlockDriverState *bs, QDict *options, int flags, bmap_size = header.blocks_in_image * sizeof(uint32_t); bmap_size = (bmap_size + SECTOR_SIZE - 1) / SECTOR_SIZE; - s->bmap = g_malloc(bmap_size * SECTOR_SIZE); + s->bmap = qemu_try_blockalign(bs->file, bmap_size * SECTOR_SIZE); + if (s->bmap == NULL) { + ret = -ENOMEM; + goto fail; + } + ret = bdrv_read(bs->file, s->bmap_sector, (uint8_t *)s->bmap, bmap_size); if (ret < 0) { goto fail_free_bmap; @@ -486,7 +496,7 @@ static int vdi_open(BlockDriverState *bs, QDict *options, int flags, return 0; fail_free_bmap: - g_free(s->bmap); + qemu_vfree(s->bmap); fail: return ret; @@ -760,7 +770,12 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp) offset += sizeof(header); if (bmap_size > 0) { - bmap = g_malloc0(bmap_size); + bmap = g_try_malloc0(bmap_size); + if (bmap == NULL) { + ret = -ENOMEM; + error_setg(errp, "Could not allocate bmap"); + goto exit; + } for (i = 0; i < blocks; i++) { if (image_type == VDI_TYPE_STATIC) { bmap[i] = i; @@ -794,7 +809,7 @@ static void vdi_close(BlockDriverState *bs) { BDRVVdiState *s = bs->opaque; - g_free(s->bmap); + qemu_vfree(s->bmap); migrate_del_blocker(s->migration_blocker); error_free(s->migration_blocker); From a67e128a4f40cf07abd86f92d0d3c913db2ad885 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Tue, 20 May 2014 13:55:50 +0200 Subject: [PATCH 52/59] vhdx: Handle failure for potentially large allocations Some code in the block layer makes potentially huge allocations. Failure is not completely unexpected there, so avoid aborting qemu and handle out-of-memory situations gracefully. This patch addresses the allocations in the vhdx block driver. Signed-off-by: Kevin Wolf Reviewed-by: Stefan Hajnoczi Reviewed-by: Benoit Canet --- block/vhdx-log.c | 7 ++++++- block/vhdx.c | 13 +++++++++++-- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/block/vhdx-log.c b/block/vhdx-log.c index 0088be8747..eb5c7a097b 100644 --- a/block/vhdx-log.c +++ b/block/vhdx-log.c @@ -352,7 +352,12 @@ static int vhdx_log_read_desc(BlockDriverState *bs, BDRVVHDXState *s, } desc_sectors = vhdx_compute_desc_sectors(hdr.descriptor_count); - desc_entries = qemu_blockalign(bs, desc_sectors * VHDX_LOG_SECTOR_SIZE); + desc_entries = qemu_try_blockalign(bs->file, + desc_sectors * VHDX_LOG_SECTOR_SIZE); + if (desc_entries == NULL) { + ret = -ENOMEM; + goto exit; + } ret = vhdx_log_read_sectors(bs, log, §ors_read, desc_entries, desc_sectors, false); diff --git a/block/vhdx.c b/block/vhdx.c index febce21f82..f666940db7 100644 --- a/block/vhdx.c +++ b/block/vhdx.c @@ -958,7 +958,12 @@ static int vhdx_open(BlockDriverState *bs, QDict *options, int flags, } /* s->bat is freed in vhdx_close() */ - s->bat = qemu_blockalign(bs, s->bat_rt.length); + s->bat = qemu_try_blockalign(bs->file, s->bat_rt.length); + if (s->bat == NULL) { + ret = -ENOMEM; + goto fail; + } + ret = bdrv_pread(bs->file, s->bat_offset, s->bat, s->bat_rt.length); if (ret < 0) { goto fail; @@ -1587,7 +1592,11 @@ static int vhdx_create_bat(BlockDriverState *bs, BDRVVHDXState *s, use_zero_blocks || bdrv_has_zero_init(bs) == 0) { /* for a fixed file, the default BAT entry is not zero */ - s->bat = g_malloc0(length); + s->bat = g_try_malloc0(length); + if (length && s->bat != NULL) { + ret = -ENOMEM; + goto exit; + } block_state = type == VHDX_TYPE_FIXED ? PAYLOAD_BLOCK_FULLY_PRESENT : PAYLOAD_BLOCK_NOT_PRESENT; block_state = use_zero_blocks ? PAYLOAD_BLOCK_ZERO : block_state; From d6e5993197990ff55c660714526681fa7028299e Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Tue, 20 May 2014 13:56:27 +0200 Subject: [PATCH 53/59] vmdk: Handle failure for potentially large allocations Some code in the block layer makes potentially huge allocations. Failure is not completely unexpected there, so avoid aborting qemu and handle out-of-memory situations gracefully. This patch addresses the allocations in the vmdk block driver. Signed-off-by: Kevin Wolf Reviewed-by: Stefan Hajnoczi Reviewed-by: Benoit Canet --- block/vmdk.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/block/vmdk.c b/block/vmdk.c index 3700a11f5f..01412a8939 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -456,7 +456,11 @@ static int vmdk_init_tables(BlockDriverState *bs, VmdkExtent *extent, /* read the L1 table */ l1_size = extent->l1_size * sizeof(uint32_t); - extent->l1_table = g_malloc(l1_size); + extent->l1_table = g_try_malloc(l1_size); + if (l1_size && extent->l1_table == NULL) { + return -ENOMEM; + } + ret = bdrv_pread(extent->file, extent->l1_table_offset, extent->l1_table, @@ -472,7 +476,11 @@ static int vmdk_init_tables(BlockDriverState *bs, VmdkExtent *extent, } if (extent->l1_backup_table_offset) { - extent->l1_backup_table = g_malloc(l1_size); + extent->l1_backup_table = g_try_malloc(l1_size); + if (l1_size && extent->l1_backup_table == NULL) { + ret = -ENOMEM; + goto fail_l1; + } ret = bdrv_pread(extent->file, extent->l1_backup_table_offset, extent->l1_backup_table, From 5fb09cd5867aabf26d5b85b0913dccd496b71421 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Wed, 21 May 2014 18:08:38 +0200 Subject: [PATCH 54/59] vpc: Handle failure for potentially large allocations Some code in the block layer makes potentially huge allocations. Failure is not completely unexpected there, so avoid aborting qemu and handle out-of-memory situations gracefully. This patch addresses the allocations in the vpc block driver. Signed-off-by: Kevin Wolf Reviewed-by: Stefan Hajnoczi Reviewed-by: Benoit Canet --- block/vpc.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/block/vpc.c b/block/vpc.c index 96903441b8..055efc42d2 100644 --- a/block/vpc.c +++ b/block/vpc.c @@ -269,7 +269,11 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags, goto fail; } - s->pagetable = qemu_blockalign(bs, s->max_table_entries * 4); + s->pagetable = qemu_try_blockalign(bs->file, s->max_table_entries * 4); + if (s->pagetable == NULL) { + ret = -ENOMEM; + goto fail; + } s->bat_offset = be64_to_cpu(dyndisk_header->table_offset); From 7504edf477d2bac9cbf53225811ec595abf045e4 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Wed, 21 May 2014 18:16:21 +0200 Subject: [PATCH 55/59] mirror: Handle failure for potentially large allocations Some code in the block layer makes potentially huge allocations. Failure is not completely unexpected there, so avoid aborting qemu and handle out-of-memory situations gracefully. This patch addresses the allocations in the mirror block job. Signed-off-by: Kevin Wolf Reviewed-by: Stefan Hajnoczi Reviewed-by: Benoit Canet --- block/mirror.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/block/mirror.c b/block/mirror.c index c7a655fc58..5e7a166b39 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -367,7 +367,12 @@ static void coroutine_fn mirror_run(void *opaque) } end = s->common.len >> BDRV_SECTOR_BITS; - s->buf = qemu_blockalign(bs, s->buf_size); + s->buf = qemu_try_blockalign(bs, s->buf_size); + if (s->buf == NULL) { + ret = -ENOMEM; + goto immediate_exit; + } + sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS; mirror_free_init(s); From 8fcffa9853473ab148d36858f15c5531161a1824 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Thu, 29 May 2014 00:19:54 +0200 Subject: [PATCH 56/59] qcow2: Return useful error code in refcount_init() If bdrv_pread() returns an error, it is very unlikely that it was ENOMEM. In this case, the return value should be passed along; as bdrv_pread() will always either return the number of bytes read or a negative value (the error code), the condition for checking whether bdrv_pread() failed can be simplified (and clarified) as well. Signed-off-by: Max Reitz Signed-off-by: Kevin Wolf Reviewed-by: Benoit Canet --- block/qcow2-refcount.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c index 87a56d84af..d60e2feb10 100644 --- a/block/qcow2-refcount.c +++ b/block/qcow2-refcount.c @@ -50,19 +50,21 @@ int qcow2_refcount_init(BlockDriverState *bs) if (s->refcount_table_size > 0) { if (s->refcount_table == NULL) { + ret = -ENOMEM; goto fail; } BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_LOAD); ret = bdrv_pread(bs->file, s->refcount_table_offset, s->refcount_table, refcount_table_size2); - if (ret != refcount_table_size2) + if (ret < 0) { goto fail; + } for(i = 0; i < s->refcount_table_size; i++) be64_to_cpus(&s->refcount_table[i]); } return 0; fail: - return -ENOMEM; + return ret; } void qcow2_refcount_close(BlockDriverState *bs) From ff52aab2df5c5e10f231481961b88d25a3021724 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Thu, 7 Aug 2014 22:47:53 +0200 Subject: [PATCH 57/59] qcow2: Catch !*host_offset for data allocation qcow2_alloc_cluster_offset() uses host_offset == 0 as "no preferred offset" for the (data) cluster range to be allocated. However, this offset is actually valid and may be allocated on images with a corrupted refcount table or first refcount block. In this case, the corruption prevention should normally catch that write anyway (because it would overwrite the image header). But since 0 is a special value here, the function assumes that nothing has been allocated at all which it asserts against. Because this condition is not qemu's fault but rather that of a broken image, it shouldn't throw an assertion but rather mark the image corrupt and show an appropriate message, which this patch does by calling the corruption check earlier than it would be called normally (before the assertion). Signed-off-by: Max Reitz Reviewed-by: Eric Blake Signed-off-by: Kevin Wolf --- block/qcow2-cluster.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index e7c5f486cd..5b36018b3e 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -1119,6 +1119,17 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset, return 0; } + /* !*host_offset would overwrite the image header and is reserved for "no + * host offset preferred". If 0 was a valid host offset, it'd trigger the + * following overlap check; do that now to avoid having an invalid value in + * *host_offset. */ + if (!alloc_cluster_offset) { + ret = qcow2_pre_write_overlap_check(bs, 0, alloc_cluster_offset, + nb_clusters * s->cluster_size); + assert(ret < 0); + goto fail; + } + /* * Save info needed for meta data update. * From a42f8a3d05c5598cee893c9cf5ffeee48bacb83e Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Thu, 7 Aug 2014 22:47:54 +0200 Subject: [PATCH 58/59] iotests: Add test for image header overlap Add a test for an image with an unallocated image header; instead of an assertion, this should result in the image being marked corrupt. Signed-off-by: Max Reitz Reviewed-by: Eric Blake Signed-off-by: Kevin Wolf --- tests/qemu-iotests/060 | 9 +++++++++ tests/qemu-iotests/060.out | 8 ++++++++ 2 files changed, 17 insertions(+) diff --git a/tests/qemu-iotests/060 b/tests/qemu-iotests/060 index 3cffc12fec..830386fdaa 100755 --- a/tests/qemu-iotests/060 +++ b/tests/qemu-iotests/060 @@ -164,6 +164,15 @@ wait_break 0 write 64k 64k resume 0" | $QEMU_IO | _filter_qemu_io +echo +echo "=== Testing unallocated image header ===" +echo +_make_test_img 64M +# Create L1/L2 +$QEMU_IO -c "$OPEN_RW" -c "write 0 64k" | _filter_qemu_io +poke_file "$TEST_IMG" "$rb_offset" "\x00\x00" +$QEMU_IO -c "$OPEN_RW" -c "write 64k 64k" | _filter_qemu_io + # success, all done echo "*** done" rm -f $seq.full diff --git a/tests/qemu-iotests/060.out b/tests/qemu-iotests/060.out index a517948036..c27c952412 100644 --- a/tests/qemu-iotests/060.out +++ b/tests/qemu-iotests/060.out @@ -93,4 +93,12 @@ blkdebug: Suspended request '0' write failed: Input/output error blkdebug: Resuming request '0' aio_write failed: No medium found + +=== Testing unallocated image header === + +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 +wrote 65536/65536 bytes at offset 0 +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +qcow2: Preventing invalid write on metadata (overlaps with qcow2_header); image marked as corrupt. +write failed: Input/output error *** done From 908bcd540f489f7adf2d804347905b0025d808d3 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Thu, 7 Aug 2014 22:47:55 +0200 Subject: [PATCH 59/59] block: Catch !bs->drv in bdrv_check() qemu-img check calls bdrv_check() twice if the first run repaired some inconsistencies. If the first run however again triggered corruption prevention (on qcow2) due to very bad inconsistencies, bs->drv may be NULL afterwards. Thus, bdrv_check() should check whether bs->drv is set. Signed-off-by: Max Reitz Reviewed-by: Eric Blake Signed-off-by: Kevin Wolf --- block.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/block.c b/block.c index 6ef3465b4a..6fa0201074 100644 --- a/block.c +++ b/block.c @@ -2209,6 +2209,9 @@ bool bdrv_dev_is_medium_locked(BlockDriverState *bs) */ int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix) { + if (bs->drv == NULL) { + return -ENOMEDIUM; + } if (bs->drv->bdrv_check == NULL) { return -ENOTSUP; }