From 0725570b2dcaeedff3031fc271b0d731a7382bfd Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 19 May 2021 13:25:13 +0200 Subject: [PATCH 01/28] MAINTAINERS: update block/rbd.c maintainer Jason has moved on from working on RBD and Ceph. I'm taking over his role upstream. Signed-off-by: Ilya Dryomov Message-Id: <20210519112513.19694-1-idryomov@gmail.com> Acked-by: Stefano Garzarella Signed-off-by: Kevin Wolf --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 684142e12e..86ec36a062 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3060,7 +3060,7 @@ S: Supported F: block/vmdk.c RBD -M: Jason Dillaman +M: Ilya Dryomov L: qemu-block@nongnu.org S: Supported F: block/rbd.c From 42e4ac9ef5a65f1714dd6f332de160eb63e0d6a4 Mon Sep 17 00:00:00 2001 From: Or Ozeri Date: Sun, 27 Jun 2021 14:46:35 +0300 Subject: [PATCH 02/28] block/rbd: Add support for rbd image encryption Starting from ceph Pacific, RBD has built-in support for image-level encryption. Currently supported formats are LUKS version 1 and 2. There are 2 new relevant librbd APIs for controlling encryption, both expect an open image context: rbd_encryption_format: formats an image (i.e. writes the LUKS header) rbd_encryption_load: loads encryptor/decryptor to the image IO stack This commit extends the qemu rbd driver API to support the above. Signed-off-by: Or Ozeri Message-Id: <20210627114635.39326-1-oro@il.ibm.com> Reviewed-by: Ilya Dryomov Signed-off-by: Kevin Wolf --- block/rbd.c | 361 ++++++++++++++++++++++++++++++++++++++++++- qapi/block-core.json | 110 ++++++++++++- 2 files changed, 465 insertions(+), 6 deletions(-) diff --git a/block/rbd.c b/block/rbd.c index 26f64cce7c..f51adb3646 100644 --- a/block/rbd.c +++ b/block/rbd.c @@ -73,6 +73,18 @@ #define LIBRBD_USE_IOVEC 0 #endif +#define RBD_ENCRYPTION_LUKS_HEADER_VERIFICATION_LEN 8 + +static const char rbd_luks_header_verification[ + RBD_ENCRYPTION_LUKS_HEADER_VERIFICATION_LEN] = { + 'L', 'U', 'K', 'S', 0xBA, 0xBE, 0, 1 +}; + +static const char rbd_luks2_header_verification[ + RBD_ENCRYPTION_LUKS_HEADER_VERIFICATION_LEN] = { + 'L', 'U', 'K', 'S', 0xBA, 0xBE, 0, 2 +}; + typedef enum { RBD_AIO_READ, RBD_AIO_WRITE, @@ -351,6 +363,203 @@ static void qemu_rbd_memset(RADOSCB *rcb, int64_t offs) } } +#ifdef LIBRBD_SUPPORTS_ENCRYPTION +static int qemu_rbd_convert_luks_options( + RbdEncryptionOptionsLUKSBase *luks_opts, + char **passphrase, + size_t *passphrase_len, + Error **errp) +{ + return qcrypto_secret_lookup(luks_opts->key_secret, (uint8_t **)passphrase, + passphrase_len, errp); +} + +static int qemu_rbd_convert_luks_create_options( + RbdEncryptionCreateOptionsLUKSBase *luks_opts, + rbd_encryption_algorithm_t *alg, + char **passphrase, + size_t *passphrase_len, + Error **errp) +{ + int r = 0; + + r = qemu_rbd_convert_luks_options( + qapi_RbdEncryptionCreateOptionsLUKSBase_base(luks_opts), + passphrase, passphrase_len, errp); + if (r < 0) { + return r; + } + + if (luks_opts->has_cipher_alg) { + switch (luks_opts->cipher_alg) { + case QCRYPTO_CIPHER_ALG_AES_128: { + *alg = RBD_ENCRYPTION_ALGORITHM_AES128; + break; + } + case QCRYPTO_CIPHER_ALG_AES_256: { + *alg = RBD_ENCRYPTION_ALGORITHM_AES256; + break; + } + default: { + r = -ENOTSUP; + error_setg_errno(errp, -r, "unknown encryption algorithm: %u", + luks_opts->cipher_alg); + return r; + } + } + } else { + /* default alg */ + *alg = RBD_ENCRYPTION_ALGORITHM_AES256; + } + + return 0; +} + +static int qemu_rbd_encryption_format(rbd_image_t image, + RbdEncryptionCreateOptions *encrypt, + Error **errp) +{ + int r = 0; + g_autofree char *passphrase = NULL; + size_t passphrase_len; + rbd_encryption_format_t format; + rbd_encryption_options_t opts; + rbd_encryption_luks1_format_options_t luks_opts; + rbd_encryption_luks2_format_options_t luks2_opts; + size_t opts_size; + uint64_t raw_size, effective_size; + + r = rbd_get_size(image, &raw_size); + if (r < 0) { + error_setg_errno(errp, -r, "cannot get raw image size"); + return r; + } + + switch (encrypt->format) { + case RBD_IMAGE_ENCRYPTION_FORMAT_LUKS: { + memset(&luks_opts, 0, sizeof(luks_opts)); + format = RBD_ENCRYPTION_FORMAT_LUKS1; + opts = &luks_opts; + opts_size = sizeof(luks_opts); + r = qemu_rbd_convert_luks_create_options( + qapi_RbdEncryptionCreateOptionsLUKS_base(&encrypt->u.luks), + &luks_opts.alg, &passphrase, &passphrase_len, errp); + if (r < 0) { + return r; + } + luks_opts.passphrase = passphrase; + luks_opts.passphrase_size = passphrase_len; + break; + } + case RBD_IMAGE_ENCRYPTION_FORMAT_LUKS2: { + memset(&luks2_opts, 0, sizeof(luks2_opts)); + format = RBD_ENCRYPTION_FORMAT_LUKS2; + opts = &luks2_opts; + opts_size = sizeof(luks2_opts); + r = qemu_rbd_convert_luks_create_options( + qapi_RbdEncryptionCreateOptionsLUKS2_base( + &encrypt->u.luks2), + &luks2_opts.alg, &passphrase, &passphrase_len, errp); + if (r < 0) { + return r; + } + luks2_opts.passphrase = passphrase; + luks2_opts.passphrase_size = passphrase_len; + break; + } + default: { + r = -ENOTSUP; + error_setg_errno( + errp, -r, "unknown image encryption format: %u", + encrypt->format); + return r; + } + } + + r = rbd_encryption_format(image, format, opts, opts_size); + if (r < 0) { + error_setg_errno(errp, -r, "encryption format fail"); + return r; + } + + r = rbd_get_size(image, &effective_size); + if (r < 0) { + error_setg_errno(errp, -r, "cannot get effective image size"); + return r; + } + + r = rbd_resize(image, raw_size + (raw_size - effective_size)); + if (r < 0) { + error_setg_errno(errp, -r, "cannot resize image after format"); + return r; + } + + return 0; +} + +static int qemu_rbd_encryption_load(rbd_image_t image, + RbdEncryptionOptions *encrypt, + Error **errp) +{ + int r = 0; + g_autofree char *passphrase = NULL; + size_t passphrase_len; + rbd_encryption_luks1_format_options_t luks_opts; + rbd_encryption_luks2_format_options_t luks2_opts; + rbd_encryption_format_t format; + rbd_encryption_options_t opts; + size_t opts_size; + + switch (encrypt->format) { + case RBD_IMAGE_ENCRYPTION_FORMAT_LUKS: { + memset(&luks_opts, 0, sizeof(luks_opts)); + format = RBD_ENCRYPTION_FORMAT_LUKS1; + opts = &luks_opts; + opts_size = sizeof(luks_opts); + r = qemu_rbd_convert_luks_options( + qapi_RbdEncryptionOptionsLUKS_base(&encrypt->u.luks), + &passphrase, &passphrase_len, errp); + if (r < 0) { + return r; + } + luks_opts.passphrase = passphrase; + luks_opts.passphrase_size = passphrase_len; + break; + } + case RBD_IMAGE_ENCRYPTION_FORMAT_LUKS2: { + memset(&luks2_opts, 0, sizeof(luks2_opts)); + format = RBD_ENCRYPTION_FORMAT_LUKS2; + opts = &luks2_opts; + opts_size = sizeof(luks2_opts); + r = qemu_rbd_convert_luks_options( + qapi_RbdEncryptionOptionsLUKS2_base(&encrypt->u.luks2), + &passphrase, &passphrase_len, errp); + if (r < 0) { + return r; + } + luks2_opts.passphrase = passphrase; + luks2_opts.passphrase_size = passphrase_len; + break; + } + default: { + r = -ENOTSUP; + error_setg_errno( + errp, -r, "unknown image encryption format: %u", + encrypt->format); + return r; + } + } + + r = rbd_encryption_load(image, format, opts, opts_size); + if (r < 0) { + error_setg_errno(errp, -r, "encryption load fail"); + return r; + } + + return 0; +} +#endif + /* FIXME Deprecate and remove keypairs or make it available in QMP. */ static int qemu_rbd_do_create(BlockdevCreateOptions *options, const char *keypairs, const char *password_secret, @@ -368,6 +577,13 @@ static int qemu_rbd_do_create(BlockdevCreateOptions *options, return -EINVAL; } +#ifndef LIBRBD_SUPPORTS_ENCRYPTION + if (opts->has_encrypt) { + error_setg(errp, "RBD library does not support image encryption"); + return -ENOTSUP; + } +#endif + if (opts->has_cluster_size) { int64_t objsize = opts->cluster_size; if ((objsize - 1) & objsize) { /* not a power of 2? */ @@ -393,6 +609,28 @@ static int qemu_rbd_do_create(BlockdevCreateOptions *options, goto out; } +#ifdef LIBRBD_SUPPORTS_ENCRYPTION + if (opts->has_encrypt) { + rbd_image_t image; + + ret = rbd_open(io_ctx, opts->location->image, &image, NULL); + if (ret < 0) { + error_setg_errno(errp, -ret, + "error opening image '%s' for encryption format", + opts->location->image); + goto out; + } + + ret = qemu_rbd_encryption_format(image, opts->encrypt, errp); + rbd_close(image); + if (ret < 0) { + /* encryption format fail, try removing the image */ + rbd_remove(io_ctx, opts->location->image); + goto out; + } + } +#endif + ret = 0; out: rados_ioctx_destroy(io_ctx); @@ -405,6 +643,43 @@ static int qemu_rbd_co_create(BlockdevCreateOptions *options, Error **errp) return qemu_rbd_do_create(options, NULL, NULL, errp); } +static int qemu_rbd_extract_encryption_create_options( + QemuOpts *opts, + RbdEncryptionCreateOptions **spec, + Error **errp) +{ + QDict *opts_qdict; + QDict *encrypt_qdict; + Visitor *v; + int ret = 0; + + opts_qdict = qemu_opts_to_qdict(opts, NULL); + qdict_extract_subqdict(opts_qdict, &encrypt_qdict, "encrypt."); + qobject_unref(opts_qdict); + if (!qdict_size(encrypt_qdict)) { + *spec = NULL; + goto exit; + } + + /* Convert options into a QAPI object */ + v = qobject_input_visitor_new_flat_confused(encrypt_qdict, errp); + if (!v) { + ret = -EINVAL; + goto exit; + } + + visit_type_RbdEncryptionCreateOptions(v, NULL, spec, errp); + visit_free(v); + if (!*spec) { + ret = -EINVAL; + goto exit; + } + +exit: + qobject_unref(encrypt_qdict); + return ret; +} + static int coroutine_fn qemu_rbd_co_create_opts(BlockDriver *drv, const char *filename, QemuOpts *opts, @@ -413,6 +688,7 @@ static int coroutine_fn qemu_rbd_co_create_opts(BlockDriver *drv, BlockdevCreateOptions *create_options; BlockdevCreateOptionsRbd *rbd_opts; BlockdevOptionsRbd *loc; + RbdEncryptionCreateOptions *encrypt = NULL; Error *local_err = NULL; const char *keypairs, *password_secret; QDict *options = NULL; @@ -441,6 +717,13 @@ static int coroutine_fn qemu_rbd_co_create_opts(BlockDriver *drv, goto exit; } + ret = qemu_rbd_extract_encryption_create_options(opts, &encrypt, errp); + if (ret < 0) { + goto exit; + } + rbd_opts->encrypt = encrypt; + rbd_opts->has_encrypt = !!encrypt; + /* * Caution: while qdict_get_try_str() is fine, getting non-string * types would require more care. When @options come from -blockdev @@ -766,12 +1049,24 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags, goto failed_open; } + if (opts->has_encrypt) { +#ifdef LIBRBD_SUPPORTS_ENCRYPTION + r = qemu_rbd_encryption_load(s->image, opts->encrypt, errp); + if (r < 0) { + goto failed_post_open; + } +#else + r = -ENOTSUP; + error_setg(errp, "RBD library does not support image encryption"); + goto failed_post_open; +#endif + } + r = rbd_get_size(s->image, &s->image_size); if (r < 0) { error_setg_errno(errp, -r, "error getting image size from %s", s->image_name); - rbd_close(s->image); - goto failed_open; + goto failed_post_open; } /* If we are using an rbd snapshot, we must be r/o, otherwise @@ -779,8 +1074,7 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags, if (s->snap != NULL) { r = bdrv_apply_auto_read_only(bs, "rbd snapshots are read-only", errp); if (r < 0) { - rbd_close(s->image); - goto failed_open; + goto failed_post_open; } } @@ -790,6 +1084,8 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags, r = 0; goto out; +failed_post_open: + rbd_close(s->image); failed_open: rados_ioctx_destroy(s->io_ctx); g_free(s->snap); @@ -1060,6 +1356,46 @@ static int qemu_rbd_getinfo(BlockDriverState *bs, BlockDriverInfo *bdi) return 0; } +static ImageInfoSpecific *qemu_rbd_get_specific_info(BlockDriverState *bs, + Error **errp) +{ + BDRVRBDState *s = bs->opaque; + ImageInfoSpecific *spec_info; + char buf[RBD_ENCRYPTION_LUKS_HEADER_VERIFICATION_LEN] = {0}; + int r; + + if (s->image_size >= RBD_ENCRYPTION_LUKS_HEADER_VERIFICATION_LEN) { + r = rbd_read(s->image, 0, + RBD_ENCRYPTION_LUKS_HEADER_VERIFICATION_LEN, buf); + if (r < 0) { + error_setg_errno(errp, -r, "cannot read image start for probe"); + return NULL; + } + } + + spec_info = g_new(ImageInfoSpecific, 1); + *spec_info = (ImageInfoSpecific){ + .type = IMAGE_INFO_SPECIFIC_KIND_RBD, + .u.rbd.data = g_new0(ImageInfoSpecificRbd, 1), + }; + + if (memcmp(buf, rbd_luks_header_verification, + RBD_ENCRYPTION_LUKS_HEADER_VERIFICATION_LEN) == 0) { + spec_info->u.rbd.data->encryption_format = + RBD_IMAGE_ENCRYPTION_FORMAT_LUKS; + spec_info->u.rbd.data->has_encryption_format = true; + } else if (memcmp(buf, rbd_luks2_header_verification, + RBD_ENCRYPTION_LUKS_HEADER_VERIFICATION_LEN) == 0) { + spec_info->u.rbd.data->encryption_format = + RBD_IMAGE_ENCRYPTION_FORMAT_LUKS2; + spec_info->u.rbd.data->has_encryption_format = true; + } else { + spec_info->u.rbd.data->has_encryption_format = false; + } + + return spec_info; +} + static int64_t qemu_rbd_getlength(BlockDriverState *bs) { BDRVRBDState *s = bs->opaque; @@ -1253,6 +1589,22 @@ static QemuOptsList qemu_rbd_create_opts = { .type = QEMU_OPT_STRING, .help = "ID of secret providing the password", }, + { + .name = "encrypt.format", + .type = QEMU_OPT_STRING, + .help = "Encrypt the image, format choices: 'luks', 'luks2'", + }, + { + .name = "encrypt.cipher-alg", + .type = QEMU_OPT_STRING, + .help = "Name of encryption cipher algorithm" + " (allowed values: aes-128, aes-256)", + }, + { + .name = "encrypt.key-secret", + .type = QEMU_OPT_STRING, + .help = "ID of secret providing LUKS passphrase", + }, { /* end of list */ } } }; @@ -1282,6 +1634,7 @@ static BlockDriver bdrv_rbd = { .bdrv_co_create_opts = qemu_rbd_co_create_opts, .bdrv_has_zero_init = bdrv_has_zero_init_1, .bdrv_get_info = qemu_rbd_getinfo, + .bdrv_get_specific_info = qemu_rbd_get_specific_info, .create_opts = &qemu_rbd_create_opts, .bdrv_getlength = qemu_rbd_getlength, .bdrv_co_truncate = qemu_rbd_co_truncate, diff --git a/qapi/block-core.json b/qapi/block-core.json index 3114ba69bb..4a46552816 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -127,6 +127,18 @@ 'extents': ['ImageInfo'] } } +## +# @ImageInfoSpecificRbd: +# +# @encryption-format: Image encryption format +# +# Since: 6.1 +## +{ 'struct': 'ImageInfoSpecificRbd', + 'data': { + '*encryption-format': 'RbdImageEncryptionFormat' + } } + ## # @ImageInfoSpecific: # @@ -141,7 +153,8 @@ # If we need to add block driver specific parameters for # LUKS in future, then we'll subclass QCryptoBlockInfoLUKS # to define a ImageInfoSpecificLUKS - 'luks': 'QCryptoBlockInfoLUKS' + 'luks': 'QCryptoBlockInfoLUKS', + 'rbd': 'ImageInfoSpecificRbd' } } ## @@ -3613,6 +3626,94 @@ { 'enum': 'RbdAuthMode', 'data': [ 'cephx', 'none' ] } +## +# @RbdImageEncryptionFormat: +# +# Since: 6.1 +## +{ 'enum': 'RbdImageEncryptionFormat', + 'data': [ 'luks', 'luks2' ] } + +## +# @RbdEncryptionOptionsLUKSBase: +# +# @key-secret: ID of a QCryptoSecret object providing a passphrase +# for unlocking the encryption +# +# Since: 6.1 +## +{ 'struct': 'RbdEncryptionOptionsLUKSBase', + 'data': { 'key-secret': 'str' } } + +## +# @RbdEncryptionCreateOptionsLUKSBase: +# +# @cipher-alg: The encryption algorithm +# +# Since: 6.1 +## +{ 'struct': 'RbdEncryptionCreateOptionsLUKSBase', + 'base': 'RbdEncryptionOptionsLUKSBase', + 'data': { '*cipher-alg': 'QCryptoCipherAlgorithm' } } + +## +# @RbdEncryptionOptionsLUKS: +# +# Since: 6.1 +## +{ 'struct': 'RbdEncryptionOptionsLUKS', + 'base': 'RbdEncryptionOptionsLUKSBase', + 'data': { } } + +## +# @RbdEncryptionOptionsLUKS2: +# +# Since: 6.1 +## +{ 'struct': 'RbdEncryptionOptionsLUKS2', + 'base': 'RbdEncryptionOptionsLUKSBase', + 'data': { } } + +## +# @RbdEncryptionCreateOptionsLUKS: +# +# Since: 6.1 +## +{ 'struct': 'RbdEncryptionCreateOptionsLUKS', + 'base': 'RbdEncryptionCreateOptionsLUKSBase', + 'data': { } } + +## +# @RbdEncryptionCreateOptionsLUKS2: +# +# Since: 6.1 +## +{ 'struct': 'RbdEncryptionCreateOptionsLUKS2', + 'base': 'RbdEncryptionCreateOptionsLUKSBase', + 'data': { } } + +## +# @RbdEncryptionOptions: +# +# Since: 6.1 +## +{ 'union': 'RbdEncryptionOptions', + 'base': { 'format': 'RbdImageEncryptionFormat' }, + 'discriminator': 'format', + 'data': { 'luks': 'RbdEncryptionOptionsLUKS', + 'luks2': 'RbdEncryptionOptionsLUKS2' } } + +## +# @RbdEncryptionCreateOptions: +# +# Since: 6.1 +## +{ 'union': 'RbdEncryptionCreateOptions', + 'base': { 'format': 'RbdImageEncryptionFormat' }, + 'discriminator': 'format', + 'data': { 'luks': 'RbdEncryptionCreateOptionsLUKS', + 'luks2': 'RbdEncryptionCreateOptionsLUKS2' } } + ## # @BlockdevOptionsRbd: # @@ -3628,6 +3729,8 @@ # # @snapshot: Ceph snapshot name. # +# @encrypt: Image encryption options. (Since 6.1) +# # @user: Ceph id name. # # @auth-client-required: Acceptable authentication modes. @@ -3650,6 +3753,7 @@ 'image': 'str', '*conf': 'str', '*snapshot': 'str', + '*encrypt': 'RbdEncryptionOptions', '*user': 'str', '*auth-client-required': ['RbdAuthMode'], '*key-secret': 'str', @@ -4403,13 +4507,15 @@ # point to a snapshot. # @size: Size of the virtual disk in bytes # @cluster-size: RBD object size +# @encrypt: Image encryption options. (Since 6.1) # # Since: 2.12 ## { 'struct': 'BlockdevCreateOptionsRbd', 'data': { 'location': 'BlockdevOptionsRbd', 'size': 'size', - '*cluster-size' : 'size' } } + '*cluster-size' : 'size', + '*encrypt' : 'RbdEncryptionCreateOptions' } } ## # @BlockdevVmdkSubformat: From 48672ac058419a2162ea4579d507278e091c1e3e Mon Sep 17 00:00:00 2001 From: Peter Lieven Date: Fri, 2 Jul 2021 19:23:51 +0200 Subject: [PATCH 03/28] block/rbd: bump librbd requirement to luminous release Ceph Luminous (version 12.2.z) is almost 4 years old at this point. Bump the requirement to get rid of the ifdef'ry in the code. Qemu 6.1 dropped the support for RHEL-7 which was the last supported OS that required an older librbd. Signed-off-by: Peter Lieven Reviewed-by: Ilya Dryomov Message-Id: <20210702172356.11574-2-idryomov@gmail.com> Signed-off-by: Kevin Wolf --- block/rbd.c | 120 ++++------------------------------------------------ meson.build | 7 ++- 2 files changed, 13 insertions(+), 114 deletions(-) diff --git a/block/rbd.c b/block/rbd.c index f51adb3646..b4b928bbb9 100644 --- a/block/rbd.c +++ b/block/rbd.c @@ -55,24 +55,10 @@ * leading "\". */ -/* rbd_aio_discard added in 0.1.2 */ -#if LIBRBD_VERSION_CODE >= LIBRBD_VERSION(0, 1, 2) -#define LIBRBD_SUPPORTS_DISCARD -#else -#undef LIBRBD_SUPPORTS_DISCARD -#endif - #define OBJ_MAX_SIZE (1UL << OBJ_DEFAULT_OBJ_ORDER) #define RBD_MAX_SNAPS 100 -/* The LIBRBD_SUPPORTS_IOVEC is defined in librbd.h */ -#ifdef LIBRBD_SUPPORTS_IOVEC -#define LIBRBD_USE_IOVEC 1 -#else -#define LIBRBD_USE_IOVEC 0 -#endif - #define RBD_ENCRYPTION_LUKS_HEADER_VERIFICATION_LEN 8 static const char rbd_luks_header_verification[ @@ -96,7 +82,6 @@ typedef struct RBDAIOCB { BlockAIOCB common; int64_t ret; QEMUIOVector *qiov; - char *bounce; RBDAIOCmd cmd; int error; struct BDRVRBDState *s; @@ -106,7 +91,6 @@ typedef struct RADOSCB { RBDAIOCB *acb; struct BDRVRBDState *s; int64_t size; - char *buf; int64_t ret; } RADOSCB; @@ -354,13 +338,9 @@ static int qemu_rbd_set_keypairs(rados_t cluster, const char *keypairs_json, static void qemu_rbd_memset(RADOSCB *rcb, int64_t offs) { - if (LIBRBD_USE_IOVEC) { - RBDAIOCB *acb = rcb->acb; - iov_memset(acb->qiov->iov, acb->qiov->niov, offs, 0, - acb->qiov->size - offs); - } else { - memset(rcb->buf + offs, 0, rcb->size - offs); - } + RBDAIOCB *acb = rcb->acb; + iov_memset(acb->qiov->iov, acb->qiov->niov, offs, 0, + acb->qiov->size - offs); } #ifdef LIBRBD_SUPPORTS_ENCRYPTION @@ -787,13 +767,6 @@ static void qemu_rbd_complete_aio(RADOSCB *rcb) g_free(rcb); - if (!LIBRBD_USE_IOVEC) { - if (acb->cmd == RBD_AIO_READ) { - qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size); - } - qemu_vfree(acb->bounce); - } - acb->common.cb(acb->common.opaque, (acb->ret > 0 ? 0 : acb->ret)); qemu_aio_unref(acb); @@ -1174,28 +1147,6 @@ static void rbd_finish_aiocb(rbd_completion_t c, RADOSCB *rcb) rbd_finish_bh, rcb); } -static int rbd_aio_discard_wrapper(rbd_image_t image, - uint64_t off, - uint64_t len, - rbd_completion_t comp) -{ -#ifdef LIBRBD_SUPPORTS_DISCARD - return rbd_aio_discard(image, off, len, comp); -#else - return -ENOTSUP; -#endif -} - -static int rbd_aio_flush_wrapper(rbd_image_t image, - rbd_completion_t comp) -{ -#ifdef LIBRBD_SUPPORTS_AIO_FLUSH - return rbd_aio_flush(image, comp); -#else - return -ENOTSUP; -#endif -} - static BlockAIOCB *rbd_start_aio(BlockDriverState *bs, int64_t off, QEMUIOVector *qiov, @@ -1218,21 +1169,6 @@ static BlockAIOCB *rbd_start_aio(BlockDriverState *bs, rcb = g_new(RADOSCB, 1); - if (!LIBRBD_USE_IOVEC) { - if (cmd == RBD_AIO_DISCARD || cmd == RBD_AIO_FLUSH) { - acb->bounce = NULL; - } else { - acb->bounce = qemu_try_blockalign(bs, qiov->size); - if (acb->bounce == NULL) { - goto failed; - } - } - if (cmd == RBD_AIO_WRITE) { - qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size); - } - rcb->buf = acb->bounce; - } - acb->ret = 0; acb->error = 0; acb->s = s; @@ -1246,7 +1182,7 @@ static BlockAIOCB *rbd_start_aio(BlockDriverState *bs, } switch (cmd) { - case RBD_AIO_WRITE: { + case RBD_AIO_WRITE: /* * RBD APIs don't allow us to write more than actual size, so in order * to support growing images, we resize the image before write @@ -1258,25 +1194,16 @@ static BlockAIOCB *rbd_start_aio(BlockDriverState *bs, goto failed_completion; } } -#ifdef LIBRBD_SUPPORTS_IOVEC - r = rbd_aio_writev(s->image, qiov->iov, qiov->niov, off, c); -#else - r = rbd_aio_write(s->image, off, size, rcb->buf, c); -#endif + r = rbd_aio_writev(s->image, qiov->iov, qiov->niov, off, c); break; - } case RBD_AIO_READ: -#ifdef LIBRBD_SUPPORTS_IOVEC - r = rbd_aio_readv(s->image, qiov->iov, qiov->niov, off, c); -#else - r = rbd_aio_read(s->image, off, size, rcb->buf, c); -#endif + r = rbd_aio_readv(s->image, qiov->iov, qiov->niov, off, c); break; case RBD_AIO_DISCARD: - r = rbd_aio_discard_wrapper(s->image, off, size, c); + r = rbd_aio_discard(s->image, off, size, c); break; case RBD_AIO_FLUSH: - r = rbd_aio_flush_wrapper(s->image, c); + r = rbd_aio_flush(s->image, c); break; default: r = -EINVAL; @@ -1291,9 +1218,6 @@ failed_completion: rbd_aio_release(c); failed: g_free(rcb); - if (!LIBRBD_USE_IOVEC) { - qemu_vfree(acb->bounce); - } qemu_aio_unref(acb); return NULL; @@ -1319,7 +1243,6 @@ static BlockAIOCB *qemu_rbd_aio_pwritev(BlockDriverState *bs, RBD_AIO_WRITE); } -#ifdef LIBRBD_SUPPORTS_AIO_FLUSH static BlockAIOCB *qemu_rbd_aio_flush(BlockDriverState *bs, BlockCompletionFunc *cb, void *opaque) @@ -1327,20 +1250,6 @@ static BlockAIOCB *qemu_rbd_aio_flush(BlockDriverState *bs, return rbd_start_aio(bs, 0, NULL, 0, cb, opaque, RBD_AIO_FLUSH); } -#else - -static int qemu_rbd_co_flush(BlockDriverState *bs) -{ -#if LIBRBD_VERSION_CODE >= LIBRBD_VERSION(0, 1, 1) - /* rbd_flush added in 0.1.1 */ - BDRVRBDState *s = bs->opaque; - return rbd_flush(s->image); -#else - return 0; -#endif -} -#endif - static int qemu_rbd_getinfo(BlockDriverState *bs, BlockDriverInfo *bdi) { BDRVRBDState *s = bs->opaque; @@ -1546,7 +1455,6 @@ static int qemu_rbd_snap_list(BlockDriverState *bs, return snap_count; } -#ifdef LIBRBD_SUPPORTS_DISCARD static BlockAIOCB *qemu_rbd_aio_pdiscard(BlockDriverState *bs, int64_t offset, int bytes, @@ -1556,9 +1464,7 @@ static BlockAIOCB *qemu_rbd_aio_pdiscard(BlockDriverState *bs, return rbd_start_aio(bs, offset, NULL, bytes, cb, opaque, RBD_AIO_DISCARD); } -#endif -#ifdef LIBRBD_SUPPORTS_INVALIDATE static void coroutine_fn qemu_rbd_co_invalidate_cache(BlockDriverState *bs, Error **errp) { @@ -1568,7 +1474,6 @@ static void coroutine_fn qemu_rbd_co_invalidate_cache(BlockDriverState *bs, error_setg_errno(errp, -r, "Failed to invalidate the cache"); } } -#endif static QemuOptsList qemu_rbd_create_opts = { .name = "rbd-create-opts", @@ -1643,23 +1548,14 @@ static BlockDriver bdrv_rbd = { .bdrv_aio_preadv = qemu_rbd_aio_preadv, .bdrv_aio_pwritev = qemu_rbd_aio_pwritev, -#ifdef LIBRBD_SUPPORTS_AIO_FLUSH .bdrv_aio_flush = qemu_rbd_aio_flush, -#else - .bdrv_co_flush_to_disk = qemu_rbd_co_flush, -#endif - -#ifdef LIBRBD_SUPPORTS_DISCARD .bdrv_aio_pdiscard = qemu_rbd_aio_pdiscard, -#endif .bdrv_snapshot_create = qemu_rbd_snap_create, .bdrv_snapshot_delete = qemu_rbd_snap_remove, .bdrv_snapshot_list = qemu_rbd_snap_list, .bdrv_snapshot_goto = qemu_rbd_snap_rollback, -#ifdef LIBRBD_SUPPORTS_INVALIDATE .bdrv_co_invalidate_cache = qemu_rbd_co_invalidate_cache, -#endif .strong_runtime_opts = qemu_rbd_strong_runtime_opts, }; diff --git a/meson.build b/meson.build index 7e12de01be..eb362ee5eb 100644 --- a/meson.build +++ b/meson.build @@ -710,13 +710,16 @@ if not get_option('rbd').auto() or have_block int main(void) { rados_t cluster; rados_create(&cluster, NULL); + #if LIBRBD_VERSION_CODE < LIBRBD_VERSION(1, 12, 0) + #error + #endif return 0; }''', dependencies: [librbd, librados]) rbd = declare_dependency(dependencies: [librbd, librados]) elif get_option('rbd').enabled() - error('could not link librados') + error('librbd >= 1.12.0 required') else - warning('could not link librados, disabling') + warning('librbd >= 1.12.0 not found, disabling') endif endif endif From 832a93dcb8a870f9debcc2513a040bb3dea858f2 Mon Sep 17 00:00:00 2001 From: Peter Lieven Date: Fri, 2 Jul 2021 19:23:52 +0200 Subject: [PATCH 04/28] block/rbd: store object_size in BDRVRBDState Signed-off-by: Peter Lieven Reviewed-by: Ilya Dryomov Message-Id: <20210702172356.11574-3-idryomov@gmail.com> Signed-off-by: Kevin Wolf --- block/rbd.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/block/rbd.c b/block/rbd.c index b4b928bbb9..1ebf8f7e48 100644 --- a/block/rbd.c +++ b/block/rbd.c @@ -102,6 +102,7 @@ typedef struct BDRVRBDState { char *snap; char *namespace; uint64_t image_size; + uint64_t object_size; } BDRVRBDState; static int qemu_rbd_connect(rados_t *cluster, rados_ioctx_t *io_ctx, @@ -958,6 +959,7 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags, const QDictEntry *e; Error *local_err = NULL; char *keypairs, *secretid; + rbd_image_info_t info; int r; keypairs = g_strdup(qdict_get_try_str(options, "=keyvalue-pairs")); @@ -1035,12 +1037,14 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags, #endif } - r = rbd_get_size(s->image, &s->image_size); + r = rbd_stat(s->image, &info, sizeof(info)); if (r < 0) { - error_setg_errno(errp, -r, "error getting image size from %s", + error_setg_errno(errp, -r, "error getting image info from %s", s->image_name); goto failed_post_open; } + s->image_size = info.size; + s->object_size = info.obj_size; /* If we are using an rbd snapshot, we must be r/o, otherwise * leave as-is */ @@ -1253,15 +1257,7 @@ static BlockAIOCB *qemu_rbd_aio_flush(BlockDriverState *bs, static int qemu_rbd_getinfo(BlockDriverState *bs, BlockDriverInfo *bdi) { BDRVRBDState *s = bs->opaque; - rbd_image_info_t info; - int r; - - r = rbd_stat(s->image, &info, sizeof(info)); - if (r < 0) { - return r; - } - - bdi->cluster_size = info.obj_size; + bdi->cluster_size = s->object_size; return 0; } From 6d9214189e22205d42c4ad1fae5af265f0a94dd3 Mon Sep 17 00:00:00 2001 From: Peter Lieven Date: Fri, 2 Jul 2021 19:23:53 +0200 Subject: [PATCH 05/28] block/rbd: update s->image_size in qemu_rbd_getlength While at it just call rbd_get_size and avoid rbd_image_info_t. Signed-off-by: Peter Lieven Reviewed-by: Ilya Dryomov Message-Id: <20210702172356.11574-4-idryomov@gmail.com> Signed-off-by: Kevin Wolf --- block/rbd.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/block/rbd.c b/block/rbd.c index 1ebf8f7e48..e2028d3db5 100644 --- a/block/rbd.c +++ b/block/rbd.c @@ -1304,15 +1304,14 @@ static ImageInfoSpecific *qemu_rbd_get_specific_info(BlockDriverState *bs, static int64_t qemu_rbd_getlength(BlockDriverState *bs) { BDRVRBDState *s = bs->opaque; - rbd_image_info_t info; int r; - r = rbd_stat(s->image, &info, sizeof(info)); + r = rbd_get_size(s->image, &s->image_size); if (r < 0) { return r; } - return info.size; + return s->image_size; } static int coroutine_fn qemu_rbd_co_truncate(BlockDriverState *bs, From c3e5fac534c6effc329b962162f79c799398f013 Mon Sep 17 00:00:00 2001 From: Peter Lieven Date: Fri, 2 Jul 2021 19:23:54 +0200 Subject: [PATCH 06/28] block/rbd: migrate from aio to coroutines Signed-off-by: Peter Lieven Reviewed-by: Ilya Dryomov Message-Id: <20210702172356.11574-5-idryomov@gmail.com> Signed-off-by: Kevin Wolf --- block/rbd.c | 252 +++++++++++++++++++--------------------------------- 1 file changed, 90 insertions(+), 162 deletions(-) diff --git a/block/rbd.c b/block/rbd.c index e2028d3db5..380ad28861 100644 --- a/block/rbd.c +++ b/block/rbd.c @@ -78,22 +78,6 @@ typedef enum { RBD_AIO_FLUSH } RBDAIOCmd; -typedef struct RBDAIOCB { - BlockAIOCB common; - int64_t ret; - QEMUIOVector *qiov; - RBDAIOCmd cmd; - int error; - struct BDRVRBDState *s; -} RBDAIOCB; - -typedef struct RADOSCB { - RBDAIOCB *acb; - struct BDRVRBDState *s; - int64_t size; - int64_t ret; -} RADOSCB; - typedef struct BDRVRBDState { rados_t cluster; rados_ioctx_t io_ctx; @@ -105,6 +89,13 @@ typedef struct BDRVRBDState { uint64_t object_size; } BDRVRBDState; +typedef struct RBDTask { + BlockDriverState *bs; + Coroutine *co; + bool complete; + int64_t ret; +} RBDTask; + static int qemu_rbd_connect(rados_t *cluster, rados_ioctx_t *io_ctx, BlockdevOptionsRbd *opts, bool cache, const char *keypairs, const char *secretid, @@ -337,13 +328,6 @@ static int qemu_rbd_set_keypairs(rados_t cluster, const char *keypairs_json, return ret; } -static void qemu_rbd_memset(RADOSCB *rcb, int64_t offs) -{ - RBDAIOCB *acb = rcb->acb; - iov_memset(acb->qiov->iov, acb->qiov->niov, offs, 0, - acb->qiov->size - offs); -} - #ifdef LIBRBD_SUPPORTS_ENCRYPTION static int qemu_rbd_convert_luks_options( RbdEncryptionOptionsLUKSBase *luks_opts, @@ -733,46 +717,6 @@ exit: return ret; } -/* - * This aio completion is being called from rbd_finish_bh() and runs in qemu - * BH context. - */ -static void qemu_rbd_complete_aio(RADOSCB *rcb) -{ - RBDAIOCB *acb = rcb->acb; - int64_t r; - - r = rcb->ret; - - if (acb->cmd != RBD_AIO_READ) { - if (r < 0) { - acb->ret = r; - acb->error = 1; - } else if (!acb->error) { - acb->ret = rcb->size; - } - } else { - if (r < 0) { - qemu_rbd_memset(rcb, 0); - acb->ret = r; - acb->error = 1; - } else if (r < rcb->size) { - qemu_rbd_memset(rcb, r); - if (!acb->error) { - acb->ret = rcb->size; - } - } else if (!acb->error) { - acb->ret = r; - } - } - - g_free(rcb); - - acb->common.cb(acb->common.opaque, (acb->ret > 0 ? 0 : acb->ret)); - - qemu_aio_unref(acb); -} - static char *qemu_rbd_mon_host(BlockdevOptionsRbd *opts, Error **errp) { const char **vals; @@ -1122,89 +1066,59 @@ static int qemu_rbd_resize(BlockDriverState *bs, uint64_t size) return 0; } -static const AIOCBInfo rbd_aiocb_info = { - .aiocb_size = sizeof(RBDAIOCB), -}; - -static void rbd_finish_bh(void *opaque) +static void qemu_rbd_finish_bh(void *opaque) { - RADOSCB *rcb = opaque; - qemu_rbd_complete_aio(rcb); + RBDTask *task = opaque; + task->complete = 1; + aio_co_wake(task->co); } /* - * This is the callback function for rbd_aio_read and _write + * This is the completion callback function for all rbd aio calls + * started from qemu_rbd_start_co(). * * Note: this function is being called from a non qemu thread so * we need to be careful about what we do here. Generally we only * schedule a BH, and do the rest of the io completion handling - * from rbd_finish_bh() which runs in a qemu context. + * from qemu_rbd_finish_bh() which runs in a qemu context. */ -static void rbd_finish_aiocb(rbd_completion_t c, RADOSCB *rcb) +static void qemu_rbd_completion_cb(rbd_completion_t c, RBDTask *task) { - RBDAIOCB *acb = rcb->acb; - - rcb->ret = rbd_aio_get_return_value(c); + task->ret = rbd_aio_get_return_value(c); rbd_aio_release(c); - - replay_bh_schedule_oneshot_event(bdrv_get_aio_context(acb->common.bs), - rbd_finish_bh, rcb); + aio_bh_schedule_oneshot(bdrv_get_aio_context(task->bs), + qemu_rbd_finish_bh, task); } -static BlockAIOCB *rbd_start_aio(BlockDriverState *bs, - int64_t off, - QEMUIOVector *qiov, - int64_t size, - BlockCompletionFunc *cb, - void *opaque, - RBDAIOCmd cmd) +static int coroutine_fn qemu_rbd_start_co(BlockDriverState *bs, + uint64_t offset, + uint64_t bytes, + QEMUIOVector *qiov, + int flags, + RBDAIOCmd cmd) { - RBDAIOCB *acb; - RADOSCB *rcb = NULL; + BDRVRBDState *s = bs->opaque; + RBDTask task = { .bs = bs, .co = qemu_coroutine_self() }; rbd_completion_t c; int r; - BDRVRBDState *s = bs->opaque; + assert(!qiov || qiov->size == bytes); - acb = qemu_aio_get(&rbd_aiocb_info, bs, cb, opaque); - acb->cmd = cmd; - acb->qiov = qiov; - assert(!qiov || qiov->size == size); - - rcb = g_new(RADOSCB, 1); - - acb->ret = 0; - acb->error = 0; - acb->s = s; - - rcb->acb = acb; - rcb->s = acb->s; - rcb->size = size; - r = rbd_aio_create_completion(rcb, (rbd_callback_t) rbd_finish_aiocb, &c); + r = rbd_aio_create_completion(&task, + (rbd_callback_t) qemu_rbd_completion_cb, &c); if (r < 0) { - goto failed; + return r; } switch (cmd) { - case RBD_AIO_WRITE: - /* - * RBD APIs don't allow us to write more than actual size, so in order - * to support growing images, we resize the image before write - * operations that exceed the current size. - */ - if (off + size > s->image_size) { - r = qemu_rbd_resize(bs, off + size); - if (r < 0) { - goto failed_completion; - } - } - r = rbd_aio_writev(s->image, qiov->iov, qiov->niov, off, c); - break; case RBD_AIO_READ: - r = rbd_aio_readv(s->image, qiov->iov, qiov->niov, off, c); + r = rbd_aio_readv(s->image, qiov->iov, qiov->niov, offset, c); + break; + case RBD_AIO_WRITE: + r = rbd_aio_writev(s->image, qiov->iov, qiov->niov, offset, c); break; case RBD_AIO_DISCARD: - r = rbd_aio_discard(s->image, off, size, c); + r = rbd_aio_discard(s->image, offset, bytes, c); break; case RBD_AIO_FLUSH: r = rbd_aio_flush(s->image, c); @@ -1214,44 +1128,69 @@ static BlockAIOCB *rbd_start_aio(BlockDriverState *bs, } if (r < 0) { - goto failed_completion; + error_report("rbd request failed early: cmd %d offset %" PRIu64 + " bytes %" PRIu64 " flags %d r %d (%s)", cmd, offset, + bytes, flags, r, strerror(-r)); + rbd_aio_release(c); + return r; } - return &acb->common; -failed_completion: - rbd_aio_release(c); -failed: - g_free(rcb); + while (!task.complete) { + qemu_coroutine_yield(); + } - qemu_aio_unref(acb); - return NULL; + if (task.ret < 0) { + error_report("rbd request failed: cmd %d offset %" PRIu64 " bytes %" + PRIu64 " flags %d task.ret %" PRIi64 " (%s)", cmd, offset, + bytes, flags, task.ret, strerror(-task.ret)); + return task.ret; + } + + /* zero pad short reads */ + if (cmd == RBD_AIO_READ && task.ret < qiov->size) { + qemu_iovec_memset(qiov, task.ret, 0, qiov->size - task.ret); + } + + return 0; } -static BlockAIOCB *qemu_rbd_aio_preadv(BlockDriverState *bs, - uint64_t offset, uint64_t bytes, - QEMUIOVector *qiov, int flags, - BlockCompletionFunc *cb, - void *opaque) +static int +coroutine_fn qemu_rbd_co_preadv(BlockDriverState *bs, uint64_t offset, + uint64_t bytes, QEMUIOVector *qiov, + int flags) { - return rbd_start_aio(bs, offset, qiov, bytes, cb, opaque, - RBD_AIO_READ); + return qemu_rbd_start_co(bs, offset, bytes, qiov, flags, RBD_AIO_READ); } -static BlockAIOCB *qemu_rbd_aio_pwritev(BlockDriverState *bs, - uint64_t offset, uint64_t bytes, - QEMUIOVector *qiov, int flags, - BlockCompletionFunc *cb, - void *opaque) +static int +coroutine_fn qemu_rbd_co_pwritev(BlockDriverState *bs, uint64_t offset, + uint64_t bytes, QEMUIOVector *qiov, + int flags) { - return rbd_start_aio(bs, offset, qiov, bytes, cb, opaque, - RBD_AIO_WRITE); + BDRVRBDState *s = bs->opaque; + /* + * RBD APIs don't allow us to write more than actual size, so in order + * to support growing images, we resize the image before write + * operations that exceed the current size. + */ + if (offset + bytes > s->image_size) { + int r = qemu_rbd_resize(bs, offset + bytes); + if (r < 0) { + return r; + } + } + return qemu_rbd_start_co(bs, offset, bytes, qiov, flags, RBD_AIO_WRITE); } -static BlockAIOCB *qemu_rbd_aio_flush(BlockDriverState *bs, - BlockCompletionFunc *cb, - void *opaque) +static int coroutine_fn qemu_rbd_co_flush(BlockDriverState *bs) { - return rbd_start_aio(bs, 0, NULL, 0, cb, opaque, RBD_AIO_FLUSH); + return qemu_rbd_start_co(bs, 0, 0, NULL, 0, RBD_AIO_FLUSH); +} + +static int coroutine_fn qemu_rbd_co_pdiscard(BlockDriverState *bs, + int64_t offset, int count) +{ + return qemu_rbd_start_co(bs, offset, count, NULL, 0, RBD_AIO_DISCARD); } static int qemu_rbd_getinfo(BlockDriverState *bs, BlockDriverInfo *bdi) @@ -1450,16 +1389,6 @@ static int qemu_rbd_snap_list(BlockDriverState *bs, return snap_count; } -static BlockAIOCB *qemu_rbd_aio_pdiscard(BlockDriverState *bs, - int64_t offset, - int bytes, - BlockCompletionFunc *cb, - void *opaque) -{ - return rbd_start_aio(bs, offset, NULL, bytes, cb, opaque, - RBD_AIO_DISCARD); -} - static void coroutine_fn qemu_rbd_co_invalidate_cache(BlockDriverState *bs, Error **errp) { @@ -1540,11 +1469,10 @@ static BlockDriver bdrv_rbd = { .bdrv_co_truncate = qemu_rbd_co_truncate, .protocol_name = "rbd", - .bdrv_aio_preadv = qemu_rbd_aio_preadv, - .bdrv_aio_pwritev = qemu_rbd_aio_pwritev, - - .bdrv_aio_flush = qemu_rbd_aio_flush, - .bdrv_aio_pdiscard = qemu_rbd_aio_pdiscard, + .bdrv_co_preadv = qemu_rbd_co_preadv, + .bdrv_co_pwritev = qemu_rbd_co_pwritev, + .bdrv_co_flush_to_disk = qemu_rbd_co_flush, + .bdrv_co_pdiscard = qemu_rbd_co_pdiscard, .bdrv_snapshot_create = qemu_rbd_snap_create, .bdrv_snapshot_delete = qemu_rbd_snap_remove, From c56ac27d2ad583aa2db5382bb510a33b638a4ab5 Mon Sep 17 00:00:00 2001 From: Peter Lieven Date: Fri, 2 Jul 2021 19:23:55 +0200 Subject: [PATCH 07/28] block/rbd: add write zeroes support This patch wittingly sets BDRV_REQ_NO_FALLBACK and silently ignores BDRV_REQ_MAY_UNMAP for older librbd versions. The rationale for this is as follows (citing Ilya Dryomov current RBD maintainer): ---8<--- a) remove the BDRV_REQ_MAY_UNMAP check in qemu_rbd_co_pwrite_zeroes() and as a consequence always unmap if librbd is too old It's not clear what qemu's expectation is but in general Write Zeroes is allowed to unmap. The only guarantee is that subsequent reads return zeroes, everything else is a hint. This is how it is specified in the kernel and in the NVMe spec. In particular, block/nvme.c implements it as follows: if (flags & BDRV_REQ_MAY_UNMAP) { cdw12 |= (1 << 25); } This sets the Deallocate bit. But if it's not set, the device may still deallocate: """ If the Deallocate bit (CDW12.DEAC) is set to '1' in a Write Zeroes command, and the namespace supports clearing all bytes to 0h in the values read (e.g., bits 2:0 in the DLFEAT field are set to 001b) from a deallocated logical block and its metadata (excluding protection information), then for each specified logical block, the controller: - should deallocate that logical block; ... If the Deallocate bit is cleared to '0' in a Write Zeroes command, and the namespace supports clearing all bytes to 0h in the values read (e.g., bits 2:0 in the DLFEAT field are set to 001b) from a deallocated logical block and its metadata (excluding protection information), then, for each specified logical block, the controller: - may deallocate that logical block; """ https://nvmexpress.org/wp-content/uploads/NVM-Express-NVM-Command-Set-Specification-2021.06.02-Ratified-1.pdf b) set BDRV_REQ_NO_FALLBACK in supported_zero_flags Again, it's not clear what qemu expects here, but without it we end up in a ridiculous situation where specifying the "don't allow slow fallback" switch immediately fails all efficient zeroing requests on a device where Write Zeroes is always efficient: $ qemu-io -c 'help write' | grep -- '-[zun]' -n, -- with -z, don't allow slow fallback -u, -- with -z, allow unmapping -z, -- write zeroes using blk_co_pwrite_zeroes $ qemu-io -f rbd -c 'write -z -u -n 0 1M' rbd:foo/bar write failed: Operation not supported --->8--- Signed-off-by: Peter Lieven Reviewed-by: Ilya Dryomov Message-Id: <20210702172356.11574-6-idryomov@gmail.com> Signed-off-by: Kevin Wolf --- block/rbd.c | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/block/rbd.c b/block/rbd.c index 380ad28861..3152bc8ba0 100644 --- a/block/rbd.c +++ b/block/rbd.c @@ -75,7 +75,8 @@ typedef enum { RBD_AIO_READ, RBD_AIO_WRITE, RBD_AIO_DISCARD, - RBD_AIO_FLUSH + RBD_AIO_FLUSH, + RBD_AIO_WRITE_ZEROES } RBDAIOCmd; typedef struct BDRVRBDState { @@ -999,6 +1000,10 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags, } } +#ifdef LIBRBD_SUPPORTS_WRITE_ZEROES + bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK; +#endif + /* When extending regular files, we get zeros from the OS */ bs->supported_truncate_flags = BDRV_REQ_ZERO_WRITE; @@ -1123,6 +1128,18 @@ static int coroutine_fn qemu_rbd_start_co(BlockDriverState *bs, case RBD_AIO_FLUSH: r = rbd_aio_flush(s->image, c); break; +#ifdef LIBRBD_SUPPORTS_WRITE_ZEROES + case RBD_AIO_WRITE_ZEROES: { + int zero_flags = 0; +#ifdef RBD_WRITE_ZEROES_FLAG_THICK_PROVISION + if (!(flags & BDRV_REQ_MAY_UNMAP)) { + zero_flags = RBD_WRITE_ZEROES_FLAG_THICK_PROVISION; + } +#endif + r = rbd_aio_write_zeroes(s->image, offset, bytes, c, zero_flags, 0); + break; + } +#endif default: r = -EINVAL; } @@ -1193,6 +1210,16 @@ static int coroutine_fn qemu_rbd_co_pdiscard(BlockDriverState *bs, return qemu_rbd_start_co(bs, offset, count, NULL, 0, RBD_AIO_DISCARD); } +#ifdef LIBRBD_SUPPORTS_WRITE_ZEROES +static int +coroutine_fn qemu_rbd_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset, + int count, BdrvRequestFlags flags) +{ + return qemu_rbd_start_co(bs, offset, count, NULL, flags, + RBD_AIO_WRITE_ZEROES); +} +#endif + static int qemu_rbd_getinfo(BlockDriverState *bs, BlockDriverInfo *bdi) { BDRVRBDState *s = bs->opaque; @@ -1473,6 +1500,9 @@ static BlockDriver bdrv_rbd = { .bdrv_co_pwritev = qemu_rbd_co_pwritev, .bdrv_co_flush_to_disk = qemu_rbd_co_flush, .bdrv_co_pdiscard = qemu_rbd_co_pdiscard, +#ifdef LIBRBD_SUPPORTS_WRITE_ZEROES + .bdrv_co_pwrite_zeroes = qemu_rbd_co_pwrite_zeroes, +#endif .bdrv_snapshot_create = qemu_rbd_snap_create, .bdrv_snapshot_delete = qemu_rbd_snap_remove, From eb06cbab7e92caf15033c91dfcacd2ba5d7bc88a Mon Sep 17 00:00:00 2001 From: Peter Lieven Date: Fri, 2 Jul 2021 19:23:56 +0200 Subject: [PATCH 08/28] block/rbd: drop qemu_rbd_refresh_limits librbd supports 1 byte alignment for all aio operations. Currently, there is no API call to query limits from the Ceph ObjectStore backend. So drop the bdrv_refresh_limits completely until there is such an API call. Signed-off-by: Peter Lieven Reviewed-by: Ilya Dryomov Message-Id: <20210702172356.11574-7-idryomov@gmail.com> Signed-off-by: Kevin Wolf --- block/rbd.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/block/rbd.c b/block/rbd.c index 3152bc8ba0..01a7b94d62 100644 --- a/block/rbd.c +++ b/block/rbd.c @@ -240,14 +240,6 @@ done: return; } - -static void qemu_rbd_refresh_limits(BlockDriverState *bs, Error **errp) -{ - /* XXX Does RBD support AIO on less than 512-byte alignment? */ - bs->bl.request_alignment = 512; -} - - static int qemu_rbd_set_auth(rados_t cluster, BlockdevOptionsRbd *opts, Error **errp) { @@ -1482,7 +1474,6 @@ static BlockDriver bdrv_rbd = { .format_name = "rbd", .instance_size = sizeof(BDRVRBDState), .bdrv_parse_filename = qemu_rbd_parse_filename, - .bdrv_refresh_limits = qemu_rbd_refresh_limits, .bdrv_file_open = qemu_rbd_open, .bdrv_close = qemu_rbd_close, .bdrv_reopen_prepare = qemu_rbd_reopen_prepare, From c2615bdfbd6d5a1a48438102f8ab328fe30b8e27 Mon Sep 17 00:00:00 2001 From: Heinrich Schuchardt Date: Tue, 29 Jun 2021 08:36:02 +0200 Subject: [PATCH 09/28] util/uri: do not check argument of uri_free() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit uri_free() checks if its argument is NULL in uri_clean() and g_free(). There is no need to check the argument before the call. Signed-off-by: Heinrich Schuchardt Message-Id: <20210629063602.4239-1-xypron.glpk@gmx.de> Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Richard W.M. Jones Signed-off-by: Kevin Wolf --- block/nfs.c | 4 +--- block/ssh.c | 4 +--- util/uri.c | 22 ++++++---------------- 3 files changed, 8 insertions(+), 22 deletions(-) diff --git a/block/nfs.c b/block/nfs.c index 7dff64f489..9aeaefb364 100644 --- a/block/nfs.c +++ b/block/nfs.c @@ -147,9 +147,7 @@ out: if (qp) { query_params_free(qp); } - if (uri) { - uri_free(uri); - } + uri_free(uri); return ret; } diff --git a/block/ssh.c b/block/ssh.c index d008caf059..e0fbb4934b 100644 --- a/block/ssh.c +++ b/block/ssh.c @@ -237,9 +237,7 @@ static int parse_uri(const char *filename, QDict *options, Error **errp) return 0; err: - if (uri) { - uri_free(uri); - } + uri_free(uri); return -EINVAL; } diff --git a/util/uri.c b/util/uri.c index 8bdef84120..ff72c6005f 100644 --- a/util/uri.c +++ b/util/uri.c @@ -1340,7 +1340,7 @@ static void uri_clean(URI *uri) /** * uri_free: - * @uri: pointer to an URI + * @uri: pointer to an URI, NULL is ignored * * Free up the URI struct */ @@ -1939,15 +1939,9 @@ step_7: val = uri_to_string(res); done: - if (ref != NULL) { - uri_free(ref); - } - if (bas != NULL) { - uri_free(bas); - } - if (res != NULL) { - uri_free(res); - } + uri_free(ref); + uri_free(bas); + uri_free(res); return val; } @@ -2190,12 +2184,8 @@ done: if (remove_path != 0) { ref->path = NULL; } - if (ref != NULL) { - uri_free(ref); - } - if (bas != NULL) { - uri_free(bas); - } + uri_free(ref); + uri_free(bas); return val; } From 2c7dd057aa7bd7a875e9b1a53975c220d6380bc4 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Fri, 25 Jun 2021 16:23:12 +0200 Subject: [PATCH 10/28] export/fuse: Pass default_permissions for mount We do not do any permission checks in fuse_open(), so let the kernel do them. We already let fuse_getattr() report the proper UNIX permissions, so this should work the way we want. This causes a change in 308's reference output, because now opening a non-writable export with O_RDWR fails already, instead of only actually attempting to write to it. (That is an improvement.) Signed-off-by: Max Reitz Message-Id: <20210625142317.271673-2-mreitz@redhat.com> Signed-off-by: Kevin Wolf --- block/export/fuse.c | 8 ++++++-- tests/qemu-iotests/308 | 3 ++- tests/qemu-iotests/308.out | 2 +- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/block/export/fuse.c b/block/export/fuse.c index 38f74c94da..d0b88e8f80 100644 --- a/block/export/fuse.c +++ b/block/export/fuse.c @@ -153,8 +153,12 @@ static int setup_fuse_export(FuseExport *exp, const char *mountpoint, struct fuse_args fuse_args; int ret; - /* Needs to match what fuse_init() sets. Only max_read must be supplied. */ - mount_opts = g_strdup_printf("max_read=%zu", FUSE_MAX_BOUNCE_BYTES); + /* + * max_read needs to match what fuse_init() sets. + * max_write need not be supplied. + */ + mount_opts = g_strdup_printf("max_read=%zu,default_permissions", + FUSE_MAX_BOUNCE_BYTES); fuse_argv[0] = ""; /* Dummy program name */ fuse_argv[1] = "-o"; diff --git a/tests/qemu-iotests/308 b/tests/qemu-iotests/308 index f122065d0f..11c28a75f2 100755 --- a/tests/qemu-iotests/308 +++ b/tests/qemu-iotests/308 @@ -215,7 +215,8 @@ echo '=== Writable export ===' fuse_export_add 'export-mp' "'mountpoint': '$EXT_MP', 'writable': true" # Check that writing to the read-only export fails -$QEMU_IO -f raw -c 'write -P 42 1M 64k' "$TEST_IMG" | _filter_qemu_io +$QEMU_IO -f raw -c 'write -P 42 1M 64k' "$TEST_IMG" 2>&1 \ + | _filter_qemu_io | _filter_testdir | _filter_imgfmt # But here it should work $QEMU_IO -f raw -c 'write -P 42 1M 64k' "$EXT_MP" | _filter_qemu_io diff --git a/tests/qemu-iotests/308.out b/tests/qemu-iotests/308.out index 466e7e0267..0e9420645f 100644 --- a/tests/qemu-iotests/308.out +++ b/tests/qemu-iotests/308.out @@ -91,7 +91,7 @@ virtual size: 0 B (0 bytes) 'mountpoint': 'TEST_DIR/t.IMGFMT.fuse', 'writable': true } } {"return": {}} -write failed: Permission denied +qemu-io: can't open device TEST_DIR/t.IMGFMT: Could not open 'TEST_DIR/t.IMGFMT': Permission denied wrote 65536/65536 bytes at offset 1048576 64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) wrote 65536/65536 bytes at offset 1048576 From 8fc54f9428b9763f8003bd5f5dd440946210fc80 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Fri, 25 Jun 2021 16:23:13 +0200 Subject: [PATCH 11/28] export/fuse: Add allow-other option Without the allow_other mount option, no user (not even root) but the one who started qemu/the storage daemon can access the export. Allow users to configure the export such that such accesses are possible. While allow_other is probably what users want, we cannot make it an unconditional default, because passing it is only possible (for non-root users) if the global fuse.conf configuration file allows it. Thus, the default is an 'auto' mode, in which we first try with allow_other, and then fall back to without. FuseExport.allow_other reports whether allow_other was actually used as a mount option or not. Currently, this information is not used, but a future patch will let this field decide whether e.g. an export's UID and GID can be changed through chmod. One notable thing about 'auto' mode is that libfuse may print error messages directly to stderr, and so may fusermount (which it executes). Our export code cannot really filter or hide them. Therefore, if 'auto' fails its first attempt and has to fall back, fusermount will print an error message that mounting with allow_other failed. This behavior necessitates a change to iotest 308, namely we need to filter out this error message (because if the first attempt at mounting with allow_other succeeds, there will be no such message). Furthermore, common.rc's _make_test_img should use allow-other=off for FUSE exports, because iotests generally do not need to access images from other users, so allow-other=on or allow-other=auto have no advantage. OTOH, allow-other=on will not work on systems where user_allow_other is disabled, and with allow-other=auto, we get said error message that we would need to filter out again. Just disabling allow-other is simplest. Signed-off-by: Max Reitz Message-Id: <20210625142317.271673-3-mreitz@redhat.com> Signed-off-by: Kevin Wolf --- block/export/fuse.c | 28 +++++++++++++++++++++++----- qapi/block-export.json | 33 ++++++++++++++++++++++++++++++++- tests/qemu-iotests/308 | 6 +++++- tests/qemu-iotests/common.rc | 6 +++++- 4 files changed, 65 insertions(+), 8 deletions(-) diff --git a/block/export/fuse.c b/block/export/fuse.c index d0b88e8f80..4068250241 100644 --- a/block/export/fuse.c +++ b/block/export/fuse.c @@ -46,6 +46,8 @@ typedef struct FuseExport { char *mountpoint; bool writable; bool growable; + /* Whether allow_other was used as a mount option or not */ + bool allow_other; } FuseExport; static GHashTable *exports; @@ -57,7 +59,7 @@ static void fuse_export_delete(BlockExport *exp); static void init_exports_table(void); static int setup_fuse_export(FuseExport *exp, const char *mountpoint, - Error **errp); + bool allow_other, Error **errp); static void read_from_fuse_export(void *opaque); static bool is_regular_file(const char *path, Error **errp); @@ -118,7 +120,22 @@ static int fuse_export_create(BlockExport *blk_exp, exp->writable = blk_exp_args->writable; exp->growable = args->growable; - ret = setup_fuse_export(exp, args->mountpoint, errp); + /* set default */ + if (!args->has_allow_other) { + args->allow_other = FUSE_EXPORT_ALLOW_OTHER_AUTO; + } + + if (args->allow_other == FUSE_EXPORT_ALLOW_OTHER_AUTO) { + /* Ignore errors on our first attempt */ + ret = setup_fuse_export(exp, args->mountpoint, true, NULL); + exp->allow_other = ret == 0; + if (ret < 0) { + ret = setup_fuse_export(exp, args->mountpoint, false, errp); + } + } else { + exp->allow_other = args->allow_other == FUSE_EXPORT_ALLOW_OTHER_ON; + ret = setup_fuse_export(exp, args->mountpoint, exp->allow_other, errp); + } if (ret < 0) { goto fail; } @@ -146,7 +163,7 @@ static void init_exports_table(void) * Create exp->fuse_session and mount it. */ static int setup_fuse_export(FuseExport *exp, const char *mountpoint, - Error **errp) + bool allow_other, Error **errp) { const char *fuse_argv[4]; char *mount_opts; @@ -157,8 +174,9 @@ static int setup_fuse_export(FuseExport *exp, const char *mountpoint, * max_read needs to match what fuse_init() sets. * max_write need not be supplied. */ - mount_opts = g_strdup_printf("max_read=%zu,default_permissions", - FUSE_MAX_BOUNCE_BYTES); + mount_opts = g_strdup_printf("max_read=%zu,default_permissions%s", + FUSE_MAX_BOUNCE_BYTES, + allow_other ? ",allow_other" : ""); fuse_argv[0] = ""; /* Dummy program name */ fuse_argv[1] = "-o"; diff --git a/qapi/block-export.json b/qapi/block-export.json index e819e70cac..0ed63442a8 100644 --- a/qapi/block-export.json +++ b/qapi/block-export.json @@ -120,6 +120,23 @@ '*logical-block-size': 'size', '*num-queues': 'uint16'} } +## +# @FuseExportAllowOther: +# +# Possible allow_other modes for FUSE exports. +# +# @off: Do not pass allow_other as a mount option. +# +# @on: Pass allow_other as a mount option. +# +# @auto: Try mounting with allow_other first, and if that fails, retry +# without allow_other. +# +# Since: 6.1 +## +{ 'enum': 'FuseExportAllowOther', + 'data': ['off', 'on', 'auto'] } + ## # @BlockExportOptionsFuse: # @@ -132,11 +149,25 @@ # @growable: Whether writes beyond the EOF should grow the block node # accordingly. (default: false) # +# @allow-other: If this is off, only qemu's user is allowed access to +# this export. That cannot be changed even with chmod or +# chown. +# Enabling this option will allow other users access to +# the export with the FUSE mount option "allow_other". +# Note that using allow_other as a non-root user requires +# user_allow_other to be enabled in the global fuse.conf +# configuration file. +# In auto mode (the default), the FUSE export driver will +# first attempt to mount the export with allow_other, and +# if that fails, try again without. +# (since 6.1; default: auto) +# # Since: 6.0 ## { 'struct': 'BlockExportOptionsFuse', 'data': { 'mountpoint': 'str', - '*growable': 'bool' }, + '*growable': 'bool', + '*allow-other': 'FuseExportAllowOther' }, 'if': 'defined(CONFIG_FUSE)' } ## diff --git a/tests/qemu-iotests/308 b/tests/qemu-iotests/308 index 11c28a75f2..d13a9a969c 100755 --- a/tests/qemu-iotests/308 +++ b/tests/qemu-iotests/308 @@ -58,6 +58,9 @@ _supported_os Linux # We need /dev/urandom # $4: Node to export (defaults to 'node-format') fuse_export_add() { + # The grep -v is a filter for errors when /etc/fuse.conf does not contain + # user_allow_other. (The error is benign, but it is printed by fusermount + # on the first mount attempt, so our export code cannot hide it.) _send_qemu_cmd $QEMU_HANDLE \ "{'execute': 'block-export-add', 'arguments': { @@ -67,7 +70,8 @@ fuse_export_add() $2 } }" \ "${3:-return}" \ - | _filter_imgfmt + | _filter_imgfmt \ + | grep -v 'option allow_other only allowed if' } # $1: Export ID diff --git a/tests/qemu-iotests/common.rc b/tests/qemu-iotests/common.rc index cbbf6d7c7f..609d82de89 100644 --- a/tests/qemu-iotests/common.rc +++ b/tests/qemu-iotests/common.rc @@ -512,9 +512,13 @@ _make_test_img() # Usually, users would export formatted nodes. But we present fuse as a # protocol-level driver here, so we have to leave the format to the # client. + # Switch off allow-other, because in general we do not need it for + # iotests. The default allow-other=auto has the downside of printing a + # fusermount error on its first attempt if allow_other is not + # permissible, which we would need to filter. QSD_NEED_PID=y $QSD \ --blockdev file,node-name=export-node,filename=$img_name,discard=unmap \ - --export fuse,id=fuse-export,node-name=export-node,mountpoint="$export_mp",writable=on,growable=on \ + --export fuse,id=fuse-export,node-name=export-node,mountpoint="$export_mp",writable=on,growable=on,allow-other=off \ & pidfile="$QEMU_TEST_DIR/qemu-storage-daemon.pid" From 9bad96a8cc669a3b399b9d739b505fdc592acaa4 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Fri, 25 Jun 2021 16:23:14 +0200 Subject: [PATCH 12/28] export/fuse: Give SET_ATTR_SIZE its own branch In order to support changing other attributes than the file size in fuse_setattr(), we have to give each its own independent branch. This also applies to the only attribute we do support right now. Signed-off-by: Max Reitz Reviewed-by: Kevin Wolf Message-Id: <20210625142317.271673-4-mreitz@redhat.com> Signed-off-by: Kevin Wolf --- block/export/fuse.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/block/export/fuse.c b/block/export/fuse.c index 4068250241..26ad644cd7 100644 --- a/block/export/fuse.c +++ b/block/export/fuse.c @@ -417,20 +417,22 @@ static void fuse_setattr(fuse_req_t req, fuse_ino_t inode, struct stat *statbuf, FuseExport *exp = fuse_req_userdata(req); int ret; - if (!exp->writable) { - fuse_reply_err(req, EACCES); - return; - } - if (to_set & ~FUSE_SET_ATTR_SIZE) { fuse_reply_err(req, ENOTSUP); return; } - ret = fuse_do_truncate(exp, statbuf->st_size, true, PREALLOC_MODE_OFF); - if (ret < 0) { - fuse_reply_err(req, -ret); - return; + if (to_set & FUSE_SET_ATTR_SIZE) { + if (!exp->writable) { + fuse_reply_err(req, EACCES); + return; + } + + ret = fuse_do_truncate(exp, statbuf->st_size, true, PREALLOC_MODE_OFF); + if (ret < 0) { + fuse_reply_err(req, -ret); + return; + } } fuse_getattr(req, inode, fi); From 6aeeaed29ced7ef03e4211193f7e3f537eace871 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Fri, 25 Jun 2021 16:23:15 +0200 Subject: [PATCH 13/28] export/fuse: Let permissions be adjustable Allow changing the file mode, UID, and GID through SETATTR. Without allow_other, UID and GID are not allowed to be changed, because it would not make sense. Also, changing group or others' permissions is not allowed either. For read-only exports, +w cannot be set. Signed-off-by: Max Reitz Message-Id: <20210625142317.271673-5-mreitz@redhat.com> Signed-off-by: Kevin Wolf --- block/export/fuse.c | 73 ++++++++++++++++++++++++++++++++++++++------- 1 file changed, 62 insertions(+), 11 deletions(-) diff --git a/block/export/fuse.c b/block/export/fuse.c index 26ad644cd7..ada9e263eb 100644 --- a/block/export/fuse.c +++ b/block/export/fuse.c @@ -48,6 +48,10 @@ typedef struct FuseExport { bool growable; /* Whether allow_other was used as a mount option or not */ bool allow_other; + + mode_t st_mode; + uid_t st_uid; + gid_t st_gid; } FuseExport; static GHashTable *exports; @@ -125,6 +129,13 @@ static int fuse_export_create(BlockExport *blk_exp, args->allow_other = FUSE_EXPORT_ALLOW_OTHER_AUTO; } + exp->st_mode = S_IFREG | S_IRUSR; + if (exp->writable) { + exp->st_mode |= S_IWUSR; + } + exp->st_uid = getuid(); + exp->st_gid = getgid(); + if (args->allow_other == FUSE_EXPORT_ALLOW_OTHER_AUTO) { /* Ignore errors on our first attempt */ ret = setup_fuse_export(exp, args->mountpoint, true, NULL); @@ -338,7 +349,6 @@ static void fuse_getattr(fuse_req_t req, fuse_ino_t inode, int64_t length, allocated_blocks; time_t now = time(NULL); FuseExport *exp = fuse_req_userdata(req); - mode_t mode; length = blk_getlength(exp->common.blk); if (length < 0) { @@ -353,17 +363,12 @@ static void fuse_getattr(fuse_req_t req, fuse_ino_t inode, allocated_blocks = DIV_ROUND_UP(allocated_blocks, 512); } - mode = S_IFREG | S_IRUSR; - if (exp->writable) { - mode |= S_IWUSR; - } - statbuf = (struct stat) { .st_ino = inode, - .st_mode = mode, + .st_mode = exp->st_mode, .st_nlink = 1, - .st_uid = getuid(), - .st_gid = getgid(), + .st_uid = exp->st_uid, + .st_gid = exp->st_gid, .st_size = length, .st_blksize = blk_bs(exp->common.blk)->bl.request_alignment, .st_blocks = allocated_blocks, @@ -409,19 +414,52 @@ static int fuse_do_truncate(const FuseExport *exp, int64_t size, } /** - * Let clients set file attributes. Only resizing is supported. + * Let clients set file attributes. Only resizing and changing + * permissions (st_mode, st_uid, st_gid) is allowed. + * Changing permissions is only allowed as far as it will actually + * permit access: Read-only exports cannot be given +w, and exports + * without allow_other cannot be given a different UID or GID, and + * they cannot be given non-owner access. */ static void fuse_setattr(fuse_req_t req, fuse_ino_t inode, struct stat *statbuf, int to_set, struct fuse_file_info *fi) { FuseExport *exp = fuse_req_userdata(req); + int supported_attrs; int ret; - if (to_set & ~FUSE_SET_ATTR_SIZE) { + supported_attrs = FUSE_SET_ATTR_SIZE | FUSE_SET_ATTR_MODE; + if (exp->allow_other) { + supported_attrs |= FUSE_SET_ATTR_UID | FUSE_SET_ATTR_GID; + } + + if (to_set & ~supported_attrs) { fuse_reply_err(req, ENOTSUP); return; } + /* Do some argument checks first before committing to anything */ + if (to_set & FUSE_SET_ATTR_MODE) { + /* + * Without allow_other, non-owners can never access the export, so do + * not allow setting permissions for them + */ + if (!exp->allow_other && + (statbuf->st_mode & (S_IRWXG | S_IRWXO)) != 0) + { + fuse_reply_err(req, EPERM); + return; + } + + /* +w for read-only exports makes no sense, disallow it */ + if (!exp->writable && + (statbuf->st_mode & (S_IWUSR | S_IWGRP | S_IWOTH)) != 0) + { + fuse_reply_err(req, EROFS); + return; + } + } + if (to_set & FUSE_SET_ATTR_SIZE) { if (!exp->writable) { fuse_reply_err(req, EACCES); @@ -435,6 +473,19 @@ static void fuse_setattr(fuse_req_t req, fuse_ino_t inode, struct stat *statbuf, } } + if (to_set & FUSE_SET_ATTR_MODE) { + /* Ignore FUSE-supplied file type, only change the mode */ + exp->st_mode = (statbuf->st_mode & 07777) | S_IFREG; + } + + if (to_set & FUSE_SET_ATTR_UID) { + exp->st_uid = statbuf->st_uid; + } + + if (to_set & FUSE_SET_ATTR_GID) { + exp->st_gid = statbuf->st_gid; + } + fuse_getattr(req, inode, fi); } From f29add26d412311926e8095952316d360bd51cbf Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Fri, 25 Jun 2021 16:23:16 +0200 Subject: [PATCH 14/28] iotests/308: Test +w on read-only FUSE exports Test that +w on read-only FUSE exports returns an EROFS error. u+x on the other hand should work. (There is no special reason to choose u+x here, it simply is like +w another flag that is not set by default.) Signed-off-by: Max Reitz Message-Id: <20210625142317.271673-6-mreitz@redhat.com> Signed-off-by: Kevin Wolf --- tests/qemu-iotests/308 | 11 +++++++++++ tests/qemu-iotests/308.out | 4 ++++ 2 files changed, 15 insertions(+) diff --git a/tests/qemu-iotests/308 b/tests/qemu-iotests/308 index d13a9a969c..6b386bd523 100755 --- a/tests/qemu-iotests/308 +++ b/tests/qemu-iotests/308 @@ -170,6 +170,17 @@ fuse_export_add 'export-mp' "'mountpoint': '$EXT_MP'" # Check that the export presents the same data as the original image $QEMU_IMG compare -f raw -F $IMGFMT -U "$EXT_MP" "$TEST_IMG" +# Some quick chmod tests +stat -c 'Permissions pre-chmod: %a' "$EXT_MP" + +# Verify that we cannot set +w +chmod u+w "$EXT_MP" 2>&1 | _filter_testdir | _filter_imgfmt +stat -c 'Permissions post-+w: %a' "$EXT_MP" + +# But that we can set, say, +x (if we are so inclined) +chmod u+x "$EXT_MP" 2>&1 | _filter_testdir | _filter_imgfmt +stat -c 'Permissions post-+x: %a' "$EXT_MP" + echo echo '=== Mount over existing file ===' diff --git a/tests/qemu-iotests/308.out b/tests/qemu-iotests/308.out index 0e9420645f..fc47bb11a2 100644 --- a/tests/qemu-iotests/308.out +++ b/tests/qemu-iotests/308.out @@ -50,6 +50,10 @@ wrote 67108864/67108864 bytes at offset 0 } } {"return": {}} Images are identical. +Permissions pre-chmod: 400 +chmod: changing permissions of 'TEST_DIR/t.IMGFMT.fuse': Read-only file system +Permissions post-+w: 400 +Permissions post-+x: 500 === Mount over existing file === {'execute': 'block-export-add', From d9f008e6235b50bc81d3c2c80eaec3065b7f04c6 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Fri, 25 Jun 2021 16:23:17 +0200 Subject: [PATCH 15/28] iotests/fuse-allow-other: Test allow-other Signed-off-by: Max Reitz Message-Id: <20210625142317.271673-7-mreitz@redhat.com> Signed-off-by: Kevin Wolf --- tests/qemu-iotests/tests/fuse-allow-other | 168 ++++++++++++++++++ tests/qemu-iotests/tests/fuse-allow-other.out | 88 +++++++++ 2 files changed, 256 insertions(+) create mode 100755 tests/qemu-iotests/tests/fuse-allow-other create mode 100644 tests/qemu-iotests/tests/fuse-allow-other.out diff --git a/tests/qemu-iotests/tests/fuse-allow-other b/tests/qemu-iotests/tests/fuse-allow-other new file mode 100755 index 0000000000..19f494aefb --- /dev/null +++ b/tests/qemu-iotests/tests/fuse-allow-other @@ -0,0 +1,168 @@ +#!/usr/bin/env bash +# group: rw +# +# Test FUSE exports' allow-other option +# +# Copyright (C) 2021 Red Hat, Inc. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + +seq=$(basename "$0") +echo "QA output created by $seq" + +status=1 # failure is the default! + +_cleanup() +{ + _cleanup_qemu + _cleanup_test_img + rm -f "$EXT_MP" +} +trap "_cleanup; exit \$status" 0 1 2 3 15 + +# get standard environment, filters and checks +. ../common.rc +. ../common.filter +. ../common.qemu + +_supported_fmt generic + +_supported_proto file # We create the FUSE export manually + +sudo -n -u nobody true || \ + _notrun 'Password-less sudo as nobody required to test allow_other' + +# $1: Export ID +# $2: Options (beyond the node-name and ID) +# $3: Expected return value (defaults to 'return') +# $4: Node to export (defaults to 'node-format') +fuse_export_add() +{ + allow_other_not_supported='option allow_other only allowed if' + + output=$( + success_or_failure=yes _send_qemu_cmd $QEMU_HANDLE \ + "{'execute': 'block-export-add', + 'arguments': { + 'type': 'fuse', + 'id': '$1', + 'node-name': '${4:-node-format}', + $2 + } }" \ + "${3:-return}" \ + "$allow_other_not_supported" \ + | _filter_imgfmt + ) + + if echo "$output" | grep -q "$allow_other_not_supported"; then + # Shut down qemu gracefully so it can unmount the export + _send_qemu_cmd $QEMU_HANDLE \ + "{'execute': 'quit'}" \ + 'return' + + wait=yes _cleanup_qemu + + _notrun "allow_other not supported" + fi + + echo "$output" +} + +EXT_MP="$TEST_DIR/fuse-export" + +_make_test_img 64k +touch "$EXT_MP" + +echo +echo '=== Test permissions ===' + +# $1: allow-other value ('on'/'off'/'auto') +run_permission_test() +{ + _launch_qemu \ + -blockdev \ + "$IMGFMT,node-name=node-format,file.driver=file,file.filename=$TEST_IMG" + + _send_qemu_cmd $QEMU_HANDLE \ + "{'execute': 'qmp_capabilities'}" \ + 'return' + + fuse_export_add 'export' \ + "'mountpoint': '$EXT_MP', + 'allow-other': '$1'" + + # Should always work + echo '(Removing all permissions)' + chmod 000 "$EXT_MP" 2>&1 | _filter_testdir | _filter_imgfmt + stat -c 'Permissions post-chmod: %a' "$EXT_MP" + + # Should always work + echo '(Granting u+r)' + chmod u+r "$EXT_MP" 2>&1 | _filter_testdir | _filter_imgfmt + stat -c 'Permissions post-chmod: %a' "$EXT_MP" + + # Should only work with allow-other: Otherwise, no permissions can be + # granted to the group or others + echo '(Granting read permissions for everyone)' + chmod 444 "$EXT_MP" 2>&1 | _filter_testdir | _filter_imgfmt + stat -c 'Permissions post-chmod: %a' "$EXT_MP" + + echo 'Doing operations as nobody:' + # Change to TEST_DIR, so nobody will not have to attempt a lookup + pushd "$TEST_DIR" >/dev/null + + # This is already prevented by the permissions (without allow-other, FUSE + # exports always have o-r), but test it anyway + sudo -n -u nobody cat fuse-export >/dev/null + + # If the only problem were the lack of permissions, we should still be able + # to stat the export as nobody; it should not work without allow-other, + # though + sudo -n -u nobody \ + stat -c 'Permissions seen by nobody: %a' fuse-export 2>&1 \ + | _filter_imgfmt + + # To prove the point, revoke read permissions for others and try again + chmod o-r fuse-export 2>&1 | _filter_testdir | _filter_imgfmt + + # Should fail + sudo -n -u nobody cat fuse-export >/dev/null + # Should work with allow_other + sudo -n -u nobody \ + stat -c 'Permissions seen by nobody: %a' fuse-export 2>&1 \ + | _filter_imgfmt + + popd >/dev/null + + _send_qemu_cmd $QEMU_HANDLE \ + "{'execute': 'quit'}" \ + 'return' + + wait=yes _cleanup_qemu +} + +# 'auto' should behave exactly like 'on', because 'on' tests that +# allow_other works (otherwise, this test is skipped) +for ao in off on auto; do + echo + echo "--- allow-other=$ao ---" + + run_permission_test "$ao" +done + +# success, all done +echo "*** done" +rm -f $seq.full +status=0 diff --git a/tests/qemu-iotests/tests/fuse-allow-other.out b/tests/qemu-iotests/tests/fuse-allow-other.out new file mode 100644 index 0000000000..543fa52a06 --- /dev/null +++ b/tests/qemu-iotests/tests/fuse-allow-other.out @@ -0,0 +1,88 @@ +QA output created by fuse-allow-other +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=65536 + +=== Test permissions === + +--- allow-other=off --- +{'execute': 'qmp_capabilities'} +{"return": {}} +{'execute': 'block-export-add', + 'arguments': { + 'type': 'fuse', + 'id': 'export', + 'node-name': 'node-format', + 'mountpoint': 'TEST_DIR/fuse-export', + 'allow-other': 'off' + } } +{"return": {}} +(Removing all permissions) +Permissions post-chmod: 0 +(Granting u+r) +Permissions post-chmod: 400 +(Granting read permissions for everyone) +chmod: changing permissions of 'TEST_DIR/fuse-export': Operation not permitted +Permissions post-chmod: 400 +Doing operations as nobody: +cat: fuse-export: Permission denied +stat: cannot statx 'fuse-export': Permission denied +cat: fuse-export: Permission denied +stat: cannot statx 'fuse-export': Permission denied +{'execute': 'quit'} +{"return": {}} +{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false, "reason": "host-qmp-quit"}} +{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_EXPORT_DELETED", "data": {"id": "export"}} + +--- allow-other=on --- +{'execute': 'qmp_capabilities'} +{"return": {}} +{'execute': 'block-export-add', + 'arguments': { + 'type': 'fuse', + 'id': 'export', + 'node-name': 'node-format', + 'mountpoint': 'TEST_DIR/fuse-export', + 'allow-other': 'on' + } } +{"return": {}} +(Removing all permissions) +Permissions post-chmod: 0 +(Granting u+r) +Permissions post-chmod: 400 +(Granting read permissions for everyone) +Permissions post-chmod: 444 +Doing operations as nobody: +Permissions seen by nobody: 444 +cat: fuse-export: Permission denied +Permissions seen by nobody: 440 +{'execute': 'quit'} +{"return": {}} +{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false, "reason": "host-qmp-quit"}} +{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_EXPORT_DELETED", "data": {"id": "export"}} + +--- allow-other=auto --- +{'execute': 'qmp_capabilities'} +{"return": {}} +{'execute': 'block-export-add', + 'arguments': { + 'type': 'fuse', + 'id': 'export', + 'node-name': 'node-format', + 'mountpoint': 'TEST_DIR/fuse-export', + 'allow-other': 'auto' + } } +{"return": {}} +(Removing all permissions) +Permissions post-chmod: 0 +(Granting u+r) +Permissions post-chmod: 400 +(Granting read permissions for everyone) +Permissions post-chmod: 444 +Doing operations as nobody: +Permissions seen by nobody: 444 +cat: fuse-export: Permission denied +Permissions seen by nobody: 440 +{'execute': 'quit'} +{"return": {}} +{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false, "reason": "host-qmp-quit"}} +{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_EXPORT_DELETED", "data": {"id": "export"}} +*** done From 64cc845bdb0908df247757ea864c1c72bf82b30c Mon Sep 17 00:00:00 2001 From: Peter Lieven Date: Wed, 7 Jul 2021 20:04:48 +0200 Subject: [PATCH 16/28] block/rbd: fix type of task->complete task->complete is a bool not an integer. Signed-off-by: Peter Lieven Message-Id: <20210707180449.32665-1-pl@kamp.de> Reviewed-by: Ilya Dryomov Signed-off-by: Kevin Wolf --- block/rbd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/rbd.c b/block/rbd.c index 01a7b94d62..dcf82b15b8 100644 --- a/block/rbd.c +++ b/block/rbd.c @@ -1066,7 +1066,7 @@ static int qemu_rbd_resize(BlockDriverState *bs, uint64_t size) static void qemu_rbd_finish_bh(void *opaque) { RBDTask *task = opaque; - task->complete = 1; + task->complete = true; aio_co_wake(task->co); } From 6d2f38eb0e2be86af6904f1b768575cc64be37c2 Mon Sep 17 00:00:00 2001 From: Peter Lieven Date: Wed, 7 Jul 2021 20:04:49 +0200 Subject: [PATCH 17/28] MAINTAINERS: add block/rbd.c reviewer adding myself as a designated reviewer. Signed-off-by: Peter Lieven Message-Id: <20210707180449.32665-2-pl@kamp.de> Acked-by: Ilya Dryomov Signed-off-by: Kevin Wolf --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 86ec36a062..0e9f338e32 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3061,6 +3061,7 @@ F: block/vmdk.c RBD M: Ilya Dryomov +R: Peter Lieven L: qemu-block@nongnu.org S: Supported F: block/rbd.c From 84affad1fd4c5251d7cccf4df43b29e9157983a9 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Mon, 5 Jul 2021 19:14:29 +0200 Subject: [PATCH 18/28] vhost-user: Fix backends without multiqueue support dev->max_queues was never initialised for backends that don't support VHOST_USER_PROTOCOL_F_MQ, so it would use 0 as the maximum number of queues to check against and consequently fail for any such backend. Set it to 1 if the backend doesn't have multiqueue support. Fixes: c90bd505a3e8210c23d69fecab9ee6f56ec4a161 Signed-off-by: Kevin Wolf Message-Id: <20210705171429.29286-1-kwolf@redhat.com> Reviewed-by: Cornelia Huck Reviewed-by: Raphael Norwitz Signed-off-by: Kevin Wolf --- hw/virtio/vhost-user.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c index 1ac4a2ebec..29ea2b4fce 100644 --- a/hw/virtio/vhost-user.c +++ b/hw/virtio/vhost-user.c @@ -1913,7 +1913,10 @@ static int vhost_user_backend_init(struct vhost_dev *dev, void *opaque, if (err < 0) { return -EPROTO; } + } else { + dev->max_queues = 1; } + if (dev->num_queues && dev->max_queues < dev->num_queues) { error_setg(errp, "The maximum number of queues supported by the " "backend is %" PRIu64, dev->max_queues); From 2842ff2d81ddd86b1d0dd3b98c46b72bfa5cb4cb Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Tue, 8 Jun 2021 10:18:52 -0700 Subject: [PATCH 19/28] blockdev: fix drive-backup transaction endless drained section drive_backup_prepare() does bdrv_drained_begin() in hope that bdrv_drained_end() will be called in drive_backup_clean(). Still we need to set state->bs for this to work. That's done too late: a lot of failure paths in drive_backup_prepare() miss setting state->bs. Fix that. Fixes: 2288ccfac96281c316db942d10e3f921c1373064 Fixes: https://gitlab.com/qemu-project/qemu/-/issues/399 Signed-off-by: Vladimir Sementsov-Ogievskiy Message-Id: <20210608171852.250775-1-vsementsov@virtuozzo.com> Reviewed-by: Eric Blake Signed-off-by: Kevin Wolf --- blockdev.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/blockdev.c b/blockdev.c index f08192deda..094c085962 100644 --- a/blockdev.c +++ b/blockdev.c @@ -1714,6 +1714,7 @@ static void drive_backup_prepare(BlkActionState *common, Error **errp) aio_context = bdrv_get_aio_context(bs); aio_context_acquire(aio_context); + state->bs = bs; /* Paired with .clean() */ bdrv_drained_begin(bs); @@ -1813,8 +1814,6 @@ static void drive_backup_prepare(BlkActionState *common, Error **errp) } } - state->bs = bs; - state->job = do_backup_common(qapi_DriveBackup_base(backup), bs, target_bs, aio_context, common->block_job_txn, errp); From 5a385bf5c5cb3069fab17c014cf4b4f629509f1e Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Mon, 3 May 2021 14:35:59 -0700 Subject: [PATCH 20/28] qcow2: Prohibit backing file changes in 'qemu-img amend' This was deprecated back in bc5ee6da7 (qcow2: Deprecate use of qemu-img amend to change backing file), and no one in the meantime has given any reasons why it should be supported. Time to make change attempts a hard error (but for convenience, specifying the _same_ backing chain is not forbidden). Update a couple of iotests to match. Signed-off-by: Eric Blake Message-Id: <20210503213600.569128-2-eblake@redhat.com> Reviewed-by: Connor Kuehl Signed-off-by: Kevin Wolf --- block/qcow2.c | 13 ++++--------- docs/system/deprecated.rst | 12 ------------ docs/system/removed-features.rst | 12 ++++++++++++ tests/qemu-iotests/061 | 3 +++ tests/qemu-iotests/061.out | 3 ++- tests/qemu-iotests/082.out | 6 ++++-- 6 files changed, 25 insertions(+), 24 deletions(-) diff --git a/block/qcow2.c b/block/qcow2.c index ee4530cdbd..0cac2eda36 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -5620,15 +5620,10 @@ static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts, if (backing_file || backing_format) { if (g_strcmp0(backing_file, s->image_backing_file) || g_strcmp0(backing_format, s->image_backing_format)) { - warn_report("Deprecated use of amend to alter the backing file; " - "use qemu-img rebase instead"); - } - ret = qcow2_change_backing_file(bs, - backing_file ?: s->image_backing_file, - backing_format ?: s->image_backing_format); - if (ret < 0) { - error_setg_errno(errp, -ret, "Failed to change the backing file"); - return ret; + error_setg(errp, "Cannot amend the backing file"); + error_append_hint(errp, + "You can use 'qemu-img rebase' instead.\n"); + return -EINVAL; } } diff --git a/docs/system/deprecated.rst b/docs/system/deprecated.rst index 70e08baff6..9626a1fb57 100644 --- a/docs/system/deprecated.rst +++ b/docs/system/deprecated.rst @@ -282,18 +282,6 @@ this CPU is also deprecated. Related binaries ---------------- -qemu-img amend to adjust backing file (since 5.1) -''''''''''''''''''''''''''''''''''''''''''''''''' - -The use of ``qemu-img amend`` to modify the name or format of a qcow2 -backing image is deprecated; this functionality was never fully -documented or tested, and interferes with other amend operations that -need access to the original backing image (such as deciding whether a -v3 zero cluster may be left unallocated when converting to a v2 -image). Rather, any changes to the backing chain should be performed -with ``qemu-img rebase -u`` either before or after the remaining -changes being performed by amend, as appropriate. - qemu-img backing file without format (since 5.1) '''''''''''''''''''''''''''''''''''''''''''''''' diff --git a/docs/system/removed-features.rst b/docs/system/removed-features.rst index 2b21bd39ab..b64ea55194 100644 --- a/docs/system/removed-features.rst +++ b/docs/system/removed-features.rst @@ -491,6 +491,18 @@ topologies described with -smp include all possible cpus, i.e. The ``enforce-config-section`` property was replaced by the ``-global migration.send-configuration={on|off}`` option. +qemu-img amend to adjust backing file (removed in 6.1) +'''''''''''''''''''''''''''''''''''''''''''''''''''''' + +The use of ``qemu-img amend`` to modify the name or format of a qcow2 +backing image was never fully documented or tested, and interferes +with other amend operations that need access to the original backing +image (such as deciding whether a v3 zero cluster may be left +unallocated when converting to a v2 image). Any changes to the +backing chain should be performed with ``qemu-img rebase -u`` either +before or after the remaining changes being performed by amend, as +appropriate. + Block devices ------------- diff --git a/tests/qemu-iotests/061 b/tests/qemu-iotests/061 index e26d94a0df..9507c223bd 100755 --- a/tests/qemu-iotests/061 +++ b/tests/qemu-iotests/061 @@ -167,6 +167,9 @@ _make_test_img -o "compat=1.1" 64M TEST_IMG="$TEST_IMG.base" _make_test_img -o "compat=1.1" 64M $QEMU_IO -c "write -P 0x2a 0 128k" "$TEST_IMG.base" | _filter_qemu_io $QEMU_IO -c "read -P 0 0 128k" "$TEST_IMG" | _filter_qemu_io +$QEMU_IMG amend -o "backing_file=$TEST_IMG.base,backing_fmt=qcow2" \ + "$TEST_IMG" && echo "unexpected pass" +$QEMU_IMG rebase -u -b "$TEST_IMG.base" -F qcow2 "$TEST_IMG" $QEMU_IMG amend -o "backing_file=$TEST_IMG.base,backing_fmt=qcow2" "$TEST_IMG" $QEMU_IO -c "read -P 0x2a 0 128k" "$TEST_IMG" | _filter_qemu_io _check_test_img diff --git a/tests/qemu-iotests/061.out b/tests/qemu-iotests/061.out index ee30da2665..7ecbd4dea8 100644 --- a/tests/qemu-iotests/061.out +++ b/tests/qemu-iotests/061.out @@ -370,7 +370,8 @@ wrote 131072/131072 bytes at offset 0 128 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) read 131072/131072 bytes at offset 0 128 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) -qemu-img: warning: Deprecated use of amend to alter the backing file; use qemu-img rebase instead +qemu-img: Cannot amend the backing file +You can use 'qemu-img rebase' instead. read 131072/131072 bytes at offset 0 128 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) No errors were found on the image. diff --git a/tests/qemu-iotests/082.out b/tests/qemu-iotests/082.out index b70c12c139..077ed0f2c7 100644 --- a/tests/qemu-iotests/082.out +++ b/tests/qemu-iotests/082.out @@ -808,12 +808,14 @@ Amend options for 'qcow2': size= - Virtual disk size Testing: amend -f qcow2 -o backing_file=TEST_DIR/t.qcow2,,help TEST_DIR/t.qcow2 -qemu-img: warning: Deprecated use of amend to alter the backing file; use qemu-img rebase instead +qemu-img: Cannot amend the backing file +You can use 'qemu-img rebase' instead. Testing: rebase -u -b -f qcow2 TEST_DIR/t.qcow2 Testing: amend -f qcow2 -o backing_file=TEST_DIR/t.qcow2,,? TEST_DIR/t.qcow2 -qemu-img: warning: Deprecated use of amend to alter the backing file; use qemu-img rebase instead +qemu-img: Cannot amend the backing file +You can use 'qemu-img rebase' instead. Testing: rebase -u -b -f qcow2 TEST_DIR/t.qcow2 From 497a30dbb065937d67f6c43af6dd78492e1d6f6d Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Mon, 3 May 2021 14:36:00 -0700 Subject: [PATCH 21/28] qemu-img: Require -F with -b backing image Back in commit d9f059aa6c (qemu-img: Deprecate use of -b without -F), we deprecated the ability to create a file with a backing image that requires qemu to perform format probing. Qemu can still probe older files for backwards compatibility, but it is time to finish off the ability to create such images, due to the potential security risk they present. Update a couple of iotests affected by the change. Signed-off-by: Eric Blake Message-Id: <20210503213600.569128-3-eblake@redhat.com> Reviewed-by: Connor Kuehl Signed-off-by: Kevin Wolf --- block.c | 37 ++++++++++---------------------- docs/system/deprecated.rst | 20 ----------------- docs/system/removed-features.rst | 19 ++++++++++++++++ qemu-img.c | 6 ++++-- tests/qemu-iotests/040 | 4 ++-- tests/qemu-iotests/041 | 6 ++++-- tests/qemu-iotests/114 | 18 ++++++++-------- tests/qemu-iotests/114.out | 11 ++++------ tests/qemu-iotests/301 | 4 +--- tests/qemu-iotests/301.out | 16 ++------------ 10 files changed, 56 insertions(+), 85 deletions(-) diff --git a/block.c b/block.c index acd35cb0cb..ce96585575 100644 --- a/block.c +++ b/block.c @@ -5074,7 +5074,7 @@ int coroutine_fn bdrv_co_check(BlockDriverState *bs, * -ENOTSUP - format driver doesn't support changing the backing file */ int bdrv_change_backing_file(BlockDriverState *bs, const char *backing_file, - const char *backing_fmt, bool warn) + const char *backing_fmt, bool require) { BlockDriver *drv = bs->drv; int ret; @@ -5088,10 +5088,8 @@ int bdrv_change_backing_file(BlockDriverState *bs, const char *backing_file, return -EINVAL; } - if (warn && backing_file && !backing_fmt) { - warn_report("Deprecated use of backing file without explicit " - "backing format, use of this image requires " - "potentially unsafe format probing"); + if (require && backing_file && !backing_fmt) { + return -EINVAL; } if (drv->bdrv_change_backing_file != NULL) { @@ -6601,24 +6599,11 @@ void bdrv_img_create(const char *filename, const char *fmt, goto out; } else { if (!backing_fmt) { - warn_report("Deprecated use of backing file without explicit " - "backing format (detected format of %s)", - bs->drv->format_name); - if (bs->drv != &bdrv_raw) { - /* - * A probe of raw deserves the most attention: - * leaving the backing format out of the image - * will ensure bs->probed is set (ensuring we - * don't accidentally commit into the backing - * file), and allow more spots to warn the users - * to fix their toolchain when opening this image - * later. For other images, we can safely record - * the format that we probed. - */ - backing_fmt = bs->drv->format_name; - qemu_opt_set(opts, BLOCK_OPT_BACKING_FMT, backing_fmt, - NULL); - } + error_setg(&local_err, + "Backing file specified without backing format"); + error_append_hint(&local_err, "Detected format of %s.", + bs->drv->format_name); + goto out; } if (size == -1) { /* Opened BS, have no size */ @@ -6635,9 +6620,9 @@ void bdrv_img_create(const char *filename, const char *fmt, } /* (backing_file && !(flags & BDRV_O_NO_BACKING)) */ } else if (backing_file && !backing_fmt) { - warn_report("Deprecated use of unopened backing file without " - "explicit backing format, use of this image requires " - "potentially unsafe format probing"); + error_setg(&local_err, + "Backing file specified without backing format"); + goto out; } if (size == -1) { diff --git a/docs/system/deprecated.rst b/docs/system/deprecated.rst index 9626a1fb57..25d6c4c9a0 100644 --- a/docs/system/deprecated.rst +++ b/docs/system/deprecated.rst @@ -282,26 +282,6 @@ this CPU is also deprecated. Related binaries ---------------- -qemu-img backing file without format (since 5.1) -'''''''''''''''''''''''''''''''''''''''''''''''' - -The use of ``qemu-img create``, ``qemu-img rebase``, or ``qemu-img -convert`` to create or modify an image that depends on a backing file -now recommends that an explicit backing format be provided. This is -for safety: if QEMU probes a different format than what you thought, -the data presented to the guest will be corrupt; similarly, presenting -a raw image to a guest allows a potential security exploit if a future -probe sees a non-raw image based on guest writes. - -To avoid the warning message, or even future refusal to create an -unsafe image, you must pass ``-o backing_fmt=`` (or the shorthand -``-F`` during create) to specify the intended backing format. You may -use ``qemu-img rebase -u`` to retroactively add a backing format to an -existing image. However, be aware that there are already potential -security risks to blindly using ``qemu-img info`` to probe the format -of an untrusted backing image, when deciding what format to add into -an existing image. - Backwards compatibility ----------------------- diff --git a/docs/system/removed-features.rst b/docs/system/removed-features.rst index b64ea55194..28bb035043 100644 --- a/docs/system/removed-features.rst +++ b/docs/system/removed-features.rst @@ -503,6 +503,25 @@ backing chain should be performed with ``qemu-img rebase -u`` either before or after the remaining changes being performed by amend, as appropriate. +qemu-img backing file without format (removed in 6.1) +''''''''''''''''''''''''''''''''''''''''''''''''''''' + +The use of ``qemu-img create``, ``qemu-img rebase``, or ``qemu-img +convert`` to create or modify an image that depends on a backing file +now requires that an explicit backing format be provided. This is +for safety: if QEMU probes a different format than what you thought, +the data presented to the guest will be corrupt; similarly, presenting +a raw image to a guest allows a potential security exploit if a future +probe sees a non-raw image based on guest writes. + +To avoid creating unsafe backing chains, you must pass ``-o +backing_fmt=`` (or the shorthand ``-F`` during create) to specify the +intended backing format. You may use ``qemu-img rebase -u`` to +retroactively add a backing format to an existing image. However, be +aware that there are already potential security risks to blindly using +``qemu-img info`` to probe the format of an untrusted backing image, +when deciding what format to add into an existing image. + Block devices ------------- diff --git a/qemu-img.c b/qemu-img.c index 7956a89965..ec0e2fabe5 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -2508,8 +2508,10 @@ static int img_convert(int argc, char **argv) if (out_baseimg_param) { if (!qemu_opt_get(opts, BLOCK_OPT_BACKING_FMT)) { - warn_report("Deprecated use of backing file without explicit " - "backing format"); + error_report("Use of backing file requires explicit " + "backing format"); + ret = -1; + goto out; } } diff --git a/tests/qemu-iotests/040 b/tests/qemu-iotests/040 index ba7cb34ce8..f3677de9df 100755 --- a/tests/qemu-iotests/040 +++ b/tests/qemu-iotests/040 @@ -920,8 +920,8 @@ class TestCommitWithOverriddenBacking(iotests.QMPTestCase): def setUp(self): qemu_img('create', '-f', iotests.imgfmt, self.img_base_a, '1M') qemu_img('create', '-f', iotests.imgfmt, self.img_base_b, '1M') - qemu_img('create', '-f', iotests.imgfmt, '-b', self.img_base_a, \ - self.img_top) + qemu_img('create', '-f', iotests.imgfmt, '-b', self.img_base_a, + '-F', iotests.imgfmt, self.img_top) self.vm = iotests.VM() self.vm.launch() diff --git a/tests/qemu-iotests/041 b/tests/qemu-iotests/041 index 5cc02b24fc..db9f5dc540 100755 --- a/tests/qemu-iotests/041 +++ b/tests/qemu-iotests/041 @@ -1295,8 +1295,10 @@ class TestReplaces(iotests.QMPTestCase): class TestFilters(iotests.QMPTestCase): def setUp(self): qemu_img('create', '-f', iotests.imgfmt, backing_img, '1M') - qemu_img('create', '-f', iotests.imgfmt, '-b', backing_img, test_img) - qemu_img('create', '-f', iotests.imgfmt, '-b', backing_img, target_img) + qemu_img('create', '-f', iotests.imgfmt, '-b', backing_img, + '-F', iotests.imgfmt, test_img) + qemu_img('create', '-f', iotests.imgfmt, '-b', backing_img, + '-F', iotests.imgfmt, target_img) qemu_io('-c', 'write -P 1 0 512k', backing_img) qemu_io('-c', 'write -P 2 512k 512k', test_img) diff --git a/tests/qemu-iotests/114 b/tests/qemu-iotests/114 index 43cb0bc6c3..de6fd327ee 100755 --- a/tests/qemu-iotests/114 +++ b/tests/qemu-iotests/114 @@ -44,16 +44,16 @@ _supported_os Linux # qcow2.py does not work too well with external data files _unsupported_imgopts data_file -# Intentionally specify backing file without backing format; demonstrate -# the difference in warning messages when backing file could be probed. -# Note that only a non-raw probe result will affect the resulting image. +# Older qemu-img could set up backing file without backing format; modern +# qemu can't but we can use qcow2.py to simulate older files. truncate -s $((64 * 1024 * 1024)) "$TEST_IMG.orig" -_make_test_img -b "$TEST_IMG.orig" 64M +_make_test_img -b "$TEST_IMG.orig" -F raw 64M +$PYTHON qcow2.py "$TEST_IMG" del-header-ext 0xE2792ACA TEST_IMG="$TEST_IMG.base" _make_test_img 64M $QEMU_IMG convert -O qcow2 -B "$TEST_IMG.orig" "$TEST_IMG.orig" "$TEST_IMG" -_make_test_img -b "$TEST_IMG.base" 64M -_make_test_img -u -b "$TEST_IMG.base" 64M +_make_test_img -b "$TEST_IMG.base" -F $IMGFMT 64M +_make_test_img -u -b "$TEST_IMG.base" -F $IMGFMT 64M # Set an invalid backing file format $PYTHON qcow2.py "$TEST_IMG" add-header-ext 0xE2792ACA "foo" @@ -64,9 +64,9 @@ _img_info $QEMU_IO -c "open $TEST_IMG" -c "read 0 4k" 2>&1 | _filter_qemu_io | _filter_testdir $QEMU_IO -c "open -o backing.driver=$IMGFMT $TEST_IMG" -c "read 0 4k" | _filter_qemu_io -# Rebase the image, to show that omitting backing format triggers a warning, -# but probing now lets us use the backing file. -$QEMU_IMG rebase -u -b "$TEST_IMG.base" "$TEST_IMG" +# Rebase the image, to show that backing format is required. +($QEMU_IMG rebase -u -b "$TEST_IMG.base" "$TEST_IMG" 2>&1 && echo "unexpected pass") | _filter_testdir +$QEMU_IMG rebase -u -b "$TEST_IMG.base" -F $IMGFMT "$TEST_IMG" $QEMU_IO -c "open $TEST_IMG" -c "read 0 4k" 2>&1 | _filter_qemu_io | _filter_testdir # success, all done diff --git a/tests/qemu-iotests/114.out b/tests/qemu-iotests/114.out index 0a37d20c82..6d638da266 100644 --- a/tests/qemu-iotests/114.out +++ b/tests/qemu-iotests/114.out @@ -1,12 +1,9 @@ QA output created by 114 -qemu-img: warning: Deprecated use of backing file without explicit backing format (detected format of raw) -Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 backing_file=TEST_DIR/t.IMGFMT.orig +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 backing_file=TEST_DIR/t.IMGFMT.orig backing_fmt=raw Formatting 'TEST_DIR/t.IMGFMT.base', fmt=IMGFMT size=67108864 -qemu-img: warning: Deprecated use of backing file without explicit backing format -qemu-img: warning: Deprecated use of backing file without explicit backing format (detected format of IMGFMT) +qemu-img: Use of backing file requires explicit backing format +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 backing_file=TEST_DIR/t.IMGFMT.base backing_fmt=IMGFMT Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 backing_file=TEST_DIR/t.IMGFMT.base backing_fmt=IMGFMT -qemu-img: warning: Deprecated use of unopened backing file without explicit backing format, use of this image requires potentially unsafe format probing -Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 backing_file=TEST_DIR/t.IMGFMT.base image: TEST_DIR/t.IMGFMT file format: IMGFMT virtual size: 64 MiB (67108864 bytes) @@ -17,7 +14,7 @@ qemu-io: can't open device TEST_DIR/t.qcow2: Could not open backing file: Unknow no file open, try 'help open' read 4096/4096 bytes at offset 0 4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) -qemu-img: warning: Deprecated use of backing file without explicit backing format, use of this image requires potentially unsafe format probing +qemu-img: Could not change the backing file to 'TEST_DIR/t.qcow2.base': Invalid argument read 4096/4096 bytes at offset 0 4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) *** done diff --git a/tests/qemu-iotests/301 b/tests/qemu-iotests/301 index 9f943cadbe..220de1043f 100755 --- a/tests/qemu-iotests/301 +++ b/tests/qemu-iotests/301 @@ -3,7 +3,7 @@ # # Test qcow backing file warnings # -# Copyright (C) 2020 Red Hat, Inc. +# Copyright (C) 2020-2021 Red Hat, Inc. # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -46,7 +46,6 @@ echo "== qcow backed by qcow ==" TEST_IMG="$TEST_IMG.base" _make_test_img $size _make_test_img -b "$TEST_IMG.base" $size -_img_info _make_test_img -b "$TEST_IMG.base" -F $IMGFMT $size _img_info @@ -71,7 +70,6 @@ echo "== qcow backed by raw ==" rm "$TEST_IMG.base" truncate --size=$size "$TEST_IMG.base" _make_test_img -b "$TEST_IMG.base" $size -_img_info _make_test_img -b "$TEST_IMG.base" -F raw $size _img_info diff --git a/tests/qemu-iotests/301.out b/tests/qemu-iotests/301.out index 9004dad639..e280658191 100644 --- a/tests/qemu-iotests/301.out +++ b/tests/qemu-iotests/301.out @@ -2,13 +2,7 @@ QA output created by 301 == qcow backed by qcow == Formatting 'TEST_DIR/t.IMGFMT.base', fmt=IMGFMT size=33554432 -qemu-img: warning: Deprecated use of backing file without explicit backing format (detected format of IMGFMT) -Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=33554432 backing_file=TEST_DIR/t.IMGFMT.base backing_fmt=IMGFMT -image: TEST_DIR/t.IMGFMT -file format: IMGFMT -virtual size: 32 MiB (33554432 bytes) -cluster_size: 512 -backing file: TEST_DIR/t.IMGFMT.base +qemu-img: TEST_DIR/t.IMGFMT: Backing file specified without backing format Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=33554432 backing_file=TEST_DIR/t.IMGFMT.base backing_fmt=IMGFMT image: TEST_DIR/t.IMGFMT file format: IMGFMT @@ -36,13 +30,7 @@ cluster_size: 512 backing file: TEST_DIR/t.IMGFMT.base == qcow backed by raw == -qemu-img: warning: Deprecated use of backing file without explicit backing format (detected format of raw) -Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=33554432 backing_file=TEST_DIR/t.IMGFMT.base -image: TEST_DIR/t.IMGFMT -file format: IMGFMT -virtual size: 32 MiB (33554432 bytes) -cluster_size: 512 -backing file: TEST_DIR/t.IMGFMT.base +qemu-img: TEST_DIR/t.IMGFMT: Backing file specified without backing format Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=33554432 backing_file=TEST_DIR/t.IMGFMT.base backing_fmt=raw image: TEST_DIR/t.IMGFMT file format: IMGFMT From a7cd44bef3d9380181734a93977c3d1df3eef2cf Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Thu, 8 Jul 2021 10:52:28 -0500 Subject: [PATCH 22/28] qemu-img: Improve error for rebase without backing format When removeing support for qemu-img being able to create backing chains without embedded backing formats, we caused a poor error message as caught by iotest 114. Improve the situation to inform the user what went wrong. Suggested-by: Kevin Wolf Signed-off-by: Eric Blake Message-Id: <20210708155228.2666172-1-eblake@redhat.com> Signed-off-by: Kevin Wolf --- qemu-img.c | 3 +++ tests/qemu-iotests/114.out | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/qemu-img.c b/qemu-img.c index ec0e2fabe5..7c4fc60312 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -3767,6 +3767,9 @@ static int img_rebase(int argc, char **argv) if (ret == -ENOSPC) { error_report("Could not change the backing file to '%s': No " "space left in the file header", out_baseimg); + } else if (ret == -EINVAL && out_baseimg && !out_basefmt) { + error_report("Could not change the backing file to '%s': backing " + "format must be specified", out_baseimg); } else if (ret < 0) { error_report("Could not change the backing file to '%s': %s", out_baseimg, strerror(-ret)); diff --git a/tests/qemu-iotests/114.out b/tests/qemu-iotests/114.out index 6d638da266..f51dd9d20a 100644 --- a/tests/qemu-iotests/114.out +++ b/tests/qemu-iotests/114.out @@ -14,7 +14,7 @@ qemu-io: can't open device TEST_DIR/t.qcow2: Could not open backing file: Unknow no file open, try 'help open' read 4096/4096 bytes at offset 0 4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) -qemu-img: Could not change the backing file to 'TEST_DIR/t.qcow2.base': Invalid argument +qemu-img: Could not change the backing file to 'TEST_DIR/t.qcow2.base': backing format must be specified read 4096/4096 bytes at offset 0 4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) *** done From bcfd86d6a6432be75fd8700c7c1aabb243adf469 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Thu, 8 Jul 2021 13:47:04 +0200 Subject: [PATCH 23/28] qcow2: Fix dangling pointer after reopen for 'file' Without an external data file, s->data_file is a second pointer with the same value as bs->file. When changing bs->file to a different BdrvChild and freeing the old BdrvChild, s->data_file must also be updated, otherwise it points to freed memory and causes crashes. This problem was caught by iotests case 245. Fixes: df2b7086f169239ebad5d150efa29c9bb6d4f820 Signed-off-by: Kevin Wolf Message-Id: <20210708114709.206487-2-kwolf@redhat.com> Reviewed-by: Eric Blake Reviewed-by: Vladimir Sementsov-Ogievskiy Signed-off-by: Kevin Wolf --- block/qcow2.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/block/qcow2.c b/block/qcow2.c index 0cac2eda36..9f1b6461c8 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -1926,6 +1926,7 @@ static void qcow2_refresh_limits(BlockDriverState *bs, Error **errp) static int qcow2_reopen_prepare(BDRVReopenState *state, BlockReopenQueue *queue, Error **errp) { + BDRVQcow2State *s = state->bs->opaque; Qcow2ReopenState *r; int ret; @@ -1956,6 +1957,16 @@ static int qcow2_reopen_prepare(BDRVReopenState *state, } } + /* + * Without an external data file, s->data_file points to the same BdrvChild + * as bs->file. It needs to be resynced after reopen because bs->file may + * be changed. We can't use it in the meantime. + */ + if (!has_data_file(state->bs)) { + assert(s->data_file == state->bs->file); + s->data_file = NULL; + } + return 0; fail: @@ -1966,7 +1977,16 @@ fail: static void qcow2_reopen_commit(BDRVReopenState *state) { + BDRVQcow2State *s = state->bs->opaque; + qcow2_update_options_commit(state->bs, state->opaque); + if (!s->data_file) { + /* + * If we don't have an external data file, s->data_file was cleared by + * qcow2_reopen_prepare() and needs to be updated. + */ + s->data_file = state->bs->file; + } g_free(state->opaque); } @@ -1990,6 +2010,15 @@ static void qcow2_reopen_commit_post(BDRVReopenState *state) static void qcow2_reopen_abort(BDRVReopenState *state) { + BDRVQcow2State *s = state->bs->opaque; + + if (!s->data_file) { + /* + * If we don't have an external data file, s->data_file was cleared by + * qcow2_reopen_prepare() and needs to be restored. + */ + s->data_file = state->bs->file; + } qcow2_update_options_abort(state->bs, state->opaque); g_free(state->opaque); } From ab5b522879e2a7880418cbd29340675e5427572f Mon Sep 17 00:00:00 2001 From: Alberto Garcia Date: Thu, 8 Jul 2021 13:47:05 +0200 Subject: [PATCH 24/28] block: Add bdrv_reopen_queue_free() Move the code to free a BlockReopenQueue to a separate function. It will be used in a subsequent patch. [ kwolf: Also free explicit_options and options, and explicitly qobject_ref() the value when it continues to be used. This makes future memory leaks less likely. ] Signed-off-by: Alberto Garcia Signed-off-by: Kevin Wolf Reviewed-by: Vladimir Sementsov-Ogievskiy Message-Id: <20210708114709.206487-3-kwolf@redhat.com> Signed-off-by: Kevin Wolf --- block.c | 22 ++++++++++++++++------ include/block/block.h | 1 + 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/block.c b/block.c index ce96585575..a26465e3da 100644 --- a/block.c +++ b/block.c @@ -4095,6 +4095,19 @@ BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue, NULL, 0, keep_old_opts); } +void bdrv_reopen_queue_free(BlockReopenQueue *bs_queue) +{ + if (bs_queue) { + BlockReopenQueueEntry *bs_entry, *next; + QTAILQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) { + qobject_unref(bs_entry->state.explicit_options); + qobject_unref(bs_entry->state.options); + g_free(bs_entry); + } + g_free(bs_queue); + } +} + /* * Reopen multiple BlockDriverStates atomically & transactionally. * @@ -4197,15 +4210,10 @@ abort: if (bs_entry->prepared) { bdrv_reopen_abort(&bs_entry->state); } - qobject_unref(bs_entry->state.explicit_options); - qobject_unref(bs_entry->state.options); } cleanup: - QTAILQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) { - g_free(bs_entry); - } - g_free(bs_queue); + bdrv_reopen_queue_free(bs_queue); return ret; } @@ -4573,6 +4581,8 @@ static void bdrv_reopen_commit(BDRVReopenState *reopen_state) /* set BDS specific flags now */ qobject_unref(bs->explicit_options); qobject_unref(bs->options); + qobject_ref(reopen_state->explicit_options); + qobject_ref(reopen_state->options); bs->explicit_options = reopen_state->explicit_options; bs->options = reopen_state->options; diff --git a/include/block/block.h b/include/block/block.h index 7ec77ecb1a..6d42992985 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -386,6 +386,7 @@ BlockDriverState *bdrv_new_open_driver(BlockDriver *drv, const char *node_name, BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue, BlockDriverState *bs, QDict *options, bool keep_old_opts); +void bdrv_reopen_queue_free(BlockReopenQueue *bs_queue); int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp); int bdrv_reopen_set_read_only(BlockDriverState *bs, bool read_only, Error **errp); From 6cf42ca2f9782f0335abf3e6b611fbced40cd099 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Thu, 8 Jul 2021 13:47:06 +0200 Subject: [PATCH 25/28] block: Acquire AioContexts during bdrv_reopen_multiple() As the BlockReopenQueue can contain nodes in multiple AioContexts, only one of which may be locked when AIO_WAIT_WHILE() can be called, we can't let the caller lock the right contexts. Instead, individually lock the AioContext of a single node when iterating the queue. Reintroduce bdrv_reopen() as a wrapper for reopening a single node that drains the node and temporarily drops the AioContext lock for bdrv_reopen_multiple(). Signed-off-by: Kevin Wolf Reviewed-by: Vladimir Sementsov-Ogievskiy Message-Id: <20210708114709.206487-4-kwolf@redhat.com> Signed-off-by: Kevin Wolf --- block.c | 51 ++++++++++++++++++++++++++++++++++++------- block/replication.c | 7 ++++++ blockdev.c | 5 +++++ include/block/block.h | 2 ++ qemu-io-cmds.c | 7 +----- 5 files changed, 58 insertions(+), 14 deletions(-) diff --git a/block.c b/block.c index a26465e3da..be083f389e 100644 --- a/block.c +++ b/block.c @@ -4124,19 +4124,26 @@ void bdrv_reopen_queue_free(BlockReopenQueue *bs_queue) * * All affected nodes must be drained between bdrv_reopen_queue() and * bdrv_reopen_multiple(). + * + * To be called from the main thread, with all other AioContexts unlocked. */ int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp) { int ret = -1; BlockReopenQueueEntry *bs_entry, *next; + AioContext *ctx; Transaction *tran = tran_new(); g_autoptr(GHashTable) found = NULL; g_autoptr(GSList) refresh_list = NULL; + assert(qemu_get_current_aio_context() == qemu_get_aio_context()); assert(bs_queue != NULL); QTAILQ_FOREACH(bs_entry, bs_queue, entry) { + ctx = bdrv_get_aio_context(bs_entry->state.bs); + aio_context_acquire(ctx); ret = bdrv_flush(bs_entry->state.bs); + aio_context_release(ctx); if (ret < 0) { error_setg_errno(errp, -ret, "Error flushing drive"); goto abort; @@ -4145,7 +4152,10 @@ int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp) QTAILQ_FOREACH(bs_entry, bs_queue, entry) { assert(bs_entry->state.bs->quiesce_counter > 0); + ctx = bdrv_get_aio_context(bs_entry->state.bs); + aio_context_acquire(ctx); ret = bdrv_reopen_prepare(&bs_entry->state, bs_queue, tran, errp); + aio_context_release(ctx); if (ret < 0) { goto abort; } @@ -4188,7 +4198,10 @@ int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp) * to first element. */ QTAILQ_FOREACH_REVERSE(bs_entry, bs_queue, entry) { + ctx = bdrv_get_aio_context(bs_entry->state.bs); + aio_context_acquire(ctx); bdrv_reopen_commit(&bs_entry->state); + aio_context_release(ctx); } tran_commit(tran); @@ -4197,7 +4210,10 @@ int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp) BlockDriverState *bs = bs_entry->state.bs; if (bs->drv->bdrv_reopen_commit_post) { + ctx = bdrv_get_aio_context(bs); + aio_context_acquire(ctx); bs->drv->bdrv_reopen_commit_post(&bs_entry->state); + aio_context_release(ctx); } } @@ -4208,7 +4224,10 @@ abort: tran_abort(tran); QTAILQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) { if (bs_entry->prepared) { + ctx = bdrv_get_aio_context(bs_entry->state.bs); + aio_context_acquire(ctx); bdrv_reopen_abort(&bs_entry->state); + aio_context_release(ctx); } } @@ -4218,21 +4237,37 @@ cleanup: return ret; } +int bdrv_reopen(BlockDriverState *bs, QDict *opts, bool keep_old_opts, + Error **errp) +{ + AioContext *ctx = bdrv_get_aio_context(bs); + BlockReopenQueue *queue; + int ret; + + bdrv_subtree_drained_begin(bs); + if (ctx != qemu_get_aio_context()) { + aio_context_release(ctx); + } + + queue = bdrv_reopen_queue(NULL, bs, opts, keep_old_opts); + ret = bdrv_reopen_multiple(queue, errp); + + if (ctx != qemu_get_aio_context()) { + aio_context_acquire(ctx); + } + bdrv_subtree_drained_end(bs); + + return ret; +} + int bdrv_reopen_set_read_only(BlockDriverState *bs, bool read_only, Error **errp) { - int ret; - BlockReopenQueue *queue; QDict *opts = qdict_new(); qdict_put_bool(opts, BDRV_OPT_READ_ONLY, read_only); - bdrv_subtree_drained_begin(bs); - queue = bdrv_reopen_queue(NULL, bs, opts, true); - ret = bdrv_reopen_multiple(queue, errp); - bdrv_subtree_drained_end(bs); - - return ret; + return bdrv_reopen(bs, opts, true, errp); } /* diff --git a/block/replication.c b/block/replication.c index 52163f2d1f..774e15df16 100644 --- a/block/replication.c +++ b/block/replication.c @@ -390,7 +390,14 @@ static void reopen_backing_file(BlockDriverState *bs, bool writable, } if (reopen_queue) { + AioContext *ctx = bdrv_get_aio_context(bs); + if (ctx != qemu_get_aio_context()) { + aio_context_release(ctx); + } bdrv_reopen_multiple(reopen_queue, errp); + if (ctx != qemu_get_aio_context()) { + aio_context_acquire(ctx); + } } bdrv_subtree_drained_end(s->hidden_disk->bs); diff --git a/blockdev.c b/blockdev.c index 094c085962..0acbace8fd 100644 --- a/blockdev.c +++ b/blockdev.c @@ -3592,8 +3592,13 @@ void qmp_x_blockdev_reopen(BlockdevOptions *options, Error **errp) ctx = bdrv_get_aio_context(bs); aio_context_acquire(ctx); bdrv_subtree_drained_begin(bs); + aio_context_release(ctx); + queue = bdrv_reopen_queue(NULL, bs, qdict, false); bdrv_reopen_multiple(queue, errp); + + ctx = bdrv_get_aio_context(bs); + aio_context_acquire(ctx); bdrv_subtree_drained_end(bs); aio_context_release(ctx); diff --git a/include/block/block.h b/include/block/block.h index 6d42992985..3477290f9a 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -388,6 +388,8 @@ BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue, bool keep_old_opts); void bdrv_reopen_queue_free(BlockReopenQueue *bs_queue); int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp); +int bdrv_reopen(BlockDriverState *bs, QDict *opts, bool keep_old_opts, + Error **errp); int bdrv_reopen_set_read_only(BlockDriverState *bs, bool read_only, Error **errp); int bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset, diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c index e8d862a426..46593d632d 100644 --- a/qemu-io-cmds.c +++ b/qemu-io-cmds.c @@ -2116,8 +2116,6 @@ static int reopen_f(BlockBackend *blk, int argc, char **argv) bool writethrough = !blk_enable_write_cache(blk); bool has_rw_option = false; bool has_cache_option = false; - - BlockReopenQueue *brq; Error *local_err = NULL; while ((c = getopt(argc, argv, "c:o:rw")) != -1) { @@ -2210,10 +2208,7 @@ static int reopen_f(BlockBackend *blk, int argc, char **argv) qdict_put_bool(opts, BDRV_OPT_CACHE_NO_FLUSH, flags & BDRV_O_NO_FLUSH); } - bdrv_subtree_drained_begin(bs); - brq = bdrv_reopen_queue(NULL, bs, opts, true); - bdrv_reopen_multiple(brq, &local_err); - bdrv_subtree_drained_end(bs); + bdrv_reopen(bs, opts, true, &local_err); if (local_err) { error_report_err(local_err); From 3908b7a8994fa5ef7a89aa58cd5a02fc58141592 Mon Sep 17 00:00:00 2001 From: Alberto Garcia Date: Thu, 8 Jul 2021 13:47:07 +0200 Subject: [PATCH 26/28] block: Support multiple reopening with x-blockdev-reopen [ kwolf: Fixed AioContext locking ] Signed-off-by: Alberto Garcia Signed-off-by: Kevin Wolf Reviewed-by: Vladimir Sementsov-Ogievskiy Message-Id: <20210708114709.206487-5-kwolf@redhat.com> Signed-off-by: Kevin Wolf --- blockdev.c | 83 ++++++++++--------- qapi/block-core.json | 18 ++-- tests/qemu-iotests/155 | 9 +- tests/qemu-iotests/165 | 4 +- tests/qemu-iotests/245 | 27 +++--- tests/qemu-iotests/248 | 2 +- tests/qemu-iotests/248.out | 2 +- tests/qemu-iotests/296 | 9 +- tests/qemu-iotests/298 | 4 +- .../tests/remove-bitmap-from-backing | 18 ++-- 10 files changed, 100 insertions(+), 76 deletions(-) diff --git a/blockdev.c b/blockdev.c index 0acbace8fd..5ad0e9070e 100644 --- a/blockdev.c +++ b/blockdev.c @@ -3559,51 +3559,60 @@ fail: visit_free(v); } -void qmp_x_blockdev_reopen(BlockdevOptions *options, Error **errp) +void qmp_x_blockdev_reopen(BlockdevOptionsList *reopen_list, Error **errp) { - BlockDriverState *bs; - AioContext *ctx; - QObject *obj; - Visitor *v = qobject_output_visitor_new(&obj); - BlockReopenQueue *queue; - QDict *qdict; + BlockReopenQueue *queue = NULL; + GSList *drained = NULL; - /* Check for the selected node name */ - if (!options->has_node_name) { - error_setg(errp, "node-name not specified"); - goto fail; + /* Add each one of the BDS that we want to reopen to the queue */ + for (; reopen_list != NULL; reopen_list = reopen_list->next) { + BlockdevOptions *options = reopen_list->value; + BlockDriverState *bs; + AioContext *ctx; + QObject *obj; + Visitor *v; + QDict *qdict; + + /* Check for the selected node name */ + if (!options->has_node_name) { + error_setg(errp, "node-name not specified"); + goto fail; + } + + bs = bdrv_find_node(options->node_name); + if (!bs) { + error_setg(errp, "Failed to find node with node-name='%s'", + options->node_name); + goto fail; + } + + /* Put all options in a QDict and flatten it */ + v = qobject_output_visitor_new(&obj); + visit_type_BlockdevOptions(v, NULL, &options, &error_abort); + visit_complete(v, &obj); + visit_free(v); + + qdict = qobject_to(QDict, obj); + + qdict_flatten(qdict); + + ctx = bdrv_get_aio_context(bs); + aio_context_acquire(ctx); + + bdrv_subtree_drained_begin(bs); + queue = bdrv_reopen_queue(queue, bs, qdict, false); + drained = g_slist_prepend(drained, bs); + + aio_context_release(ctx); } - bs = bdrv_find_node(options->node_name); - if (!bs) { - error_setg(errp, "Failed to find node with node-name='%s'", - options->node_name); - goto fail; - } - - /* Put all options in a QDict and flatten it */ - visit_type_BlockdevOptions(v, NULL, &options, &error_abort); - visit_complete(v, &obj); - qdict = qobject_to(QDict, obj); - - qdict_flatten(qdict); - /* Perform the reopen operation */ - ctx = bdrv_get_aio_context(bs); - aio_context_acquire(ctx); - bdrv_subtree_drained_begin(bs); - aio_context_release(ctx); - - queue = bdrv_reopen_queue(NULL, bs, qdict, false); bdrv_reopen_multiple(queue, errp); - - ctx = bdrv_get_aio_context(bs); - aio_context_acquire(ctx); - bdrv_subtree_drained_end(bs); - aio_context_release(ctx); + queue = NULL; fail: - visit_free(v); + bdrv_reopen_queue_free(queue); + g_slist_free_full(drained, (GDestroyNotify) bdrv_subtree_drained_end); } void qmp_blockdev_del(const char *node_name, Error **errp) diff --git a/qapi/block-core.json b/qapi/block-core.json index 4a46552816..052520331e 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -4221,13 +4221,15 @@ ## # @x-blockdev-reopen: # -# Reopens a block device using the given set of options. Any option -# not specified will be reset to its default value regardless of its -# previous status. If an option cannot be changed or a particular +# Reopens one or more block devices using the given set of options. +# Any option not specified will be reset to its default value regardless +# of its previous status. If an option cannot be changed or a particular # driver does not support reopening then the command will return an -# error. +# error. All devices in the list are reopened in one transaction, so +# if one of them fails then the whole transaction is cancelled. # -# The top-level @node-name option (from BlockdevOptions) must be +# The command receives a list of block devices to reopen. For each one +# of them, the top-level @node-name option (from BlockdevOptions) must be # specified and is used to select the block device to be reopened. # Other @node-name options must be either omitted or set to the # current name of the appropriate node. This command won't change any @@ -4247,8 +4249,8 @@ # # 4) NULL: the current child (if any) is detached. # -# Options (1) and (2) are supported in all cases, but at the moment -# only @backing allows replacing or detaching an existing child. +# Options (1) and (2) are supported in all cases. Option (3) is +# supported for @file and @backing, and option (4) for @backing only. # # Unlike with blockdev-add, the @backing option must always be present # unless the node being reopened does not have a backing file and its @@ -4258,7 +4260,7 @@ # Since: 4.0 ## { 'command': 'x-blockdev-reopen', - 'data': 'BlockdevOptions', 'boxed': true } + 'data': { 'options': ['BlockdevOptions'] } } ## # @blockdev-del: diff --git a/tests/qemu-iotests/155 b/tests/qemu-iotests/155 index bafef9dd9a..2947bfb81a 100755 --- a/tests/qemu-iotests/155 +++ b/tests/qemu-iotests/155 @@ -261,9 +261,12 @@ class TestBlockdevMirrorReopen(MirrorBaseClass): result = self.vm.qmp('blockdev-add', node_name="backing", driver="null-co") self.assert_qmp(result, 'return', {}) - result = self.vm.qmp('x-blockdev-reopen', node_name="target", - driver=iotests.imgfmt, file="target-file", - backing="backing") + result = self.vm.qmp('x-blockdev-reopen', options=[{ + 'node-name': "target", + 'driver': iotests.imgfmt, + 'file': "target-file", + 'backing': "backing" + }]) self.assert_qmp(result, 'return', {}) class TestBlockdevMirrorReopenIothread(TestBlockdevMirrorReopen): diff --git a/tests/qemu-iotests/165 b/tests/qemu-iotests/165 index abc4ffadd5..ef4cf14516 100755 --- a/tests/qemu-iotests/165 +++ b/tests/qemu-iotests/165 @@ -137,7 +137,7 @@ class TestPersistentDirtyBitmap(iotests.QMPTestCase): assert sha256_1 == self.getSha256() # Reopen to RW - result = self.vm.qmp('x-blockdev-reopen', **{ + result = self.vm.qmp('x-blockdev-reopen', options=[{ 'node-name': 'node0', 'driver': iotests.imgfmt, 'file': { @@ -145,7 +145,7 @@ class TestPersistentDirtyBitmap(iotests.QMPTestCase): 'filename': disk }, 'read-only': False - }) + }]) self.assert_qmp(result, 'return', {}) # Check that bitmap is reopened to RW and we can write to it. diff --git a/tests/qemu-iotests/245 b/tests/qemu-iotests/245 index 0295129cbb..ca08955207 100755 --- a/tests/qemu-iotests/245 +++ b/tests/qemu-iotests/245 @@ -85,8 +85,18 @@ class TestBlockdevReopen(iotests.QMPTestCase): "Expected output of %d qemu-io commands, found %d" % (found, self.total_io_cmds)) - # Run x-blockdev-reopen with 'opts' but applying 'newopts' - # on top of it. The original 'opts' dict is unmodified + # Run x-blockdev-reopen on a list of block devices + def reopenMultiple(self, opts, errmsg = None): + result = self.vm.qmp('x-blockdev-reopen', conv_keys=False, options=opts) + if errmsg: + self.assert_qmp(result, 'error/class', 'GenericError') + self.assert_qmp(result, 'error/desc', errmsg) + else: + self.assert_qmp(result, 'return', {}) + + # Run x-blockdev-reopen on a single block device (specified by + # 'opts') but applying 'newopts' on top of it. The original 'opts' + # dict is unmodified def reopen(self, opts, newopts = {}, errmsg = None): opts = copy.deepcopy(opts) @@ -101,12 +111,7 @@ class TestBlockdevReopen(iotests.QMPTestCase): subdict = opts[prefix] subdict[key] = value - result = self.vm.qmp('x-blockdev-reopen', conv_keys = False, **opts) - if errmsg: - self.assert_qmp(result, 'error/class', 'GenericError') - self.assert_qmp(result, 'error/desc', errmsg) - else: - self.assert_qmp(result, 'return', {}) + self.reopenMultiple([ opts ], errmsg) # Run query-named-block-nodes and return the specified entry @@ -142,10 +147,10 @@ class TestBlockdevReopen(iotests.QMPTestCase): # We cannot change any of these self.reopen(opts, {'node-name': 'not-found'}, "Failed to find node with node-name='not-found'") self.reopen(opts, {'node-name': ''}, "Failed to find node with node-name=''") - self.reopen(opts, {'node-name': None}, "Invalid parameter type for 'node-name', expected: string") + self.reopen(opts, {'node-name': None}, "Invalid parameter type for 'options[0].node-name', expected: string") self.reopen(opts, {'driver': 'raw'}, "Cannot change the option 'driver'") self.reopen(opts, {'driver': ''}, "Invalid parameter ''") - self.reopen(opts, {'driver': None}, "Invalid parameter type for 'driver', expected: string") + self.reopen(opts, {'driver': None}, "Invalid parameter type for 'options[0].driver', expected: string") self.reopen(opts, {'file': 'not-found'}, "Cannot find device='' nor node-name='not-found'") self.reopen(opts, {'file': ''}, "Cannot find device='' nor node-name=''") self.reopen(opts, {'file': None}, "Invalid parameter type for 'file', expected: BlockdevRef") @@ -154,7 +159,7 @@ class TestBlockdevReopen(iotests.QMPTestCase): self.reopen(opts, {'file.filename': hd_path[1]}, "Cannot change the option 'filename'") self.reopen(opts, {'file.aio': 'native'}, "Cannot change the option 'aio'") self.reopen(opts, {'file.locking': 'off'}, "Cannot change the option 'locking'") - self.reopen(opts, {'file.filename': None}, "Invalid parameter type for 'file.filename', expected: string") + self.reopen(opts, {'file.filename': None}, "Invalid parameter type for 'options[0].file.filename', expected: string") # node-name is optional in BlockdevOptions, but x-blockdev-reopen needs it del opts['node-name'] diff --git a/tests/qemu-iotests/248 b/tests/qemu-iotests/248 index 4daaed1530..03911333c4 100755 --- a/tests/qemu-iotests/248 +++ b/tests/qemu-iotests/248 @@ -63,7 +63,7 @@ vm.get_qmp_events() del blockdev_opts['file']['size'] vm.qmp_log('x-blockdev-reopen', filters=[filter_qmp_testfiles], - **blockdev_opts) + options = [ blockdev_opts ]) vm.qmp_log('block-job-resume', device='drive0') vm.event_wait('JOB_STATUS_CHANGE', timeout=1.0, diff --git a/tests/qemu-iotests/248.out b/tests/qemu-iotests/248.out index 369b25bf26..893f625347 100644 --- a/tests/qemu-iotests/248.out +++ b/tests/qemu-iotests/248.out @@ -2,7 +2,7 @@ {"return": {}} {"execute": "blockdev-mirror", "arguments": {"device": "drive0", "on-target-error": "enospc", "sync": "full", "target": "target"}} {"return": {}} -{"execute": "x-blockdev-reopen", "arguments": {"driver": "qcow2", "file": {"driver": "raw", "file": {"driver": "file", "filename": "TEST_DIR/PID-target"}}, "node-name": "target"}} +{"execute": "x-blockdev-reopen", "arguments": {"options": [{"driver": "qcow2", "file": {"driver": "raw", "file": {"driver": "file", "filename": "TEST_DIR/PID-target"}}, "node-name": "target"}]}} {"return": {}} {"execute": "block-job-resume", "arguments": {"device": "drive0"}} {"return": {}} diff --git a/tests/qemu-iotests/296 b/tests/qemu-iotests/296 index 7c65e987a1..fa07b98bb4 100755 --- a/tests/qemu-iotests/296 +++ b/tests/qemu-iotests/296 @@ -120,8 +120,7 @@ class EncryptionSetupTestCase(iotests.QMPTestCase): command = 'x-blockdev-reopen' if reOpen else 'blockdev-add' - result = vm.qmp(command, ** - { + opts = { 'driver': iotests.imgfmt, 'node-name': id, 'read-only': readOnly, @@ -131,7 +130,11 @@ class EncryptionSetupTestCase(iotests.QMPTestCase): 'filename': test_img, } } - ) + + if reOpen: + result = vm.qmp(command, options=[opts]) + else: + result = vm.qmp(command, **opts) self.assert_qmp(result, 'return', {}) diff --git a/tests/qemu-iotests/298 b/tests/qemu-iotests/298 index d535946b5f..a5b0d91224 100755 --- a/tests/qemu-iotests/298 +++ b/tests/qemu-iotests/298 @@ -98,7 +98,7 @@ class TestPreallocateFilter(TestPreallocateBase): self.check_big() def test_reopen_opts(self): - result = self.vm.qmp('x-blockdev-reopen', **{ + result = self.vm.qmp('x-blockdev-reopen', options=[{ 'node-name': 'disk', 'driver': iotests.imgfmt, 'file': { @@ -112,7 +112,7 @@ class TestPreallocateFilter(TestPreallocateBase): 'filename': disk } } - }) + }]) self.assert_qmp(result, 'return', {}) self.vm.hmp_qemu_io('drive0', 'write 0 1M') diff --git a/tests/qemu-iotests/tests/remove-bitmap-from-backing b/tests/qemu-iotests/tests/remove-bitmap-from-backing index 0ea4c36507..0b07f7e836 100755 --- a/tests/qemu-iotests/tests/remove-bitmap-from-backing +++ b/tests/qemu-iotests/tests/remove-bitmap-from-backing @@ -41,13 +41,15 @@ log('Trying to remove persistent bitmap from r-o base node, should fail:') vm.qmp_log('block-dirty-bitmap-remove', node='base', name='bitmap0') new_base_opts = { - 'node-name': 'base', - 'driver': 'qcow2', - 'file': { - 'driver': 'file', - 'filename': base - }, - 'read-only': False + 'options': [{ + 'node-name': 'base', + 'driver': 'qcow2', + 'file': { + 'driver': 'file', + 'filename': base + }, + 'read-only': False + }] } # Don't want to bother with filtering qmp_log for reopen command @@ -58,7 +60,7 @@ if result != {'return': {}}: log('Remove persistent bitmap from base node reopened to RW:') vm.qmp_log('block-dirty-bitmap-remove', node='base', name='bitmap0') -new_base_opts['read-only'] = True +new_base_opts['options'][0]['read-only'] = True result = vm.qmp('x-blockdev-reopen', **new_base_opts) if result != {'return': {}}: log('Failed to reopen: ' + str(result)) From 246ebc2d6a3adb8eb7514155daa7e6369ae8d654 Mon Sep 17 00:00:00 2001 From: Alberto Garcia Date: Thu, 8 Jul 2021 13:47:08 +0200 Subject: [PATCH 27/28] iotests: Test reopening multiple devices at the same time This test swaps the images used by two active block devices. This is now possible thanks to the new ability to run x-blockdev-reopen on multiple devices at the same time. Signed-off-by: Alberto Garcia Signed-off-by: Kevin Wolf Reviewed-by: Vladimir Sementsov-Ogievskiy Message-Id: <20210708114709.206487-6-kwolf@redhat.com> Signed-off-by: Kevin Wolf --- tests/qemu-iotests/245 | 47 ++++++++++++++++++++++++++++++++++++++ tests/qemu-iotests/245.out | 4 ++-- 2 files changed, 49 insertions(+), 2 deletions(-) diff --git a/tests/qemu-iotests/245 b/tests/qemu-iotests/245 index ca08955207..8bc8101e6d 100755 --- a/tests/qemu-iotests/245 +++ b/tests/qemu-iotests/245 @@ -649,6 +649,53 @@ class TestBlockdevReopen(iotests.QMPTestCase): '-c', 'read -P 0x40 0x40008 1', '-c', 'read -P 0x80 0x40010 1', hd_path[0]) + # Swap the disk images of two active block devices + def test_swap_files(self): + # Add hd0 and hd2 (none of them with backing files) + opts0 = hd_opts(0) + result = self.vm.qmp('blockdev-add', conv_keys = False, **opts0) + self.assert_qmp(result, 'return', {}) + + opts2 = hd_opts(2) + result = self.vm.qmp('blockdev-add', conv_keys = False, **opts2) + self.assert_qmp(result, 'return', {}) + + # Write different data to both block devices + self.run_qemu_io("hd0", "write -P 0xa0 0 1k") + self.run_qemu_io("hd2", "write -P 0xa2 0 1k") + + # Check that the data reads correctly + self.run_qemu_io("hd0", "read -P 0xa0 0 1k") + self.run_qemu_io("hd2", "read -P 0xa2 0 1k") + + # It's not possible to make a block device use an image that + # is already being used by the other device. + self.reopen(opts0, {'file': 'hd2-file'}, + "Permission conflict on node 'hd2-file': permissions " + "'write, resize' are both required by node 'hd2' (uses " + "node 'hd2-file' as 'file' child) and unshared by node " + "'hd0' (uses node 'hd2-file' as 'file' child).") + self.reopen(opts2, {'file': 'hd0-file'}, + "Permission conflict on node 'hd0-file': permissions " + "'write, resize' are both required by node 'hd0' (uses " + "node 'hd0-file' as 'file' child) and unshared by node " + "'hd2' (uses node 'hd0-file' as 'file' child).") + + # But we can swap the images if we reopen both devices at the + # same time + opts0['file'] = 'hd2-file' + opts2['file'] = 'hd0-file' + self.reopenMultiple([opts0, opts2]) + self.run_qemu_io("hd0", "read -P 0xa2 0 1k") + self.run_qemu_io("hd2", "read -P 0xa0 0 1k") + + # And we can of course come back to the original state + opts0['file'] = 'hd0-file' + opts2['file'] = 'hd2-file' + self.reopenMultiple([opts0, opts2]) + self.run_qemu_io("hd0", "read -P 0xa0 0 1k") + self.run_qemu_io("hd2", "read -P 0xa2 0 1k") + # Misc reopen tests with different block drivers @iotests.skip_if_unsupported(['quorum', 'throttle']) def test_misc_drivers(self): diff --git a/tests/qemu-iotests/245.out b/tests/qemu-iotests/245.out index daf1e51922..4eced19294 100644 --- a/tests/qemu-iotests/245.out +++ b/tests/qemu-iotests/245.out @@ -17,8 +17,8 @@ read 1/1 bytes at offset 262152 read 1/1 bytes at offset 262160 1 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) -.............. +............... ---------------------------------------------------------------------- -Ran 24 tests +Ran 25 tests OK From e60edf69e2f64e818466019313517a2e6d6b63f4 Mon Sep 17 00:00:00 2001 From: Alberto Garcia Date: Thu, 8 Jul 2021 13:47:09 +0200 Subject: [PATCH 28/28] block: Make blockdev-reopen stable API This patch drops the 'x-' prefix from x-blockdev-reopen. Signed-off-by: Alberto Garcia Signed-off-by: Kevin Wolf Reviewed-by: Vladimir Sementsov-Ogievskiy Message-Id: <20210708114709.206487-7-kwolf@redhat.com> Signed-off-by: Kevin Wolf --- blockdev.c | 2 +- qapi/block-core.json | 6 +++--- tests/qemu-iotests/155 | 2 +- tests/qemu-iotests/165 | 2 +- tests/qemu-iotests/245 | 10 +++++----- tests/qemu-iotests/248 | 2 +- tests/qemu-iotests/248.out | 2 +- tests/qemu-iotests/296 | 2 +- tests/qemu-iotests/298 | 2 +- tests/qemu-iotests/tests/remove-bitmap-from-backing | 4 ++-- 10 files changed, 17 insertions(+), 17 deletions(-) diff --git a/blockdev.c b/blockdev.c index 5ad0e9070e..3d8ac368a1 100644 --- a/blockdev.c +++ b/blockdev.c @@ -3559,7 +3559,7 @@ fail: visit_free(v); } -void qmp_x_blockdev_reopen(BlockdevOptionsList *reopen_list, Error **errp) +void qmp_blockdev_reopen(BlockdevOptionsList *reopen_list, Error **errp) { BlockReopenQueue *queue = NULL; GSList *drained = NULL; diff --git a/qapi/block-core.json b/qapi/block-core.json index 052520331e..c7a311798a 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -4219,7 +4219,7 @@ { 'command': 'blockdev-add', 'data': 'BlockdevOptions', 'boxed': true } ## -# @x-blockdev-reopen: +# @blockdev-reopen: # # Reopens one or more block devices using the given set of options. # Any option not specified will be reset to its default value regardless @@ -4257,9 +4257,9 @@ # image does not have a default backing file name as part of its # metadata. # -# Since: 4.0 +# Since: 6.1 ## -{ 'command': 'x-blockdev-reopen', +{ 'command': 'blockdev-reopen', 'data': { 'options': ['BlockdevOptions'] } } ## diff --git a/tests/qemu-iotests/155 b/tests/qemu-iotests/155 index 2947bfb81a..eadda52615 100755 --- a/tests/qemu-iotests/155 +++ b/tests/qemu-iotests/155 @@ -261,7 +261,7 @@ class TestBlockdevMirrorReopen(MirrorBaseClass): result = self.vm.qmp('blockdev-add', node_name="backing", driver="null-co") self.assert_qmp(result, 'return', {}) - result = self.vm.qmp('x-blockdev-reopen', options=[{ + result = self.vm.qmp('blockdev-reopen', options=[{ 'node-name': "target", 'driver': iotests.imgfmt, 'file': "target-file", diff --git a/tests/qemu-iotests/165 b/tests/qemu-iotests/165 index ef4cf14516..ce499946b8 100755 --- a/tests/qemu-iotests/165 +++ b/tests/qemu-iotests/165 @@ -137,7 +137,7 @@ class TestPersistentDirtyBitmap(iotests.QMPTestCase): assert sha256_1 == self.getSha256() # Reopen to RW - result = self.vm.qmp('x-blockdev-reopen', options=[{ + result = self.vm.qmp('blockdev-reopen', options=[{ 'node-name': 'node0', 'driver': iotests.imgfmt, 'file': { diff --git a/tests/qemu-iotests/245 b/tests/qemu-iotests/245 index 8bc8101e6d..bf8261eec0 100755 --- a/tests/qemu-iotests/245 +++ b/tests/qemu-iotests/245 @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # group: rw # -# Test cases for the QMP 'x-blockdev-reopen' command +# Test cases for the QMP 'blockdev-reopen' command # # Copyright (C) 2018-2019 Igalia, S.L. # Author: Alberto Garcia @@ -85,16 +85,16 @@ class TestBlockdevReopen(iotests.QMPTestCase): "Expected output of %d qemu-io commands, found %d" % (found, self.total_io_cmds)) - # Run x-blockdev-reopen on a list of block devices + # Run blockdev-reopen on a list of block devices def reopenMultiple(self, opts, errmsg = None): - result = self.vm.qmp('x-blockdev-reopen', conv_keys=False, options=opts) + result = self.vm.qmp('blockdev-reopen', conv_keys=False, options=opts) if errmsg: self.assert_qmp(result, 'error/class', 'GenericError') self.assert_qmp(result, 'error/desc', errmsg) else: self.assert_qmp(result, 'return', {}) - # Run x-blockdev-reopen on a single block device (specified by + # Run blockdev-reopen on a single block device (specified by # 'opts') but applying 'newopts' on top of it. The original 'opts' # dict is unmodified def reopen(self, opts, newopts = {}, errmsg = None): @@ -161,7 +161,7 @@ class TestBlockdevReopen(iotests.QMPTestCase): self.reopen(opts, {'file.locking': 'off'}, "Cannot change the option 'locking'") self.reopen(opts, {'file.filename': None}, "Invalid parameter type for 'options[0].file.filename', expected: string") - # node-name is optional in BlockdevOptions, but x-blockdev-reopen needs it + # node-name is optional in BlockdevOptions, but blockdev-reopen needs it del opts['node-name'] self.reopen(opts, {}, "node-name not specified") diff --git a/tests/qemu-iotests/248 b/tests/qemu-iotests/248 index 03911333c4..2ec2416e8a 100755 --- a/tests/qemu-iotests/248 +++ b/tests/qemu-iotests/248 @@ -62,7 +62,7 @@ vm.event_wait('JOB_STATUS_CHANGE', timeout=3.0, vm.get_qmp_events() del blockdev_opts['file']['size'] -vm.qmp_log('x-blockdev-reopen', filters=[filter_qmp_testfiles], +vm.qmp_log('blockdev-reopen', filters=[filter_qmp_testfiles], options = [ blockdev_opts ]) vm.qmp_log('block-job-resume', device='drive0') diff --git a/tests/qemu-iotests/248.out b/tests/qemu-iotests/248.out index 893f625347..66e94ccd7e 100644 --- a/tests/qemu-iotests/248.out +++ b/tests/qemu-iotests/248.out @@ -2,7 +2,7 @@ {"return": {}} {"execute": "blockdev-mirror", "arguments": {"device": "drive0", "on-target-error": "enospc", "sync": "full", "target": "target"}} {"return": {}} -{"execute": "x-blockdev-reopen", "arguments": {"options": [{"driver": "qcow2", "file": {"driver": "raw", "file": {"driver": "file", "filename": "TEST_DIR/PID-target"}}, "node-name": "target"}]}} +{"execute": "blockdev-reopen", "arguments": {"options": [{"driver": "qcow2", "file": {"driver": "raw", "file": {"driver": "file", "filename": "TEST_DIR/PID-target"}}, "node-name": "target"}]}} {"return": {}} {"execute": "block-job-resume", "arguments": {"device": "drive0"}} {"return": {}} diff --git a/tests/qemu-iotests/296 b/tests/qemu-iotests/296 index fa07b98bb4..099a3eeaa5 100755 --- a/tests/qemu-iotests/296 +++ b/tests/qemu-iotests/296 @@ -118,7 +118,7 @@ class EncryptionSetupTestCase(iotests.QMPTestCase): def openImageQmp(self, vm, id, file, secret, readOnly = False, reOpen = False): - command = 'x-blockdev-reopen' if reOpen else 'blockdev-add' + command = 'blockdev-reopen' if reOpen else 'blockdev-add' opts = { 'driver': iotests.imgfmt, diff --git a/tests/qemu-iotests/298 b/tests/qemu-iotests/298 index a5b0d91224..fae72211b1 100755 --- a/tests/qemu-iotests/298 +++ b/tests/qemu-iotests/298 @@ -98,7 +98,7 @@ class TestPreallocateFilter(TestPreallocateBase): self.check_big() def test_reopen_opts(self): - result = self.vm.qmp('x-blockdev-reopen', options=[{ + result = self.vm.qmp('blockdev-reopen', options=[{ 'node-name': 'disk', 'driver': iotests.imgfmt, 'file': { diff --git a/tests/qemu-iotests/tests/remove-bitmap-from-backing b/tests/qemu-iotests/tests/remove-bitmap-from-backing index 0b07f7e836..8d48fc0f3c 100755 --- a/tests/qemu-iotests/tests/remove-bitmap-from-backing +++ b/tests/qemu-iotests/tests/remove-bitmap-from-backing @@ -53,7 +53,7 @@ new_base_opts = { } # Don't want to bother with filtering qmp_log for reopen command -result = vm.qmp('x-blockdev-reopen', **new_base_opts) +result = vm.qmp('blockdev-reopen', **new_base_opts) if result != {'return': {}}: log('Failed to reopen: ' + str(result)) @@ -61,7 +61,7 @@ log('Remove persistent bitmap from base node reopened to RW:') vm.qmp_log('block-dirty-bitmap-remove', node='base', name='bitmap0') new_base_opts['options'][0]['read-only'] = True -result = vm.qmp('x-blockdev-reopen', **new_base_opts) +result = vm.qmp('blockdev-reopen', **new_base_opts) if result != {'return': {}}: log('Failed to reopen: ' + str(result))