From a6f8f9f82c857cf31cda702e6c62d623e8ddedd5 Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Tue, 17 Sep 2019 11:19:58 +0200
Subject: [PATCH 01/69] iotests: Prefer null-co over null-aio

We use null-co basically everywhere in the iotests.  Unless we want to
test null-aio specifically, we should use it instead (for consistency).

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: John Snow <jsnow@redhat.com>
Reviewed-by: Thomas Huth <thuth@redhat.com>
Message-id: 20190917092004.999-2-mreitz@redhat.com
Reviewed-by: Andrey Shinkevich <andrey.shinkevich@virtuozzo.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/093 | 7 +++----
 tests/qemu-iotests/245 | 2 +-
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/tests/qemu-iotests/093 b/tests/qemu-iotests/093
index 3c4f5173ce..50c1e7f2ec 100755
--- a/tests/qemu-iotests/093
+++ b/tests/qemu-iotests/093
@@ -267,13 +267,12 @@ class ThrottleTestCoroutine(ThrottleTestCase):
     test_img = "null-co://"
 
 class ThrottleTestGroupNames(iotests.QMPTestCase):
-    test_img = "null-aio://"
     max_drives = 3
 
     def setUp(self):
         self.vm = iotests.VM()
         for i in range(0, self.max_drives):
-            self.vm.add_drive(self.test_img,
+            self.vm.add_drive("null-co://",
                               "throttling.iops-total=100,file.read-zeroes=on")
         self.vm.launch()
 
@@ -376,10 +375,10 @@ class ThrottleTestRemovableMedia(iotests.QMPTestCase):
 
     def test_removable_media(self):
         # Add a couple of dummy nodes named cd0 and cd1
-        result = self.vm.qmp("blockdev-add", driver="null-aio",
+        result = self.vm.qmp("blockdev-add", driver="null-co",
                              read_zeroes=True, node_name="cd0")
         self.assert_qmp(result, 'return', {})
-        result = self.vm.qmp("blockdev-add", driver="null-aio",
+        result = self.vm.qmp("blockdev-add", driver="null-co",
                              read_zeroes=True, node_name="cd1")
         self.assert_qmp(result, 'return', {})
 
diff --git a/tests/qemu-iotests/245 b/tests/qemu-iotests/245
index 41218d5f1d..e66a23c5f0 100644
--- a/tests/qemu-iotests/245
+++ b/tests/qemu-iotests/245
@@ -598,7 +598,7 @@ class TestBlockdevReopen(iotests.QMPTestCase):
         ##################
         ###### null ######
         ##################
-        opts = {'driver': 'null-aio', 'node-name': 'root', 'size': 1024}
+        opts = {'driver': 'null-co', 'node-name': 'root', 'size': 1024}
 
         result = self.vm.qmp('blockdev-add', conv_keys = False, **opts)
         self.assert_qmp(result, 'return', {})

From 6be012252018249d3a2843a0143ee8cca41c276d Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Tue, 17 Sep 2019 11:19:59 +0200
Subject: [PATCH 02/69] iotests: Allow skipping test cases

case_notrun() does not actually skip the current test case.  It just
adds a "notrun" note and then returns to the caller, who manually has to
skip the test.  Generally, skipping a test case is as simple as
returning from the current function, but not always: For example, this
model does not allow skipping tests already in the setUp() function.

Thus, add a QMPTestCase.case_skip() function that invokes case_notrun()
and then self.skipTest().  To make this work, we need to filter the
information on how many test cases were skipped from the unittest
output.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: John Snow <jsnow@redhat.com>
Reviewed-by: Andrey Shinkevich <andrey.shinkevich@virtuozzo.com>
Message-id: 20190917092004.999-3-mreitz@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/iotests.py | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py
index 709def4d5d..6d5de6e504 100644
--- a/tests/qemu-iotests/iotests.py
+++ b/tests/qemu-iotests/iotests.py
@@ -838,6 +838,11 @@ class QMPTestCase(unittest.TestCase):
             return self.pause_wait(job_id)
         return result
 
+    def case_skip(self, reason):
+        '''Skip this test case'''
+        case_notrun(reason)
+        self.skipTest(reason)
+
 
 def notrun(reason):
     '''Skip this test suite'''
@@ -849,7 +854,11 @@ def notrun(reason):
     sys.exit(0)
 
 def case_notrun(reason):
-    '''Skip this test case'''
+    '''Mark this test case as not having been run (without actually
+    skipping it, that is left to the caller).  See
+    QMPTestCase.case_skip() for a variant that actually skips the
+    current test case.'''
+
     # Each test in qemu-iotests has a number ("seq")
     seq = os.path.basename(sys.argv[0])
 
@@ -950,8 +959,15 @@ def execute_unittest(output, verbosity, debug):
         unittest.main(testRunner=runner)
     finally:
         if not debug:
-            sys.stderr.write(re.sub(r'Ran (\d+) tests? in [\d.]+s',
-                                    r'Ran \1 tests', output.getvalue()))
+            out = output.getvalue()
+            out = re.sub(r'Ran (\d+) tests? in [\d.]+s', r'Ran \1 tests', out)
+
+            # Hide skipped tests from the reference output
+            out = re.sub(r'OK \(skipped=\d+\)', 'OK', out)
+            out_first_line, out_rest = out.split('\n', 1)
+            out = out_first_line.replace('s', '.') + '\n' + out_rest
+
+            sys.stderr.write(out)
 
 def execute_test(test_function=None,
                  supported_fmts=[], supported_oses=['linux'],

From e6067a950c44809bbeeb340b2595a5ab844635fd Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Tue, 17 Sep 2019 11:20:00 +0200
Subject: [PATCH 03/69] iotests: Use case_skip() in skip_if_unsupported()

skip_if_unsupported() should use the stronger variant case_skip(),
because this allows it to be used even with setUp() (in a meaningful
way).

In the process, make it explicit what we expect the first argument of
the func_wrapper to be (namely something derived of QMPTestCase).

Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-id: 20190917092004.999-4-mreitz@redhat.com
Reviewed-by: Andrey Shinkevich <andrey.shinkevich@virtuozzo.com>
Reviewed-by: John Snow <jsnow@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/iotests.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py
index 6d5de6e504..bd867d7e02 100644
--- a/tests/qemu-iotests/iotests.py
+++ b/tests/qemu-iotests/iotests.py
@@ -929,14 +929,14 @@ def skip_if_unsupported(required_formats=[], read_only=False):
     '''Skip Test Decorator
        Runs the test if all the required formats are whitelisted'''
     def skip_test_decorator(func):
-        def func_wrapper(*args, **kwargs):
+        def func_wrapper(test_case: QMPTestCase, *args, **kwargs):
             usf_list = list(set(required_formats) -
                             set(supported_formats(read_only)))
             if usf_list:
-                case_notrun('{}: formats {} are not whitelisted'.format(
-                    args[0], usf_list))
+                test_case.case_skip('{}: formats {} are not whitelisted'.format(
+                    test_case, usf_list))
             else:
-                return func(*args, **kwargs)
+                return func(test_case, *args, **kwargs)
         return func_wrapper
     return skip_test_decorator
 

From 7448be831a6b3a3a7f91d873a52bb2d1ce0bd3d9 Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Tue, 17 Sep 2019 11:20:01 +0200
Subject: [PATCH 04/69] iotests: Let skip_if_unsupported accept a function

This lets tests use skip_if_unsupported() with a potentially variable
list of required formats.

Suggested-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-id: 20190917092004.999-5-mreitz@redhat.com
Reviewed-by: Andrey Shinkevich <andrey.shinkevich@virtuozzo.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/iotests.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py
index bd867d7e02..936d33df61 100644
--- a/tests/qemu-iotests/iotests.py
+++ b/tests/qemu-iotests/iotests.py
@@ -930,8 +930,12 @@ def skip_if_unsupported(required_formats=[], read_only=False):
        Runs the test if all the required formats are whitelisted'''
     def skip_test_decorator(func):
         def func_wrapper(test_case: QMPTestCase, *args, **kwargs):
-            usf_list = list(set(required_formats) -
-                            set(supported_formats(read_only)))
+            if callable(required_formats):
+                fmts = required_formats(test_case)
+            else:
+                fmts = required_formats
+
+            usf_list = list(set(fmts) - set(supported_formats(read_only)))
             if usf_list:
                 test_case.case_skip('{}: formats {} are not whitelisted'.format(
                     test_case, usf_list))

From a377dd5170f2327cda746145027540dc823009fb Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Tue, 17 Sep 2019 11:20:02 +0200
Subject: [PATCH 05/69] iotests: Test driver whitelisting in 093

null-aio may not be whitelisted.  Skip all test cases that require it.

(And skip the whole test if null-co is not whitelisted.)

Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-id: 20190917092004.999-6-mreitz@redhat.com
Reviewed-by: Andrey Shinkevich <andrey.shinkevich@virtuozzo.com>
Reviewed-by: John Snow <jsnow@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/093 | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/tests/qemu-iotests/093 b/tests/qemu-iotests/093
index 50c1e7f2ec..f03fa24a07 100755
--- a/tests/qemu-iotests/093
+++ b/tests/qemu-iotests/093
@@ -24,7 +24,7 @@ import iotests
 nsec_per_sec = 1000000000
 
 class ThrottleTestCase(iotests.QMPTestCase):
-    test_img = "null-aio://"
+    test_driver = "null-aio"
     max_drives = 3
 
     def blockstats(self, device):
@@ -35,10 +35,14 @@ class ThrottleTestCase(iotests.QMPTestCase):
                 return stat['rd_bytes'], stat['rd_operations'], stat['wr_bytes'], stat['wr_operations']
         raise Exception("Device not found for blockstats: %s" % device)
 
+    def required_drivers(self):
+        return [self.test_driver]
+
+    @iotests.skip_if_unsupported(required_drivers)
     def setUp(self):
         self.vm = iotests.VM()
         for i in range(0, self.max_drives):
-            self.vm.add_drive(self.test_img, "file.read-zeroes=on")
+            self.vm.add_drive(self.test_driver + "://", "file.read-zeroes=on")
         self.vm.launch()
 
     def tearDown(self):
@@ -264,7 +268,7 @@ class ThrottleTestCase(iotests.QMPTestCase):
         self.assertEqual(self.blockstats('drive1')[0], 4096)
 
 class ThrottleTestCoroutine(ThrottleTestCase):
-    test_img = "null-co://"
+    test_driver = "null-co"
 
 class ThrottleTestGroupNames(iotests.QMPTestCase):
     max_drives = 3
@@ -425,4 +429,6 @@ class ThrottleTestRemovableMedia(iotests.QMPTestCase):
 
 
 if __name__ == '__main__':
+    if 'null-co' not in iotests.supported_formats():
+        iotests.notrun('null-co driver support missing')
     iotests.main(supported_fmts=["raw"])

From 753b31b5f3248be68f2e02b08b4796358b5e7603 Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Tue, 17 Sep 2019 11:20:03 +0200
Subject: [PATCH 06/69] iotests: Test driver whitelisting in 136

null-aio may not be whitelisted.  Skip all test cases that require it.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-id: 20190917092004.999-7-mreitz@redhat.com
Reviewed-by: Andrey Shinkevich <andrey.shinkevich@virtuozzo.com>
Reviewed-by: John Snow <jsnow@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/136 | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/tests/qemu-iotests/136 b/tests/qemu-iotests/136
index a46a7b7630..012ea111ac 100755
--- a/tests/qemu-iotests/136
+++ b/tests/qemu-iotests/136
@@ -30,7 +30,7 @@ bad_offset = bad_sector * 512
 blkdebug_file = os.path.join(iotests.test_dir, 'blkdebug.conf')
 
 class BlockDeviceStatsTestCase(iotests.QMPTestCase):
-    test_img = "null-aio://"
+    test_driver = "null-aio"
     total_rd_bytes = 0
     total_rd_ops = 0
     total_wr_bytes = 0
@@ -67,6 +67,10 @@ sector = "%d"
 ''' % (bad_sector, bad_sector))
         file.close()
 
+    def required_drivers(self):
+        return [self.test_driver]
+
+    @iotests.skip_if_unsupported(required_drivers)
     def setUp(self):
         drive_args = []
         drive_args.append("stats-intervals.0=%d" % interval_length)
@@ -76,8 +80,8 @@ sector = "%d"
                           (self.account_failed and "on" or "off"))
         drive_args.append("file.image.read-zeroes=on")
         self.create_blkdebug_file()
-        self.vm = iotests.VM().add_drive('blkdebug:%s:%s' %
-                                         (blkdebug_file, self.test_img),
+        self.vm = iotests.VM().add_drive('blkdebug:%s:%s://' %
+                                         (blkdebug_file, self.test_driver),
                                          ','.join(drive_args))
         self.vm.launch()
         # Set an initial value for the clock
@@ -337,7 +341,9 @@ class BlockDeviceStatsTestAccountBoth(BlockDeviceStatsTestCase):
     account_failed = True
 
 class BlockDeviceStatsTestCoroutine(BlockDeviceStatsTestCase):
-    test_img = "null-co://"
+    test_driver = "null-co"
 
 if __name__ == '__main__':
+    if 'null-co' not in iotests.supported_formats():
+        iotests.notrun('null-co driver support missing')
     iotests.main(supported_fmts=["raw"])

From 767de537b1268d20dca363a74ba8c8931212d243 Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Tue, 17 Sep 2019 11:20:04 +0200
Subject: [PATCH 07/69] iotests: Cache supported_formats()

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: John Snow <jsnow@redhat.com>
Message-id: 20190917092004.999-8-mreitz@redhat.com
Reviewed-by: Andrey Shinkevich <andrey.shinkevich@virtuozzo.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/iotests.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py
index 936d33df61..a4a2238b42 100644
--- a/tests/qemu-iotests/iotests.py
+++ b/tests/qemu-iotests/iotests.py
@@ -921,9 +921,17 @@ def qemu_pipe(*args):
 def supported_formats(read_only=False):
     '''Set 'read_only' to True to check ro-whitelist
        Otherwise, rw-whitelist is checked'''
-    format_message = qemu_pipe("-drive", "format=help")
-    line = 1 if read_only else 0
-    return format_message.splitlines()[line].split(":")[1].split()
+
+    if not hasattr(supported_formats, "formats"):
+        supported_formats.formats = {}
+
+    if read_only not in supported_formats.formats:
+        format_message = qemu_pipe("-drive", "format=help")
+        line = 1 if read_only else 0
+        supported_formats.formats[read_only] = \
+            format_message.splitlines()[line].split(":")[1].split()
+
+    return supported_formats.formats[read_only]
 
 def skip_if_unsupported(required_formats=[], read_only=False):
     '''Skip Test Decorator

From fed33bd175f663cc8c13f8a490a4f35a19756cfe Mon Sep 17 00:00:00 2001
From: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Date: Fri, 11 Oct 2019 12:07:07 +0300
Subject: [PATCH 08/69] hbitmap: handle set/reset with zero length

Passing zero length to these functions leads to unpredicted results.
Zero-length set/reset may occur in active-mirror, on zero-length write
(which is unlikely, but not guaranteed to never happen).

Let's just do nothing on zero-length request.

Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-id: 20191011090711.19940-2-vsementsov@virtuozzo.com
Reviewed-by: Max Reitz <mreitz@redhat.com>
Cc: qemu-stable@nongnu.org
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 util/hbitmap.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/util/hbitmap.c b/util/hbitmap.c
index 66db87c6ff..242c6e519c 100644
--- a/util/hbitmap.c
+++ b/util/hbitmap.c
@@ -387,6 +387,10 @@ void hbitmap_set(HBitmap *hb, uint64_t start, uint64_t count)
     uint64_t first, n;
     uint64_t last = start + count - 1;
 
+    if (count == 0) {
+        return;
+    }
+
     trace_hbitmap_set(hb, start, count,
                       start >> hb->granularity, last >> hb->granularity);
 
@@ -478,6 +482,10 @@ void hbitmap_reset(HBitmap *hb, uint64_t start, uint64_t count)
     uint64_t last = start + count - 1;
     uint64_t gran = 1ULL << hb->granularity;
 
+    if (count == 0) {
+        return;
+    }
+
     assert(QEMU_IS_ALIGNED(start, gran));
     assert(QEMU_IS_ALIGNED(count, gran) || (start + count == hb->orig_size));
 

From 5c511ac375a859dd12a4c82b56549791570442d3 Mon Sep 17 00:00:00 2001
From: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Date: Fri, 11 Oct 2019 12:07:08 +0300
Subject: [PATCH 09/69] block/mirror: simplify do_sync_target_write

do_sync_target_write is called from bdrv_mirror_top_do_write after
write/discard operation, all inside active_write/active_write_settle
protecting us from mirror iteration. So the whole area is dirty for
sure, no reason to examine dirty bitmap.

Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-id: 20191011090711.19940-3-vsementsov@virtuozzo.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/mirror.c | 103 +++++++++++++++----------------------------------
 1 file changed, 32 insertions(+), 71 deletions(-)

diff --git a/block/mirror.c b/block/mirror.c
index a6c50caea4..351faf9367 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -1181,84 +1181,45 @@ do_sync_target_write(MirrorBlockJob *job, MirrorMethod method,
                      uint64_t offset, uint64_t bytes,
                      QEMUIOVector *qiov, int flags)
 {
-    QEMUIOVector target_qiov;
-    uint64_t dirty_offset = offset;
-    uint64_t dirty_bytes;
+    int ret;
 
-    if (qiov) {
-        qemu_iovec_init(&target_qiov, qiov->niov);
+    bdrv_reset_dirty_bitmap(job->dirty_bitmap, offset, bytes);
+
+    job_progress_increase_remaining(&job->common.job, bytes);
+
+    switch (method) {
+    case MIRROR_METHOD_COPY:
+        ret = blk_co_pwritev(job->target, offset, bytes, qiov, flags);
+        break;
+
+    case MIRROR_METHOD_ZERO:
+        assert(!qiov);
+        ret = blk_co_pwrite_zeroes(job->target, offset, bytes, flags);
+        break;
+
+    case MIRROR_METHOD_DISCARD:
+        assert(!qiov);
+        ret = blk_co_pdiscard(job->target, offset, bytes);
+        break;
+
+    default:
+        abort();
     }
 
-    while (true) {
-        bool valid_area;
-        int ret;
+    if (ret >= 0) {
+        job_progress_update(&job->common.job, bytes);
+    } else {
+        BlockErrorAction action;
 
-        bdrv_dirty_bitmap_lock(job->dirty_bitmap);
-        dirty_bytes = MIN(offset + bytes - dirty_offset, INT_MAX);
-        valid_area = bdrv_dirty_bitmap_next_dirty_area(job->dirty_bitmap,
-                                                       &dirty_offset,
-                                                       &dirty_bytes);
-        if (!valid_area) {
-            bdrv_dirty_bitmap_unlock(job->dirty_bitmap);
-            break;
-        }
+        bdrv_set_dirty_bitmap(job->dirty_bitmap, offset, bytes);
+        job->actively_synced = false;
 
-        bdrv_reset_dirty_bitmap_locked(job->dirty_bitmap,
-                                       dirty_offset, dirty_bytes);
-        bdrv_dirty_bitmap_unlock(job->dirty_bitmap);
-
-        job_progress_increase_remaining(&job->common.job, dirty_bytes);
-
-        assert(dirty_offset - offset <= SIZE_MAX);
-        if (qiov) {
-            qemu_iovec_reset(&target_qiov);
-            qemu_iovec_concat(&target_qiov, qiov,
-                              dirty_offset - offset, dirty_bytes);
-        }
-
-        switch (method) {
-        case MIRROR_METHOD_COPY:
-            ret = blk_co_pwritev(job->target, dirty_offset, dirty_bytes,
-                                 qiov ? &target_qiov : NULL, flags);
-            break;
-
-        case MIRROR_METHOD_ZERO:
-            assert(!qiov);
-            ret = blk_co_pwrite_zeroes(job->target, dirty_offset, dirty_bytes,
-                                       flags);
-            break;
-
-        case MIRROR_METHOD_DISCARD:
-            assert(!qiov);
-            ret = blk_co_pdiscard(job->target, dirty_offset, dirty_bytes);
-            break;
-
-        default:
-            abort();
-        }
-
-        if (ret >= 0) {
-            job_progress_update(&job->common.job, dirty_bytes);
-        } else {
-            BlockErrorAction action;
-
-            bdrv_set_dirty_bitmap(job->dirty_bitmap, dirty_offset, dirty_bytes);
-            job->actively_synced = false;
-
-            action = mirror_error_action(job, false, -ret);
-            if (action == BLOCK_ERROR_ACTION_REPORT) {
-                if (!job->ret) {
-                    job->ret = ret;
-                }
-                break;
+        action = mirror_error_action(job, false, -ret);
+        if (action == BLOCK_ERROR_ACTION_REPORT) {
+            if (!job->ret) {
+                job->ret = ret;
             }
         }
-
-        dirty_offset += dirty_bytes;
-    }
-
-    if (qiov) {
-        qemu_iovec_destroy(&target_qiov);
     }
 }
 

From b30168647fea0fc173a0e244d2943fc5575ca511 Mon Sep 17 00:00:00 2001
From: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Date: Fri, 11 Oct 2019 12:07:09 +0300
Subject: [PATCH 10/69] block/block-backend: add blk_co_pwritev_part

Add blk write function with qiov_offset parameter. It's needed for the
following commit.

Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-id: 20191011090711.19940-4-vsementsov@virtuozzo.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/block-backend.c          | 17 +++++++++++++----
 include/sysemu/block-backend.h |  4 ++++
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/block/block-backend.c b/block/block-backend.c
index eb22ff306e..912c50678d 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -1178,9 +1178,10 @@ int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset,
     return ret;
 }
 
-int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset,
-                                unsigned int bytes, QEMUIOVector *qiov,
-                                BdrvRequestFlags flags)
+int coroutine_fn blk_co_pwritev_part(BlockBackend *blk, int64_t offset,
+                                     unsigned int bytes,
+                                     QEMUIOVector *qiov, size_t qiov_offset,
+                                     BdrvRequestFlags flags)
 {
     int ret;
     BlockDriverState *bs;
@@ -1207,11 +1208,19 @@ int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset,
         flags |= BDRV_REQ_FUA;
     }
 
-    ret = bdrv_co_pwritev(blk->root, offset, bytes, qiov, flags);
+    ret = bdrv_co_pwritev_part(blk->root, offset, bytes, qiov, qiov_offset,
+                               flags);
     bdrv_dec_in_flight(bs);
     return ret;
 }
 
+int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset,
+                                unsigned int bytes, QEMUIOVector *qiov,
+                                BdrvRequestFlags flags)
+{
+    return blk_co_pwritev_part(blk, offset, bytes, qiov, 0, flags);
+}
+
 typedef struct BlkRwCo {
     BlockBackend *blk;
     int64_t offset;
diff --git a/include/sysemu/block-backend.h b/include/sysemu/block-backend.h
index 368d53af77..73f2cef7fe 100644
--- a/include/sysemu/block-backend.h
+++ b/include/sysemu/block-backend.h
@@ -121,6 +121,10 @@ void blk_set_dev_ops(BlockBackend *blk, const BlockDevOps *ops, void *opaque);
 int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset,
                                unsigned int bytes, QEMUIOVector *qiov,
                                BdrvRequestFlags flags);
+int coroutine_fn blk_co_pwritev_part(BlockBackend *blk, int64_t offset,
+                                     unsigned int bytes,
+                                     QEMUIOVector *qiov, size_t qiov_offset,
+                                     BdrvRequestFlags flags);
 int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset,
                                unsigned int bytes, QEMUIOVector *qiov,
                                BdrvRequestFlags flags);

From dbdf699cad505be6ba2d4c93066f1dd7aff15aeb Mon Sep 17 00:00:00 2001
From: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Date: Fri, 11 Oct 2019 12:07:10 +0300
Subject: [PATCH 11/69] block/mirror: support unaligned write in active mirror

Prior 9adc1cb49af8d do_sync_target_write had a bug: it reset aligned-up
region in the dirty bitmap, which means that we may not copy some bytes
and assume them copied, which actually leads to producing corrupted
target.

So 9adc1cb49af8d forced dirty bitmap granularity to be
request_alignment for mirror-top filter, so we are not working with
unaligned requests. However forcing large alignment obviously decreases
performance of unaligned requests.

This commit provides another solution for the problem: if unaligned
padding is already dirty, we can safely ignore it, as
1. It's dirty, it will be copied by mirror_iteration anyway
2. It's dirty, so skipping it now we don't increase dirtiness of the
   bitmap and therefore don't damage "synchronicity" of the
   write-blocking mirror.

If unaligned padding is not dirty, we just write it, no reason to touch
dirty bitmap if we succeed (on failure we'll set the whole region
ofcourse, but we loss "synchronicity" on failure anyway).

Note: we need to disable dirty_bitmap, otherwise we will not be able to
see in do_sync_target_write bitmap state before current operation. We
may of course check dirty bitmap before the operation in
bdrv_mirror_top_do_write and remember it, but we don't need active
dirty bitmap for write-blocking mirror anyway.

New code-path is unused until the following commit reverts
9adc1cb49af8d.

Suggested-by: Denis V. Lunev <den@openvz.org>
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-id: 20191011090711.19940-5-vsementsov@virtuozzo.com
Reviewed-by: Max Reitz <mreitz@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/mirror.c | 71 +++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 68 insertions(+), 3 deletions(-)

diff --git a/block/mirror.c b/block/mirror.c
index 351faf9367..11d4d66f43 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -1182,14 +1182,67 @@ do_sync_target_write(MirrorBlockJob *job, MirrorMethod method,
                      QEMUIOVector *qiov, int flags)
 {
     int ret;
+    size_t qiov_offset = 0;
+    int64_t bitmap_offset, bitmap_end;
 
-    bdrv_reset_dirty_bitmap(job->dirty_bitmap, offset, bytes);
+    if (!QEMU_IS_ALIGNED(offset, job->granularity) &&
+        bdrv_dirty_bitmap_get(job->dirty_bitmap, offset))
+    {
+            /*
+             * Dirty unaligned padding: ignore it.
+             *
+             * Reasoning:
+             * 1. If we copy it, we can't reset corresponding bit in
+             *    dirty_bitmap as there may be some "dirty" bytes still not
+             *    copied.
+             * 2. It's already dirty, so skipping it we don't diverge mirror
+             *    progress.
+             *
+             * Note, that because of this, guest write may have no contribution
+             * into mirror converge, but that's not bad, as we have background
+             * process of mirroring. If under some bad circumstances (high guest
+             * IO load) background process starve, we will not converge anyway,
+             * even if each write will contribute, as guest is not guaranteed to
+             * rewrite the whole disk.
+             */
+            qiov_offset = QEMU_ALIGN_UP(offset, job->granularity) - offset;
+            if (bytes <= qiov_offset) {
+                /* nothing to do after shrink */
+                return;
+            }
+            offset += qiov_offset;
+            bytes -= qiov_offset;
+    }
+
+    if (!QEMU_IS_ALIGNED(offset + bytes, job->granularity) &&
+        bdrv_dirty_bitmap_get(job->dirty_bitmap, offset + bytes - 1))
+    {
+        uint64_t tail = (offset + bytes) % job->granularity;
+
+        if (bytes <= tail) {
+            /* nothing to do after shrink */
+            return;
+        }
+        bytes -= tail;
+    }
+
+    /*
+     * Tails are either clean or shrunk, so for bitmap resetting
+     * we safely align the range down.
+     */
+    bitmap_offset = QEMU_ALIGN_UP(offset, job->granularity);
+    bitmap_end = QEMU_ALIGN_DOWN(offset + bytes, job->granularity);
+    if (bitmap_offset < bitmap_end) {
+        bdrv_reset_dirty_bitmap(job->dirty_bitmap, bitmap_offset,
+                                bitmap_end - bitmap_offset);
+    }
 
     job_progress_increase_remaining(&job->common.job, bytes);
 
     switch (method) {
     case MIRROR_METHOD_COPY:
-        ret = blk_co_pwritev(job->target, offset, bytes, qiov, flags);
+        ret = blk_co_pwritev_part(job->target, offset, bytes,
+                                  qiov, qiov_offset, flags);
         break;
 
     case MIRROR_METHOD_ZERO:
@@ -1211,7 +1264,16 @@ do_sync_target_write(MirrorBlockJob *job, MirrorMethod method,
     } else {
         BlockErrorAction action;
 
-        bdrv_set_dirty_bitmap(job->dirty_bitmap, offset, bytes);
+        /*
+         * We failed, so we should mark dirty the whole area, aligned up.
+         * Note that we don't care about shrunk tails if any: they were dirty
+         * at function start, and they must be still dirty, as we've locked
+         * the region for in-flight op.
+         */
+        bitmap_offset = QEMU_ALIGN_DOWN(offset, job->granularity);
+        bitmap_end = QEMU_ALIGN_UP(offset + bytes, job->granularity);
+        bdrv_set_dirty_bitmap(job->dirty_bitmap, bitmap_offset,
+                              bitmap_end - bitmap_offset);
         job->actively_synced = false;
 
         action = mirror_error_action(job, false, -ret);
@@ -1618,6 +1680,9 @@ static BlockJob *mirror_start_job(
     if (!s->dirty_bitmap) {
         goto fail;
     }
+    if (s->copy_mode == MIRROR_COPY_MODE_WRITE_BLOCKING) {
+        bdrv_disable_dirty_bitmap(s->dirty_bitmap);
+    }
 
     ret = block_job_add_bdrv(&s->common, "source", bs, 0,
                              BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE |

From 994b44ab2025c9662f238867e871c8833fd52561 Mon Sep 17 00:00:00 2001
From: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Date: Fri, 11 Oct 2019 12:07:11 +0300
Subject: [PATCH 12/69] Revert "mirror: Only mirror granularity-aligned chunks"

This reverts commit 9adc1cb49af8d4e54f57980b1eed5c0a4b2dafa6.
    "mirror: Only mirror granularity-aligned chunks"

Since previous commit unaligned chunks are supported by
do_sync_target_write.

Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-id: 20191011090711.19940-6-vsementsov@virtuozzo.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/mirror.c | 29 -----------------------------
 1 file changed, 29 deletions(-)

diff --git a/block/mirror.c b/block/mirror.c
index 11d4d66f43..454365ce00 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -1488,15 +1488,6 @@ static void bdrv_mirror_top_child_perm(BlockDriverState *bs, BdrvChild *c,
     *nshared = BLK_PERM_ALL;
 }
 
-static void bdrv_mirror_top_refresh_limits(BlockDriverState *bs, Error **errp)
-{
-    MirrorBDSOpaque *s = bs->opaque;
-
-    if (s && s->job && s->job->copy_mode == MIRROR_COPY_MODE_WRITE_BLOCKING) {
-        bs->bl.request_alignment = s->job->granularity;
-    }
-}
-
 /* Dummy node that provides consistent read to its users without requiring it
  * from its backing file and that allows writes on the backing file chain. */
 static BlockDriver bdrv_mirror_top = {
@@ -1509,7 +1500,6 @@ static BlockDriver bdrv_mirror_top = {
     .bdrv_co_block_status       = bdrv_co_block_status_from_backing,
     .bdrv_refresh_filename      = bdrv_mirror_top_refresh_filename,
     .bdrv_child_perm            = bdrv_mirror_top_child_perm,
-    .bdrv_refresh_limits        = bdrv_mirror_top_refresh_limits,
 };
 
 static BlockJob *mirror_start_job(
@@ -1657,25 +1647,6 @@ static BlockJob *mirror_start_job(
         s->should_complete = true;
     }
 
-    /*
-     * Must be called before we start tracking writes, but after
-     *
-     *     ((MirrorBlockJob *)
-     *         ((MirrorBDSOpaque *)
-     *             mirror_top_bs->opaque
-     *         )->job
-     *     )->copy_mode
-     *
-     * has the correct value.
-     * (We start tracking writes as of the following
-     * bdrv_create_dirty_bitmap() call.)
-     */
-    bdrv_refresh_limits(mirror_top_bs, &local_err);
-    if (local_err) {
-        error_propagate(errp, local_err);
-        goto fail;
-    }
-
     s->dirty_bitmap = bdrv_create_dirty_bitmap(bs, granularity, NULL, errp);
     if (!s->dirty_bitmap) {
         goto fail;

From c7df3f19d22dc3c13f7ca64d7dc15a4389ee63b8 Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Thu, 17 Oct 2019 15:31:33 +0200
Subject: [PATCH 13/69] iotests: Introduce $SOCK_DIR

Unix sockets generally have a maximum path length.  Depending on your
$TEST_DIR, it may be exceeded and then all tests that create and use
Unix sockets there may fail.

Circumvent this by adding a new scratch directory specifically for
Unix socket files.  It defaults to a temporary directory (mktemp -d)
that is completely removed after the iotests are done.

(By default, mktemp -d creates a /tmp/tmp.XXXXXXXXXX directory, which
should be short enough for our use cases.)

Use mkdir -p to create the directory (because it seems right), and do
the same for $TEST_DIR (because there is no reason for that to be
created in any different way).

Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-id: 20191017133155.5327-2-mreitz@redhat.com
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/check | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/tests/qemu-iotests/check b/tests/qemu-iotests/check
index 588c453a94..71fe38834e 100755
--- a/tests/qemu-iotests/check
+++ b/tests/qemu-iotests/check
@@ -97,6 +97,7 @@ IMGFMT        -- $FULL_IMGFMT_DETAILS
 IMGPROTO      -- $IMGPROTO
 PLATFORM      -- $FULL_HOST_DETAILS
 TEST_DIR      -- $TEST_DIR
+SOCK_DIR      -- $SOCK_DIR
 SOCKET_SCM_HELPER -- $SOCKET_SCM_HELPER
 
 EOF
@@ -116,10 +117,14 @@ set_prog_path()
 if [ -z "$TEST_DIR" ]; then
         TEST_DIR=$PWD/scratch
 fi
+mkdir -p "$TEST_DIR" || _init_error 'Failed to create TEST_DIR'
 
-if [ ! -e "$TEST_DIR" ]; then
-        mkdir "$TEST_DIR"
+tmp_sock_dir=false
+if [ -z "$SOCK_DIR" ]; then
+    SOCK_DIR=$(mktemp -d)
+    tmp_sock_dir=true
 fi
+mkdir -p "$SOCK_DIR" || _init_error 'Failed to create SOCK_DIR'
 
 diff="diff -u"
 verbose=false
@@ -534,6 +539,7 @@ if [ -z "$SAMPLE_IMG_DIR" ]; then
 fi
 
 export TEST_DIR
+export SOCK_DIR
 export SAMPLE_IMG_DIR
 
 if [ -s $tmp.list ]
@@ -716,6 +722,11 @@ END        { if (NR > 0) {
     rm -f "${TEST_DIR}"/*.out "${TEST_DIR}"/*.err "${TEST_DIR}"/*.time
     rm -f "${TEST_DIR}"/check.pid "${TEST_DIR}"/check.sts
     rm -f $tmp.*
+
+    if $tmp_sock_dir
+    then
+        rm -rf "$SOCK_DIR"
+    fi
 }
 
 trap "_wrapup; exit \$status" 0 1 2 3 15

From 32558ce7a43dc7e8e3d065ac26eb2cdbfa34b36b Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Thu, 17 Oct 2019 15:31:34 +0200
Subject: [PATCH 14/69] iotests.py: Store socket files in $SOCK_DIR

iotests.py itself does not store socket files, but machine.py and
qtest.py do.  iotests.py needs to pass the respective path to them, and
they need to adhere to it.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Thomas Huth <thuth@redhat.com>
Message-id: 20191017133155.5327-3-mreitz@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 python/qemu/machine.py        | 15 ++++++++++++---
 python/qemu/qtest.py          |  9 ++++++---
 tests/qemu-iotests/iotests.py |  4 +++-
 3 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/python/qemu/machine.py b/python/qemu/machine.py
index 128a3d1dc2..2024e8b1b1 100644
--- a/python/qemu/machine.py
+++ b/python/qemu/machine.py
@@ -71,7 +71,7 @@ class QEMUMachine(object):
 
     def __init__(self, binary, args=None, wrapper=None, name=None,
                  test_dir="/var/tmp", monitor_address=None,
-                 socket_scm_helper=None):
+                 socket_scm_helper=None, sock_dir=None):
         '''
         Initialize a QEMUMachine
 
@@ -90,6 +90,8 @@ class QEMUMachine(object):
             wrapper = []
         if name is None:
             name = "qemu-%d" % os.getpid()
+        if sock_dir is None:
+            sock_dir = test_dir
         self._name = name
         self._monitor_address = monitor_address
         self._vm_monitor = None
@@ -106,12 +108,14 @@ class QEMUMachine(object):
         self._qemu_full_args = None
         self._test_dir = test_dir
         self._temp_dir = None
+        self._sock_dir = sock_dir
         self._launched = False
         self._machine = None
         self._console_set = False
         self._console_device_type = None
         self._console_address = None
         self._console_socket = None
+        self._remove_files = []
 
         # just in case logging wasn't configured by the main script:
         logging.basicConfig()
@@ -236,8 +240,9 @@ class QEMUMachine(object):
         if self._machine is not None:
             args.extend(['-machine', self._machine])
         if self._console_set:
-            self._console_address = os.path.join(self._temp_dir,
+            self._console_address = os.path.join(self._sock_dir,
                                                  self._name + "-console.sock")
+            self._remove_files.append(self._console_address)
             chardev = ('socket,id=console,path=%s,server,nowait' %
                        self._console_address)
             args.extend(['-chardev', chardev])
@@ -253,8 +258,9 @@ class QEMUMachine(object):
         if self._monitor_address is not None:
             self._vm_monitor = self._monitor_address
         else:
-            self._vm_monitor = os.path.join(self._temp_dir,
+            self._vm_monitor = os.path.join(self._sock_dir,
                                             self._name + "-monitor.sock")
+            self._remove_files.append(self._vm_monitor)
         self._qemu_log_path = os.path.join(self._temp_dir, self._name + ".log")
         self._qemu_log_file = open(self._qemu_log_path, 'wb')
 
@@ -279,6 +285,9 @@ class QEMUMachine(object):
             shutil.rmtree(self._temp_dir)
             self._temp_dir = None
 
+        while len(self._remove_files) > 0:
+            self._remove_if_exists(self._remove_files.pop())
+
     def launch(self):
         """
         Launch the VM and make sure we cleanup and expose the
diff --git a/python/qemu/qtest.py b/python/qemu/qtest.py
index 3f1d2cb325..d24ad04256 100644
--- a/python/qemu/qtest.py
+++ b/python/qemu/qtest.py
@@ -84,14 +84,17 @@ class QEMUQtestMachine(QEMUMachine):
     '''A QEMU VM'''
 
     def __init__(self, binary, args=None, name=None, test_dir="/var/tmp",
-                 socket_scm_helper=None):
+                 socket_scm_helper=None, sock_dir=None):
         if name is None:
             name = "qemu-%d" % os.getpid()
+        if sock_dir is None:
+            sock_dir = test_dir
         super(QEMUQtestMachine,
               self).__init__(binary, args, name=name, test_dir=test_dir,
-                             socket_scm_helper=socket_scm_helper)
+                             socket_scm_helper=socket_scm_helper,
+                             sock_dir=sock_dir)
         self._qtest = None
-        self._qtest_path = os.path.join(test_dir, name + "-qtest.sock")
+        self._qtest_path = os.path.join(sock_dir, name + "-qtest.sock")
 
     def _base_args(self):
         args = super(QEMUQtestMachine, self)._base_args()
diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py
index a4a2238b42..9bfcef1f14 100644
--- a/tests/qemu-iotests/iotests.py
+++ b/tests/qemu-iotests/iotests.py
@@ -57,6 +57,7 @@ qemu_opts = os.environ.get('QEMU_OPTIONS', '').strip().split(' ')
 imgfmt = os.environ.get('IMGFMT', 'raw')
 imgproto = os.environ.get('IMGPROTO', 'file')
 test_dir = os.environ.get('TEST_DIR')
+sock_dir = os.environ.get('SOCK_DIR')
 output_dir = os.environ.get('OUTPUT_DIR', '.')
 cachemode = os.environ.get('CACHEMODE')
 qemu_default_machine = os.environ.get('QEMU_DEFAULT_MACHINE')
@@ -456,7 +457,8 @@ class VM(qtest.QEMUQtestMachine):
         name = "qemu%s-%d" % (path_suffix, os.getpid())
         super(VM, self).__init__(qemu_prog, qemu_opts, name=name,
                                  test_dir=test_dir,
-                                 socket_scm_helper=socket_scm_helper)
+                                 socket_scm_helper=socket_scm_helper,
+                                 sock_dir=sock_dir)
         self._num_drives = 0
 
     def add_object(self, opts):

From 93b78ea5f698a4f547159fe3700981b2b8c13e78 Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Thu, 17 Oct 2019 15:31:35 +0200
Subject: [PATCH 15/69] iotests.py: Add @base_dir to FilePaths etc.

Specifying this optional parameter allows creating temporary files in
other directories than the test_dir; for example in sock_dir.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-id: 20191017133155.5327-4-mreitz@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/iotests.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py
index 9bfcef1f14..075f4739da 100644
--- a/tests/qemu-iotests/iotests.py
+++ b/tests/qemu-iotests/iotests.py
@@ -386,10 +386,10 @@ class FilePaths(object):
             qemu_img('create', img_path, '1G')
         # migration_sock_path is automatically deleted
     """
-    def __init__(self, names):
+    def __init__(self, names, base_dir=test_dir):
         self.paths = []
         for name in names:
-            self.paths.append(os.path.join(test_dir, file_pattern(name)))
+            self.paths.append(os.path.join(base_dir, file_pattern(name)))
 
     def __enter__(self):
         return self.paths
@@ -406,8 +406,8 @@ class FilePath(FilePaths):
     """
     FilePath is a specialization of FilePaths that takes a single filename.
     """
-    def __init__(self, name):
-        super(FilePath, self).__init__([name])
+    def __init__(self, name, base_dir=test_dir):
+        super(FilePath, self).__init__([name], base_dir)
 
     def __enter__(self):
         return self.paths[0]
@@ -420,7 +420,7 @@ def file_path_remover():
             pass
 
 
-def file_path(*names):
+def file_path(*names, base_dir=test_dir):
     ''' Another way to get auto-generated filename that cleans itself up.
 
     Use is as simple as:
@@ -436,7 +436,7 @@ def file_path(*names):
     paths = []
     for name in names:
         filename = file_pattern(name)
-        path = os.path.join(test_dir, filename)
+        path = os.path.join(base_dir, filename)
         file_path_remover.paths.append(path)
         paths.append(path)
 

From dc48bfdf9f38d0a3efeb681161a7636f65dcc652 Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Thu, 17 Oct 2019 15:31:36 +0200
Subject: [PATCH 16/69] iotests: Filter $SOCK_DIR

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-id: 20191017133155.5327-5-mreitz@redhat.com
Reviewed-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/common.filter | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tests/qemu-iotests/common.filter b/tests/qemu-iotests/common.filter
index 9f418b4881..0ee6042524 100644
--- a/tests/qemu-iotests/common.filter
+++ b/tests/qemu-iotests/common.filter
@@ -43,7 +43,8 @@ _filter_qom_path()
 # replace occurrences of the actual TEST_DIR value with TEST_DIR
 _filter_testdir()
 {
-    $SED -e "s#$TEST_DIR/#TEST_DIR/#g"
+    $SED -e "s#$TEST_DIR/#TEST_DIR/#g" \
+         -e "s#$SOCK_DIR/#SOCK_DIR/#g"
 }
 
 # replace occurrences of the actual IMGFMT value with IMGFMT
@@ -124,6 +125,7 @@ _filter_img_create()
     $SED -e "s#$REMOTE_TEST_DIR#TEST_DIR#g" \
         -e "s#$IMGPROTO:$TEST_DIR#TEST_DIR#g" \
         -e "s#$TEST_DIR#TEST_DIR#g" \
+        -e "s#$SOCK_DIR#SOCK_DIR#g" \
         -e "s#$IMGFMT#IMGFMT#g" \
         -e 's#nbd:127.0.0.1:10810#TEST_DIR/t.IMGFMT#g' \
         -e "s# encryption=off##g" \
@@ -160,6 +162,7 @@ _filter_img_info()
     $SED -e "s#$REMOTE_TEST_DIR#TEST_DIR#g" \
         -e "s#$IMGPROTO:$TEST_DIR#TEST_DIR#g" \
         -e "s#$TEST_DIR#TEST_DIR#g" \
+        -e "s#$SOCK_DIR#SOCK_DIR#g" \
         -e "s#$IMGFMT#IMGFMT#g" \
         -e 's#nbd://127.0.0.1:10810$#TEST_DIR/t.IMGFMT#g' \
         -e 's#json.*vdisk-id.*vxhs"}}#TEST_DIR/t.IMGFMT#' \
@@ -219,6 +222,7 @@ _filter_nbd()
     $SED -e '/nbd\/.*\.c:/d' \
         -e 's#127\.0\.0\.1:[0-9]*#127.0.0.1:PORT#g' \
         -e "s#?socket=$TEST_DIR#?socket=TEST_DIR#g" \
+        -e "s#?socket=$SOCK_DIR#?socket=SOCK_DIR#g" \
         -e 's#\(foo\|PORT/\?\|.sock\): Failed to .*$#\1#'
 }
 

From 5759322ab0414d277e6f340b2487f37d0f1f3c1e Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Thu, 17 Oct 2019 15:31:37 +0200
Subject: [PATCH 17/69] iotests: Let common.nbd create socket in $SOCK_DIR

In addition, drop the nbd_unix_socket assignment in 241 because it does
not really do anything.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-id: 20191017133155.5327-6-mreitz@redhat.com
Reviewed-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/241        | 2 --
 tests/qemu-iotests/common.nbd | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/tests/qemu-iotests/241 b/tests/qemu-iotests/241
index 58b64ebf41..8dae8d39e4 100755
--- a/tests/qemu-iotests/241
+++ b/tests/qemu-iotests/241
@@ -23,8 +23,6 @@ echo "QA output created by $seq"
 
 status=1 # failure is the default!
 
-nbd_unix_socket=$TEST_DIR/test_qemu_nbd_socket
-
 _cleanup()
 {
     _cleanup_test_img
diff --git a/tests/qemu-iotests/common.nbd b/tests/qemu-iotests/common.nbd
index 24b01b60aa..a8cae8fe2c 100644
--- a/tests/qemu-iotests/common.nbd
+++ b/tests/qemu-iotests/common.nbd
@@ -19,7 +19,7 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
 
-nbd_unix_socket="${TEST_DIR}/qemu-nbd.sock"
+nbd_unix_socket="${SOCK_DIR}/qemu-nbd.sock"
 nbd_tcp_addr="127.0.0.1"
 nbd_pid_file="${TEST_DIR}/qemu-nbd.pid"
 nbd_stderr_fifo="${TEST_DIR}/qemu-nbd.fifo"

From a7552b52331a9eedbc92ba75b47fbacb9f0575f1 Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Thu, 17 Oct 2019 15:31:38 +0200
Subject: [PATCH 18/69] iotests/083: Create socket in $SOCK_DIR

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Thomas Huth <thuth@redhat.com>
Message-id: 20191017133155.5327-7-mreitz@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/083     |  6 +++---
 tests/qemu-iotests/083.out | 34 +++++++++++++++++-----------------
 2 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/tests/qemu-iotests/083 b/tests/qemu-iotests/083
index b270550d3e..10fdfc8ebb 100755
--- a/tests/qemu-iotests/083
+++ b/tests/qemu-iotests/083
@@ -28,7 +28,7 @@ status=1	# failure is the default!
 
 _cleanup()
 {
-	rm -f nbd.sock
+	rm -f "$SOCK_DIR/nbd.sock"
 	rm -f nbd-fault-injector.out
 	rm -f nbd-fault-injector.conf
 }
@@ -80,10 +80,10 @@ EOF
 	if [ "$proto" = "tcp" ]; then
 		nbd_addr="127.0.0.1:0"
 	else
-		nbd_addr="$TEST_DIR/nbd.sock"
+		nbd_addr="$SOCK_DIR/nbd.sock"
 	fi
 
-	rm -f "$TEST_DIR/nbd.sock"
+	rm -f "$SOCK_DIR/nbd.sock"
 
         echo > "$TEST_DIR/nbd-fault-injector.out"
 	$PYTHON nbd-fault-injector.py $extra_args "$nbd_addr" "$TEST_DIR/nbd-fault-injector.conf" >"$TEST_DIR/nbd-fault-injector.out" 2>&1 &
diff --git a/tests/qemu-iotests/083.out b/tests/qemu-iotests/083.out
index eee6dd1379..2090ee693c 100644
--- a/tests/qemu-iotests/083.out
+++ b/tests/qemu-iotests/083.out
@@ -110,43 +110,43 @@ read failed: Input/output error
 
 === Check disconnect before neg1 ===
 
-qemu-io: can't open device nbd+unix:///foo?socket=TEST_DIR/nbd.sock
+qemu-io: can't open device nbd+unix:///foo?socket=SOCK_DIR/nbd.sock
 
 === Check disconnect after neg1 ===
 
-qemu-io: can't open device nbd+unix:///foo?socket=TEST_DIR/nbd.sock
+qemu-io: can't open device nbd+unix:///foo?socket=SOCK_DIR/nbd.sock
 
 === Check disconnect 8 neg1 ===
 
-qemu-io: can't open device nbd+unix:///foo?socket=TEST_DIR/nbd.sock
+qemu-io: can't open device nbd+unix:///foo?socket=SOCK_DIR/nbd.sock
 
 === Check disconnect 16 neg1 ===
 
-qemu-io: can't open device nbd+unix:///foo?socket=TEST_DIR/nbd.sock
+qemu-io: can't open device nbd+unix:///foo?socket=SOCK_DIR/nbd.sock
 
 === Check disconnect before export ===
 
-qemu-io: can't open device nbd+unix:///foo?socket=TEST_DIR/nbd.sock
+qemu-io: can't open device nbd+unix:///foo?socket=SOCK_DIR/nbd.sock
 
 === Check disconnect after export ===
 
-qemu-io: can't open device nbd+unix:///foo?socket=TEST_DIR/nbd.sock
+qemu-io: can't open device nbd+unix:///foo?socket=SOCK_DIR/nbd.sock
 
 === Check disconnect 4 export ===
 
-qemu-io: can't open device nbd+unix:///foo?socket=TEST_DIR/nbd.sock
+qemu-io: can't open device nbd+unix:///foo?socket=SOCK_DIR/nbd.sock
 
 === Check disconnect 12 export ===
 
-qemu-io: can't open device nbd+unix:///foo?socket=TEST_DIR/nbd.sock
+qemu-io: can't open device nbd+unix:///foo?socket=SOCK_DIR/nbd.sock
 
 === Check disconnect 16 export ===
 
-qemu-io: can't open device nbd+unix:///foo?socket=TEST_DIR/nbd.sock
+qemu-io: can't open device nbd+unix:///foo?socket=SOCK_DIR/nbd.sock
 
 === Check disconnect before neg2 ===
 
-qemu-io: can't open device nbd+unix:///foo?socket=TEST_DIR/nbd.sock
+qemu-io: can't open device nbd+unix:///foo?socket=SOCK_DIR/nbd.sock
 
 === Check disconnect after neg2 ===
 
@@ -154,11 +154,11 @@ read failed: Input/output error
 
 === Check disconnect 8 neg2 ===
 
-qemu-io: can't open device nbd+unix:///foo?socket=TEST_DIR/nbd.sock
+qemu-io: can't open device nbd+unix:///foo?socket=SOCK_DIR/nbd.sock
 
 === Check disconnect 10 neg2 ===
 
-qemu-io: can't open device nbd+unix:///foo?socket=TEST_DIR/nbd.sock
+qemu-io: can't open device nbd+unix:///foo?socket=SOCK_DIR/nbd.sock
 
 === Check disconnect before request ===
 
@@ -195,23 +195,23 @@ read 512/512 bytes at offset 0
 
 === Check disconnect before neg-classic ===
 
-qemu-io: can't open device nbd+unix:///?socket=TEST_DIR/nbd.sock
+qemu-io: can't open device nbd+unix:///?socket=SOCK_DIR/nbd.sock
 
 === Check disconnect 8 neg-classic ===
 
-qemu-io: can't open device nbd+unix:///?socket=TEST_DIR/nbd.sock
+qemu-io: can't open device nbd+unix:///?socket=SOCK_DIR/nbd.sock
 
 === Check disconnect 16 neg-classic ===
 
-qemu-io: can't open device nbd+unix:///?socket=TEST_DIR/nbd.sock
+qemu-io: can't open device nbd+unix:///?socket=SOCK_DIR/nbd.sock
 
 === Check disconnect 24 neg-classic ===
 
-qemu-io: can't open device nbd+unix:///?socket=TEST_DIR/nbd.sock
+qemu-io: can't open device nbd+unix:///?socket=SOCK_DIR/nbd.sock
 
 === Check disconnect 28 neg-classic ===
 
-qemu-io: can't open device nbd+unix:///?socket=TEST_DIR/nbd.sock
+qemu-io: can't open device nbd+unix:///?socket=SOCK_DIR/nbd.sock
 
 === Check disconnect after neg-classic ===
 

From 46cabce6c2d30c1ff2f3c1a4178f786dcc8ed672 Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Thu, 17 Oct 2019 15:31:39 +0200
Subject: [PATCH 19/69] iotests/140: Create socket in $SOCK_DIR

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Thomas Huth <thuth@redhat.com>
Message-id: 20191017133155.5327-8-mreitz@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/140     | 8 ++++----
 tests/qemu-iotests/140.out | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/qemu-iotests/140 b/tests/qemu-iotests/140
index b965b1dd5d..8d2ce5d9e3 100755
--- a/tests/qemu-iotests/140
+++ b/tests/qemu-iotests/140
@@ -34,7 +34,7 @@ _cleanup()
 {
     _cleanup_qemu
     _cleanup_test_img
-    rm -f "$TEST_DIR/nbd"
+    rm -f "$SOCK_DIR/nbd"
 }
 trap "_cleanup; exit \$status" 0 1 2 3 15
 
@@ -69,7 +69,7 @@ _send_qemu_cmd $QEMU_HANDLE \
 _send_qemu_cmd $QEMU_HANDLE \
     "{ 'execute': 'nbd-server-start',
        'arguments': { 'addr': { 'type': 'unix',
-                                'data': { 'path': '$TEST_DIR/nbd' }}}}" \
+                                'data': { 'path': '$SOCK_DIR/nbd' }}}}" \
     'return'
 
 _send_qemu_cmd $QEMU_HANDLE \
@@ -78,7 +78,7 @@ _send_qemu_cmd $QEMU_HANDLE \
     'return'
 
 $QEMU_IO_PROG -f raw -r -c 'read -P 42 0 64k' \
-    "nbd+unix:///drv?socket=$TEST_DIR/nbd" 2>&1 \
+    "nbd+unix:///drv?socket=$SOCK_DIR/nbd" 2>&1 \
     | _filter_qemu_io | _filter_nbd
 
 _send_qemu_cmd $QEMU_HANDLE \
@@ -87,7 +87,7 @@ _send_qemu_cmd $QEMU_HANDLE \
     'return'
 
 $QEMU_IO_PROG -f raw -r -c close \
-    "nbd+unix:///drv?socket=$TEST_DIR/nbd" 2>&1 \
+    "nbd+unix:///drv?socket=$SOCK_DIR/nbd" 2>&1 \
     | _filter_qemu_io | _filter_nbd
 
 _send_qemu_cmd $QEMU_HANDLE \
diff --git a/tests/qemu-iotests/140.out b/tests/qemu-iotests/140.out
index 67fe44a3e3..2511eb7369 100644
--- a/tests/qemu-iotests/140.out
+++ b/tests/qemu-iotests/140.out
@@ -8,7 +8,7 @@ wrote 65536/65536 bytes at offset 0
 read 65536/65536 bytes at offset 0
 64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 {"return": {}}
-qemu-io: can't open device nbd+unix:///drv?socket=TEST_DIR/nbd: Requested export not available
+qemu-io: can't open device nbd+unix:///drv?socket=SOCK_DIR/nbd: Requested export not available
 server reported: export 'drv' not present
 {"return": {}}
 {"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false, "reason": "host-qmp-quit"}}

From 9a9c7c8f9834f18204a6586e6ddd902dab2b0bf9 Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Thu, 17 Oct 2019 15:31:40 +0200
Subject: [PATCH 20/69] iotests/143: Create socket in $SOCK_DIR

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Thomas Huth <thuth@redhat.com>
Message-id: 20191017133155.5327-9-mreitz@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/143     | 6 +++---
 tests/qemu-iotests/143.out | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/qemu-iotests/143 b/tests/qemu-iotests/143
index 92249ac8da..f649b36195 100755
--- a/tests/qemu-iotests/143
+++ b/tests/qemu-iotests/143
@@ -29,7 +29,7 @@ status=1	# failure is the default!
 _cleanup()
 {
     _cleanup_qemu
-    rm -f "$TEST_DIR/nbd"
+    rm -f "$SOCK_DIR/nbd"
 }
 trap "_cleanup; exit \$status" 0 1 2 3 15
 
@@ -51,12 +51,12 @@ _send_qemu_cmd $QEMU_HANDLE \
 _send_qemu_cmd $QEMU_HANDLE \
     "{ 'execute': 'nbd-server-start',
        'arguments': { 'addr': { 'type': 'unix',
-                                'data': { 'path': '$TEST_DIR/nbd' }}}}" \
+                                'data': { 'path': '$SOCK_DIR/nbd' }}}}" \
     'return'
 
 # This should just result in a client error, not in the server crashing
 $QEMU_IO_PROG -f raw -c quit \
-    "nbd+unix:///no_such_export?socket=$TEST_DIR/nbd" 2>&1 \
+    "nbd+unix:///no_such_export?socket=$SOCK_DIR/nbd" 2>&1 \
     | _filter_qemu_io | _filter_nbd
 
 _send_qemu_cmd $QEMU_HANDLE \
diff --git a/tests/qemu-iotests/143.out b/tests/qemu-iotests/143.out
index ee71b5aa42..037d34a409 100644
--- a/tests/qemu-iotests/143.out
+++ b/tests/qemu-iotests/143.out
@@ -1,7 +1,7 @@
 QA output created by 143
 {"return": {}}
 {"return": {}}
-qemu-io: can't open device nbd+unix:///no_such_export?socket=TEST_DIR/nbd: Requested export not available
+qemu-io: can't open device nbd+unix:///no_such_export?socket=SOCK_DIR/nbd: Requested export not available
 server reported: export 'no_such_export' not present
 {"return": {}}
 {"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "SHUTDOWN", "data": {"guest": false, "reason": "host-qmp-quit"}}

From 610dffaa3948c762a1e287410bead51fc876c1a6 Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Thu, 17 Oct 2019 15:31:41 +0200
Subject: [PATCH 21/69] iotests/147: Create socket in $SOCK_DIR

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Thomas Huth <thuth@redhat.com>
Message-id: 20191017133155.5327-10-mreitz@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/147 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/qemu-iotests/147 b/tests/qemu-iotests/147
index ab8480b9a4..03fc2fabcf 100755
--- a/tests/qemu-iotests/147
+++ b/tests/qemu-iotests/147
@@ -32,7 +32,7 @@ NBD_IPV6_PORT_START = NBD_PORT_END
 NBD_IPV6_PORT_END   = NBD_IPV6_PORT_START + 1024
 
 test_img = os.path.join(iotests.test_dir, 'test.img')
-unix_socket = os.path.join(iotests.test_dir, 'nbd.socket')
+unix_socket = os.path.join(iotests.sock_dir, 'nbd.socket')
 
 
 def flatten_sock_addr(crumpled_address):

From 14fa704577c9acbdd8c9e117349fededeed5d9c9 Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Thu, 17 Oct 2019 15:31:42 +0200
Subject: [PATCH 22/69] iotests/181: Create socket in $SOCK_DIR

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Thomas Huth <thuth@redhat.com>
Message-id: 20191017133155.5327-11-mreitz@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/181 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/qemu-iotests/181 b/tests/qemu-iotests/181
index e317e63422..378c2899d1 100755
--- a/tests/qemu-iotests/181
+++ b/tests/qemu-iotests/181
@@ -26,7 +26,7 @@ echo "QA output created by $seq"
 
 status=1	# failure is the default!
 
-MIG_SOCKET="${TEST_DIR}/migrate"
+MIG_SOCKET="${SOCK_DIR}/migrate"
 
 _cleanup()
 {

From 748f831b2a4e6bcebd99b6fd02612b7d708e61dc Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Thu, 17 Oct 2019 15:31:43 +0200
Subject: [PATCH 23/69] iotests/182: Create socket in $SOCK_DIR

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Thomas Huth <thuth@redhat.com>
Message-id: 20191017133155.5327-12-mreitz@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/182 | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/qemu-iotests/182 b/tests/qemu-iotests/182
index 7f494eb9bb..1ccb850055 100755
--- a/tests/qemu-iotests/182
+++ b/tests/qemu-iotests/182
@@ -31,7 +31,7 @@ _cleanup()
 {
     _cleanup_test_img
     rm -f "$TEST_IMG.overlay"
-    rm -f "$TEST_DIR/nbd.socket"
+    rm -f "$SOCK_DIR/nbd.socket"
 }
 trap "_cleanup; exit \$status" 0 1 2 3 15
 
@@ -133,7 +133,7 @@ success_or_failure=y _send_qemu_cmd $QEMU_HANDLE \
           'addr': {
               'type': 'unix',
               'data': {
-                  'path': '$TEST_DIR/nbd.socket'
+                  'path': '$SOCK_DIR/nbd.socket'
               } } } }" \
     'return' \
     'error'

From 6ab72e5865529654cf357f56e3466247286bc3a7 Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Thu, 17 Oct 2019 15:31:44 +0200
Subject: [PATCH 24/69] iotests/183: Create socket in $SOCK_DIR

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Thomas Huth <thuth@redhat.com>
Message-id: 20191017133155.5327-13-mreitz@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/183 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/qemu-iotests/183 b/tests/qemu-iotests/183
index 04fb344d08..bced83fae0 100755
--- a/tests/qemu-iotests/183
+++ b/tests/qemu-iotests/183
@@ -26,7 +26,7 @@ echo "QA output created by $seq"
 
 status=1 # failure is the default!
 
-MIG_SOCKET="${TEST_DIR}/migrate"
+MIG_SOCKET="${SOCK_DIR}/migrate"
 
 _cleanup()
 {

From 7310e0bd64026223cc74701bcd20d58937e3578e Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Thu, 17 Oct 2019 15:31:45 +0200
Subject: [PATCH 25/69] iotests/192: Create socket in $SOCK_DIR

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-id: 20191017133155.5327-14-mreitz@redhat.com
Reviewed-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/192     | 4 ++--
 tests/qemu-iotests/192.out | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/qemu-iotests/192 b/tests/qemu-iotests/192
index 034432272f..d2ba55dd90 100755
--- a/tests/qemu-iotests/192
+++ b/tests/qemu-iotests/192
@@ -31,7 +31,7 @@ _cleanup()
 {
     _cleanup_qemu
     _cleanup_test_img
-    rm -f "$TEST_DIR/nbd"
+    rm -f "$SOCK_DIR/nbd"
 }
 trap "_cleanup; exit \$status" 0 1 2 3 15
 
@@ -66,7 +66,7 @@ else
     QEMU_COMM_TIMEOUT=1
 fi
 
-_send_qemu_cmd $h "nbd_server_start unix:$TEST_DIR/nbd" "(qemu)"
+_send_qemu_cmd $h "nbd_server_start unix:$SOCK_DIR/nbd" "(qemu)"
 _send_qemu_cmd $h "nbd_server_add -w drive0" "(qemu)"
 _send_qemu_cmd $h "q" "(qemu)"
 
diff --git a/tests/qemu-iotests/192.out b/tests/qemu-iotests/192.out
index 1e0be4c4d7..b9429dbe36 100644
--- a/tests/qemu-iotests/192.out
+++ b/tests/qemu-iotests/192.out
@@ -1,7 +1,7 @@
 QA output created by 192
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864
 QEMU X.Y.Z monitor - type 'help' for more information
-(qemu) nbd_server_start unix:TEST_DIR/nbd
+(qemu) nbd_server_start unix:SOCK_DIR/nbd
 (qemu) nbd_server_add -w drive0
 (qemu) q
 *** done

From 4b4d34f4f79f16f1135705161bb2c2991df9049e Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Thu, 17 Oct 2019 15:31:46 +0200
Subject: [PATCH 26/69] iotests/194: Create sockets in $SOCK_DIR

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-id: 20191017133155.5327-15-mreitz@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/194 | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/qemu-iotests/194 b/tests/qemu-iotests/194
index d746ab1e21..72e47e8833 100755
--- a/tests/qemu-iotests/194
+++ b/tests/qemu-iotests/194
@@ -26,8 +26,8 @@ iotests.verify_platform(['linux'])
 
 with iotests.FilePath('source.img') as source_img_path, \
      iotests.FilePath('dest.img') as dest_img_path, \
-     iotests.FilePath('migration.sock') as migration_sock_path, \
-     iotests.FilePath('nbd.sock') as nbd_sock_path, \
+     iotests.FilePaths(['migration.sock', 'nbd.sock'], iotests.sock_dir) as \
+         [migration_sock_path, nbd_sock_path], \
      iotests.VM('source') as source_vm, \
      iotests.VM('dest') as dest_vm:
 

From 2b4af4650c5a668b3a713fa2c54f0abdb7ef54eb Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Thu, 17 Oct 2019 15:31:47 +0200
Subject: [PATCH 27/69] iotests/201: Create socket in $SOCK_DIR

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-id: 20191017133155.5327-16-mreitz@redhat.com
Reviewed-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/201 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/qemu-iotests/201 b/tests/qemu-iotests/201
index 7abf740fe4..86fa37e714 100755
--- a/tests/qemu-iotests/201
+++ b/tests/qemu-iotests/201
@@ -24,7 +24,7 @@ echo "QA output created by $seq"
 
 status=1	# failure is the default!
 
-MIG_SOCKET="${TEST_DIR}/migrate"
+MIG_SOCKET="${SOCK_DIR}/migrate"
 
 # get standard environment, filters and checks
 . ./common.rc

From 2683ff77dc7f5dac830ba07ab09b0490f49486a6 Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Thu, 17 Oct 2019 15:31:48 +0200
Subject: [PATCH 28/69] iotests/205: Create socket in $SOCK_DIR

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-id: 20191017133155.5327-17-mreitz@redhat.com
Reviewed-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/205 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/qemu-iotests/205 b/tests/qemu-iotests/205
index 76f6c5fa2b..4bb2c21e8b 100755
--- a/tests/qemu-iotests/205
+++ b/tests/qemu-iotests/205
@@ -24,7 +24,7 @@ import iotests
 import time
 from iotests import qemu_img_create, qemu_io, filter_qemu_io, QemuIoInteractive
 
-nbd_sock = os.path.join(iotests.test_dir, 'nbd_sock')
+nbd_sock = os.path.join(iotests.sock_dir, 'nbd_sock')
 nbd_uri = 'nbd+unix:///exp?socket=' + nbd_sock
 disk = os.path.join(iotests.test_dir, 'disk')
 

From 9a1c51e11aaa0be62ac31bd6bca855b1af7d979a Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Thu, 17 Oct 2019 15:31:49 +0200
Subject: [PATCH 29/69] iotests/208: Create socket in $SOCK_DIR

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-id: 20191017133155.5327-18-mreitz@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/208 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/qemu-iotests/208 b/tests/qemu-iotests/208
index 1e202388dc..546eb1de3e 100755
--- a/tests/qemu-iotests/208
+++ b/tests/qemu-iotests/208
@@ -26,7 +26,7 @@ iotests.verify_image_format(supported_fmts=['generic'])
 
 with iotests.FilePath('disk.img') as disk_img_path, \
      iotests.FilePath('disk-snapshot.img') as disk_snapshot_img_path, \
-     iotests.FilePath('nbd.sock') as nbd_sock_path, \
+     iotests.FilePath('nbd.sock', iotests.sock_dir) as nbd_sock_path, \
      iotests.VM() as vm:
 
     img_size = '10M'

From 73752070be2f07a73a17a398c36223f030ef59ca Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Thu, 17 Oct 2019 15:31:50 +0200
Subject: [PATCH 30/69] iotests/209: Create socket in $SOCK_DIR

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-id: 20191017133155.5327-19-mreitz@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/209 | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/qemu-iotests/209 b/tests/qemu-iotests/209
index 259e991ec6..e0f464bcbe 100755
--- a/tests/qemu-iotests/209
+++ b/tests/qemu-iotests/209
@@ -24,7 +24,8 @@ from iotests import qemu_img_create, qemu_io, qemu_img_verbose, qemu_nbd, \
 
 iotests.verify_image_format(supported_fmts=['qcow2'])
 
-disk, nbd_sock = file_path('disk', 'nbd-sock')
+disk = file_path('disk')
+nbd_sock = file_path('nbd-sock', base_dir=iotests.sock_dir)
 nbd_uri = 'nbd+unix:///exp?socket=' + nbd_sock
 
 qemu_img_create('-f', iotests.imgfmt, disk, '1M')

From 9ea16864f47b55fb5ec787c974c0bd1f659cd498 Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Thu, 17 Oct 2019 15:31:51 +0200
Subject: [PATCH 31/69] iotests/222: Create socket in $SOCK_DIR

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-id: 20191017133155.5327-20-mreitz@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/222 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/qemu-iotests/222 b/tests/qemu-iotests/222
index 0ead56d574..3f9f934ad8 100644
--- a/tests/qemu-iotests/222
+++ b/tests/qemu-iotests/222
@@ -48,7 +48,7 @@ remainder = [("0xd5", "0x108000",  "32k"), # Right-end of partial-left [1]
 
 with iotests.FilePath('base.img') as base_img_path, \
      iotests.FilePath('fleece.img') as fleece_img_path, \
-     iotests.FilePath('nbd.sock') as nbd_sock_path, \
+     iotests.FilePath('nbd.sock', iotests.sock_dir) as nbd_sock_path, \
      iotests.VM() as vm:
 
     log('--- Setting up images ---')

From 135a46630683ef79a1c4d4b8f195b65ca65385a5 Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Thu, 17 Oct 2019 15:31:52 +0200
Subject: [PATCH 32/69] iotests/223: Create socket in $SOCK_DIR

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-id: 20191017133155.5327-21-mreitz@redhat.com
Reviewed-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/223 | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/tests/qemu-iotests/223 b/tests/qemu-iotests/223
index 2ba3d8124b..b5a80e50bb 100755
--- a/tests/qemu-iotests/223
+++ b/tests/qemu-iotests/223
@@ -28,7 +28,7 @@ _cleanup()
     nbd_server_stop
     _cleanup_test_img
     _cleanup_qemu
-    rm -f "$TEST_DIR/nbd"
+    rm -f "$SOCK_DIR/nbd"
 }
 trap "_cleanup; exit \$status" 0 1 2 3 15
 
@@ -125,11 +125,11 @@ _send_qemu_cmd $QEMU_HANDLE '{"execute":"nbd-server-add",
   "arguments":{"device":"n"}}' "error" # Attempt add without server
 _send_qemu_cmd $QEMU_HANDLE '{"execute":"nbd-server-start",
   "arguments":{"addr":{"type":"unix",
-    "data":{"path":"'"$TEST_DIR/nbd"'"}}}}' "return"
+    "data":{"path":"'"$SOCK_DIR/nbd"'"}}}}' "return"
 _send_qemu_cmd $QEMU_HANDLE '{"execute":"nbd-server-start",
   "arguments":{"addr":{"type":"unix",
-    "data":{"path":"'"$TEST_DIR/nbd"1'"}}}}' "error" # Attempt second server
-$QEMU_NBD_PROG -L -k "$TEST_DIR/nbd"
+    "data":{"path":"'"$SOCK_DIR/nbd"1'"}}}}' "error" # Attempt second server
+$QEMU_NBD_PROG -L -k "$SOCK_DIR/nbd"
 _send_qemu_cmd $QEMU_HANDLE '{"execute":"nbd-server-add",
   "arguments":{"device":"n", "bitmap":"b"}}' "return"
 _send_qemu_cmd $QEMU_HANDLE '{"execute":"nbd-server-add",
@@ -145,14 +145,14 @@ _send_qemu_cmd $QEMU_HANDLE '{"execute":"nbd-server-add",
 _send_qemu_cmd $QEMU_HANDLE '{"execute":"nbd-server-add",
   "arguments":{"device":"n", "name":"n2", "writable":true,
   "bitmap":"b2"}}' "return"
-$QEMU_NBD_PROG -L -k "$TEST_DIR/nbd"
+$QEMU_NBD_PROG -L -k "$SOCK_DIR/nbd"
 
 echo
 echo "=== Contrast normal status to large granularity dirty-bitmap ==="
 echo
 
 QEMU_IO_OPTIONS=$QEMU_IO_OPTIONS_NO_FMT
-IMG="driver=nbd,export=n,server.type=unix,server.path=$TEST_DIR/nbd"
+IMG="driver=nbd,export=n,server.type=unix,server.path=$SOCK_DIR/nbd"
 $QEMU_IO -r -c 'r -P 0x22 512 512' -c 'r -P 0 512k 512k' -c 'r -P 0x11 1m 1m' \
   -c 'r -P 0x33 2m 2m' --image-opts "$IMG" | _filter_qemu_io
 $QEMU_IMG map --output=json --image-opts \
@@ -164,7 +164,7 @@ echo
 echo "=== Contrast to small granularity dirty-bitmap ==="
 echo
 
-IMG="driver=nbd,export=n2,server.type=unix,server.path=$TEST_DIR/nbd"
+IMG="driver=nbd,export=n2,server.type=unix,server.path=$SOCK_DIR/nbd"
 $QEMU_IMG map --output=json --image-opts \
   "$IMG,x-dirty-bitmap=qemu:dirty-bitmap:b2" | _filter_qemu_img_map
 

From 5b2da7f7cc5c03e611ca6c70151fbe7275ea3d52 Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Thu, 17 Oct 2019 15:31:53 +0200
Subject: [PATCH 33/69] iotests/240: Create socket in $SOCK_DIR

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-id: 20191017133155.5327-22-mreitz@redhat.com
Reviewed-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/240 | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/qemu-iotests/240 b/tests/qemu-iotests/240
index f73bc07d80..8b4337b58d 100755
--- a/tests/qemu-iotests/240
+++ b/tests/qemu-iotests/240
@@ -29,7 +29,7 @@ status=1	# failure is the default!
 
 _cleanup()
 {
-    rm -f "$TEST_DIR/nbd"
+    rm -f "$SOCK_DIR/nbd"
 }
 trap "_cleanup; exit \$status" 0 1 2 3 15
 
@@ -135,7 +135,7 @@ echo
 run_qemu <<EOF
 { "execute": "qmp_capabilities" }
 { "execute": "blockdev-add", "arguments": {"driver": "null-co", "read-zeroes": true, "node-name": "hd0", "read-only": true}}
-{ "execute": "nbd-server-start", "arguments": {"addr":{"type":"unix","data":{"path":"$TEST_DIR/nbd"}}}}
+{ "execute": "nbd-server-start", "arguments": {"addr":{"type":"unix","data":{"path":"$SOCK_DIR/nbd"}}}}
 { "execute": "nbd-server-add", "arguments": {"device":"hd0"}}
 { "execute": "object-add", "arguments": {"qom-type": "iothread", "id": "iothread0"}}
 { "execute": "device_add", "arguments": {"id": "scsi0", "driver": "${virtio_scsi}", "iothread": "iothread0"}}

From f0e24942fc93708aa971bf4142b97b40d198700c Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Thu, 17 Oct 2019 15:31:54 +0200
Subject: [PATCH 34/69] iotests/267: Create socket in $SOCK_DIR

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-id: 20191017133155.5327-23-mreitz@redhat.com
Reviewed-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/267     | 4 ++--
 tests/qemu-iotests/267.out | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/qemu-iotests/267 b/tests/qemu-iotests/267
index d37a67c012..170e173c0a 100755
--- a/tests/qemu-iotests/267
+++ b/tests/qemu-iotests/267
@@ -29,7 +29,7 @@ status=1	# failure is the default!
 _cleanup()
 {
     _cleanup_test_img
-    rm -f "$TEST_DIR/nbd"
+    rm -f "$SOCK_DIR/nbd"
 }
 trap "_cleanup; exit \$status" 0 1 2 3 15
 
@@ -143,7 +143,7 @@ echo
 
 IMGOPTS="backing_file=$TEST_IMG.base" _make_test_img $size
 cat <<EOF |
-nbd_server_start unix:$TEST_DIR/nbd
+nbd_server_start unix:$SOCK_DIR/nbd
 nbd_server_add -w backing-fmt
 savevm snap0
 info snapshots
diff --git a/tests/qemu-iotests/267.out b/tests/qemu-iotests/267.out
index 9d812e3c72..8dddb4baa4 100644
--- a/tests/qemu-iotests/267.out
+++ b/tests/qemu-iotests/267.out
@@ -161,7 +161,7 @@ Internal snapshots on backing file:
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728 backing_file=TEST_DIR/t.IMGFMT.base
 Testing: -blockdev driver=file,filename=TEST_DIR/t.IMGFMT.base,node-name=backing-file -blockdev driver=IMGFMT,file=backing-file,node-name=backing-fmt -blockdev driver=file,filename=TEST_DIR/t.IMGFMT,node-name=file -blockdev driver=IMGFMT,file=file,backing=backing-fmt,node-name=fmt
 QEMU X.Y.Z monitor - type 'help' for more information
-(qemu) nbd_server_start unix:TEST_DIR/nbd
+(qemu) nbd_server_start unix:SOCK_DIR/nbd
 (qemu) nbd_server_add -w backing-fmt
 (qemu) savevm snap0
 (qemu) info snapshots

From ecc47cb47ccd3da0ad1375d2696997192f953ca3 Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Thu, 17 Oct 2019 15:31:55 +0200
Subject: [PATCH 35/69] iotests: Drop TEST_DIR filter from _filter_nbd

Sockets should be placed into $SOCK_DIR instead of $TEST_DIR, so remove
the $TEST_DIR filter from _filter_nbd.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-id: 20191017133155.5327-24-mreitz@redhat.com
Reviewed-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/common.filter | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/qemu-iotests/common.filter b/tests/qemu-iotests/common.filter
index 0ee6042524..f870e00e44 100644
--- a/tests/qemu-iotests/common.filter
+++ b/tests/qemu-iotests/common.filter
@@ -221,7 +221,6 @@ _filter_nbd()
     # Filter out the TCP port number since this changes between runs.
     $SED -e '/nbd\/.*\.c:/d' \
         -e 's#127\.0\.0\.1:[0-9]*#127.0.0.1:PORT#g' \
-        -e "s#?socket=$TEST_DIR#?socket=TEST_DIR#g" \
         -e "s#?socket=$SOCK_DIR#?socket=SOCK_DIR#g" \
         -e 's#\(foo\|PORT/\?\|.sock\): Failed to .*$#\1#'
 }

From 3816edd2cb8391a782bde365d7f14705fdb2834d Mon Sep 17 00:00:00 2001
From: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Date: Tue, 22 Oct 2019 14:18:00 +0300
Subject: [PATCH 36/69] block/block-copy: allocate buffer in
 block_copy_with_bounce_buffer

Move bounce_buffer allocation block_copy_with_bounce_buffer. This
commit simplifies further work on implementing copying by larger chunks
(of different size) and further asynchronous handling of block_copy
iterations (with help of block/aio_task API).

Allocation works fast, a lot faster than disk io, so it's not a problem
that we now allocate/free bounce_buffer more times. And we anyway will
have to allocate several bounce_buffers for parallel execution of loop
iterations in future.

Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-id: 20191022111805.3432-2-vsementsov@virtuozzo.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/block-copy.c | 21 ++++++++-------------
 1 file changed, 8 insertions(+), 13 deletions(-)

diff --git a/block/block-copy.c b/block/block-copy.c
index 066e3a7274..ecd086010e 100644
--- a/block/block-copy.c
+++ b/block/block-copy.c
@@ -126,20 +126,17 @@ void block_copy_set_callbacks(
 static int coroutine_fn block_copy_with_bounce_buffer(BlockCopyState *s,
                                                       int64_t start,
                                                       int64_t end,
-                                                      bool *error_is_read,
-                                                      void **bounce_buffer)
+                                                      bool *error_is_read)
 {
     int ret;
     int nbytes;
+    void *bounce_buffer = qemu_blockalign(s->source->bs, s->cluster_size);
 
     assert(QEMU_IS_ALIGNED(start, s->cluster_size));
     bdrv_reset_dirty_bitmap(s->copy_bitmap, start, s->cluster_size);
     nbytes = MIN(s->cluster_size, s->len - start);
-    if (!*bounce_buffer) {
-        *bounce_buffer = qemu_blockalign(s->source->bs, s->cluster_size);
-    }
 
-    ret = bdrv_co_pread(s->source, start, nbytes, *bounce_buffer, 0);
+    ret = bdrv_co_pread(s->source, start, nbytes, bounce_buffer, 0);
     if (ret < 0) {
         trace_block_copy_with_bounce_buffer_read_fail(s, start, ret);
         if (error_is_read) {
@@ -148,7 +145,7 @@ static int coroutine_fn block_copy_with_bounce_buffer(BlockCopyState *s,
         goto fail;
     }
 
-    ret = bdrv_co_pwrite(s->target, start, nbytes, *bounce_buffer,
+    ret = bdrv_co_pwrite(s->target, start, nbytes, bounce_buffer,
                          s->write_flags);
     if (ret < 0) {
         trace_block_copy_with_bounce_buffer_write_fail(s, start, ret);
@@ -158,8 +155,11 @@ static int coroutine_fn block_copy_with_bounce_buffer(BlockCopyState *s,
         goto fail;
     }
 
+    qemu_vfree(bounce_buffer);
+
     return nbytes;
 fail:
+    qemu_vfree(bounce_buffer);
     bdrv_set_dirty_bitmap(s->copy_bitmap, start, s->cluster_size);
     return ret;
 
@@ -271,7 +271,6 @@ int coroutine_fn block_copy(BlockCopyState *s,
 {
     int ret = 0;
     int64_t end = bytes + start; /* bytes */
-    void *bounce_buffer = NULL;
     int64_t status_bytes;
     BlockCopyInFlightReq req;
 
@@ -324,7 +323,7 @@ int coroutine_fn block_copy(BlockCopyState *s,
         }
         if (!s->use_copy_range) {
             ret = block_copy_with_bounce_buffer(s, start, dirty_end,
-                                                error_is_read, &bounce_buffer);
+                                                error_is_read);
         }
         if (ret < 0) {
             break;
@@ -335,10 +334,6 @@ int coroutine_fn block_copy(BlockCopyState *s,
         ret = 0;
     }
 
-    if (bounce_buffer) {
-        qemu_vfree(bounce_buffer);
-    }
-
     block_copy_inflight_req_end(&req);
 
     return ret;

From b3b7036afbf564268ad7edc10fb28501af0709e9 Mon Sep 17 00:00:00 2001
From: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Date: Tue, 22 Oct 2019 14:18:01 +0300
Subject: [PATCH 37/69] block/block-copy: limit copy_range_size to 16 MiB

Large copy range may imply memory allocation and large io effort, so
using 2G copy range request may be bad idea. Let's limit it to 16 MiB.
It also helps the following patch to refactor copy-with-offload
fallback to copy-with-bounce-buffer.

Note, that total memory usage of backup is still not limited, it will
be fixed in further commit.

Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-id: 20191022111805.3432-3-vsementsov@virtuozzo.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/block-copy.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/block/block-copy.c b/block/block-copy.c
index ecd086010e..cd45b373a9 100644
--- a/block/block-copy.c
+++ b/block/block-copy.c
@@ -18,6 +18,9 @@
 #include "qapi/error.h"
 #include "block/block-copy.h"
 #include "sysemu/block-backend.h"
+#include "qemu/units.h"
+
+#define BLOCK_COPY_MAX_COPY_RANGE (16 * MiB)
 
 static void coroutine_fn block_copy_wait_inflight_reqs(BlockCopyState *s,
                                                        int64_t start,
@@ -70,9 +73,12 @@ BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
 {
     BlockCopyState *s;
     BdrvDirtyBitmap *copy_bitmap;
+
+    /* Ignore BLOCK_COPY_MAX_COPY_RANGE if requested cluster_size is larger */
     uint32_t max_transfer =
-            MIN_NON_ZERO(INT_MAX, MIN_NON_ZERO(source->bs->bl.max_transfer,
-                                               target->bs->bl.max_transfer));
+            MIN_NON_ZERO(MAX(cluster_size, BLOCK_COPY_MAX_COPY_RANGE),
+                         MIN_NON_ZERO(source->bs->bl.max_transfer,
+                                      target->bs->bl.max_transfer));
 
     copy_bitmap = bdrv_create_dirty_bitmap(source->bs, cluster_size, NULL,
                                            errp);

From e332a726da322c5c68bc34ffb59384e31e36d0ff Mon Sep 17 00:00:00 2001
From: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Date: Tue, 22 Oct 2019 14:18:02 +0300
Subject: [PATCH 38/69] block/block-copy: refactor copying

Merge copying code into one function block_copy_do_copy, which only
calls bdrv_ io functions and don't do any synchronization (like dirty
bitmap set/reset).

Refactor block_copy() function so that it takes full decision about
size of chunk to be copied and does all the synchronization (checking
intersecting requests, set/reset dirty bitmaps).

It will help:
 - introduce parallel processing of block_copy iterations: we need to
   calculate chunk size, start async chunk copying and go to the next
   iteration
 - simplify synchronization improvement (like memory limiting in
   further commit and reducing critical section (now we lock the whole
   requested range, when actually we need to lock only dirty region
   which we handle at the moment))

Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-id: 20191022111805.3432-4-vsementsov@virtuozzo.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/block-copy.c | 118 ++++++++++++++++++++-------------------------
 block/trace-events |   6 +--
 2 files changed, 54 insertions(+), 70 deletions(-)

diff --git a/block/block-copy.c b/block/block-copy.c
index cd45b373a9..845b2423dc 100644
--- a/block/block-copy.c
+++ b/block/block-copy.c
@@ -126,79 +126,64 @@ void block_copy_set_callbacks(
 }
 
 /*
- * Copy range to target with a bounce buffer and return the bytes copied. If
- * error occurred, return a negative error number
+ * block_copy_do_copy
+ *
+ * Do copy of cluser-aligned chunk. @end is allowed to exceed s->len only to
+ * cover last cluster when s->len is not aligned to clusters.
+ *
+ * No sync here: nor bitmap neighter intersecting requests handling, only copy.
+ *
+ * Returns 0 on success.
  */
-static int coroutine_fn block_copy_with_bounce_buffer(BlockCopyState *s,
-                                                      int64_t start,
-                                                      int64_t end,
-                                                      bool *error_is_read)
+static int coroutine_fn block_copy_do_copy(BlockCopyState *s,
+                                           int64_t start, int64_t end,
+                                           bool *error_is_read)
 {
     int ret;
-    int nbytes;
-    void *bounce_buffer = qemu_blockalign(s->source->bs, s->cluster_size);
+    int nbytes = MIN(end, s->len) - start;
+    void *bounce_buffer = NULL;
 
     assert(QEMU_IS_ALIGNED(start, s->cluster_size));
-    bdrv_reset_dirty_bitmap(s->copy_bitmap, start, s->cluster_size);
-    nbytes = MIN(s->cluster_size, s->len - start);
+    assert(QEMU_IS_ALIGNED(end, s->cluster_size));
+    assert(end < s->len || end == QEMU_ALIGN_UP(s->len, s->cluster_size));
+
+    if (s->use_copy_range) {
+        ret = bdrv_co_copy_range(s->source, start, s->target, start, nbytes,
+                                 0, s->write_flags);
+        if (ret < 0) {
+            trace_block_copy_copy_range_fail(s, start, ret);
+            s->use_copy_range = false;
+            /* Fallback to read+write with allocated buffer */
+        } else {
+            goto out;
+        }
+    }
+
+    bounce_buffer = qemu_blockalign(s->source->bs, nbytes);
 
     ret = bdrv_co_pread(s->source, start, nbytes, bounce_buffer, 0);
     if (ret < 0) {
-        trace_block_copy_with_bounce_buffer_read_fail(s, start, ret);
+        trace_block_copy_read_fail(s, start, ret);
         if (error_is_read) {
             *error_is_read = true;
         }
-        goto fail;
+        goto out;
     }
 
     ret = bdrv_co_pwrite(s->target, start, nbytes, bounce_buffer,
                          s->write_flags);
     if (ret < 0) {
-        trace_block_copy_with_bounce_buffer_write_fail(s, start, ret);
+        trace_block_copy_write_fail(s, start, ret);
         if (error_is_read) {
             *error_is_read = false;
         }
-        goto fail;
+        goto out;
     }
 
+out:
     qemu_vfree(bounce_buffer);
 
-    return nbytes;
-fail:
-    qemu_vfree(bounce_buffer);
-    bdrv_set_dirty_bitmap(s->copy_bitmap, start, s->cluster_size);
     return ret;
-
-}
-
-/*
- * Copy range to target and return the bytes copied. If error occurred, return a
- * negative error number.
- */
-static int coroutine_fn block_copy_with_offload(BlockCopyState *s,
-                                                int64_t start,
-                                                int64_t end)
-{
-    int ret;
-    int nr_clusters;
-    int nbytes;
-
-    assert(QEMU_IS_ALIGNED(s->copy_range_size, s->cluster_size));
-    assert(QEMU_IS_ALIGNED(start, s->cluster_size));
-    nbytes = MIN(s->copy_range_size, MIN(end, s->len) - start);
-    nr_clusters = DIV_ROUND_UP(nbytes, s->cluster_size);
-    bdrv_reset_dirty_bitmap(s->copy_bitmap, start,
-                            s->cluster_size * nr_clusters);
-    ret = bdrv_co_copy_range(s->source, start, s->target, start, nbytes,
-                             0, s->write_flags);
-    if (ret < 0) {
-        trace_block_copy_with_offload_fail(s, start, ret);
-        bdrv_set_dirty_bitmap(s->copy_bitmap, start,
-                              s->cluster_size * nr_clusters);
-        return ret;
-    }
-
-    return nbytes;
 }
 
 /*
@@ -294,7 +279,7 @@ int coroutine_fn block_copy(BlockCopyState *s,
     block_copy_inflight_req_begin(s, &req, start, end);
 
     while (start < end) {
-        int64_t dirty_end;
+        int64_t next_zero, chunk_end;
 
         if (!bdrv_dirty_bitmap_get(s->copy_bitmap, start)) {
             trace_block_copy_skip(s, start);
@@ -302,10 +287,15 @@ int coroutine_fn block_copy(BlockCopyState *s,
             continue; /* already copied */
         }
 
-        dirty_end = bdrv_dirty_bitmap_next_zero(s->copy_bitmap, start,
-                                                (end - start));
-        if (dirty_end < 0) {
-            dirty_end = end;
+        chunk_end = MIN(end, start + (s->use_copy_range ?
+                                      s->copy_range_size : s->cluster_size));
+
+        next_zero = bdrv_dirty_bitmap_next_zero(s->copy_bitmap, start,
+                                                chunk_end - start);
+        if (next_zero >= 0) {
+            assert(next_zero > start); /* start is dirty */
+            assert(next_zero < chunk_end); /* no need to do MIN() */
+            chunk_end = next_zero;
         }
 
         if (s->skip_unallocated) {
@@ -316,27 +306,21 @@ int coroutine_fn block_copy(BlockCopyState *s,
                 continue;
             }
             /* Clamp to known allocated region */
-            dirty_end = MIN(dirty_end, start + status_bytes);
+            chunk_end = MIN(chunk_end, start + status_bytes);
         }
 
         trace_block_copy_process(s, start);
 
-        if (s->use_copy_range) {
-            ret = block_copy_with_offload(s, start, dirty_end);
-            if (ret < 0) {
-                s->use_copy_range = false;
-            }
-        }
-        if (!s->use_copy_range) {
-            ret = block_copy_with_bounce_buffer(s, start, dirty_end,
-                                                error_is_read);
-        }
+        bdrv_reset_dirty_bitmap(s->copy_bitmap, start, chunk_end - start);
+
+        ret = block_copy_do_copy(s, start, chunk_end, error_is_read);
         if (ret < 0) {
+            bdrv_set_dirty_bitmap(s->copy_bitmap, start, chunk_end - start);
             break;
         }
 
-        start += ret;
-        s->progress_bytes_callback(ret, s->progress_opaque);
+        s->progress_bytes_callback(chunk_end - start, s->progress_opaque);
+        start = chunk_end;
         ret = 0;
     }
 
diff --git a/block/trace-events b/block/trace-events
index b8d70f5242..ccde15a14c 100644
--- a/block/trace-events
+++ b/block/trace-events
@@ -45,9 +45,9 @@ backup_do_cow_return(void *job, int64_t offset, uint64_t bytes, int ret) "job %p
 block_copy_skip(void *bcs, int64_t start) "bcs %p start %"PRId64
 block_copy_skip_range(void *bcs, int64_t start, uint64_t bytes) "bcs %p start %"PRId64" bytes %"PRId64
 block_copy_process(void *bcs, int64_t start) "bcs %p start %"PRId64
-block_copy_with_bounce_buffer_read_fail(void *bcs, int64_t start, int ret) "bcs %p start %"PRId64" ret %d"
-block_copy_with_bounce_buffer_write_fail(void *bcs, int64_t start, int ret) "bcs %p start %"PRId64" ret %d"
-block_copy_with_offload_fail(void *bcs, int64_t start, int ret) "bcs %p start %"PRId64" ret %d"
+block_copy_copy_range_fail(void *bcs, int64_t start, int ret) "bcs %p start %"PRId64" ret %d"
+block_copy_read_fail(void *bcs, int64_t start, int ret) "bcs %p start %"PRId64" ret %d"
+block_copy_write_fail(void *bcs, int64_t start, int ret) "bcs %p start %"PRId64" ret %d"
 
 # ../blockdev.c
 qmp_block_job_cancel(void *job) "job %p"

From f16ba00de946517e5cca4edeeec3e21f8f77132d Mon Sep 17 00:00:00 2001
From: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Date: Tue, 22 Oct 2019 14:18:03 +0300
Subject: [PATCH 39/69] util: introduce SharedResource

Introduce an API for some shared splittable resource, like memory.
It's going to be used by backup. Backup uses both read/write io and
copy_range. copy_range may consume memory implictly, so the new API is
abstract: it doesn't allocate any real memory but only hands out
tickets.

The idea is that we have some total amount of something and callers
should wait in coroutine queue if there is not enough of the resource
at the moment.

Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-id: 20191022111805.3432-5-vsementsov@virtuozzo.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 include/qemu/co-shared-resource.h | 71 +++++++++++++++++++++++++++++
 util/Makefile.objs                |  1 +
 util/qemu-co-shared-resource.c    | 76 +++++++++++++++++++++++++++++++
 3 files changed, 148 insertions(+)
 create mode 100644 include/qemu/co-shared-resource.h
 create mode 100644 util/qemu-co-shared-resource.c

diff --git a/include/qemu/co-shared-resource.h b/include/qemu/co-shared-resource.h
new file mode 100644
index 0000000000..4e4503004c
--- /dev/null
+++ b/include/qemu/co-shared-resource.h
@@ -0,0 +1,71 @@
+/*
+ * Helper functionality for distributing a fixed total amount of
+ * an abstract resource among multiple coroutines.
+ *
+ * Copyright (c) 2019 Virtuozzo International GmbH
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef QEMU_CO_SHARED_RESOURCE_H
+#define QEMU_CO_SHARED_RESOURCE_H
+
+
+typedef struct SharedResource SharedResource;
+
+/*
+ * Create SharedResource structure
+ *
+ * @total: total amount of some resource to be shared between clients
+ *
+ * Note: this API is not thread-safe.
+ */
+SharedResource *shres_create(uint64_t total);
+
+/*
+ * Release SharedResource structure
+ *
+ * This function may only be called once everything allocated by all
+ * clients has been deallocated.
+ */
+void shres_destroy(SharedResource *s);
+
+/*
+ * Try to allocate an amount of @n.  Return true on success, and false
+ * if there is too little left of the collective resource to fulfill
+ * the request.
+ */
+bool co_try_get_from_shres(SharedResource *s, uint64_t n);
+
+/*
+ * Allocate an amount of @n, and, if necessary, yield until
+ * that becomes possible.
+ */
+void coroutine_fn co_get_from_shres(SharedResource *s, uint64_t n);
+
+/*
+ * Deallocate an amount of @n.  The total amount allocated by a caller
+ * does not need to be deallocated/released with a single call, but may
+ * be split over several calls.  For example, get(4), get(3), and then
+ * put(5), put(2).
+ */
+void coroutine_fn co_put_to_shres(SharedResource *s, uint64_t n);
+
+
+#endif /* QEMU_CO_SHARED_RESOURCE_H */
diff --git a/util/Makefile.objs b/util/Makefile.objs
index 41bf59d127..df124af1c5 100644
--- a/util/Makefile.objs
+++ b/util/Makefile.objs
@@ -37,6 +37,7 @@ util-obj-y += rcu.o
 util-obj-$(CONFIG_MEMBARRIER) += sys_membarrier.o
 util-obj-y += qemu-coroutine.o qemu-coroutine-lock.o qemu-coroutine-io.o
 util-obj-y += qemu-coroutine-sleep.o
+util-obj-y += qemu-co-shared-resource.o
 util-obj-y += coroutine-$(CONFIG_COROUTINE_BACKEND).o
 util-obj-y += buffer.o
 util-obj-y += timed-average.o
diff --git a/util/qemu-co-shared-resource.c b/util/qemu-co-shared-resource.c
new file mode 100644
index 0000000000..1c83cd9d29
--- /dev/null
+++ b/util/qemu-co-shared-resource.c
@@ -0,0 +1,76 @@
+/*
+ * Helper functionality for distributing a fixed total amount of
+ * an abstract resource among multiple coroutines.
+ *
+ * Copyright (c) 2019 Virtuozzo International GmbH
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/coroutine.h"
+#include "qemu/co-shared-resource.h"
+
+struct SharedResource {
+    uint64_t total;
+    uint64_t available;
+
+    CoQueue queue;
+};
+
+SharedResource *shres_create(uint64_t total)
+{
+    SharedResource *s = g_new0(SharedResource, 1);
+
+    s->total = s->available = total;
+    qemu_co_queue_init(&s->queue);
+
+    return s;
+}
+
+void shres_destroy(SharedResource *s)
+{
+    assert(s->available == s->total);
+    g_free(s);
+}
+
+bool co_try_get_from_shres(SharedResource *s, uint64_t n)
+{
+    if (s->available >= n) {
+        s->available -= n;
+        return true;
+    }
+
+    return false;
+}
+
+void coroutine_fn co_get_from_shres(SharedResource *s, uint64_t n)
+{
+    assert(n <= s->total);
+    while (!co_try_get_from_shres(s, n)) {
+        qemu_co_queue_wait(&s->queue, NULL);
+    }
+}
+
+void coroutine_fn co_put_to_shres(SharedResource *s, uint64_t n)
+{
+    assert(s->total - s->available >= n);
+    s->available += n;
+    qemu_co_queue_restart_all(&s->queue);
+}

From 7f739d0e5375a934f0eade3cd38e6d33673beec0 Mon Sep 17 00:00:00 2001
From: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Date: Tue, 22 Oct 2019 14:18:04 +0300
Subject: [PATCH 40/69] block/block-copy: add memory limit

Currently total allocation for parallel requests to block-copy instance
is unlimited. Let's limit it to 128 MiB.

For now block-copy is used only in backup, so actually we limit total
allocation for backup job.

Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-id: 20191022111805.3432-6-vsementsov@virtuozzo.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/block-copy.c         | 5 +++++
 include/block/block-copy.h | 3 +++
 2 files changed, 8 insertions(+)

diff --git a/block/block-copy.c b/block/block-copy.c
index 845b2423dc..7f0ebb58f8 100644
--- a/block/block-copy.c
+++ b/block/block-copy.c
@@ -21,6 +21,7 @@
 #include "qemu/units.h"
 
 #define BLOCK_COPY_MAX_COPY_RANGE (16 * MiB)
+#define BLOCK_COPY_MAX_MEM (128 * MiB)
 
 static void coroutine_fn block_copy_wait_inflight_reqs(BlockCopyState *s,
                                                        int64_t start,
@@ -64,6 +65,7 @@ void block_copy_state_free(BlockCopyState *s)
     }
 
     bdrv_release_dirty_bitmap(s->copy_bitmap);
+    shres_destroy(s->mem);
     g_free(s);
 }
 
@@ -95,6 +97,7 @@ BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
         .cluster_size = cluster_size,
         .len = bdrv_dirty_bitmap_size(copy_bitmap),
         .write_flags = write_flags,
+        .mem = shres_create(BLOCK_COPY_MAX_MEM),
     };
 
     s->copy_range_size = QEMU_ALIGN_DOWN(max_transfer, cluster_size),
@@ -313,7 +316,9 @@ int coroutine_fn block_copy(BlockCopyState *s,
 
         bdrv_reset_dirty_bitmap(s->copy_bitmap, start, chunk_end - start);
 
+        co_get_from_shres(s->mem, chunk_end - start);
         ret = block_copy_do_copy(s, start, chunk_end, error_is_read);
+        co_put_to_shres(s->mem, chunk_end - start);
         if (ret < 0) {
             bdrv_set_dirty_bitmap(s->copy_bitmap, start, chunk_end - start);
             break;
diff --git a/include/block/block-copy.h b/include/block/block-copy.h
index e2e135ff1b..edcdf0072d 100644
--- a/include/block/block-copy.h
+++ b/include/block/block-copy.h
@@ -16,6 +16,7 @@
 #define BLOCK_COPY_H
 
 #include "block/block.h"
+#include "qemu/co-shared-resource.h"
 
 typedef struct BlockCopyInFlightReq {
     int64_t start_byte;
@@ -69,6 +70,8 @@ typedef struct BlockCopyState {
      */
     ProgressResetCallbackFunc progress_reset_callback;
     void *progress_opaque;
+
+    SharedResource *mem;
 } BlockCopyState;
 
 BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,

From 0e2402452f1f2042923a5206b9ff3e9d70c77811 Mon Sep 17 00:00:00 2001
From: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Date: Tue, 22 Oct 2019 14:18:05 +0300
Subject: [PATCH 41/69] block/block-copy: increase buffered copy request

No reason to limit buffered copy to one cluster. Let's allow up to 1
MiB.

Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-id: 20191022111805.3432-7-vsementsov@virtuozzo.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/block-copy.c         | 48 +++++++++++++++++++++++++-------------
 include/block/block-copy.h |  2 +-
 2 files changed, 33 insertions(+), 17 deletions(-)

diff --git a/block/block-copy.c b/block/block-copy.c
index 7f0ebb58f8..c39cc9cffe 100644
--- a/block/block-copy.c
+++ b/block/block-copy.c
@@ -21,6 +21,7 @@
 #include "qemu/units.h"
 
 #define BLOCK_COPY_MAX_COPY_RANGE (16 * MiB)
+#define BLOCK_COPY_MAX_BUFFER (1 * MiB)
 #define BLOCK_COPY_MAX_MEM (128 * MiB)
 
 static void coroutine_fn block_copy_wait_inflight_reqs(BlockCopyState *s,
@@ -75,10 +76,8 @@ BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
 {
     BlockCopyState *s;
     BdrvDirtyBitmap *copy_bitmap;
-
-    /* Ignore BLOCK_COPY_MAX_COPY_RANGE if requested cluster_size is larger */
     uint32_t max_transfer =
-            MIN_NON_ZERO(MAX(cluster_size, BLOCK_COPY_MAX_COPY_RANGE),
+            MIN_NON_ZERO(INT_MAX,
                          MIN_NON_ZERO(source->bs->bl.max_transfer,
                                       target->bs->bl.max_transfer));
 
@@ -100,17 +99,28 @@ BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
         .mem = shres_create(BLOCK_COPY_MAX_MEM),
     };
 
-    s->copy_range_size = QEMU_ALIGN_DOWN(max_transfer, cluster_size),
-    /*
-     * Set use_copy_range, consider the following:
-     * 1. Compression is not supported for copy_range.
-     * 2. copy_range does not respect max_transfer (it's a TODO), so we factor
-     *    that in here. If max_transfer is smaller than the job->cluster_size,
-     *    we do not use copy_range (in that case it's zero after aligning down
-     *    above).
-     */
-    s->use_copy_range =
-        !(write_flags & BDRV_REQ_WRITE_COMPRESSED) && s->copy_range_size > 0;
+    if (max_transfer < cluster_size) {
+        /*
+         * copy_range does not respect max_transfer. We don't want to bother
+         * with requests smaller than block-copy cluster size, so fallback to
+         * buffered copying (read and write respect max_transfer on their
+         * behalf).
+         */
+        s->use_copy_range = false;
+        s->copy_size = cluster_size;
+    } else if (write_flags & BDRV_REQ_WRITE_COMPRESSED) {
+        /* Compression is not supported for copy_range */
+        s->use_copy_range = false;
+        s->copy_size = MAX(cluster_size, BLOCK_COPY_MAX_BUFFER);
+    } else {
+        /*
+         * copy_range does not respect max_transfer (it's a TODO), so we factor
+         * that in here.
+         */
+        s->use_copy_range = true;
+        s->copy_size = MIN(MAX(cluster_size, BLOCK_COPY_MAX_COPY_RANGE),
+                           QEMU_ALIGN_DOWN(max_transfer, cluster_size));
+    }
 
     QLIST_INIT(&s->inflight_reqs);
 
@@ -156,12 +166,19 @@ static int coroutine_fn block_copy_do_copy(BlockCopyState *s,
         if (ret < 0) {
             trace_block_copy_copy_range_fail(s, start, ret);
             s->use_copy_range = false;
+            s->copy_size = MAX(s->cluster_size, BLOCK_COPY_MAX_BUFFER);
             /* Fallback to read+write with allocated buffer */
         } else {
             goto out;
         }
     }
 
+    /*
+     * In case of failed copy_range request above, we may proceed with buffered
+     * request larger than BLOCK_COPY_MAX_BUFFER. Still, further requests will
+     * be properly limited, so don't care too much.
+     */
+
     bounce_buffer = qemu_blockalign(s->source->bs, nbytes);
 
     ret = bdrv_co_pread(s->source, start, nbytes, bounce_buffer, 0);
@@ -290,8 +307,7 @@ int coroutine_fn block_copy(BlockCopyState *s,
             continue; /* already copied */
         }
 
-        chunk_end = MIN(end, start + (s->use_copy_range ?
-                                      s->copy_range_size : s->cluster_size));
+        chunk_end = MIN(end, start + s->copy_size);
 
         next_zero = bdrv_dirty_bitmap_next_zero(s->copy_bitmap, start,
                                                 chunk_end - start);
diff --git a/include/block/block-copy.h b/include/block/block-copy.h
index edcdf0072d..0a161724d7 100644
--- a/include/block/block-copy.h
+++ b/include/block/block-copy.h
@@ -38,7 +38,7 @@ typedef struct BlockCopyState {
     BdrvDirtyBitmap *copy_bitmap;
     int64_t cluster_size;
     bool use_copy_range;
-    int64_t copy_range_size;
+    int64_t copy_size;
     uint64_t len;
     QLIST_HEAD(, BlockCopyInFlightReq) inflight_reqs;
 

From e0dd95e3738c5868ddea932aa0f1998900eeaf58 Mon Sep 17 00:00:00 2001
From: Maxim Levitsky <mlevitsk@redhat.com>
Date: Fri, 13 Sep 2019 16:36:26 +0300
Subject: [PATCH 42/69] block/nvme: add support for write zeros

Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
Message-id: 20190913133627.28450-2-mlevitsk@redhat.com
Reviewed-by: John Snow <jsnow@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/nvme.c         | 72 +++++++++++++++++++++++++++++++++++++++++++-
 block/trace-events   |  1 +
 include/block/nvme.h | 19 +++++++++++-
 3 files changed, 90 insertions(+), 2 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index 910872ec59..0ac38c24e6 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -112,6 +112,8 @@ typedef struct {
     uint64_t max_transfer;
     bool plugged;
 
+    bool supports_write_zeroes;
+
     CoMutex dma_map_lock;
     CoQueue dma_flush_queue;
 
@@ -423,6 +425,7 @@ static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
     NvmeIdNs *idns;
     NvmeLBAF *lbaf;
     uint8_t *resp;
+    uint16_t oncs;
     int r;
     uint64_t iova;
     NvmeCmd cmd = {
@@ -460,6 +463,9 @@ static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
     s->max_transfer = MIN_NON_ZERO(s->max_transfer,
                           s->page_size / sizeof(uint64_t) * s->page_size);
 
+    oncs = le16_to_cpu(idctrl->oncs);
+    s->supports_write_zeroes = !!(oncs & NVME_ONCS_WRITE_ZEROS);
+
     memset(resp, 0, 4096);
 
     cmd.cdw10 = 0;
@@ -472,6 +478,12 @@ static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
     s->nsze = le64_to_cpu(idns->nsze);
     lbaf = &idns->lbaf[NVME_ID_NS_FLBAS_INDEX(idns->flbas)];
 
+    if (NVME_ID_NS_DLFEAT_WRITE_ZEROES(idns->dlfeat) &&
+            NVME_ID_NS_DLFEAT_READ_BEHAVIOR(idns->dlfeat) ==
+                    NVME_ID_NS_DLFEAT_READ_BEHAVIOR_ZEROES) {
+        bs->supported_write_flags |= BDRV_REQ_MAY_UNMAP;
+    }
+
     if (lbaf->ms) {
         error_setg(errp, "Namespaces with metadata are not yet supported");
         goto out;
@@ -766,6 +778,8 @@ static int nvme_file_open(BlockDriverState *bs, QDict *options, int flags,
     int ret;
     BDRVNVMeState *s = bs->opaque;
 
+    bs->supported_write_flags = BDRV_REQ_FUA;
+
     opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
     qemu_opts_absorb_qdict(opts, options, &error_abort);
     device = qemu_opt_get(opts, NVME_BLOCK_OPT_DEVICE);
@@ -794,7 +808,6 @@ static int nvme_file_open(BlockDriverState *bs, QDict *options, int flags,
             goto fail;
         }
     }
-    bs->supported_write_flags = BDRV_REQ_FUA;
     return 0;
 fail:
     nvme_close(bs);
@@ -1088,6 +1101,60 @@ static coroutine_fn int nvme_co_flush(BlockDriverState *bs)
 }
 
 
+static coroutine_fn int nvme_co_pwrite_zeroes(BlockDriverState *bs,
+                                              int64_t offset,
+                                              int bytes,
+                                              BdrvRequestFlags flags)
+{
+    BDRVNVMeState *s = bs->opaque;
+    NVMeQueuePair *ioq = s->queues[1];
+    NVMeRequest *req;
+
+    uint32_t cdw12 = ((bytes >> s->blkshift) - 1) & 0xFFFF;
+
+    if (!s->supports_write_zeroes) {
+        return -ENOTSUP;
+    }
+
+    NvmeCmd cmd = {
+        .opcode = NVME_CMD_WRITE_ZEROS,
+        .nsid = cpu_to_le32(s->nsid),
+        .cdw10 = cpu_to_le32((offset >> s->blkshift) & 0xFFFFFFFF),
+        .cdw11 = cpu_to_le32(((offset >> s->blkshift) >> 32) & 0xFFFFFFFF),
+    };
+
+    NVMeCoData data = {
+        .ctx = bdrv_get_aio_context(bs),
+        .ret = -EINPROGRESS,
+    };
+
+    if (flags & BDRV_REQ_MAY_UNMAP) {
+        cdw12 |= (1 << 25);
+    }
+
+    if (flags & BDRV_REQ_FUA) {
+        cdw12 |= (1 << 30);
+    }
+
+    cmd.cdw12 = cpu_to_le32(cdw12);
+
+    trace_nvme_write_zeroes(s, offset, bytes, flags);
+    assert(s->nr_queues > 1);
+    req = nvme_get_free_req(ioq);
+    assert(req);
+
+    nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
+
+    data.co = qemu_coroutine_self();
+    while (data.ret == -EINPROGRESS) {
+        qemu_coroutine_yield();
+    }
+
+    trace_nvme_rw_done(s, true, offset, bytes, data.ret);
+    return data.ret;
+}
+
+
 static int nvme_reopen_prepare(BDRVReopenState *reopen_state,
                                BlockReopenQueue *queue, Error **errp)
 {
@@ -1192,6 +1259,9 @@ static BlockDriver bdrv_nvme = {
 
     .bdrv_co_preadv           = nvme_co_preadv,
     .bdrv_co_pwritev          = nvme_co_pwritev,
+
+    .bdrv_co_pwrite_zeroes    = nvme_co_pwrite_zeroes,
+
     .bdrv_co_flush_to_disk    = nvme_co_flush,
     .bdrv_reopen_prepare      = nvme_reopen_prepare,
 
diff --git a/block/trace-events b/block/trace-events
index ccde15a14c..8706eab170 100644
--- a/block/trace-events
+++ b/block/trace-events
@@ -152,6 +152,7 @@ nvme_submit_command_raw(int c0, int c1, int c2, int c3, int c4, int c5, int c6,
 nvme_handle_event(void *s) "s %p"
 nvme_poll_cb(void *s) "s %p"
 nvme_prw_aligned(void *s, int is_write, uint64_t offset, uint64_t bytes, int flags, int niov) "s %p is_write %d offset %"PRId64" bytes %"PRId64" flags %d niov %d"
+nvme_write_zeroes(void *s, uint64_t offset, uint64_t bytes, int flags) "s %p offset %"PRId64" bytes %"PRId64" flags %d"
 nvme_qiov_unaligned(const void *qiov, int n, void *base, size_t size, int align) "qiov %p n %d base %p size 0x%zx align 0x%x"
 nvme_prw_buffered(void *s, uint64_t offset, uint64_t bytes, int niov, int is_write) "s %p offset %"PRId64" bytes %"PRId64" niov %d is_write %d"
 nvme_rw_done(void *s, int is_write, uint64_t offset, uint64_t bytes, int ret) "s %p is_write %d offset %"PRId64" bytes %"PRId64" ret %d"
diff --git a/include/block/nvme.h b/include/block/nvme.h
index 3ec8efcc43..ab5943b90a 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -653,12 +653,29 @@ typedef struct NvmeIdNs {
     uint8_t     mc;
     uint8_t     dpc;
     uint8_t     dps;
-    uint8_t     res30[98];
+
+    uint8_t     nmic;
+    uint8_t     rescap;
+    uint8_t     fpi;
+    uint8_t     dlfeat;
+
+    uint8_t     res34[94];
     NvmeLBAF    lbaf[16];
     uint8_t     res192[192];
     uint8_t     vs[3712];
 } NvmeIdNs;
 
+
+/*Deallocate Logical Block Features*/
+#define NVME_ID_NS_DLFEAT_GUARD_CRC(dlfeat)       ((dlfeat) & 0x10)
+#define NVME_ID_NS_DLFEAT_WRITE_ZEROES(dlfeat)    ((dlfeat) & 0x08)
+
+#define NVME_ID_NS_DLFEAT_READ_BEHAVIOR(dlfeat)     ((dlfeat) & 0x7)
+#define NVME_ID_NS_DLFEAT_READ_BEHAVIOR_UNDEFINED   0
+#define NVME_ID_NS_DLFEAT_READ_BEHAVIOR_ZEROES      1
+#define NVME_ID_NS_DLFEAT_READ_BEHAVIOR_ONES        2
+
+
 #define NVME_ID_NS_NSFEAT_THIN(nsfeat)      ((nsfeat & 0x1))
 #define NVME_ID_NS_FLBAS_EXTENDED(flbas)    ((flbas >> 4) & 0x1)
 #define NVME_ID_NS_FLBAS_INDEX(flbas)       ((flbas & 0xf))

From e87a09d6251b28d1494a3728441d8fdb93a8d57d Mon Sep 17 00:00:00 2001
From: Maxim Levitsky <mlevitsk@redhat.com>
Date: Fri, 13 Sep 2019 16:36:27 +0300
Subject: [PATCH 43/69] block/nvme: add support for discard

Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
Message-id: 20190913133627.28450-3-mlevitsk@redhat.com
Reviewed-by: John Snow <jsnow@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/nvme.c       | 83 ++++++++++++++++++++++++++++++++++++++++++++++
 block/trace-events |  2 ++
 2 files changed, 85 insertions(+)

diff --git a/block/nvme.c b/block/nvme.c
index 0ac38c24e6..d41c4bda6e 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -113,6 +113,7 @@ typedef struct {
     bool plugged;
 
     bool supports_write_zeroes;
+    bool supports_discard;
 
     CoMutex dma_map_lock;
     CoQueue dma_flush_queue;
@@ -465,6 +466,7 @@ static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
 
     oncs = le16_to_cpu(idctrl->oncs);
     s->supports_write_zeroes = !!(oncs & NVME_ONCS_WRITE_ZEROS);
+    s->supports_discard = !!(oncs & NVME_ONCS_DSM);
 
     memset(resp, 0, 4096);
 
@@ -1155,6 +1157,86 @@ static coroutine_fn int nvme_co_pwrite_zeroes(BlockDriverState *bs,
 }
 
 
+static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs,
+                                         int64_t offset,
+                                         int bytes)
+{
+    BDRVNVMeState *s = bs->opaque;
+    NVMeQueuePair *ioq = s->queues[1];
+    NVMeRequest *req;
+    NvmeDsmRange *buf;
+    QEMUIOVector local_qiov;
+    int ret;
+
+    NvmeCmd cmd = {
+        .opcode = NVME_CMD_DSM,
+        .nsid = cpu_to_le32(s->nsid),
+        .cdw10 = cpu_to_le32(0), /*number of ranges - 0 based*/
+        .cdw11 = cpu_to_le32(1 << 2), /*deallocate bit*/
+    };
+
+    NVMeCoData data = {
+        .ctx = bdrv_get_aio_context(bs),
+        .ret = -EINPROGRESS,
+    };
+
+    if (!s->supports_discard) {
+        return -ENOTSUP;
+    }
+
+    assert(s->nr_queues > 1);
+
+    buf = qemu_try_blockalign0(bs, s->page_size);
+    if (!buf) {
+        return -ENOMEM;
+    }
+
+    buf->nlb = cpu_to_le32(bytes >> s->blkshift);
+    buf->slba = cpu_to_le64(offset >> s->blkshift);
+    buf->cattr = 0;
+
+    qemu_iovec_init(&local_qiov, 1);
+    qemu_iovec_add(&local_qiov, buf, 4096);
+
+    req = nvme_get_free_req(ioq);
+    assert(req);
+
+    qemu_co_mutex_lock(&s->dma_map_lock);
+    ret = nvme_cmd_map_qiov(bs, &cmd, req, &local_qiov);
+    qemu_co_mutex_unlock(&s->dma_map_lock);
+
+    if (ret) {
+        req->busy = false;
+        goto out;
+    }
+
+    trace_nvme_dsm(s, offset, bytes);
+
+    nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
+
+    data.co = qemu_coroutine_self();
+    while (data.ret == -EINPROGRESS) {
+        qemu_coroutine_yield();
+    }
+
+    qemu_co_mutex_lock(&s->dma_map_lock);
+    ret = nvme_cmd_unmap_qiov(bs, &local_qiov);
+    qemu_co_mutex_unlock(&s->dma_map_lock);
+
+    if (ret) {
+        goto out;
+    }
+
+    ret = data.ret;
+    trace_nvme_dsm_done(s, offset, bytes, ret);
+out:
+    qemu_iovec_destroy(&local_qiov);
+    qemu_vfree(buf);
+    return ret;
+
+}
+
+
 static int nvme_reopen_prepare(BDRVReopenState *reopen_state,
                                BlockReopenQueue *queue, Error **errp)
 {
@@ -1261,6 +1343,7 @@ static BlockDriver bdrv_nvme = {
     .bdrv_co_pwritev          = nvme_co_pwritev,
 
     .bdrv_co_pwrite_zeroes    = nvme_co_pwrite_zeroes,
+    .bdrv_co_pdiscard         = nvme_co_pdiscard,
 
     .bdrv_co_flush_to_disk    = nvme_co_flush,
     .bdrv_reopen_prepare      = nvme_reopen_prepare,
diff --git a/block/trace-events b/block/trace-events
index 8706eab170..6ba86decca 100644
--- a/block/trace-events
+++ b/block/trace-events
@@ -156,6 +156,8 @@ nvme_write_zeroes(void *s, uint64_t offset, uint64_t bytes, int flags) "s %p off
 nvme_qiov_unaligned(const void *qiov, int n, void *base, size_t size, int align) "qiov %p n %d base %p size 0x%zx align 0x%x"
 nvme_prw_buffered(void *s, uint64_t offset, uint64_t bytes, int niov, int is_write) "s %p offset %"PRId64" bytes %"PRId64" niov %d is_write %d"
 nvme_rw_done(void *s, int is_write, uint64_t offset, uint64_t bytes, int ret) "s %p is_write %d offset %"PRId64" bytes %"PRId64" ret %d"
+nvme_dsm(void *s, uint64_t offset, uint64_t bytes) "s %p offset %"PRId64" bytes %"PRId64""
+nvme_dsm_done(void *s, uint64_t offset, uint64_t bytes, int ret) "s %p offset %"PRId64" bytes %"PRId64" ret %d"
 nvme_dma_map_flush(void *s) "s %p"
 nvme_free_req_queue_wait(void *q) "q %p"
 nvme_cmd_map_qiov(void *s, void *cmd, void *req, void *qiov, int entries) "s %p cmd %p req %p qiov %p entries %d"

From f93c3add3a773e0e3f6277e5517583c4ad3a43c2 Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Mon, 14 Oct 2019 17:39:28 +0200
Subject: [PATCH 44/69] mirror: Do not dereference invalid pointers

mirror_exit_common() may be called twice (if it is called from
mirror_prepare() and fails, it will be called from mirror_abort()
again).

In such a case, many of the pointers in the MirrorBlockJob object will
already be freed.  This can be seen most reliably for s->target, which
is set to NULL (and then dereferenced by blk_bs()).

Cc: qemu-stable@nongnu.org
Fixes: 737efc1eda23b904fbe0e66b37715fb0e5c3e58b
Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: John Snow <jsnow@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-id: 20191014153931.20699-2-mreitz@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/mirror.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/block/mirror.c b/block/mirror.c
index 454365ce00..bb17cfce31 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -620,11 +620,11 @@ static int mirror_exit_common(Job *job)
 {
     MirrorBlockJob *s = container_of(job, MirrorBlockJob, common.job);
     BlockJob *bjob = &s->common;
-    MirrorBDSOpaque *bs_opaque = s->mirror_top_bs->opaque;
+    MirrorBDSOpaque *bs_opaque;
     AioContext *replace_aio_context = NULL;
-    BlockDriverState *src = s->mirror_top_bs->backing->bs;
-    BlockDriverState *target_bs = blk_bs(s->target);
-    BlockDriverState *mirror_top_bs = s->mirror_top_bs;
+    BlockDriverState *src;
+    BlockDriverState *target_bs;
+    BlockDriverState *mirror_top_bs;
     Error *local_err = NULL;
     bool abort = job->ret < 0;
     int ret = 0;
@@ -634,6 +634,11 @@ static int mirror_exit_common(Job *job)
     }
     s->prepared = true;
 
+    mirror_top_bs = s->mirror_top_bs;
+    bs_opaque = mirror_top_bs->opaque;
+    src = mirror_top_bs->backing->bs;
+    target_bs = blk_bs(s->target);
+
     if (bdrv_chain_contains(src, target_bs)) {
         bdrv_unfreeze_backing_chain(mirror_top_bs, target_bs);
     }

From 5d5b33c08031cfe0e9872bde1f9dcb2215f9b30a Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Fri, 11 Oct 2019 17:27:59 +0200
Subject: [PATCH 45/69] include: Move endof() up from hw/virtio/virtio.h

endof() is a useful macro, we can make use of it outside of virtio.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-id: 20191011152814.14791-2-mreitz@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 hw/block/virtio-blk.c      |  4 ++--
 hw/net/virtio-net.c        | 10 +++++-----
 include/hw/virtio/virtio.h |  7 -------
 include/qemu/compiler.h    |  7 +++++++
 4 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index 14e9f85b8b..156416dfd5 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -42,9 +42,9 @@
  */
 static VirtIOFeature feature_sizes[] = {
     {.flags = 1ULL << VIRTIO_BLK_F_DISCARD,
-     .end = virtio_endof(struct virtio_blk_config, discard_sector_alignment)},
+     .end = endof(struct virtio_blk_config, discard_sector_alignment)},
     {.flags = 1ULL << VIRTIO_BLK_F_WRITE_ZEROES,
-     .end = virtio_endof(struct virtio_blk_config, write_zeroes_may_unmap)},
+     .end = endof(struct virtio_blk_config, write_zeroes_may_unmap)},
     {}
 };
 
diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index 9f11422337..2c4909c5f9 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -90,15 +90,15 @@ static inline __virtio16 *virtio_net_rsc_ext_num_dupacks(
 
 static VirtIOFeature feature_sizes[] = {
     {.flags = 1ULL << VIRTIO_NET_F_MAC,
-     .end = virtio_endof(struct virtio_net_config, mac)},
+     .end = endof(struct virtio_net_config, mac)},
     {.flags = 1ULL << VIRTIO_NET_F_STATUS,
-     .end = virtio_endof(struct virtio_net_config, status)},
+     .end = endof(struct virtio_net_config, status)},
     {.flags = 1ULL << VIRTIO_NET_F_MQ,
-     .end = virtio_endof(struct virtio_net_config, max_virtqueue_pairs)},
+     .end = endof(struct virtio_net_config, max_virtqueue_pairs)},
     {.flags = 1ULL << VIRTIO_NET_F_MTU,
-     .end = virtio_endof(struct virtio_net_config, mtu)},
+     .end = endof(struct virtio_net_config, mtu)},
     {.flags = 1ULL << VIRTIO_NET_F_SPEED_DUPLEX,
-     .end = virtio_endof(struct virtio_net_config, duplex)},
+     .end = endof(struct virtio_net_config, duplex)},
     {}
 };
 
diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
index 48e8d04ff6..ef083af550 100644
--- a/include/hw/virtio/virtio.h
+++ b/include/hw/virtio/virtio.h
@@ -37,13 +37,6 @@ static inline hwaddr vring_align(hwaddr addr,
     return QEMU_ALIGN_UP(addr, align);
 }
 
-/*
- * Calculate the number of bytes up to and including the given 'field' of
- * 'container'.
- */
-#define virtio_endof(container, field) \
-    (offsetof(container, field) + sizeof_field(container, field))
-
 typedef struct VirtIOFeature {
     uint64_t flags;
     size_t end;
diff --git a/include/qemu/compiler.h b/include/qemu/compiler.h
index 7b93c73340..85c02c16d3 100644
--- a/include/qemu/compiler.h
+++ b/include/qemu/compiler.h
@@ -60,6 +60,13 @@
 
 #define sizeof_field(type, field) sizeof(((type *)0)->field)
 
+/*
+ * Calculate the number of bytes up to and including the given 'field' of
+ * 'container'.
+ */
+#define endof(container, field) \
+    (offsetof(container, field) + sizeof_field(container, field))
+
 /* Convert from a base type to a parent type, with compile time checking.  */
 #ifdef __GNUC__
 #define DO_UPCAST(type, field, dev) ( __extension__ ( { \

From d8fa8442ad4bdc869afd847ebb8e0fcedff6968c Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Fri, 11 Oct 2019 17:28:00 +0200
Subject: [PATCH 46/69] qcow2: Use endof()

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-id: 20191011152814.14791-3-mreitz@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2-snapshot.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c
index d0e7fa9311..752883e5c3 100644
--- a/block/qcow2-snapshot.c
+++ b/block/qcow2-snapshot.c
@@ -92,11 +92,12 @@ int qcow2_read_snapshots(BlockDriverState *bs)
         }
         offset += extra_data_size;
 
-        if (extra_data_size >= 8) {
+        if (extra_data_size >= endof(QCowSnapshotExtraData,
+                                     vm_state_size_large)) {
             sn->vm_state_size = be64_to_cpu(extra.vm_state_size_large);
         }
 
-        if (extra_data_size >= 16) {
+        if (extra_data_size >= endof(QCowSnapshotExtraData, disk_size)) {
             sn->disk_size = be64_to_cpu(extra.disk_size);
         } else {
             sn->disk_size = bs->total_sectors * BDRV_SECTOR_SIZE;
@@ -251,7 +252,7 @@ static int qcow2_write_snapshots(BlockDriverState *bs)
     }
 
     QEMU_BUILD_BUG_ON(offsetof(QCowHeader, snapshots_offset) !=
-        offsetof(QCowHeader, nb_snapshots) + sizeof(header_data.nb_snapshots));
+                      endof(QCowHeader, nb_snapshots));
 
     header_data.nb_snapshots        = cpu_to_be32(s->nb_snapshots);
     header_data.snapshots_offset    = cpu_to_be64(snapshots_offset);

From ecf6c7c0c1e5e6883f0e245a7b47101b69dc8235 Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Fri, 11 Oct 2019 17:28:01 +0200
Subject: [PATCH 47/69] qcow2: Add Error ** to qcow2_read_snapshots()

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-id: 20191011152814.14791-4-mreitz@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2-snapshot.c | 7 ++++++-
 block/qcow2.c          | 3 +--
 block/qcow2.h          | 2 +-
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c
index 752883e5c3..80d32e4504 100644
--- a/block/qcow2-snapshot.c
+++ b/block/qcow2-snapshot.c
@@ -43,7 +43,7 @@ void qcow2_free_snapshots(BlockDriverState *bs)
     s->nb_snapshots = 0;
 }
 
-int qcow2_read_snapshots(BlockDriverState *bs)
+int qcow2_read_snapshots(BlockDriverState *bs, Error **errp)
 {
     BDRVQcow2State *s = bs->opaque;
     QCowSnapshotHeader h;
@@ -68,6 +68,7 @@ int qcow2_read_snapshots(BlockDriverState *bs)
         offset = ROUND_UP(offset, 8);
         ret = bdrv_pread(bs->file, offset, &h, sizeof(h));
         if (ret < 0) {
+            error_setg_errno(errp, -ret, "Failed to read snapshot table");
             goto fail;
         }
 
@@ -88,6 +89,7 @@ int qcow2_read_snapshots(BlockDriverState *bs)
         ret = bdrv_pread(bs->file, offset, &extra,
                          MIN(sizeof(extra), extra_data_size));
         if (ret < 0) {
+            error_setg_errno(errp, -ret, "Failed to read snapshot table");
             goto fail;
         }
         offset += extra_data_size;
@@ -107,6 +109,7 @@ int qcow2_read_snapshots(BlockDriverState *bs)
         sn->id_str = g_malloc(id_str_size + 1);
         ret = bdrv_pread(bs->file, offset, sn->id_str, id_str_size);
         if (ret < 0) {
+            error_setg_errno(errp, -ret, "Failed to read snapshot table");
             goto fail;
         }
         offset += id_str_size;
@@ -116,6 +119,7 @@ int qcow2_read_snapshots(BlockDriverState *bs)
         sn->name = g_malloc(name_size + 1);
         ret = bdrv_pread(bs->file, offset, sn->name, name_size);
         if (ret < 0) {
+            error_setg_errno(errp, -ret, "Failed to read snapshot table");
             goto fail;
         }
         offset += name_size;
@@ -123,6 +127,7 @@ int qcow2_read_snapshots(BlockDriverState *bs)
 
         if (offset - s->snapshots_offset > QCOW_MAX_SNAPSHOTS_SIZE) {
             ret = -EFBIG;
+            error_setg(errp, "Snapshot table is too big");
             goto fail;
         }
     }
diff --git a/block/qcow2.c b/block/qcow2.c
index 0bc69e6996..60ed084e5c 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -1584,9 +1584,8 @@ static int coroutine_fn qcow2_do_open(BlockDriverState *bs, QDict *options,
     s->snapshots_offset = header.snapshots_offset;
     s->nb_snapshots = header.nb_snapshots;
 
-    ret = qcow2_read_snapshots(bs);
+    ret = qcow2_read_snapshots(bs, errp);
     if (ret < 0) {
-        error_setg_errno(errp, -ret, "Could not read snapshots");
         goto fail;
     }
 
diff --git a/block/qcow2.h b/block/qcow2.h
index 5cccd87162..cc0d5eadd0 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -708,7 +708,7 @@ int qcow2_snapshot_load_tmp(BlockDriverState *bs,
                             Error **errp);
 
 void qcow2_free_snapshots(BlockDriverState *bs);
-int qcow2_read_snapshots(BlockDriverState *bs);
+int qcow2_read_snapshots(BlockDriverState *bs, Error **errp);
 
 /* qcow2-cache.c functions */
 Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables,

From fcf9a6b7288154677f6ee9c5f345c4275170e97d Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Fri, 11 Oct 2019 17:28:02 +0200
Subject: [PATCH 48/69] qcow2: Keep unknown extra snapshot data

The qcow2 specification says to ignore unknown extra data fields in
snapshot table entries.  Currently, we discard it whenever we update the
image, which is a bit different from "ignore".

This patch makes the qcow2 driver keep all unknown extra data fields
when updating an image's snapshot table.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-id: 20191011152814.14791-5-mreitz@redhat.com
[mreitz: Adjusted comments as proposed by Eric]
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2-snapshot.c | 61 +++++++++++++++++++++++++++++++++++-------
 block/qcow2.h          |  7 +++++
 2 files changed, 58 insertions(+), 10 deletions(-)

diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c
index 80d32e4504..120cb7fa09 100644
--- a/block/qcow2-snapshot.c
+++ b/block/qcow2-snapshot.c
@@ -37,6 +37,7 @@ void qcow2_free_snapshots(BlockDriverState *bs)
     for(i = 0; i < s->nb_snapshots; i++) {
         g_free(s->snapshots[i].name);
         g_free(s->snapshots[i].id_str);
+        g_free(s->snapshots[i].unknown_extra_data);
     }
     g_free(s->snapshots);
     s->snapshots = NULL;
@@ -51,7 +52,6 @@ int qcow2_read_snapshots(BlockDriverState *bs, Error **errp)
     QCowSnapshot *sn;
     int i, id_str_size, name_size;
     int64_t offset;
-    uint32_t extra_data_size;
     int ret;
 
     if (!s->nb_snapshots) {
@@ -80,31 +80,53 @@ int qcow2_read_snapshots(BlockDriverState *bs, Error **errp)
         sn->date_sec = be32_to_cpu(h.date_sec);
         sn->date_nsec = be32_to_cpu(h.date_nsec);
         sn->vm_clock_nsec = be64_to_cpu(h.vm_clock_nsec);
-        extra_data_size = be32_to_cpu(h.extra_data_size);
+        sn->extra_data_size = be32_to_cpu(h.extra_data_size);
 
         id_str_size = be16_to_cpu(h.id_str_size);
         name_size = be16_to_cpu(h.name_size);
 
-        /* Read extra data */
+        if (sn->extra_data_size > QCOW_MAX_SNAPSHOT_EXTRA_DATA) {
+            ret = -EFBIG;
+            error_setg(errp, "Too much extra metadata in snapshot table "
+                       "entry %i", i);
+            goto fail;
+        }
+
+        /* Read known extra data */
         ret = bdrv_pread(bs->file, offset, &extra,
-                         MIN(sizeof(extra), extra_data_size));
+                         MIN(sizeof(extra), sn->extra_data_size));
         if (ret < 0) {
             error_setg_errno(errp, -ret, "Failed to read snapshot table");
             goto fail;
         }
-        offset += extra_data_size;
+        offset += MIN(sizeof(extra), sn->extra_data_size);
 
-        if (extra_data_size >= endof(QCowSnapshotExtraData,
-                                     vm_state_size_large)) {
+        if (sn->extra_data_size >= endof(QCowSnapshotExtraData,
+                                         vm_state_size_large)) {
             sn->vm_state_size = be64_to_cpu(extra.vm_state_size_large);
         }
 
-        if (extra_data_size >= endof(QCowSnapshotExtraData, disk_size)) {
+        if (sn->extra_data_size >= endof(QCowSnapshotExtraData, disk_size)) {
             sn->disk_size = be64_to_cpu(extra.disk_size);
         } else {
             sn->disk_size = bs->total_sectors * BDRV_SECTOR_SIZE;
         }
 
+        if (sn->extra_data_size > sizeof(extra)) {
+            /* Store unknown extra data */
+            size_t unknown_extra_data_size =
+                sn->extra_data_size - sizeof(extra);
+
+            sn->unknown_extra_data = g_malloc(unknown_extra_data_size);
+            ret = bdrv_pread(bs->file, offset, sn->unknown_extra_data,
+                             unknown_extra_data_size);
+            if (ret < 0) {
+                error_setg_errno(errp, -ret, "Failed to read snapshot table");
+                goto fail;
+            }
+            offset += unknown_extra_data_size;
+        }
+
         /* Read snapshot ID */
         sn->id_str = g_malloc(id_str_size + 1);
         ret = bdrv_pread(bs->file, offset, sn->id_str, id_str_size);
@@ -162,7 +184,7 @@ static int qcow2_write_snapshots(BlockDriverState *bs)
         sn = s->snapshots + i;
         offset = ROUND_UP(offset, 8);
         offset += sizeof(h);
-        offset += sizeof(extra);
+        offset += MAX(sizeof(extra), sn->extra_data_size);
         offset += strlen(sn->id_str);
         offset += strlen(sn->name);
 
@@ -209,7 +231,8 @@ static int qcow2_write_snapshots(BlockDriverState *bs)
         h.date_sec = cpu_to_be32(sn->date_sec);
         h.date_nsec = cpu_to_be32(sn->date_nsec);
         h.vm_clock_nsec = cpu_to_be64(sn->vm_clock_nsec);
-        h.extra_data_size = cpu_to_be32(sizeof(extra));
+        h.extra_data_size = cpu_to_be32(MAX(sizeof(extra),
+                                            sn->extra_data_size));
 
         memset(&extra, 0, sizeof(extra));
         extra.vm_state_size_large = cpu_to_be64(sn->vm_state_size);
@@ -234,6 +257,22 @@ static int qcow2_write_snapshots(BlockDriverState *bs)
         }
         offset += sizeof(extra);
 
+        if (sn->extra_data_size > sizeof(extra)) {
+            size_t unknown_extra_data_size =
+                sn->extra_data_size - sizeof(extra);
+
+            /* qcow2_read_snapshots() ensures no unbounded allocation */
+            assert(unknown_extra_data_size <= BDRV_REQUEST_MAX_BYTES);
+            assert(sn->unknown_extra_data);
+
+            ret = bdrv_pwrite(bs->file, offset, sn->unknown_extra_data,
+                              unknown_extra_data_size);
+            if (ret < 0) {
+                goto fail;
+            }
+            offset += unknown_extra_data_size;
+        }
+
         ret = bdrv_pwrite(bs->file, offset, sn->id_str, id_str_size);
         if (ret < 0) {
             goto fail;
@@ -376,6 +415,7 @@ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
     sn->date_sec = sn_info->date_sec;
     sn->date_nsec = sn_info->date_nsec;
     sn->vm_clock_nsec = sn_info->vm_clock_nsec;
+    sn->extra_data_size = sizeof(QCowSnapshotExtraData);
 
     /* Allocate the L1 table of the snapshot and copy the current one there. */
     l1_table_offset = qcow2_alloc_clusters(bs, s->l1_size * sizeof(uint64_t));
@@ -647,6 +687,7 @@ int qcow2_snapshot_delete(BlockDriverState *bs,
      * The snapshot is now unused, clean up. If we fail after this point, we
      * won't recover but just leak clusters.
      */
+    g_free(sn.unknown_extra_data);
     g_free(sn.id_str);
     g_free(sn.name);
 
diff --git a/block/qcow2.h b/block/qcow2.h
index cc0d5eadd0..363681cfdc 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -61,6 +61,9 @@
  * space for snapshot names and IDs */
 #define QCOW_MAX_SNAPSHOTS_SIZE (1024 * QCOW_MAX_SNAPSHOTS)
 
+/* Maximum amount of extra data per snapshot table entry to accept */
+#define QCOW_MAX_SNAPSHOT_EXTRA_DATA 1024
+
 /* Bitmap header extension constraints */
 #define QCOW2_MAX_BITMAPS 65535
 #define QCOW2_MAX_BITMAP_DIRECTORY_SIZE (1024 * QCOW2_MAX_BITMAPS)
@@ -181,6 +184,10 @@ typedef struct QCowSnapshot {
     uint32_t date_sec;
     uint32_t date_nsec;
     uint64_t vm_clock_nsec;
+    /* Size of all extra data, including QCowSnapshotExtraData if available */
+    uint32_t extra_data_size;
+    /* Data beyond QCowSnapshotExtraData, if any */
+    void *unknown_extra_data;
 } QCowSnapshot;
 
 struct Qcow2Cache;

From e0314b56b238ac4f0b74498b7afb016ee85023a6 Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Fri, 11 Oct 2019 17:28:03 +0200
Subject: [PATCH 49/69] qcow2: Make qcow2_write_snapshots() public

Updating the snapshot list will be useful when upgrading a v2 image to
v3, so we will need to call this function in qcow2.c.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-id: 20191011152814.14791-6-mreitz@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2-snapshot.c | 2 +-
 block/qcow2.h          | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c
index 120cb7fa09..e3bf4c9776 100644
--- a/block/qcow2-snapshot.c
+++ b/block/qcow2-snapshot.c
@@ -164,7 +164,7 @@ fail:
 }
 
 /* add at the end of the file a new list of snapshots */
-static int qcow2_write_snapshots(BlockDriverState *bs)
+int qcow2_write_snapshots(BlockDriverState *bs)
 {
     BDRVQcow2State *s = bs->opaque;
     QCowSnapshot *sn;
diff --git a/block/qcow2.h b/block/qcow2.h
index 363681cfdc..6c7a23fdf6 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -716,6 +716,7 @@ int qcow2_snapshot_load_tmp(BlockDriverState *bs,
 
 void qcow2_free_snapshots(BlockDriverState *bs);
 int qcow2_read_snapshots(BlockDriverState *bs, Error **errp);
+int qcow2_write_snapshots(BlockDriverState *bs);
 
 /* qcow2-cache.c functions */
 Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables,

From 722efb0c7c6ae62a9a34d7b256cb9cc8612bda5b Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Fri, 11 Oct 2019 17:28:04 +0200
Subject: [PATCH 50/69] qcow2: Put qcow2_upgrade() into its own function

This does not make sense right now, but it will make sense once we need
to do more than to just update s->qcow_version.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-id: 20191011152814.14791-7-mreitz@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2.c | 43 ++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 38 insertions(+), 5 deletions(-)

diff --git a/block/qcow2.c b/block/qcow2.c
index 60ed084e5c..af08757055 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -4912,12 +4912,46 @@ static int qcow2_downgrade(BlockDriverState *bs, int target_version,
     return 0;
 }
 
+/*
+ * Upgrades an image's version.  While newer versions encompass all
+ * features of older versions, some things may have to be presented
+ * differently.
+ */
+static int qcow2_upgrade(BlockDriverState *bs, int target_version,
+                         BlockDriverAmendStatusCB *status_cb, void *cb_opaque,
+                         Error **errp)
+{
+    BDRVQcow2State *s = bs->opaque;
+    int current_version = s->qcow_version;
+    int ret;
+
+    /* This is qcow2_upgrade(), not qcow2_downgrade() */
+    assert(target_version > current_version);
+
+    /* There are no other versions (yet) that you can upgrade to */
+    assert(target_version == 3);
+
+    status_cb(bs, 0, 1, cb_opaque);
+
+    s->qcow_version = target_version;
+    ret = qcow2_update_header(bs);
+    if (ret < 0) {
+        s->qcow_version = current_version;
+        error_setg_errno(errp, -ret, "Failed to update the image header");
+        return ret;
+    }
+    status_cb(bs, 1, 1, cb_opaque);
+
+    return 0;
+}
+
 typedef enum Qcow2AmendOperation {
     /* This is the value Qcow2AmendHelperCBInfo::last_operation will be
      * statically initialized to so that the helper CB can discern the first
      * invocation from an operation change */
     QCOW2_NO_OPERATION = 0,
 
+    QCOW2_UPGRADING,
     QCOW2_CHANGING_REFCOUNT_ORDER,
     QCOW2_DOWNGRADING,
 } Qcow2AmendOperation;
@@ -5100,17 +5134,16 @@ static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts,
     helper_cb_info = (Qcow2AmendHelperCBInfo){
         .original_status_cb = status_cb,
         .original_cb_opaque = cb_opaque,
-        .total_operations = (new_version < old_version)
+        .total_operations = (new_version != old_version)
                           + (s->refcount_bits != refcount_bits)
     };
 
     /* Upgrade first (some features may require compat=1.1) */
     if (new_version > old_version) {
-        s->qcow_version = new_version;
-        ret = qcow2_update_header(bs);
+        helper_cb_info.current_operation = QCOW2_UPGRADING;
+        ret = qcow2_upgrade(bs, new_version, &qcow2_amend_helper_cb,
+                            &helper_cb_info, errp);
         if (ret < 0) {
-            s->qcow_version = old_version;
-            error_setg_errno(errp, -ret, "Failed to update the image header");
             return ret;
         }
     }

From 0a85af351d90e1ae9e06b3bb5290ddf2d5d8b950 Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Fri, 11 Oct 2019 17:28:05 +0200
Subject: [PATCH 51/69] qcow2: Write v3-compliant snapshot list on upgrade

qcow2 v3 requires every snapshot table entry to have two extra data
fields: The 64-bit VM state size, and the virtual disk size.  Both are
optional for v2 images, so they may not be present.

qcow2_upgrade() therefore should update the snapshot table to ensure all
entries have these extra data fields.

Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=1727347
Reported-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-id: 20191011152814.14791-8-mreitz@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2.c | 32 ++++++++++++++++++++++++++++++--
 1 file changed, 30 insertions(+), 2 deletions(-)

diff --git a/block/qcow2.c b/block/qcow2.c
index af08757055..e3a4c44fb7 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -4922,7 +4922,9 @@ static int qcow2_upgrade(BlockDriverState *bs, int target_version,
                          Error **errp)
 {
     BDRVQcow2State *s = bs->opaque;
+    bool need_snapshot_update;
     int current_version = s->qcow_version;
+    int i;
     int ret;
 
     /* This is qcow2_upgrade(), not qcow2_downgrade() */
@@ -4931,7 +4933,33 @@ static int qcow2_upgrade(BlockDriverState *bs, int target_version,
     /* There are no other versions (yet) that you can upgrade to */
     assert(target_version == 3);
 
-    status_cb(bs, 0, 1, cb_opaque);
+    status_cb(bs, 0, 2, cb_opaque);
+
+    /*
+     * In v2, snapshots do not need to have extra data.  v3 requires
+     * the 64-bit VM state size and the virtual disk size to be
+     * present.
+     * qcow2_write_snapshots() will always write the list in the
+     * v3-compliant format.
+     */
+    need_snapshot_update = false;
+    for (i = 0; i < s->nb_snapshots; i++) {
+        if (s->snapshots[i].extra_data_size <
+            sizeof_field(QCowSnapshotExtraData, vm_state_size_large) +
+            sizeof_field(QCowSnapshotExtraData, disk_size))
+        {
+            need_snapshot_update = true;
+            break;
+        }
+    }
+    if (need_snapshot_update) {
+        ret = qcow2_write_snapshots(bs);
+        if (ret < 0) {
+            error_setg_errno(errp, -ret, "Failed to update the snapshot table");
+            return ret;
+        }
+    }
+    status_cb(bs, 1, 2, cb_opaque);
 
     s->qcow_version = target_version;
     ret = qcow2_update_header(bs);
@@ -4940,7 +4968,7 @@ static int qcow2_upgrade(BlockDriverState *bs, int target_version,
         error_setg_errno(errp, -ret, "Failed to update the image header");
         return ret;
     }
-    status_cb(bs, 1, 1, cb_opaque);
+    status_cb(bs, 2, 2, cb_opaque);
 
     return 0;
 }

From 8bc584fe035d98b8aca4fcc818617c91df0ed925 Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Fri, 11 Oct 2019 17:28:06 +0200
Subject: [PATCH 52/69] qcow2: Separate qcow2_check_read_snapshot_table()

Reading the snapshot table can fail.  That is a problem when we want to
repair the image.

Therefore, stop reading the snapshot table in qcow2_do_open() in check
mode.  Instead, add a new function qcow2_check_read_snapshot_table()
that reads the snapshot table at a later point.  In the future, we want
to handle errors here and fix them.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-id: 20191011152814.14791-9-mreitz@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2-snapshot.c | 58 ++++++++++++++++++++++++++++++++
 block/qcow2.c          | 76 ++++++++++++++++++++++++++++++++----------
 block/qcow2.h          |  4 +++
 3 files changed, 120 insertions(+), 18 deletions(-)

diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c
index e3bf4c9776..d667bfd935 100644
--- a/block/qcow2-snapshot.c
+++ b/block/qcow2-snapshot.c
@@ -322,6 +322,64 @@ fail:
     return ret;
 }
 
+int coroutine_fn qcow2_check_read_snapshot_table(BlockDriverState *bs,
+                                                 BdrvCheckResult *result,
+                                                 BdrvCheckMode fix)
+{
+    BDRVQcow2State *s = bs->opaque;
+    Error *local_err = NULL;
+    int ret;
+    struct {
+        uint32_t nb_snapshots;
+        uint64_t snapshots_offset;
+    } QEMU_PACKED snapshot_table_pointer;
+
+    /* qcow2_do_open() discards this information in check mode */
+    ret = bdrv_pread(bs->file, offsetof(QCowHeader, nb_snapshots),
+                     &snapshot_table_pointer, sizeof(snapshot_table_pointer));
+    if (ret < 0) {
+        result->check_errors++;
+        fprintf(stderr, "ERROR failed to read the snapshot table pointer from "
+                "the image header: %s\n", strerror(-ret));
+        return ret;
+    }
+
+    s->snapshots_offset = be64_to_cpu(snapshot_table_pointer.snapshots_offset);
+    s->nb_snapshots = be32_to_cpu(snapshot_table_pointer.nb_snapshots);
+
+    ret = qcow2_validate_table(bs, s->snapshots_offset, s->nb_snapshots,
+                               sizeof(QCowSnapshotHeader),
+                               sizeof(QCowSnapshotHeader) * QCOW_MAX_SNAPSHOTS,
+                               "snapshot table", &local_err);
+    if (ret < 0) {
+        result->check_errors++;
+        error_reportf_err(local_err, "ERROR ");
+
+        /* We did not read the snapshot table, so invalidate this information */
+        s->snapshots_offset = 0;
+        s->nb_snapshots = 0;
+
+        return ret;
+    }
+
+    qemu_co_mutex_unlock(&s->lock);
+    ret = qcow2_read_snapshots(bs, &local_err);
+    qemu_co_mutex_lock(&s->lock);
+    if (ret < 0) {
+        result->check_errors++;
+        error_reportf_err(local_err,
+                          "ERROR failed to read the snapshot table: ");
+
+        /* We did not read the snapshot table, so invalidate this information */
+        s->snapshots_offset = 0;
+        s->nb_snapshots = 0;
+
+        return ret;
+    }
+
+    return 0;
+}
+
 static void find_new_snapshot_id(BlockDriverState *bs,
                                  char *id_str, int id_str_size)
 {
diff --git a/block/qcow2.c b/block/qcow2.c
index e3a4c44fb7..b0026e8a9a 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -570,11 +570,40 @@ int qcow2_mark_consistent(BlockDriverState *bs)
     return 0;
 }
 
+static void qcow2_add_check_result(BdrvCheckResult *out,
+                                   const BdrvCheckResult *src,
+                                   bool set_allocation_info)
+{
+    out->corruptions += src->corruptions;
+    out->leaks += src->leaks;
+    out->check_errors += src->check_errors;
+    out->corruptions_fixed += src->corruptions_fixed;
+    out->leaks_fixed += src->leaks_fixed;
+
+    if (set_allocation_info) {
+        out->image_end_offset = src->image_end_offset;
+        out->bfi = src->bfi;
+    }
+}
+
 static int coroutine_fn qcow2_co_check_locked(BlockDriverState *bs,
                                               BdrvCheckResult *result,
                                               BdrvCheckMode fix)
 {
-    int ret = qcow2_check_refcounts(bs, result, fix);
+    BdrvCheckResult snapshot_res = {};
+    BdrvCheckResult refcount_res = {};
+    int ret;
+
+    memset(result, 0, sizeof(*result));
+
+    ret = qcow2_check_read_snapshot_table(bs, &snapshot_res, fix);
+    qcow2_add_check_result(result, &snapshot_res, false);
+    if (ret < 0) {
+        return ret;
+    }
+
+    ret = qcow2_check_refcounts(bs, &refcount_res, fix);
+    qcow2_add_check_result(result, &refcount_res, true);
     if (ret < 0) {
         return ret;
     }
@@ -1410,17 +1439,22 @@ static int coroutine_fn qcow2_do_open(BlockDriverState *bs, QDict *options,
         goto fail;
     }
 
-    /* The total size in bytes of the snapshot table is checked in
-     * qcow2_read_snapshots() because the size of each snapshot is
-     * variable and we don't know it yet.
-     * Here we only check the offset and number of snapshots. */
-    ret = qcow2_validate_table(bs, header.snapshots_offset,
-                               header.nb_snapshots,
-                               sizeof(QCowSnapshotHeader),
-                               sizeof(QCowSnapshotHeader) * QCOW_MAX_SNAPSHOTS,
-                               "Snapshot table", errp);
-    if (ret < 0) {
-        goto fail;
+    if (!(flags & BDRV_O_CHECK)) {
+        /*
+         * The total size in bytes of the snapshot table is checked in
+         * qcow2_read_snapshots() because the size of each snapshot is
+         * variable and we don't know it yet.
+         * Here we only check the offset and number of snapshots.
+         */
+        ret = qcow2_validate_table(bs, header.snapshots_offset,
+                                   header.nb_snapshots,
+                                   sizeof(QCowSnapshotHeader),
+                                   sizeof(QCowSnapshotHeader) *
+                                       QCOW_MAX_SNAPSHOTS,
+                                   "Snapshot table", errp);
+        if (ret < 0) {
+            goto fail;
+        }
     }
 
     /* read the level 1 table */
@@ -1580,13 +1614,19 @@ static int coroutine_fn qcow2_do_open(BlockDriverState *bs, QDict *options,
         s->image_backing_file = g_strdup(bs->auto_backing_file);
     }
 
-    /* Internal snapshots */
-    s->snapshots_offset = header.snapshots_offset;
-    s->nb_snapshots = header.nb_snapshots;
+    /*
+     * Internal snapshots; skip reading them in check mode, because
+     * we do not need them then, and we do not want to abort because
+     * of a broken table.
+     */
+    if (!(flags & BDRV_O_CHECK)) {
+        s->snapshots_offset = header.snapshots_offset;
+        s->nb_snapshots = header.nb_snapshots;
 
-    ret = qcow2_read_snapshots(bs, errp);
-    if (ret < 0) {
-        goto fail;
+        ret = qcow2_read_snapshots(bs, errp);
+        if (ret < 0) {
+            goto fail;
+        }
     }
 
     /* Clear unknown autoclear feature bits */
diff --git a/block/qcow2.h b/block/qcow2.h
index 6c7a23fdf6..adb5c3bc28 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -718,6 +718,10 @@ void qcow2_free_snapshots(BlockDriverState *bs);
 int qcow2_read_snapshots(BlockDriverState *bs, Error **errp);
 int qcow2_write_snapshots(BlockDriverState *bs);
 
+int coroutine_fn qcow2_check_read_snapshot_table(BlockDriverState *bs,
+                                                 BdrvCheckResult *result,
+                                                 BdrvCheckMode fix);
+
 /* qcow2-cache.c functions */
 Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables,
                                unsigned table_size);

From fe446b5da225b551fc6493890d437fe4a8ba7552 Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Fri, 11 Oct 2019 17:28:07 +0200
Subject: [PATCH 53/69] qcow2: Add qcow2_check_fix_snapshot_table()

qcow2_check_read_snapshot_table() can perform consistency checks, but it
cannot fix everything.  Specifically, it cannot allocate new clusters,
because that should wait until the refcount structures are known to be
consistent (i.e., after qcow2_check_refcounts()).  Thus, it cannot call
qcow2_write_snapshots().

Do that in qcow2_check_fix_snapshot_table(), which is called after
qcow2_check_refcounts().

Currently, there is nothing that would set result->corruptions, so this
is a no-op.  A follow-up patch will change that.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-id: 20191011152814.14791-10-mreitz@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2-snapshot.c | 25 +++++++++++++++++++++++++
 block/qcow2.c          |  9 ++++++++-
 block/qcow2.h          |  3 +++
 3 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c
index d667bfd935..b526a8f819 100644
--- a/block/qcow2-snapshot.c
+++ b/block/qcow2-snapshot.c
@@ -380,6 +380,31 @@ int coroutine_fn qcow2_check_read_snapshot_table(BlockDriverState *bs,
     return 0;
 }
 
+int coroutine_fn qcow2_check_fix_snapshot_table(BlockDriverState *bs,
+                                                BdrvCheckResult *result,
+                                                BdrvCheckMode fix)
+{
+    BDRVQcow2State *s = bs->opaque;
+    int ret;
+
+    if (result->corruptions && (fix & BDRV_FIX_ERRORS)) {
+        qemu_co_mutex_unlock(&s->lock);
+        ret = qcow2_write_snapshots(bs);
+        qemu_co_mutex_lock(&s->lock);
+        if (ret < 0) {
+            result->check_errors++;
+            fprintf(stderr, "ERROR failed to update snapshot table: %s\n",
+                    strerror(-ret));
+            return ret;
+        }
+
+        result->corruptions_fixed += result->corruptions;
+        result->corruptions = 0;
+    }
+
+    return 0;
+}
+
 static void find_new_snapshot_id(BlockDriverState *bs,
                                  char *id_str, int id_str_size)
 {
diff --git a/block/qcow2.c b/block/qcow2.c
index b0026e8a9a..8d4f38ae74 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -597,13 +597,20 @@ static int coroutine_fn qcow2_co_check_locked(BlockDriverState *bs,
     memset(result, 0, sizeof(*result));
 
     ret = qcow2_check_read_snapshot_table(bs, &snapshot_res, fix);
-    qcow2_add_check_result(result, &snapshot_res, false);
     if (ret < 0) {
+        qcow2_add_check_result(result, &snapshot_res, false);
         return ret;
     }
 
     ret = qcow2_check_refcounts(bs, &refcount_res, fix);
     qcow2_add_check_result(result, &refcount_res, true);
+    if (ret < 0) {
+        qcow2_add_check_result(result, &snapshot_res, false);
+        return ret;
+    }
+
+    ret = qcow2_check_fix_snapshot_table(bs, &snapshot_res, fix);
+    qcow2_add_check_result(result, &snapshot_res, false);
     if (ret < 0) {
         return ret;
     }
diff --git a/block/qcow2.h b/block/qcow2.h
index adb5c3bc28..601c2e4c82 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -721,6 +721,9 @@ int qcow2_write_snapshots(BlockDriverState *bs);
 int coroutine_fn qcow2_check_read_snapshot_table(BlockDriverState *bs,
                                                  BdrvCheckResult *result,
                                                  BdrvCheckMode fix);
+int coroutine_fn qcow2_check_fix_snapshot_table(BlockDriverState *bs,
+                                                BdrvCheckResult *result,
+                                                BdrvCheckMode fix);
 
 /* qcow2-cache.c functions */
 Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables,

From f91f1f159bcaa81a70e860079a73ae7d1220dc0c Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Fri, 11 Oct 2019 17:28:08 +0200
Subject: [PATCH 54/69] qcow2: Fix broken snapshot table entries

The only case where we currently reject snapshot table entries is when
they have too much extra data.  Fix them with qemu-img check -r all by
counting it as a corruption, reducing their extra_data_size, and then
letting qcow2_check_fix_snapshot_table() do the rest.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-id: 20191011152814.14791-11-mreitz@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2-snapshot.c | 67 +++++++++++++++++++++++++++++++++++-------
 1 file changed, 56 insertions(+), 11 deletions(-)

diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c
index b526a8f819..53dc1635ec 100644
--- a/block/qcow2-snapshot.c
+++ b/block/qcow2-snapshot.c
@@ -44,7 +44,23 @@ void qcow2_free_snapshots(BlockDriverState *bs)
     s->nb_snapshots = 0;
 }
 
-int qcow2_read_snapshots(BlockDriverState *bs, Error **errp)
+/*
+ * If @repair is true, try to repair a broken snapshot table instead
+ * of just returning an error:
+ *
+ * - If there were snapshots with too much extra metadata, increment
+ *   *extra_data_dropped for each.
+ *   This requires the caller to eventually rewrite the whole snapshot
+ *   table, which requires cluster allocation.  Therefore, this should
+ *   be done only after qcow2_check_refcounts() made sure the refcount
+ *   structures are valid.
+ *   (In the meantime, the image is still valid because
+ *   qcow2_check_refcounts() does not do anything with snapshots'
+ *   extra data.)
+ */
+static int qcow2_do_read_snapshots(BlockDriverState *bs, bool repair,
+                                   int *extra_data_dropped,
+                                   Error **errp)
 {
     BDRVQcow2State *s = bs->opaque;
     QCowSnapshotHeader h;
@@ -64,6 +80,8 @@ int qcow2_read_snapshots(BlockDriverState *bs, Error **errp)
     s->snapshots = g_new0(QCowSnapshot, s->nb_snapshots);
 
     for(i = 0; i < s->nb_snapshots; i++) {
+        bool truncate_unknown_extra_data = false;
+
         /* Read statically sized part of the snapshot header */
         offset = ROUND_UP(offset, 8);
         ret = bdrv_pread(bs->file, offset, &h, sizeof(h));
@@ -86,10 +104,21 @@ int qcow2_read_snapshots(BlockDriverState *bs, Error **errp)
         name_size = be16_to_cpu(h.name_size);
 
         if (sn->extra_data_size > QCOW_MAX_SNAPSHOT_EXTRA_DATA) {
-            ret = -EFBIG;
-            error_setg(errp, "Too much extra metadata in snapshot table "
-                       "entry %i", i);
-            goto fail;
+            if (!repair) {
+                ret = -EFBIG;
+                error_setg(errp, "Too much extra metadata in snapshot table "
+                           "entry %i", i);
+                error_append_hint(errp, "You can force-remove this extra "
+                                  "metadata with qemu-img check -r all\n");
+                goto fail;
+            }
+
+            fprintf(stderr, "Discarding too much extra metadata in snapshot "
+                    "table entry %i (%" PRIu32 " > %u)\n",
+                    i, sn->extra_data_size, QCOW_MAX_SNAPSHOT_EXTRA_DATA);
+
+            (*extra_data_dropped)++;
+            truncate_unknown_extra_data = true;
         }
 
         /* Read known extra data */
@@ -113,18 +142,26 @@ int qcow2_read_snapshots(BlockDriverState *bs, Error **errp)
         }
 
         if (sn->extra_data_size > sizeof(extra)) {
-            /* Store unknown extra data */
-            size_t unknown_extra_data_size =
-                sn->extra_data_size - sizeof(extra);
+            uint64_t extra_data_end;
+            size_t unknown_extra_data_size;
 
+            extra_data_end = offset + sn->extra_data_size - sizeof(extra);
+
+            if (truncate_unknown_extra_data) {
+                sn->extra_data_size = QCOW_MAX_SNAPSHOT_EXTRA_DATA;
+            }
+
+            /* Store unknown extra data */
+            unknown_extra_data_size = sn->extra_data_size - sizeof(extra);
             sn->unknown_extra_data = g_malloc(unknown_extra_data_size);
             ret = bdrv_pread(bs->file, offset, sn->unknown_extra_data,
                              unknown_extra_data_size);
             if (ret < 0) {
-                error_setg_errno(errp, -ret, "Failed to read snapshot table");
+                error_setg_errno(errp, -ret,
+                                 "Failed to read snapshot table");
                 goto fail;
             }
-            offset += unknown_extra_data_size;
+            offset = extra_data_end;
         }
 
         /* Read snapshot ID */
@@ -163,6 +200,11 @@ fail:
     return ret;
 }
 
+int qcow2_read_snapshots(BlockDriverState *bs, Error **errp)
+{
+    return qcow2_do_read_snapshots(bs, false, NULL, errp);
+}
+
 /* add at the end of the file a new list of snapshots */
 int qcow2_write_snapshots(BlockDriverState *bs)
 {
@@ -328,6 +370,7 @@ int coroutine_fn qcow2_check_read_snapshot_table(BlockDriverState *bs,
 {
     BDRVQcow2State *s = bs->opaque;
     Error *local_err = NULL;
+    int extra_data_dropped = 0;
     int ret;
     struct {
         uint32_t nb_snapshots;
@@ -363,7 +406,8 @@ int coroutine_fn qcow2_check_read_snapshot_table(BlockDriverState *bs,
     }
 
     qemu_co_mutex_unlock(&s->lock);
-    ret = qcow2_read_snapshots(bs, &local_err);
+    ret = qcow2_do_read_snapshots(bs, fix & BDRV_FIX_ERRORS,
+                                  &extra_data_dropped, &local_err);
     qemu_co_mutex_lock(&s->lock);
     if (ret < 0) {
         result->check_errors++;
@@ -376,6 +420,7 @@ int coroutine_fn qcow2_check_read_snapshot_table(BlockDriverState *bs,
 
         return ret;
     }
+    result->corruptions += extra_data_dropped;
 
     return 0;
 }

From 624143355cb6e4149ec27b9b00088aeb958da31d Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Fri, 11 Oct 2019 17:28:09 +0200
Subject: [PATCH 55/69] qcow2: Keep track of the snapshot table length

When repairing the snapshot table, we truncate entries that have too
much extra data.  This frees up space that we do not have to count
towards the snapshot table size.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-id: 20191011152814.14791-12-mreitz@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2-snapshot.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c
index 53dc1635ec..582eb3386a 100644
--- a/block/qcow2-snapshot.c
+++ b/block/qcow2-snapshot.c
@@ -68,6 +68,7 @@ static int qcow2_do_read_snapshots(BlockDriverState *bs, bool repair,
     QCowSnapshot *sn;
     int i, id_str_size, name_size;
     int64_t offset;
+    uint64_t table_length = 0;
     int ret;
 
     if (!s->nb_snapshots) {
@@ -82,6 +83,8 @@ static int qcow2_do_read_snapshots(BlockDriverState *bs, bool repair,
     for(i = 0; i < s->nb_snapshots; i++) {
         bool truncate_unknown_extra_data = false;
 
+        table_length = ROUND_UP(table_length, 8);
+
         /* Read statically sized part of the snapshot header */
         offset = ROUND_UP(offset, 8);
         ret = bdrv_pread(bs->file, offset, &h, sizeof(h));
@@ -184,7 +187,16 @@ static int qcow2_do_read_snapshots(BlockDriverState *bs, bool repair,
         offset += name_size;
         sn->name[name_size] = '\0';
 
-        if (offset - s->snapshots_offset > QCOW_MAX_SNAPSHOTS_SIZE) {
+        /* Note that the extra data may have been truncated */
+        table_length += sizeof(h) + sn->extra_data_size + id_str_size +
+                        name_size;
+        if (!repair) {
+            assert(table_length == offset - s->snapshots_offset);
+        }
+
+        if (table_length > QCOW_MAX_SNAPSHOTS_SIZE ||
+            offset - s->snapshots_offset > INT_MAX)
+        {
             ret = -EFBIG;
             error_setg(errp, "Snapshot table is too big");
             goto fail;

From 099febf3ac37e8d615e90066e515dd9b1d9bba52 Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Fri, 11 Oct 2019 17:28:10 +0200
Subject: [PATCH 56/69] qcow2: Fix overly long snapshot tables

We currently refuse to open qcow2 images with overly long snapshot
tables.  This patch makes qemu-img check -r all drop all offending
entries past what we deem acceptable.

The user cannot choose which snapshots are removed.  This is fine
because we have chosen the maximum snapshot table size to be so large
(64 MB) that it cannot be reasonably reached.  If the snapshot table
exceeds this size, the image has probably been corrupted in some way; in
this case, it is most important to just make the image usable such that
the user can copy off at least the active layer.
(Also note that the snapshots will be removed only with "-r all", so a
plain "check" or "check -r leaks" will not delete any data.)

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-id: 20191011152814.14791-13-mreitz@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2-snapshot.c | 88 +++++++++++++++++++++++++++++++++++++-----
 1 file changed, 78 insertions(+), 10 deletions(-)

diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c
index 582eb3386a..366d9f574c 100644
--- a/block/qcow2-snapshot.c
+++ b/block/qcow2-snapshot.c
@@ -29,15 +29,24 @@
 #include "qemu/error-report.h"
 #include "qemu/cutils.h"
 
+static void qcow2_free_single_snapshot(BlockDriverState *bs, int i)
+{
+    BDRVQcow2State *s = bs->opaque;
+
+    assert(i >= 0 && i < s->nb_snapshots);
+    g_free(s->snapshots[i].name);
+    g_free(s->snapshots[i].id_str);
+    g_free(s->snapshots[i].unknown_extra_data);
+    memset(&s->snapshots[i], 0, sizeof(s->snapshots[i]));
+}
+
 void qcow2_free_snapshots(BlockDriverState *bs)
 {
     BDRVQcow2State *s = bs->opaque;
     int i;
 
     for(i = 0; i < s->nb_snapshots; i++) {
-        g_free(s->snapshots[i].name);
-        g_free(s->snapshots[i].id_str);
-        g_free(s->snapshots[i].unknown_extra_data);
+        qcow2_free_single_snapshot(bs, i);
     }
     g_free(s->snapshots);
     s->snapshots = NULL;
@@ -48,6 +57,14 @@ void qcow2_free_snapshots(BlockDriverState *bs)
  * If @repair is true, try to repair a broken snapshot table instead
  * of just returning an error:
  *
+ * - If the snapshot table was too long, set *nb_clusters_reduced to
+ *   the number of snapshots removed off the end.
+ *   The caller will update the on-disk nb_snapshots accordingly;
+ *   this leaks clusters, but is safe.
+ *   (The on-disk information must be updated before
+ *   qcow2_check_refcounts(), because that function relies on
+ *   s->nb_snapshots to reflect the on-disk value.)
+ *
  * - If there were snapshots with too much extra metadata, increment
  *   *extra_data_dropped for each.
  *   This requires the caller to eventually rewrite the whole snapshot
@@ -59,6 +76,7 @@ void qcow2_free_snapshots(BlockDriverState *bs)
  *   extra data.)
  */
 static int qcow2_do_read_snapshots(BlockDriverState *bs, bool repair,
+                                   int *nb_clusters_reduced,
                                    int *extra_data_dropped,
                                    Error **errp)
 {
@@ -67,7 +85,7 @@ static int qcow2_do_read_snapshots(BlockDriverState *bs, bool repair,
     QCowSnapshotExtraData extra;
     QCowSnapshot *sn;
     int i, id_str_size, name_size;
-    int64_t offset;
+    int64_t offset, pre_sn_offset;
     uint64_t table_length = 0;
     int ret;
 
@@ -83,6 +101,7 @@ static int qcow2_do_read_snapshots(BlockDriverState *bs, bool repair,
     for(i = 0; i < s->nb_snapshots; i++) {
         bool truncate_unknown_extra_data = false;
 
+        pre_sn_offset = offset;
         table_length = ROUND_UP(table_length, 8);
 
         /* Read statically sized part of the snapshot header */
@@ -197,9 +216,31 @@ static int qcow2_do_read_snapshots(BlockDriverState *bs, bool repair,
         if (table_length > QCOW_MAX_SNAPSHOTS_SIZE ||
             offset - s->snapshots_offset > INT_MAX)
         {
-            ret = -EFBIG;
-            error_setg(errp, "Snapshot table is too big");
-            goto fail;
+            if (!repair) {
+                ret = -EFBIG;
+                error_setg(errp, "Snapshot table is too big");
+                error_append_hint(errp, "You can force-remove all %u "
+                                  "overhanging snapshots with qemu-img check "
+                                  "-r all\n", s->nb_snapshots - i);
+                goto fail;
+            }
+
+            fprintf(stderr, "Discarding %u overhanging snapshots (snapshot "
+                    "table is too big)\n", s->nb_snapshots - i);
+
+            *nb_clusters_reduced += (s->nb_snapshots - i);
+
+            /* Discard current snapshot also */
+            qcow2_free_single_snapshot(bs, i);
+
+            /*
+             * This leaks all the rest of the snapshot table and the
+             * snapshots' clusters, but we run in check -r all mode,
+             * so qcow2_check_refcounts() will take care of it.
+             */
+            s->nb_snapshots = i;
+            offset = pre_sn_offset;
+            break;
         }
     }
 
@@ -214,7 +255,7 @@ fail:
 
 int qcow2_read_snapshots(BlockDriverState *bs, Error **errp)
 {
-    return qcow2_do_read_snapshots(bs, false, NULL, errp);
+    return qcow2_do_read_snapshots(bs, false, NULL, NULL, errp);
 }
 
 /* add at the end of the file a new list of snapshots */
@@ -382,6 +423,7 @@ int coroutine_fn qcow2_check_read_snapshot_table(BlockDriverState *bs,
 {
     BDRVQcow2State *s = bs->opaque;
     Error *local_err = NULL;
+    int nb_clusters_reduced = 0;
     int extra_data_dropped = 0;
     int ret;
     struct {
@@ -419,7 +461,8 @@ int coroutine_fn qcow2_check_read_snapshot_table(BlockDriverState *bs,
 
     qemu_co_mutex_unlock(&s->lock);
     ret = qcow2_do_read_snapshots(bs, fix & BDRV_FIX_ERRORS,
-                                  &extra_data_dropped, &local_err);
+                                  &nb_clusters_reduced, &extra_data_dropped,
+                                  &local_err);
     qemu_co_mutex_lock(&s->lock);
     if (ret < 0) {
         result->check_errors++;
@@ -432,7 +475,32 @@ int coroutine_fn qcow2_check_read_snapshot_table(BlockDriverState *bs,
 
         return ret;
     }
-    result->corruptions += extra_data_dropped;
+    result->corruptions += nb_clusters_reduced + extra_data_dropped;
+
+    if (nb_clusters_reduced) {
+        /*
+         * Update image header now, because:
+         * (1) qcow2_check_refcounts() relies on s->nb_snapshots to be
+         *     the same as what the image header says,
+         * (2) this leaks clusters, but qcow2_check_refcounts() will
+         *     fix that.
+         */
+        assert(fix & BDRV_FIX_ERRORS);
+
+        snapshot_table_pointer.nb_snapshots = cpu_to_be32(s->nb_snapshots);
+        ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, nb_snapshots),
+                               &snapshot_table_pointer.nb_snapshots,
+                               sizeof(snapshot_table_pointer.nb_snapshots));
+        if (ret < 0) {
+            result->check_errors++;
+            fprintf(stderr, "ERROR failed to update the snapshot count in the "
+                    "image header: %s\n", strerror(-ret));
+            return ret;
+        }
+
+        result->corruptions_fixed += nb_clusters_reduced;
+        result->corruptions -= nb_clusters_reduced;
+    }
 
     return 0;
 }

From d2b1d1ec734a847ba5d4fba4341a851ec1741d0a Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Fri, 11 Oct 2019 17:28:11 +0200
Subject: [PATCH 57/69] qcow2: Repair snapshot table with too many entries

The user cannot choose which snapshots are removed.  This is fine
because we have chosen the maximum snapshot table size to be so large
(65536 entries) that it cannot be reasonably reached.  If the snapshot
table exceeds this size, the image has probably been corrupted in some
way; in this case, it is most important to just make the image usable
such that the user can copy off at least the active layer.
(Also note that the snapshots will be removed only with "-r all", so a
plain "check" or "check -r leaks" will not delete any data.)

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-id: 20191011152814.14791-14-mreitz@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2-snapshot.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c
index 366d9f574c..dac8a778e4 100644
--- a/block/qcow2-snapshot.c
+++ b/block/qcow2-snapshot.c
@@ -444,6 +444,14 @@ int coroutine_fn qcow2_check_read_snapshot_table(BlockDriverState *bs,
     s->snapshots_offset = be64_to_cpu(snapshot_table_pointer.snapshots_offset);
     s->nb_snapshots = be32_to_cpu(snapshot_table_pointer.nb_snapshots);
 
+    if (s->nb_snapshots > QCOW_MAX_SNAPSHOTS && (fix & BDRV_FIX_ERRORS)) {
+        fprintf(stderr, "Discarding %u overhanging snapshots\n",
+                s->nb_snapshots - QCOW_MAX_SNAPSHOTS);
+
+        nb_clusters_reduced += s->nb_snapshots - QCOW_MAX_SNAPSHOTS;
+        s->nb_snapshots = QCOW_MAX_SNAPSHOTS;
+    }
+
     ret = qcow2_validate_table(bs, s->snapshots_offset, s->nb_snapshots,
                                sizeof(QCowSnapshotHeader),
                                sizeof(QCowSnapshotHeader) * QCOW_MAX_SNAPSHOTS,
@@ -452,6 +460,12 @@ int coroutine_fn qcow2_check_read_snapshot_table(BlockDriverState *bs,
         result->check_errors++;
         error_reportf_err(local_err, "ERROR ");
 
+        if (s->nb_snapshots > QCOW_MAX_SNAPSHOTS) {
+            fprintf(stderr, "You can force-remove all %u overhanging snapshots "
+                    "with qemu-img check -r all\n",
+                    s->nb_snapshots - QCOW_MAX_SNAPSHOTS);
+        }
+
         /* We did not read the snapshot table, so invalidate this information */
         s->snapshots_offset = 0;
         s->nb_snapshots = 0;

From e40e6e88f6c0ac73ba2bf680d4f0e46378c6d0de Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Fri, 11 Oct 2019 17:28:12 +0200
Subject: [PATCH 58/69] qcow2: Fix v3 snapshot table entry compliancy

qcow2 v3 images require every snapshot table entry to have at least 16
bytes of extra data.  If they do not, let qemu-img check -r all fix it.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-id: 20191011152814.14791-15-mreitz@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2-snapshot.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c
index dac8a778e4..5ab64da1ec 100644
--- a/block/qcow2-snapshot.c
+++ b/block/qcow2-snapshot.c
@@ -516,6 +516,24 @@ int coroutine_fn qcow2_check_read_snapshot_table(BlockDriverState *bs,
         result->corruptions -= nb_clusters_reduced;
     }
 
+    /*
+     * All of v3 images' snapshot table entries need to have at least
+     * 16 bytes of extra data.
+     */
+    if (s->qcow_version >= 3) {
+        int i;
+        for (i = 0; i < s->nb_snapshots; i++) {
+            if (s->snapshots[i].extra_data_size <
+                sizeof_field(QCowSnapshotExtraData, vm_state_size_large) +
+                sizeof_field(QCowSnapshotExtraData, disk_size))
+            {
+                result->corruptions++;
+                fprintf(stderr, "%s snapshot table entry %i is incomplete\n",
+                        fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", i);
+            }
+        }
+    }
+
     return 0;
 }
 

From fc8ba423ca6b37bee56ec9dc339b44043c39553d Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Fri, 11 Oct 2019 17:28:13 +0200
Subject: [PATCH 59/69] iotests: Add peek_file* functions

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-id: 20191011152814.14791-16-mreitz@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/common.rc | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/tests/qemu-iotests/common.rc b/tests/qemu-iotests/common.rc
index 12b4751848..fa7bae2422 100644
--- a/tests/qemu-iotests/common.rc
+++ b/tests/qemu-iotests/common.rc
@@ -53,6 +53,26 @@ poke_file()
     printf "$3" | dd "of=$1" bs=1 "seek=$2" conv=notrunc &>/dev/null
 }
 
+# peek_file_le 'test.img' 512 2 => 65534
+peek_file_le()
+{
+    # Wrap in echo $() to strip spaces
+    echo $(od -j"$2" -N"$3" --endian=little -An -vtu"$3" "$1")
+}
+
+# peek_file_be 'test.img' 512 2 => 65279
+peek_file_be()
+{
+    # Wrap in echo $() to strip spaces
+    echo $(od -j"$2" -N"$3" --endian=big -An -vtu"$3" "$1")
+}
+
+# peek_file_raw 'test.img' 512 2 => '\xff\xfe'
+peek_file_raw()
+{
+    dd if="$1" bs=1 skip="$2" count="$3" status=none
+}
+
 
 if ! . ./common.config
     then

From f53b25dfd57ef5a65ac82194478b432ba88c9de0 Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Fri, 11 Oct 2019 17:28:14 +0200
Subject: [PATCH 60/69] iotests: Test qcow2's snapshot table handling

Add a test how our qcow2 driver handles extra data in snapshot table
entries, and how it repairs overly long snapshot tables.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-id: 20191011152814.14791-17-mreitz@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/261     | 523 +++++++++++++++++++++++++++++++++++++
 tests/qemu-iotests/261.out | 346 ++++++++++++++++++++++++
 tests/qemu-iotests/group   |   1 +
 3 files changed, 870 insertions(+)
 create mode 100755 tests/qemu-iotests/261
 create mode 100644 tests/qemu-iotests/261.out

diff --git a/tests/qemu-iotests/261 b/tests/qemu-iotests/261
new file mode 100755
index 0000000000..fb96bcfbe2
--- /dev/null
+++ b/tests/qemu-iotests/261
@@ -0,0 +1,523 @@
+#!/usr/bin/env bash
+#
+# Test case for qcow2's handling of extra data in snapshot table entries
+# (and more generally, how certain cases of broken snapshot tables are
+# handled)
+#
+# Copyright (C) 2019 Red Hat, Inc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+# creator
+owner=mreitz@redhat.com
+
+seq=$(basename $0)
+echo "QA output created by $seq"
+
+status=1	# failure is the default!
+
+_cleanup()
+{
+    _cleanup_test_img
+    rm -f "$TEST_IMG".v{2,3}.orig
+    rm -f "$TEST_DIR"/sn{0,1,2}{,-pre,-extra,-post}
+}
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+# get standard environment, filters and checks
+. ./common.rc
+. ./common.filter
+
+# This tests qocw2-specific low-level functionality
+_supported_fmt qcow2
+_supported_proto file
+_supported_os Linux
+# (1) We create a v2 image that supports nothing but refcount_bits=16
+# (2) We do some refcount management on our own which expects
+#     refcount_bits=16
+_unsupported_imgopts 'refcount_bits=\([^1]\|.\([^6]\|$\)\)'
+
+# Parameters:
+#   $1: image filename
+#   $2: snapshot table entry offset in the image
+snapshot_table_entry_size()
+{
+    id_len=$(peek_file_be "$1" $(($2 + 12)) 2)
+    name_len=$(peek_file_be "$1" $(($2 + 14)) 2)
+    extra_len=$(peek_file_be "$1" $(($2 + 36)) 4)
+
+    full_len=$((40 + extra_len + id_len + name_len))
+    echo $(((full_len + 7) / 8 * 8))
+}
+
+# Parameter:
+#   $1: image filename
+print_snapshot_table()
+{
+    nb_entries=$(peek_file_be "$1" 60 4)
+    offset=$(peek_file_be "$1" 64 8)
+
+    echo "Snapshots in $1:" | _filter_testdir | _filter_imgfmt
+
+    for ((i = 0; i < nb_entries; i++)); do
+        id_len=$(peek_file_be "$1" $((offset + 12)) 2)
+        name_len=$(peek_file_be "$1" $((offset + 14)) 2)
+        extra_len=$(peek_file_be "$1" $((offset + 36)) 4)
+
+        extra_ofs=$((offset + 40))
+        id_ofs=$((extra_ofs + extra_len))
+        name_ofs=$((id_ofs + id_len))
+
+        echo "  [$i]"
+        echo "    ID: $(peek_file_raw "$1" $id_ofs $id_len)"
+        echo "    Name: $(peek_file_raw "$1" $name_ofs $name_len)"
+        echo "    Extra data size: $extra_len"
+        if [ $extra_len -ge 8 ]; then
+            echo "    VM state size: $(peek_file_be "$1" $extra_ofs 8)"
+        fi
+        if [ $extra_len -ge 16 ]; then
+            echo "    Disk size: $(peek_file_be "$1" $((extra_ofs + 8)) 8)"
+        fi
+        if [ $extra_len -gt 16 ]; then
+            echo '    Unknown extra data:' \
+                "$(peek_file_raw "$1" $((extra_ofs + 16)) $((extra_len - 16)) \
+                   | tr -d '\0')"
+        fi
+
+        offset=$((offset + $(snapshot_table_entry_size "$1" $offset)))
+    done
+}
+
+# Mark clusters as allocated; works only in refblock 0 (i.e. before
+# cluster #32768).
+# Parameters:
+#   $1: Start offset of what to allocate
+#   $2: End offset (exclusive)
+refblock0_allocate()
+{
+    reftable_ofs=$(peek_file_be "$TEST_IMG" 48 8)
+    refblock_ofs=$(peek_file_be "$TEST_IMG" $reftable_ofs 8)
+
+    cluster=$(($1 / 65536))
+    ecluster=$((($2 + 65535) / 65536))
+
+    while [ $cluster -lt $ecluster ]; do
+        if [ $cluster -ge 32768 ]; then
+            echo "*** Abort: Cluster $cluster exceeds refblock 0 ***"
+            exit 1
+        fi
+        poke_file "$TEST_IMG" $((refblock_ofs + cluster * 2)) '\x00\x01'
+        cluster=$((cluster + 1))
+    done
+}
+
+
+echo
+echo '=== Create v2 template ==='
+echo
+
+# Create v2 image with a snapshot table with three entries:
+# [0]: No extra data (valid with v2, not valid with v3)
+# [1]: Has extra data unknown to qemu
+# [2]: Has the 64-bit VM state size, but not the disk size (again,
+#      valid with v2, not valid with v3)
+
+TEST_IMG="$TEST_IMG.v2.orig" IMGOPTS='compat=0.10' _make_test_img 64M
+$QEMU_IMG snapshot -c sn0 "$TEST_IMG.v2.orig"
+$QEMU_IMG snapshot -c sn1 "$TEST_IMG.v2.orig"
+$QEMU_IMG snapshot -c sn2 "$TEST_IMG.v2.orig"
+
+# Copy out all existing snapshot table entries
+sn_table_ofs=$(peek_file_be "$TEST_IMG.v2.orig" 64 8)
+
+# ofs: Snapshot table entry offset
+# eds: Extra data size
+# ids: Name + ID size
+# len: Total entry length
+sn0_ofs=$sn_table_ofs
+sn0_eds=$(peek_file_be "$TEST_IMG.v2.orig" $((sn0_ofs + 36)) 4)
+sn0_ids=$(($(peek_file_be "$TEST_IMG.v2.orig" $((sn0_ofs + 12)) 2) +
+           $(peek_file_be "$TEST_IMG.v2.orig" $((sn0_ofs + 14)) 2)))
+sn0_len=$(snapshot_table_entry_size "$TEST_IMG.v2.orig" $sn0_ofs)
+sn1_ofs=$((sn0_ofs + sn0_len))
+sn1_eds=$(peek_file_be "$TEST_IMG.v2.orig" $((sn1_ofs + 36)) 4)
+sn1_ids=$(($(peek_file_be "$TEST_IMG.v2.orig" $((sn1_ofs + 12)) 2) +
+           $(peek_file_be "$TEST_IMG.v2.orig" $((sn1_ofs + 14)) 2)))
+sn1_len=$(snapshot_table_entry_size "$TEST_IMG.v2.orig" $sn1_ofs)
+sn2_ofs=$((sn1_ofs + sn1_len))
+sn2_eds=$(peek_file_be "$TEST_IMG.v2.orig" $((sn2_ofs + 36)) 4)
+sn2_ids=$(($(peek_file_be "$TEST_IMG.v2.orig" $((sn2_ofs + 12)) 2) +
+           $(peek_file_be "$TEST_IMG.v2.orig" $((sn2_ofs + 14)) 2)))
+sn2_len=$(snapshot_table_entry_size "$TEST_IMG.v2.orig" $sn2_ofs)
+
+# Data before extra data
+dd if="$TEST_IMG.v2.orig" of="$TEST_DIR/sn0-pre" bs=1 skip=$sn0_ofs count=40 \
+    &> /dev/null
+dd if="$TEST_IMG.v2.orig" of="$TEST_DIR/sn1-pre" bs=1 skip=$sn1_ofs count=40 \
+    &> /dev/null
+dd if="$TEST_IMG.v2.orig" of="$TEST_DIR/sn2-pre" bs=1 skip=$sn2_ofs count=40 \
+    &> /dev/null
+
+# Extra data
+dd if="$TEST_IMG.v2.orig" of="$TEST_DIR/sn0-extra" bs=1 \
+    skip=$((sn0_ofs + 40)) count=$sn0_eds &> /dev/null
+dd if="$TEST_IMG.v2.orig" of="$TEST_DIR/sn1-extra" bs=1 \
+    skip=$((sn1_ofs + 40)) count=$sn1_eds &> /dev/null
+dd if="$TEST_IMG.v2.orig" of="$TEST_DIR/sn2-extra" bs=1 \
+    skip=$((sn2_ofs + 40)) count=$sn2_eds &> /dev/null
+
+# Data after extra data
+dd if="$TEST_IMG.v2.orig" of="$TEST_DIR/sn0-post" bs=1 \
+    skip=$((sn0_ofs + 40 + sn0_eds)) count=$sn0_ids \
+    &> /dev/null
+dd if="$TEST_IMG.v2.orig" of="$TEST_DIR/sn1-post" bs=1 \
+    skip=$((sn1_ofs + 40 + sn1_eds)) count=$sn1_ids \
+    &> /dev/null
+dd if="$TEST_IMG.v2.orig" of="$TEST_DIR/sn2-post" bs=1 \
+    skip=$((sn2_ofs + 40 + sn2_eds)) count=$sn2_ids \
+    &> /dev/null
+
+# Amend them, one by one
+# Set sn0's extra data size to 0
+poke_file "$TEST_DIR/sn0-pre" 36 '\x00\x00\x00\x00'
+truncate -s 0 "$TEST_DIR/sn0-extra"
+# Grow sn0-post to pad
+truncate -s $(($(snapshot_table_entry_size "$TEST_DIR/sn0-pre") - 40)) \
+    "$TEST_DIR/sn0-post"
+
+# Set sn1's extra data size to 42
+poke_file "$TEST_DIR/sn1-pre" 36 '\x00\x00\x00\x2a'
+truncate -s 42 "$TEST_DIR/sn1-extra"
+poke_file "$TEST_DIR/sn1-extra" 16 'very important data'
+# Grow sn1-post to pad
+truncate -s $(($(snapshot_table_entry_size "$TEST_DIR/sn1-pre") - 82)) \
+    "$TEST_DIR/sn1-post"
+
+# Set sn2's extra data size to 8
+poke_file "$TEST_DIR/sn2-pre" 36 '\x00\x00\x00\x08'
+truncate -s 8 "$TEST_DIR/sn2-extra"
+# Grow sn2-post to pad
+truncate -s $(($(snapshot_table_entry_size "$TEST_DIR/sn2-pre") - 48)) \
+    "$TEST_DIR/sn2-post"
+
+# Construct snapshot table
+cat "$TEST_DIR"/sn0-{pre,extra,post} \
+    "$TEST_DIR"/sn1-{pre,extra,post} \
+    "$TEST_DIR"/sn2-{pre,extra,post} \
+    | dd of="$TEST_IMG.v2.orig" bs=1 seek=$sn_table_ofs conv=notrunc \
+          &> /dev/null
+
+# Done!
+TEST_IMG="$TEST_IMG.v2.orig" _check_test_img
+print_snapshot_table "$TEST_IMG.v2.orig"
+
+echo
+echo '=== Upgrade to v3 ==='
+echo
+
+cp "$TEST_IMG.v2.orig" "$TEST_IMG.v3.orig"
+$QEMU_IMG amend -o compat=1.1 "$TEST_IMG.v3.orig"
+TEST_IMG="$TEST_IMG.v3.orig" _check_test_img
+print_snapshot_table "$TEST_IMG.v3.orig"
+
+echo
+echo '=== Repair botched v3 ==='
+echo
+
+# Force the v2 file to be v3.  v3 requires each snapshot table entry
+# to have at least 16 bytes of extra data, so it will not comply to
+# the qcow2 v3 specification; but we can fix that.
+cp "$TEST_IMG.v2.orig" "$TEST_IMG"
+
+# Set version
+poke_file "$TEST_IMG" 4 '\x00\x00\x00\x03'
+# Increase header length (necessary for v3)
+poke_file "$TEST_IMG" 100 '\x00\x00\x00\x68'
+# Set refcount order (necessary for v3)
+poke_file "$TEST_IMG" 96 '\x00\x00\x00\x04'
+
+_check_test_img -r all
+print_snapshot_table "$TEST_IMG"
+
+
+# From now on, just test the qcow2 version we are supposed to test.
+# (v3 by default, v2 by choice through $IMGOPTS.)
+# That works because we always write all known extra data when
+# updating the snapshot table, independent of the version.
+
+if echo "$IMGOPTS" | grep -q 'compat=\(0\.10\|v2\)' 2> /dev/null; then
+    subver=v2
+else
+    subver=v3
+fi
+
+echo
+echo '=== Add new snapshot ==='
+echo
+
+cp "$TEST_IMG.$subver.orig" "$TEST_IMG"
+$QEMU_IMG snapshot -c sn3 "$TEST_IMG"
+_check_test_img
+print_snapshot_table "$TEST_IMG"
+
+echo
+echo '=== Remove different snapshots ==='
+
+for sn in sn0 sn1 sn2; do
+    echo
+    echo "--- $sn ---"
+
+    cp "$TEST_IMG.$subver.orig" "$TEST_IMG"
+    $QEMU_IMG snapshot -d $sn "$TEST_IMG"
+    _check_test_img
+    print_snapshot_table "$TEST_IMG"
+done
+
+echo
+echo '=== Reject too much unknown extra data ==='
+echo
+
+cp "$TEST_IMG.$subver.orig" "$TEST_IMG"
+$QEMU_IMG snapshot -c sn3 "$TEST_IMG"
+
+sn_table_ofs=$(peek_file_be "$TEST_IMG" 64 8)
+sn0_ofs=$sn_table_ofs
+sn1_ofs=$((sn0_ofs + $(snapshot_table_entry_size "$TEST_IMG" $sn0_ofs)))
+sn2_ofs=$((sn1_ofs + $(snapshot_table_entry_size "$TEST_IMG" $sn1_ofs)))
+sn3_ofs=$((sn2_ofs + $(snapshot_table_entry_size "$TEST_IMG" $sn2_ofs)))
+
+# 64 kB of extra data should be rejected
+# (Note that this also induces a refcount error, because it spills
+# over to the next cluster.  That's a good way to test that we can
+# handle simultaneous snapshot table and refcount errors.)
+poke_file "$TEST_IMG" $((sn3_ofs + 36)) '\x00\x01\x00\x00'
+
+# Print error
+_img_info
+echo
+_check_test_img
+echo
+
+# Should be repairable
+_check_test_img -r all
+
+echo
+echo '=== Snapshot table too big ==='
+echo
+
+sn_table_ofs=$(peek_file_be "$TEST_IMG.v3.orig" 64 8)
+
+# Fill a snapshot with 1 kB of extra data, a 65535-char ID, and a
+# 65535-char name, and repeat it as many times as necessary to fill
+# 64 MB (the maximum supported by qemu)
+
+touch "$TEST_DIR/sn0"
+
+# Full size (fixed + extra + ID + name + padding)
+sn_size=$((40 + 1024 + 65535 + 65535 + 2))
+
+# We only need the fixed part, though.
+truncate -s 40 "$TEST_DIR/sn0"
+
+# 65535-char ID string
+poke_file "$TEST_DIR/sn0" 12 '\xff\xff'
+# 65535-char name
+poke_file "$TEST_DIR/sn0" 14 '\xff\xff'
+# 1 kB of extra data
+poke_file "$TEST_DIR/sn0" 36 '\x00\x00\x04\x00'
+
+# Create test image
+_make_test_img 64M
+
+# Hook up snapshot table somewhere safe (at 1 MB)
+poke_file "$TEST_IMG" 64 '\x00\x00\x00\x00\x00\x10\x00\x00'
+
+offset=1048576
+size_written=0
+sn_count=0
+while [ $size_written -le $((64 * 1048576)) ]; do
+    dd if="$TEST_DIR/sn0" of="$TEST_IMG" bs=1 seek=$offset conv=notrunc \
+        &> /dev/null
+    offset=$((offset + sn_size))
+    size_written=$((size_written + sn_size))
+    sn_count=$((sn_count + 1))
+done
+truncate -s "$offset" "$TEST_IMG"
+
+# Give the last snapshot (the one to be removed) an L1 table so we can
+# see how that is handled when repairing the image
+# (Put it two clusters before 1 MB, and one L2 table one cluster
+# before 1 MB)
+poke_file "$TEST_IMG" $((offset - sn_size + 0)) \
+    '\x00\x00\x00\x00\x00\x0e\x00\x00'
+poke_file "$TEST_IMG" $((offset - sn_size + 8)) \
+    '\x00\x00\x00\x01'
+
+# Hook up the L2 table
+poke_file "$TEST_IMG" $((1048576 - 2 * 65536)) \
+    '\x80\x00\x00\x00\x00\x0f\x00\x00'
+
+# Make sure all of the clusters we just hooked up are allocated:
+# - The snapshot table
+# - The last snapshot's L1 and L2 table
+refblock0_allocate $((1048576 - 2 * 65536)) $offset
+
+poke_file "$TEST_IMG" 60 \
+    "$(printf '%08x' $sn_count | sed -e 's/\(..\)/\\x\1/g')"
+
+# Print error
+_img_info
+echo
+_check_test_img
+echo
+
+# Should be repairable
+_check_test_img -r all
+
+echo
+echo "$((sn_count - 1)) snapshots should remain:"
+echo "  qemu-img info reports $(_img_info | grep -c '^ \{34\}') snapshots"
+echo "  Image header reports $(peek_file_be "$TEST_IMG" 60 4) snapshots"
+
+echo
+echo '=== Snapshot table too big with one entry with too much extra data ==='
+echo
+
+# For this test, we reuse the image from the previous case, which has
+# a snapshot table that is right at the limit.
+# Our layout looks like this:
+# - (a number of snapshot table entries)
+# - One snapshot with $extra_data_size extra data
+# - One normal snapshot that breaks the 64 MB boundary
+# - One normal snapshot beyond the 64 MB boundary
+#
+# $extra_data_size is calculated so that simply by virtue of it
+# decreasing to 1 kB, the penultimate snapshot will fit into 64 MB
+# limit again.  The final snapshot will always be beyond the limit, so
+# that we can see that the repair algorithm does still determine the
+# limit to be somewhere, even when truncating one snapshot's extra
+# data.
+
+# The last case has removed the last snapshot, so calculate
+# $old_offset to get the current image's real length
+old_offset=$((offset - sn_size))
+
+# The layout from the previous test had one snapshot beyond the 64 MB
+# limit; we want the same (after the oversized extra data has been
+# truncated to 1 kB), so we drop the last three snapshots and
+# construct them from scratch.
+offset=$((offset - 3 * sn_size))
+sn_count=$((sn_count - 3))
+
+# Assuming we had already written one of the three snapshots
+# (necessary so we can calculate $extra_data_size next).
+size_written=$((size_written - 2 * sn_size))
+
+# Increase the extra data size so we go past the limit
+# (The -1024 comes from the 1 kB of extra data we already have)
+extra_data_size=$((64 * 1048576 + 8 - sn_size - (size_written - 1024)))
+
+poke_file "$TEST_IMG" $((offset + 36)) \
+    "$(printf '%08x' $extra_data_size | sed -e 's/\(..\)/\\x\1/g')"
+
+offset=$((offset + sn_size - 1024 + extra_data_size))
+size_written=$((size_written - 1024 + extra_data_size))
+sn_count=$((sn_count + 1))
+
+# Write the two normal snapshots
+for ((i = 0; i < 2; i++)); do
+    dd if="$TEST_DIR/sn0" of="$TEST_IMG" bs=1 seek=$offset conv=notrunc \
+        &> /dev/null
+    offset=$((offset + sn_size))
+    size_written=$((size_written + sn_size))
+    sn_count=$((sn_count + 1))
+
+    if [ $i = 0 ]; then
+        # Check that the penultimate snapshot is beyond the 64 MB limit
+        echo "Snapshot table size should equal $((64 * 1048576 + 8)):" \
+            $size_written
+        echo
+    fi
+done
+
+truncate -s $offset "$TEST_IMG"
+refblock0_allocate $old_offset $offset
+
+poke_file "$TEST_IMG" 60 \
+    "$(printf '%08x' $sn_count | sed -e 's/\(..\)/\\x\1/g')"
+
+# Print error
+_img_info
+echo
+_check_test_img
+echo
+
+# Just truncating the extra data should be sufficient to shorten the
+# snapshot table so only one snapshot exceeds the extra size
+_check_test_img -r all
+
+echo
+echo '=== Too many snapshots ==='
+echo
+
+# Create a v2 image, for speeds' sake: All-zero snapshot table entries
+# are only valid in v2.
+IMGOPTS='compat=0.10' _make_test_img 64M
+
+# Hook up snapshot table somewhere safe (at 1 MB)
+poke_file "$TEST_IMG" 64 '\x00\x00\x00\x00\x00\x10\x00\x00'
+# "Create" more than 65536 snapshots (twice that many here)
+poke_file "$TEST_IMG" 60 '\x00\x02\x00\x00'
+
+# 40-byte all-zero snapshot table entries are valid snapshots, but
+# only in v2 (v3 needs 16 bytes of extra data, so we would have to
+# write 131072x '\x10').
+truncate -s $((1048576 + 40 * 131072)) "$TEST_IMG"
+
+# But let us give one of the snapshots to be removed an L1 table so
+# we can see how that is handled when repairing the image.
+# (Put it two clusters before 1 MB, and one L2 table one cluster
+# before 1 MB)
+poke_file "$TEST_IMG" $((1048576 + 40 * 65536 + 0)) \
+    '\x00\x00\x00\x00\x00\x0e\x00\x00'
+poke_file "$TEST_IMG" $((1048576 + 40 * 65536 + 8)) \
+    '\x00\x00\x00\x01'
+
+# Hook up the L2 table
+poke_file "$TEST_IMG" $((1048576 - 2 * 65536)) \
+    '\x80\x00\x00\x00\x00\x0f\x00\x00'
+
+# Make sure all of the clusters we just hooked up are allocated:
+# - The snapshot table
+# - The last snapshot's L1 and L2 table
+refblock0_allocate $((1048576 - 2 * 65536)) $((1048576 + 40 * 131072))
+
+# Print error
+_img_info
+echo
+_check_test_img
+echo
+
+# Should be repairable
+_check_test_img -r all
+
+echo
+echo '65536 snapshots should remain:'
+echo "  qemu-img info reports $(_img_info | grep -c '^ \{34\}') snapshots"
+echo "  Image header reports $(peek_file_be "$TEST_IMG" 60 4) snapshots"
+
+# success, all done
+echo "*** done"
+status=0
diff --git a/tests/qemu-iotests/261.out b/tests/qemu-iotests/261.out
new file mode 100644
index 0000000000..2600354566
--- /dev/null
+++ b/tests/qemu-iotests/261.out
@@ -0,0 +1,346 @@
+QA output created by 261
+
+=== Create v2 template ===
+
+Formatting 'TEST_DIR/t.IMGFMT.v2.orig', fmt=IMGFMT size=67108864
+No errors were found on the image.
+Snapshots in TEST_DIR/t.IMGFMT.v2.orig:
+  [0]
+    ID: 1
+    Name: sn0
+    Extra data size: 0
+  [1]
+    ID: 2
+    Name: sn1
+    Extra data size: 42
+    VM state size: 0
+    Disk size: 67108864
+    Unknown extra data: very important data
+  [2]
+    ID: 3
+    Name: sn2
+    Extra data size: 8
+    VM state size: 0
+
+=== Upgrade to v3 ===
+
+No errors were found on the image.
+Snapshots in TEST_DIR/t.IMGFMT.v3.orig:
+  [0]
+    ID: 1
+    Name: sn0
+    Extra data size: 16
+    VM state size: 0
+    Disk size: 67108864
+  [1]
+    ID: 2
+    Name: sn1
+    Extra data size: 42
+    VM state size: 0
+    Disk size: 67108864
+    Unknown extra data: very important data
+  [2]
+    ID: 3
+    Name: sn2
+    Extra data size: 16
+    VM state size: 0
+    Disk size: 67108864
+
+=== Repair botched v3 ===
+
+Repairing snapshot table entry 0 is incomplete
+Repairing snapshot table entry 2 is incomplete
+The following inconsistencies were found and repaired:
+
+    0 leaked clusters
+    2 corruptions
+
+Double checking the fixed image now...
+No errors were found on the image.
+Snapshots in TEST_DIR/t.IMGFMT:
+  [0]
+    ID: 1
+    Name: sn0
+    Extra data size: 16
+    VM state size: 0
+    Disk size: 67108864
+  [1]
+    ID: 2
+    Name: sn1
+    Extra data size: 42
+    VM state size: 0
+    Disk size: 67108864
+    Unknown extra data: very important data
+  [2]
+    ID: 3
+    Name: sn2
+    Extra data size: 16
+    VM state size: 0
+    Disk size: 67108864
+
+=== Add new snapshot ===
+
+No errors were found on the image.
+Snapshots in TEST_DIR/t.IMGFMT:
+  [0]
+    ID: 1
+    Name: sn0
+    Extra data size: 16
+    VM state size: 0
+    Disk size: 67108864
+  [1]
+    ID: 2
+    Name: sn1
+    Extra data size: 42
+    VM state size: 0
+    Disk size: 67108864
+    Unknown extra data: very important data
+  [2]
+    ID: 3
+    Name: sn2
+    Extra data size: 16
+    VM state size: 0
+    Disk size: 67108864
+  [3]
+    ID: 4
+    Name: sn3
+    Extra data size: 16
+    VM state size: 0
+    Disk size: 67108864
+
+=== Remove different snapshots ===
+
+--- sn0 ---
+No errors were found on the image.
+Snapshots in TEST_DIR/t.IMGFMT:
+  [0]
+    ID: 2
+    Name: sn1
+    Extra data size: 42
+    VM state size: 0
+    Disk size: 67108864
+    Unknown extra data: very important data
+  [1]
+    ID: 3
+    Name: sn2
+    Extra data size: 16
+    VM state size: 0
+    Disk size: 67108864
+
+--- sn1 ---
+No errors were found on the image.
+Snapshots in TEST_DIR/t.IMGFMT:
+  [0]
+    ID: 1
+    Name: sn0
+    Extra data size: 16
+    VM state size: 0
+    Disk size: 67108864
+  [1]
+    ID: 3
+    Name: sn2
+    Extra data size: 16
+    VM state size: 0
+    Disk size: 67108864
+
+--- sn2 ---
+No errors were found on the image.
+Snapshots in TEST_DIR/t.IMGFMT:
+  [0]
+    ID: 1
+    Name: sn0
+    Extra data size: 16
+    VM state size: 0
+    Disk size: 67108864
+  [1]
+    ID: 2
+    Name: sn1
+    Extra data size: 42
+    VM state size: 0
+    Disk size: 67108864
+    Unknown extra data: very important data
+
+=== Reject too much unknown extra data ===
+
+qemu-img: Could not open 'TEST_DIR/t.IMGFMT': Too much extra metadata in snapshot table entry 3
+You can force-remove this extra metadata with qemu-img check -r all
+
+qemu-img: ERROR failed to read the snapshot table: Too much extra metadata in snapshot table entry 3
+You can force-remove this extra metadata with qemu-img check -r all
+qemu-img: Check failed: File too large
+
+Discarding too much extra metadata in snapshot table entry 3 (65536 > 1024)
+ERROR cluster 10 refcount=0 reference=1
+Rebuilding refcount structure
+Repairing cluster 1 refcount=1 reference=0
+Repairing cluster 2 refcount=1 reference=0
+The following inconsistencies were found and repaired:
+
+    0 leaked clusters
+    2 corruptions
+
+Double checking the fixed image now...
+No errors were found on the image.
+
+=== Snapshot table too big ===
+
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864
+qemu-img: Could not open 'TEST_DIR/t.IMGFMT': Snapshot table is too big
+You can force-remove all 1 overhanging snapshots with qemu-img check -r all
+
+qemu-img: ERROR failed to read the snapshot table: Snapshot table is too big
+You can force-remove all 1 overhanging snapshots with qemu-img check -r all
+qemu-img: Check failed: File too large
+
+Discarding 1 overhanging snapshots (snapshot table is too big)
+Leaked cluster 14 refcount=1 reference=0
+Leaked cluster 15 refcount=1 reference=0
+Leaked cluster 1039 refcount=1 reference=0
+Leaked cluster 1040 refcount=1 reference=0
+Repairing cluster 14 refcount=1 reference=0
+Repairing cluster 15 refcount=1 reference=0
+Repairing cluster 1039 refcount=1 reference=0
+Repairing cluster 1040 refcount=1 reference=0
+The following inconsistencies were found and repaired:
+
+    4 leaked clusters
+    1 corruptions
+
+Double checking the fixed image now...
+No errors were found on the image.
+
+507 snapshots should remain:
+  qemu-img info reports 507 snapshots
+  Image header reports 507 snapshots
+
+=== Snapshot table too big with one entry with too much extra data ===
+
+Snapshot table size should equal 67108872: 67108872
+
+qemu-img: Could not open 'TEST_DIR/t.IMGFMT': Too much extra metadata in snapshot table entry 505
+You can force-remove this extra metadata with qemu-img check -r all
+
+qemu-img: ERROR failed to read the snapshot table: Too much extra metadata in snapshot table entry 505
+You can force-remove this extra metadata with qemu-img check -r all
+qemu-img: Check failed: File too large
+
+Discarding too much extra metadata in snapshot table entry 505 (116944 > 1024)
+Discarding 1 overhanging snapshots (snapshot table is too big)
+Leaked cluster 1041 refcount=1 reference=0
+Leaked cluster 1042 refcount=1 reference=0
+Repairing cluster 1041 refcount=1 reference=0
+Repairing cluster 1042 refcount=1 reference=0
+The following inconsistencies were found and repaired:
+
+    2 leaked clusters
+    2 corruptions
+
+Double checking the fixed image now...
+No errors were found on the image.
+
+=== Too many snapshots ===
+
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864
+qemu-img: Could not open 'TEST_DIR/t.IMGFMT': Snapshot table too large
+
+qemu-img: ERROR snapshot table too large
+You can force-remove all 65536 overhanging snapshots with qemu-img check -r all
+qemu-img: Check failed: File too large
+
+Discarding 65536 overhanging snapshots
+Leaked cluster 14 refcount=1 reference=0
+Leaked cluster 15 refcount=1 reference=0
+Leaked cluster 56 refcount=1 reference=0
+Leaked cluster 57 refcount=1 reference=0
+Leaked cluster 58 refcount=1 reference=0
+Leaked cluster 59 refcount=1 reference=0
+Leaked cluster 60 refcount=1 reference=0
+Leaked cluster 61 refcount=1 reference=0
+Leaked cluster 62 refcount=1 reference=0
+Leaked cluster 63 refcount=1 reference=0
+Leaked cluster 64 refcount=1 reference=0
+Leaked cluster 65 refcount=1 reference=0
+Leaked cluster 66 refcount=1 reference=0
+Leaked cluster 67 refcount=1 reference=0
+Leaked cluster 68 refcount=1 reference=0
+Leaked cluster 69 refcount=1 reference=0
+Leaked cluster 70 refcount=1 reference=0
+Leaked cluster 71 refcount=1 reference=0
+Leaked cluster 72 refcount=1 reference=0
+Leaked cluster 73 refcount=1 reference=0
+Leaked cluster 74 refcount=1 reference=0
+Leaked cluster 75 refcount=1 reference=0
+Leaked cluster 76 refcount=1 reference=0
+Leaked cluster 77 refcount=1 reference=0
+Leaked cluster 78 refcount=1 reference=0
+Leaked cluster 79 refcount=1 reference=0
+Leaked cluster 80 refcount=1 reference=0
+Leaked cluster 81 refcount=1 reference=0
+Leaked cluster 82 refcount=1 reference=0
+Leaked cluster 83 refcount=1 reference=0
+Leaked cluster 84 refcount=1 reference=0
+Leaked cluster 85 refcount=1 reference=0
+Leaked cluster 86 refcount=1 reference=0
+Leaked cluster 87 refcount=1 reference=0
+Leaked cluster 88 refcount=1 reference=0
+Leaked cluster 89 refcount=1 reference=0
+Leaked cluster 90 refcount=1 reference=0
+Leaked cluster 91 refcount=1 reference=0
+Leaked cluster 92 refcount=1 reference=0
+Leaked cluster 93 refcount=1 reference=0
+Leaked cluster 94 refcount=1 reference=0
+Leaked cluster 95 refcount=1 reference=0
+Repairing cluster 14 refcount=1 reference=0
+Repairing cluster 15 refcount=1 reference=0
+Repairing cluster 56 refcount=1 reference=0
+Repairing cluster 57 refcount=1 reference=0
+Repairing cluster 58 refcount=1 reference=0
+Repairing cluster 59 refcount=1 reference=0
+Repairing cluster 60 refcount=1 reference=0
+Repairing cluster 61 refcount=1 reference=0
+Repairing cluster 62 refcount=1 reference=0
+Repairing cluster 63 refcount=1 reference=0
+Repairing cluster 64 refcount=1 reference=0
+Repairing cluster 65 refcount=1 reference=0
+Repairing cluster 66 refcount=1 reference=0
+Repairing cluster 67 refcount=1 reference=0
+Repairing cluster 68 refcount=1 reference=0
+Repairing cluster 69 refcount=1 reference=0
+Repairing cluster 70 refcount=1 reference=0
+Repairing cluster 71 refcount=1 reference=0
+Repairing cluster 72 refcount=1 reference=0
+Repairing cluster 73 refcount=1 reference=0
+Repairing cluster 74 refcount=1 reference=0
+Repairing cluster 75 refcount=1 reference=0
+Repairing cluster 76 refcount=1 reference=0
+Repairing cluster 77 refcount=1 reference=0
+Repairing cluster 78 refcount=1 reference=0
+Repairing cluster 79 refcount=1 reference=0
+Repairing cluster 80 refcount=1 reference=0
+Repairing cluster 81 refcount=1 reference=0
+Repairing cluster 82 refcount=1 reference=0
+Repairing cluster 83 refcount=1 reference=0
+Repairing cluster 84 refcount=1 reference=0
+Repairing cluster 85 refcount=1 reference=0
+Repairing cluster 86 refcount=1 reference=0
+Repairing cluster 87 refcount=1 reference=0
+Repairing cluster 88 refcount=1 reference=0
+Repairing cluster 89 refcount=1 reference=0
+Repairing cluster 90 refcount=1 reference=0
+Repairing cluster 91 refcount=1 reference=0
+Repairing cluster 92 refcount=1 reference=0
+Repairing cluster 93 refcount=1 reference=0
+Repairing cluster 94 refcount=1 reference=0
+Repairing cluster 95 refcount=1 reference=0
+The following inconsistencies were found and repaired:
+
+    42 leaked clusters
+    65536 corruptions
+
+Double checking the fixed image now...
+No errors were found on the image.
+
+65536 snapshots should remain:
+  qemu-img info reports 65536 snapshots
+  Image header reports 65536 snapshots
+*** done
diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group
index af322af756..28871604cd 100644
--- a/tests/qemu-iotests/group
+++ b/tests/qemu-iotests/group
@@ -274,6 +274,7 @@
 257 rw
 258 rw quick
 260 rw quick
+261 rw
 262 rw quick migration
 263 rw quick
 264 rw

From 6b7e8f8b1ce7b41527a7c408491e93f90a442bfc Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Wed, 18 Sep 2019 11:51:37 +0200
Subject: [PATCH 61/69] block: Handle filter truncation like native impl.

Make the filter truncation (passing it through to bs->file) a
first-class citizen and handle it exactly as if it was the filter
driver's native implementation of .bdrv_co_truncate().

I do not see a reason not to, it makes the code a bit shorter, and may
be even more correct because this gets us to finish the write_req that
we prepared before (may be important to e.g. bring dirty bitmaps to the
correct size).

Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-id: 20190918095144.955-2-mreitz@redhat.com
Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/io.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/block/io.c b/block/io.c
index f0b86c1d19..8ff3b47fb4 100644
--- a/block/io.c
+++ b/block/io.c
@@ -3347,20 +3347,19 @@ int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset,
         goto out;
     }
 
-    if (!drv->bdrv_co_truncate) {
-        if (bs->file && drv->is_filter) {
-            ret = bdrv_co_truncate(bs->file, offset, prealloc, errp);
-            goto out;
-        }
+    if (drv->bdrv_co_truncate) {
+        ret = drv->bdrv_co_truncate(bs, offset, prealloc, errp);
+    } else if (bs->file && drv->is_filter) {
+        ret = bdrv_co_truncate(bs->file, offset, prealloc, errp);
+    } else {
         error_setg(errp, "Image format driver does not support resize");
         ret = -ENOTSUP;
         goto out;
     }
-
-    ret = drv->bdrv_co_truncate(bs, offset, prealloc, errp);
     if (ret < 0) {
         goto out;
     }
+
     ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
     if (ret < 0) {
         error_setg_errno(errp, -ret, "Could not refresh total sector count");

From bb8160eb78e948b0ef6f5ebbd38ff2a2c3f1dcf3 Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Wed, 18 Sep 2019 11:51:38 +0200
Subject: [PATCH 62/69] block/cor: Drop cor_co_truncate()

No other filter driver has a .bdrv_co_truncate() implementation, and
there is no need to because the general block layer code can handle it
just as well.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-id: 20190918095144.955-3-mreitz@redhat.com
Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/copy-on-read.c | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/block/copy-on-read.c b/block/copy-on-read.c
index 6631f30205..e95223d3cb 100644
--- a/block/copy-on-read.c
+++ b/block/copy-on-read.c
@@ -73,13 +73,6 @@ static int64_t cor_getlength(BlockDriverState *bs)
 }
 
 
-static int coroutine_fn cor_co_truncate(BlockDriverState *bs, int64_t offset,
-                                        PreallocMode prealloc, Error **errp)
-{
-    return bdrv_co_truncate(bs->file, offset, prealloc, errp);
-}
-
-
 static int coroutine_fn cor_co_preadv(BlockDriverState *bs,
                                       uint64_t offset, uint64_t bytes,
                                       QEMUIOVector *qiov, int flags)
@@ -139,7 +132,6 @@ static BlockDriver bdrv_copy_on_read = {
     .bdrv_child_perm                    = cor_child_perm,
 
     .bdrv_getlength                     = cor_getlength,
-    .bdrv_co_truncate                   = cor_co_truncate,
 
     .bdrv_co_preadv                     = cor_co_preadv,
     .bdrv_co_pwritev                    = cor_co_pwritev,

From 26536c7fc25917d1bd13781f81fe3ab039643bff Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Wed, 18 Sep 2019 11:51:39 +0200
Subject: [PATCH 63/69] block: Do not truncate file node when formatting

There is no reason why the format drivers need to truncate the protocol
node when formatting it.  When using the old .bdrv_co_create_ops()
interface, the file will be created with no size option anyway, which
generally gives it a size of 0.  (Exceptions are block devices, which
cannot be truncated anyway.)

When using blockdev-create, the user must have given the file node some
size anyway, so there is no reason why we should override that.

qed is an exception, it needs the file to start completely empty (as
explained by c743849bee7333c7ef256b7e12e34ed6f907064f).

Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-id: 20190918095144.955-4-mreitz@redhat.com
Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/parallels.c | 5 -----
 block/qcow.c      | 5 -----
 block/qcow2.c     | 6 ------
 3 files changed, 16 deletions(-)

diff --git a/block/parallels.c b/block/parallels.c
index 7cd2714b69..905cac35c6 100644
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -563,11 +563,6 @@ static int coroutine_fn parallels_co_create(BlockdevCreateOptions* opts,
     blk_set_allow_write_beyond_eof(blk, true);
 
     /* Create image format */
-    ret = blk_truncate(blk, 0, PREALLOC_MODE_OFF, errp);
-    if (ret < 0) {
-        goto out;
-    }
-
     bat_entries = DIV_ROUND_UP(total_size, cl_size);
     bat_sectors = DIV_ROUND_UP(bat_entry_off(bat_entries), cl_size);
     bat_sectors = (bat_sectors *  cl_size) >> BDRV_SECTOR_BITS;
diff --git a/block/qcow.c b/block/qcow.c
index 5bdf72ba33..17705015ca 100644
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -858,11 +858,6 @@ static int coroutine_fn qcow_co_create(BlockdevCreateOptions *opts,
     blk_set_allow_write_beyond_eof(qcow_blk, true);
 
     /* Create image format */
-    ret = blk_truncate(qcow_blk, 0, PREALLOC_MODE_OFF, errp);
-    if (ret < 0) {
-        goto exit;
-    }
-
     memset(&header, 0, sizeof(header));
     header.magic = cpu_to_be32(QCOW_MAGIC);
     header.version = cpu_to_be32(QCOW_VERSION);
diff --git a/block/qcow2.c b/block/qcow2.c
index 8d4f38ae74..bf29d1c460 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -3391,12 +3391,6 @@ qcow2_co_create(BlockdevCreateOptions *create_options, Error **errp)
     }
     blk_set_allow_write_beyond_eof(blk, true);
 
-    /* Clear the protocol layer and preallocate it if necessary */
-    ret = blk_truncate(blk, 0, PREALLOC_MODE_OFF, errp);
-    if (ret < 0) {
-        goto out;
-    }
-
     /* Write the header */
     QEMU_BUILD_BUG_ON((1 << MIN_CLUSTER_BITS) < sizeof(*header));
     header = g_malloc0(cluster_size);

From c80d8b06cfa59f5d8229379c85dcb2c3bb9c881b Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Wed, 18 Sep 2019 11:51:40 +0200
Subject: [PATCH 64/69] block: Add @exact parameter to bdrv_co_truncate()

We have two drivers (iscsi and file-posix) that (in some cases) return
success from their .bdrv_co_truncate() implementation if the block
device is larger than the requested offset, but cannot be shrunk.  Some
callers do not want that behavior, so this patch adds a new parameter
that they can use to turn off that behavior.

This patch just adds the parameter and lets the block/io.c and
block/block-backend.c functions pass it around.  All other callers
always pass false and none of the implementations evaluate it, so that
this patch does not change existing behavior.  Future patches take care
of that.

Suggested-by: Maxim Levitsky <mlevitsk@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-id: 20190918095144.955-5-mreitz@redhat.com
Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/block-backend.c          |  6 +++---
 block/commit.c                 |  5 +++--
 block/crypto.c                 |  8 ++++----
 block/file-posix.c             |  3 ++-
 block/file-win32.c             |  3 ++-
 block/gluster.c                |  1 +
 block/io.c                     | 20 +++++++++++++-------
 block/iscsi.c                  |  3 ++-
 block/mirror.c                 |  4 ++--
 block/nfs.c                    |  2 +-
 block/parallels.c              |  6 +++---
 block/qcow.c                   |  4 ++--
 block/qcow2-refcount.c         |  2 +-
 block/qcow2.c                  | 22 ++++++++++++----------
 block/qed.c                    |  3 ++-
 block/raw-format.c             |  5 +++--
 block/rbd.c                    |  1 +
 block/sheepdog.c               |  5 +++--
 block/ssh.c                    |  3 ++-
 block/vdi.c                    |  2 +-
 block/vhdx-log.c               |  4 ++--
 block/vhdx.c                   |  7 ++++---
 block/vmdk.c                   |  8 ++++----
 block/vpc.c                    |  2 +-
 blockdev.c                     |  2 +-
 include/block/block.h          |  6 +++---
 include/block/block_int.h      | 17 ++++++++++++++++-
 include/sysemu/block-backend.h |  4 ++--
 qemu-img.c                     |  2 +-
 qemu-io-cmds.c                 |  2 +-
 tests/test-block-iothread.c    |  8 ++++----
 31 files changed, 102 insertions(+), 68 deletions(-)

diff --git a/block/block-backend.c b/block/block-backend.c
index 912c50678d..8b8f2a80a0 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -2072,15 +2072,15 @@ int blk_pwrite_compressed(BlockBackend *blk, int64_t offset, const void *buf,
                    BDRV_REQ_WRITE_COMPRESSED);
 }
 
-int blk_truncate(BlockBackend *blk, int64_t offset, PreallocMode prealloc,
-                 Error **errp)
+int blk_truncate(BlockBackend *blk, int64_t offset, bool exact,
+                 PreallocMode prealloc, Error **errp)
 {
     if (!blk_is_available(blk)) {
         error_setg(errp, "No medium inserted");
         return -ENOMEDIUM;
     }
 
-    return bdrv_truncate(blk->root, offset, prealloc, errp);
+    return bdrv_truncate(blk->root, offset, exact, prealloc, errp);
 }
 
 static void blk_pdiscard_entry(void *opaque)
diff --git a/block/commit.c b/block/commit.c
index bc8454463d..23c90b3b91 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -155,7 +155,7 @@ static int coroutine_fn commit_run(Job *job, Error **errp)
     }
 
     if (base_len < len) {
-        ret = blk_truncate(s->base, len, PREALLOC_MODE_OFF, NULL);
+        ret = blk_truncate(s->base, len, false, PREALLOC_MODE_OFF, NULL);
         if (ret) {
             goto out;
         }
@@ -471,7 +471,8 @@ int bdrv_commit(BlockDriverState *bs)
      * grow the backing file image if possible.  If not possible,
      * we must return an error */
     if (length > backing_length) {
-        ret = blk_truncate(backing, length, PREALLOC_MODE_OFF, &local_err);
+        ret = blk_truncate(backing, length, false, PREALLOC_MODE_OFF,
+                           &local_err);
         if (ret < 0) {
             error_report_err(local_err);
             goto ro_cleanup;
diff --git a/block/crypto.c b/block/crypto.c
index 7eb698774e..e5a1a2cdf3 100644
--- a/block/crypto.c
+++ b/block/crypto.c
@@ -113,8 +113,8 @@ static ssize_t block_crypto_init_func(QCryptoBlock *block,
      * available to the guest, so we must take account of that
      * which will be used by the crypto header
      */
-    return blk_truncate(data->blk, data->size + headerlen, data->prealloc,
-                        errp);
+    return blk_truncate(data->blk, data->size + headerlen, false,
+                        data->prealloc, errp);
 }
 
 
@@ -297,7 +297,7 @@ static int block_crypto_co_create_generic(BlockDriverState *bs,
 }
 
 static int coroutine_fn
-block_crypto_co_truncate(BlockDriverState *bs, int64_t offset,
+block_crypto_co_truncate(BlockDriverState *bs, int64_t offset, bool exact,
                          PreallocMode prealloc, Error **errp)
 {
     BlockCrypto *crypto = bs->opaque;
@@ -311,7 +311,7 @@ block_crypto_co_truncate(BlockDriverState *bs, int64_t offset,
 
     offset += payload_offset;
 
-    return bdrv_co_truncate(bs->file, offset, prealloc, errp);
+    return bdrv_co_truncate(bs->file, offset, false, prealloc, errp);
 }
 
 static void block_crypto_close(BlockDriverState *bs)
diff --git a/block/file-posix.c b/block/file-posix.c
index 695fcf740d..a3e8a8aa70 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -2019,7 +2019,8 @@ raw_regular_truncate(BlockDriverState *bs, int fd, int64_t offset,
 }
 
 static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset,
-                                        PreallocMode prealloc, Error **errp)
+                                        bool exact, PreallocMode prealloc,
+                                        Error **errp)
 {
     BDRVRawState *s = bs->opaque;
     struct stat st;
diff --git a/block/file-win32.c b/block/file-win32.c
index 41f55dfece..77e8ff7b68 100644
--- a/block/file-win32.c
+++ b/block/file-win32.c
@@ -468,7 +468,8 @@ static void raw_close(BlockDriverState *bs)
 }
 
 static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset,
-                                        PreallocMode prealloc, Error **errp)
+                                        bool exact, PreallocMode prealloc,
+                                        Error **errp)
 {
     BDRVRawState *s = bs->opaque;
     LONG low, high;
diff --git a/block/gluster.c b/block/gluster.c
index 64028b2cba..4fa4a77a47 100644
--- a/block/gluster.c
+++ b/block/gluster.c
@@ -1225,6 +1225,7 @@ static coroutine_fn int qemu_gluster_co_rw(BlockDriverState *bs,
 
 static coroutine_fn int qemu_gluster_co_truncate(BlockDriverState *bs,
                                                  int64_t offset,
+                                                 bool exact,
                                                  PreallocMode prealloc,
                                                  Error **errp)
 {
diff --git a/block/io.c b/block/io.c
index 8ff3b47fb4..6e2b59802e 100644
--- a/block/io.c
+++ b/block/io.c
@@ -3291,8 +3291,12 @@ static void bdrv_parent_cb_resize(BlockDriverState *bs)
 
 /**
  * Truncate file to 'offset' bytes (needed only for file protocols)
+ *
+ * If 'exact' is true, the file must be resized to exactly the given
+ * 'offset'.  Otherwise, it is sufficient for the node to be at least
+ * 'offset' bytes in length.
  */
-int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset,
+int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact,
                                   PreallocMode prealloc, Error **errp)
 {
     BlockDriverState *bs = child->bs;
@@ -3348,9 +3352,9 @@ int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset,
     }
 
     if (drv->bdrv_co_truncate) {
-        ret = drv->bdrv_co_truncate(bs, offset, prealloc, errp);
+        ret = drv->bdrv_co_truncate(bs, offset, exact, prealloc, errp);
     } else if (bs->file && drv->is_filter) {
-        ret = bdrv_co_truncate(bs->file, offset, prealloc, errp);
+        ret = bdrv_co_truncate(bs->file, offset, exact, prealloc, errp);
     } else {
         error_setg(errp, "Image format driver does not support resize");
         ret = -ENOTSUP;
@@ -3381,6 +3385,7 @@ out:
 typedef struct TruncateCo {
     BdrvChild *child;
     int64_t offset;
+    bool exact;
     PreallocMode prealloc;
     Error **errp;
     int ret;
@@ -3389,18 +3394,19 @@ typedef struct TruncateCo {
 static void coroutine_fn bdrv_truncate_co_entry(void *opaque)
 {
     TruncateCo *tco = opaque;
-    tco->ret = bdrv_co_truncate(tco->child, tco->offset, tco->prealloc,
-                                tco->errp);
+    tco->ret = bdrv_co_truncate(tco->child, tco->offset, tco->exact,
+                                tco->prealloc, tco->errp);
     aio_wait_kick();
 }
 
-int bdrv_truncate(BdrvChild *child, int64_t offset, PreallocMode prealloc,
-                  Error **errp)
+int bdrv_truncate(BdrvChild *child, int64_t offset, bool exact,
+                  PreallocMode prealloc, Error **errp)
 {
     Coroutine *co;
     TruncateCo tco = {
         .child      = child,
         .offset     = offset,
+        .exact      = exact,
         .prealloc   = prealloc,
         .errp       = errp,
         .ret        = NOT_DONE,
diff --git a/block/iscsi.c b/block/iscsi.c
index 2ced15066a..677946cf09 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -2123,7 +2123,8 @@ static void iscsi_reopen_commit(BDRVReopenState *reopen_state)
 }
 
 static int coroutine_fn iscsi_co_truncate(BlockDriverState *bs, int64_t offset,
-                                          PreallocMode prealloc, Error **errp)
+                                          bool exact, PreallocMode prealloc,
+                                          Error **errp)
 {
     IscsiLun *iscsilun = bs->opaque;
     Error *local_err = NULL;
diff --git a/block/mirror.c b/block/mirror.c
index bb17cfce31..f0f2d9dff1 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -878,8 +878,8 @@ static int coroutine_fn mirror_run(Job *job, Error **errp)
         }
 
         if (s->bdev_length > base_length) {
-            ret = blk_truncate(s->target, s->bdev_length, PREALLOC_MODE_OFF,
-                               NULL);
+            ret = blk_truncate(s->target, s->bdev_length, false,
+                               PREALLOC_MODE_OFF, NULL);
             if (ret < 0) {
                 goto immediate_exit;
             }
diff --git a/block/nfs.c b/block/nfs.c
index 40f23495a0..9a6311e270 100644
--- a/block/nfs.c
+++ b/block/nfs.c
@@ -752,7 +752,7 @@ static int64_t nfs_get_allocated_file_size(BlockDriverState *bs)
 }
 
 static int coroutine_fn
-nfs_file_co_truncate(BlockDriverState *bs, int64_t offset,
+nfs_file_co_truncate(BlockDriverState *bs, int64_t offset, bool exact,
                      PreallocMode prealloc, Error **errp)
 {
     NFSClient *client = bs->opaque;
diff --git a/block/parallels.c b/block/parallels.c
index 905cac35c6..a1a92c97a4 100644
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -203,7 +203,7 @@ static int64_t allocate_clusters(BlockDriverState *bs, int64_t sector_num,
         } else {
             ret = bdrv_truncate(bs->file,
                                 (s->data_end + space) << BDRV_SECTOR_BITS,
-                                PREALLOC_MODE_OFF, NULL);
+                                false, PREALLOC_MODE_OFF, NULL);
         }
         if (ret < 0) {
             return ret;
@@ -487,7 +487,7 @@ static int coroutine_fn parallels_co_check(BlockDriverState *bs,
         res->leaks += count;
         if (fix & BDRV_FIX_LEAKS) {
             Error *local_err = NULL;
-            ret = bdrv_truncate(bs->file, res->image_end_offset,
+            ret = bdrv_truncate(bs->file, res->image_end_offset, false,
                                 PREALLOC_MODE_OFF, &local_err);
             if (ret < 0) {
                 error_report_err(local_err);
@@ -880,7 +880,7 @@ static void parallels_close(BlockDriverState *bs)
     if ((bs->open_flags & BDRV_O_RDWR) && !(bs->open_flags & BDRV_O_INACTIVE)) {
         s->header->inuse = 0;
         parallels_update_header(bs);
-        bdrv_truncate(bs->file, s->data_end << BDRV_SECTOR_BITS,
+        bdrv_truncate(bs->file, s->data_end << BDRV_SECTOR_BITS, false,
                       PREALLOC_MODE_OFF, NULL);
     }
 
diff --git a/block/qcow.c b/block/qcow.c
index 17705015ca..fce8989868 100644
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -480,7 +480,7 @@ static int get_cluster_offset(BlockDriverState *bs,
                     return -E2BIG;
                 }
                 ret = bdrv_truncate(bs->file, cluster_offset + s->cluster_size,
-                                    PREALLOC_MODE_OFF, NULL);
+                                    false, PREALLOC_MODE_OFF, NULL);
                 if (ret < 0) {
                     return ret;
                 }
@@ -1033,7 +1033,7 @@ static int qcow_make_empty(BlockDriverState *bs)
     if (bdrv_pwrite_sync(bs->file, s->l1_table_offset, s->l1_table,
             l1_length) < 0)
         return -1;
-    ret = bdrv_truncate(bs->file, s->l1_table_offset + l1_length,
+    ret = bdrv_truncate(bs->file, s->l1_table_offset + l1_length, false,
                         PREALLOC_MODE_OFF, NULL);
     if (ret < 0)
         return ret;
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index 0d64bf5a5e..f67ac6b2d8 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -2016,7 +2016,7 @@ static int check_refblocks(BlockDriverState *bs, BdrvCheckResult *res,
                     goto resize_fail;
                 }
 
-                ret = bdrv_truncate(bs->file, offset + s->cluster_size,
+                ret = bdrv_truncate(bs->file, offset + s->cluster_size, false,
                                     PREALLOC_MODE_OFF, &local_err);
                 if (ret < 0) {
                     error_report_err(local_err);
diff --git a/block/qcow2.c b/block/qcow2.c
index bf29d1c460..de284ad803 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -3074,8 +3074,8 @@ static int coroutine_fn preallocate_co(BlockDriverState *bs, uint64_t offset,
         if (mode == PREALLOC_MODE_METADATA) {
             mode = PREALLOC_MODE_OFF;
         }
-        ret = bdrv_co_truncate(s->data_file, host_offset + cur_bytes, mode,
-                               errp);
+        ret = bdrv_co_truncate(s->data_file, host_offset + cur_bytes, false,
+                               mode, errp);
         if (ret < 0) {
             return ret;
         }
@@ -3489,7 +3489,8 @@ qcow2_co_create(BlockdevCreateOptions *create_options, Error **errp)
     }
 
     /* Okay, now that we have a valid image, let's give it the right size */
-    ret = blk_truncate(blk, qcow2_opts->size, qcow2_opts->preallocation, errp);
+    ret = blk_truncate(blk, qcow2_opts->size, false, qcow2_opts->preallocation,
+                       errp);
     if (ret < 0) {
         error_prepend(errp, "Could not resize image: ");
         goto out;
@@ -3937,7 +3938,8 @@ fail:
 }
 
 static int coroutine_fn qcow2_co_truncate(BlockDriverState *bs, int64_t offset,
-                                          PreallocMode prealloc, Error **errp)
+                                          bool exact, PreallocMode prealloc,
+                                          Error **errp)
 {
     BDRVQcow2State *s = bs->opaque;
     uint64_t old_length;
@@ -4026,7 +4028,7 @@ static int coroutine_fn qcow2_co_truncate(BlockDriverState *bs, int64_t offset,
             Error *local_err = NULL;
 
             bdrv_co_truncate(bs->file, (last_cluster + 1) * s->cluster_size,
-                             PREALLOC_MODE_OFF, &local_err);
+                             false, PREALLOC_MODE_OFF, &local_err);
             if (local_err) {
                 warn_reportf_err(local_err,
                                  "Failed to truncate the tail of the image: ");
@@ -4043,7 +4045,7 @@ static int coroutine_fn qcow2_co_truncate(BlockDriverState *bs, int64_t offset,
     switch (prealloc) {
     case PREALLOC_MODE_OFF:
         if (has_data_file(bs)) {
-            ret = bdrv_co_truncate(s->data_file, offset, prealloc, errp);
+            ret = bdrv_co_truncate(s->data_file, offset, false, prealloc, errp);
             if (ret < 0) {
                 goto fail;
             }
@@ -4128,7 +4130,7 @@ static int coroutine_fn qcow2_co_truncate(BlockDriverState *bs, int64_t offset,
         /* Allocate the data area */
         new_file_size = allocation_start +
                         nb_new_data_clusters * s->cluster_size;
-        ret = bdrv_co_truncate(bs->file, new_file_size, prealloc, errp);
+        ret = bdrv_co_truncate(bs->file, new_file_size, false, prealloc, errp);
         if (ret < 0) {
             error_prepend(errp, "Failed to resize underlying file: ");
             qcow2_free_clusters(bs, allocation_start,
@@ -4231,7 +4233,7 @@ qcow2_co_pwritev_compressed_part(BlockDriverState *bs,
         if (len < 0) {
             return len;
         }
-        return bdrv_co_truncate(bs->file, len, PREALLOC_MODE_OFF, NULL);
+        return bdrv_co_truncate(bs->file, len, false, PREALLOC_MODE_OFF, NULL);
     }
 
     if (offset_into_cluster(s, offset)) {
@@ -4468,7 +4470,7 @@ static int make_completely_empty(BlockDriverState *bs)
         goto fail;
     }
 
-    ret = bdrv_truncate(bs->file, (3 + l1_clusters) * s->cluster_size,
+    ret = bdrv_truncate(bs->file, (3 + l1_clusters) * s->cluster_size, false,
                         PREALLOC_MODE_OFF, &local_err);
     if (ret < 0) {
         error_report_err(local_err);
@@ -5308,7 +5310,7 @@ static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts,
             return ret;
         }
 
-        ret = blk_truncate(blk, new_size, PREALLOC_MODE_OFF, errp);
+        ret = blk_truncate(blk, new_size, false, PREALLOC_MODE_OFF, errp);
         blk_unref(blk);
         if (ret < 0) {
             return ret;
diff --git a/block/qed.c b/block/qed.c
index 0d8fd507aa..7c2a65af40 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -674,7 +674,7 @@ static int coroutine_fn bdrv_qed_co_create(BlockdevCreateOptions *opts,
     l1_size = header.cluster_size * header.table_size;
 
     /* File must start empty and grow, check truncate is supported */
-    ret = blk_truncate(blk, 0, PREALLOC_MODE_OFF, errp);
+    ret = blk_truncate(blk, 0, false, PREALLOC_MODE_OFF, errp);
     if (ret < 0) {
         goto out;
     }
@@ -1461,6 +1461,7 @@ static int coroutine_fn bdrv_qed_co_pwrite_zeroes(BlockDriverState *bs,
 
 static int coroutine_fn bdrv_qed_co_truncate(BlockDriverState *bs,
                                              int64_t offset,
+                                             bool exact,
                                              PreallocMode prealloc,
                                              Error **errp)
 {
diff --git a/block/raw-format.c b/block/raw-format.c
index 42c28cc29a..57d84d0bae 100644
--- a/block/raw-format.c
+++ b/block/raw-format.c
@@ -370,7 +370,8 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
 }
 
 static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset,
-                                        PreallocMode prealloc, Error **errp)
+                                        bool exact, PreallocMode prealloc,
+                                        Error **errp)
 {
     BDRVRawState *s = bs->opaque;
 
@@ -386,7 +387,7 @@ static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset,
 
     s->size = offset;
     offset += s->offset;
-    return bdrv_co_truncate(bs->file, offset, prealloc, errp);
+    return bdrv_co_truncate(bs->file, offset, false, prealloc, errp);
 }
 
 static void raw_eject(BlockDriverState *bs, bool eject_flag)
diff --git a/block/rbd.c b/block/rbd.c
index c71e45d7c3..027cbcc695 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -1087,6 +1087,7 @@ static int64_t qemu_rbd_getlength(BlockDriverState *bs)
 
 static int coroutine_fn qemu_rbd_co_truncate(BlockDriverState *bs,
                                              int64_t offset,
+                                             bool exact,
                                              PreallocMode prealloc,
                                              Error **errp)
 {
diff --git a/block/sheepdog.c b/block/sheepdog.c
index 773dfc6ab1..cfa84338a2 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -2285,7 +2285,8 @@ static int64_t sd_getlength(BlockDriverState *bs)
 }
 
 static int coroutine_fn sd_co_truncate(BlockDriverState *bs, int64_t offset,
-                                       PreallocMode prealloc, Error **errp)
+                                       bool exact, PreallocMode prealloc,
+                                       Error **errp)
 {
     BDRVSheepdogState *s = bs->opaque;
     int ret, fd;
@@ -2601,7 +2602,7 @@ static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num,
 
     assert(!flags);
     if (offset > s->inode.vdi_size) {
-        ret = sd_co_truncate(bs, offset, PREALLOC_MODE_OFF, NULL);
+        ret = sd_co_truncate(bs, offset, false, PREALLOC_MODE_OFF, NULL);
         if (ret < 0) {
             return ret;
         }
diff --git a/block/ssh.c b/block/ssh.c
index 84d01e892b..b4375cf7d2 100644
--- a/block/ssh.c
+++ b/block/ssh.c
@@ -1295,7 +1295,8 @@ static int64_t ssh_getlength(BlockDriverState *bs)
 }
 
 static int coroutine_fn ssh_co_truncate(BlockDriverState *bs, int64_t offset,
-                                        PreallocMode prealloc, Error **errp)
+                                        bool exact, PreallocMode prealloc,
+                                        Error **errp)
 {
     BDRVSSHState *s = bs->opaque;
 
diff --git a/block/vdi.c b/block/vdi.c
index 806ba7f53c..0142da7233 100644
--- a/block/vdi.c
+++ b/block/vdi.c
@@ -874,7 +874,7 @@ static int coroutine_fn vdi_co_do_create(BlockdevCreateOptions *create_options,
     }
 
     if (image_type == VDI_TYPE_STATIC) {
-        ret = blk_truncate(blk, offset + blocks * block_size,
+        ret = blk_truncate(blk, offset + blocks * block_size, false,
                            PREALLOC_MODE_OFF, errp);
         if (ret < 0) {
             error_prepend(errp, "Failed to statically allocate file");
diff --git a/block/vhdx-log.c b/block/vhdx-log.c
index fdd3a7adc3..13a49c2a33 100644
--- a/block/vhdx-log.c
+++ b/block/vhdx-log.c
@@ -557,8 +557,8 @@ static int vhdx_log_flush(BlockDriverState *bs, BDRVVHDXState *s,
                     ret = -EINVAL;
                     goto exit;
                 }
-                ret = bdrv_truncate(bs->file, new_file_size, PREALLOC_MODE_OFF,
-                                    NULL);
+                ret = bdrv_truncate(bs->file, new_file_size, false,
+                                    PREALLOC_MODE_OFF, NULL);
                 if (ret < 0) {
                     goto exit;
                 }
diff --git a/block/vhdx.c b/block/vhdx.c
index 371f226286..f02d2611be 100644
--- a/block/vhdx.c
+++ b/block/vhdx.c
@@ -1263,7 +1263,7 @@ static int vhdx_allocate_block(BlockDriverState *bs, BDRVVHDXState *s,
         return -EINVAL;
     }
 
-    return bdrv_truncate(bs->file, *new_offset + s->block_size,
+    return bdrv_truncate(bs->file, *new_offset + s->block_size, false,
                          PREALLOC_MODE_OFF, NULL);
 }
 
@@ -1702,12 +1702,13 @@ static int vhdx_create_bat(BlockBackend *blk, BDRVVHDXState *s,
     if (type == VHDX_TYPE_DYNAMIC) {
         /* All zeroes, so we can just extend the file - the end of the BAT
          * is the furthest thing we have written yet */
-        ret = blk_truncate(blk, data_file_offset, PREALLOC_MODE_OFF, errp);
+        ret = blk_truncate(blk, data_file_offset, false, PREALLOC_MODE_OFF,
+                           errp);
         if (ret < 0) {
             goto exit;
         }
     } else if (type == VHDX_TYPE_FIXED) {
-        ret = blk_truncate(blk, data_file_offset + image_size,
+        ret = blk_truncate(blk, data_file_offset + image_size, false,
                            PREALLOC_MODE_OFF, errp);
         if (ret < 0) {
             goto exit;
diff --git a/block/vmdk.c b/block/vmdk.c
index fed3b50c8a..20e909d997 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -2076,7 +2076,7 @@ vmdk_co_pwritev_compressed(BlockDriverState *bs, uint64_t offset,
                 return length;
             }
             length = QEMU_ALIGN_UP(length, BDRV_SECTOR_SIZE);
-            ret = bdrv_truncate(s->extents[i].file, length,
+            ret = bdrv_truncate(s->extents[i].file, length, false,
                                 PREALLOC_MODE_OFF, NULL);
             if (ret < 0) {
                 return ret;
@@ -2118,7 +2118,7 @@ static int vmdk_init_extent(BlockBackend *blk,
     int gd_buf_size;
 
     if (flat) {
-        ret = blk_truncate(blk, filesize, PREALLOC_MODE_OFF, errp);
+        ret = blk_truncate(blk, filesize, false, PREALLOC_MODE_OFF, errp);
         goto exit;
     }
     magic = cpu_to_be32(VMDK4_MAGIC);
@@ -2181,7 +2181,7 @@ static int vmdk_init_extent(BlockBackend *blk,
         goto exit;
     }
 
-    ret = blk_truncate(blk, le64_to_cpu(header.grain_offset) << 9,
+    ret = blk_truncate(blk, le64_to_cpu(header.grain_offset) << 9, false,
                        PREALLOC_MODE_OFF, errp);
     if (ret < 0) {
         goto exit;
@@ -2523,7 +2523,7 @@ static int coroutine_fn vmdk_co_do_create(int64_t size,
     /* bdrv_pwrite write padding zeros to align to sector, we don't need that
      * for description file */
     if (desc_offset == 0) {
-        ret = blk_truncate(blk, desc_len, PREALLOC_MODE_OFF, errp);
+        ret = blk_truncate(blk, desc_len, false, PREALLOC_MODE_OFF, errp);
         if (ret < 0) {
             goto exit;
         }
diff --git a/block/vpc.c b/block/vpc.c
index 5cd3890780..a65550298e 100644
--- a/block/vpc.c
+++ b/block/vpc.c
@@ -898,7 +898,7 @@ static int create_fixed_disk(BlockBackend *blk, uint8_t *buf,
     /* Add footer to total size */
     total_size += HEADER_SIZE;
 
-    ret = blk_truncate(blk, total_size, PREALLOC_MODE_OFF, errp);
+    ret = blk_truncate(blk, total_size, false, PREALLOC_MODE_OFF, errp);
     if (ret < 0) {
         return ret;
     }
diff --git a/blockdev.c b/blockdev.c
index ba491e3ef5..8e029e9c01 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -3204,7 +3204,7 @@ void qmp_block_resize(bool has_device, const char *device,
     }
 
     bdrv_drained_begin(bs);
-    ret = blk_truncate(blk, size, PREALLOC_MODE_OFF, errp);
+    ret = blk_truncate(blk, size, false, PREALLOC_MODE_OFF, errp);
     bdrv_drained_end(bs);
 
 out:
diff --git a/include/block/block.h b/include/block/block.h
index 89606bd9f8..1df9848e74 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -346,10 +346,10 @@ BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
     const char *backing_file);
 void bdrv_refresh_filename(BlockDriverState *bs);
 
-int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset,
+int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact,
                                   PreallocMode prealloc, Error **errp);
-int bdrv_truncate(BdrvChild *child, int64_t offset, PreallocMode prealloc,
-                  Error **errp);
+int bdrv_truncate(BdrvChild *child, int64_t offset, bool exact,
+                  PreallocMode prealloc, Error **errp);
 
 int64_t bdrv_nb_sectors(BlockDriverState *bs);
 int64_t bdrv_getlength(BlockDriverState *bs);
diff --git a/include/block/block_int.h b/include/block/block_int.h
index ca4ccac4c1..02dc0034a2 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -334,8 +334,23 @@ struct BlockDriver {
      * bdrv_parse_filename.
      */
     const char *protocol_name;
+
+    /*
+     * Truncate @bs to @offset bytes using the given @prealloc mode
+     * when growing.  Modes other than PREALLOC_MODE_OFF should be
+     * rejected when shrinking @bs.
+     *
+     * If @exact is true, @bs must be resized to exactly @offset.
+     * Otherwise, it is sufficient for @bs (if it is a host block
+     * device and thus there is no way to resize it) to be at least
+     * @offset bytes in length.
+     *
+     * If @exact is true and this function fails but would succeed
+     * with @exact = false, it should return -ENOTSUP.
+     */
     int coroutine_fn (*bdrv_co_truncate)(BlockDriverState *bs, int64_t offset,
-                                         PreallocMode prealloc, Error **errp);
+                                         bool exact, PreallocMode prealloc,
+                                         Error **errp);
 
     int64_t (*bdrv_getlength)(BlockDriverState *bs);
     bool has_variable_length;
diff --git a/include/sysemu/block-backend.h b/include/sysemu/block-backend.h
index 73f2cef7fe..b198deca0b 100644
--- a/include/sysemu/block-backend.h
+++ b/include/sysemu/block-backend.h
@@ -237,8 +237,8 @@ int coroutine_fn blk_co_pwrite_zeroes(BlockBackend *blk, int64_t offset,
                                       int bytes, BdrvRequestFlags flags);
 int blk_pwrite_compressed(BlockBackend *blk, int64_t offset, const void *buf,
                           int bytes);
-int blk_truncate(BlockBackend *blk, int64_t offset, PreallocMode prealloc,
-                 Error **errp);
+int blk_truncate(BlockBackend *blk, int64_t offset, bool exact,
+                 PreallocMode prealloc, Error **errp);
 int blk_pdiscard(BlockBackend *blk, int64_t offset, int bytes);
 int blk_save_vmstate(BlockBackend *blk, const uint8_t *buf,
                      int64_t pos, int size);
diff --git a/qemu-img.c b/qemu-img.c
index 8b03ef8171..c738297b04 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -3831,7 +3831,7 @@ static int img_resize(int argc, char **argv)
         }
     }
 
-    ret = blk_truncate(blk, total_size, prealloc, &err);
+    ret = blk_truncate(blk, total_size, false, prealloc, &err);
     if (ret < 0) {
         error_report_err(err);
         goto out;
diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
index 349256a5fe..5e9017c979 100644
--- a/qemu-io-cmds.c
+++ b/qemu-io-cmds.c
@@ -1710,7 +1710,7 @@ static int truncate_f(BlockBackend *blk, int argc, char **argv)
         return offset;
     }
 
-    ret = blk_truncate(blk, offset, PREALLOC_MODE_OFF, &local_err);
+    ret = blk_truncate(blk, offset, false, PREALLOC_MODE_OFF, &local_err);
     if (ret < 0) {
         error_report_err(local_err);
         return ret;
diff --git a/tests/test-block-iothread.c b/tests/test-block-iothread.c
index cfe30bab21..0c861809f0 100644
--- a/tests/test-block-iothread.c
+++ b/tests/test-block-iothread.c
@@ -45,7 +45,7 @@ static int coroutine_fn bdrv_test_co_pdiscard(BlockDriverState *bs,
 }
 
 static int coroutine_fn
-bdrv_test_co_truncate(BlockDriverState *bs, int64_t offset,
+bdrv_test_co_truncate(BlockDriverState *bs, int64_t offset, bool exact,
                       PreallocMode prealloc, Error **errp)
 {
     return 0;
@@ -185,18 +185,18 @@ static void test_sync_op_truncate(BdrvChild *c)
     int ret;
 
     /* Normal success path */
-    ret = bdrv_truncate(c, 65536, PREALLOC_MODE_OFF, NULL);
+    ret = bdrv_truncate(c, 65536, false, PREALLOC_MODE_OFF, NULL);
     g_assert_cmpint(ret, ==, 0);
 
     /* Early error: Negative offset */
-    ret = bdrv_truncate(c, -2, PREALLOC_MODE_OFF, NULL);
+    ret = bdrv_truncate(c, -2, false, PREALLOC_MODE_OFF, NULL);
     g_assert_cmpint(ret, ==, -EINVAL);
 
     /* Error: Read-only image */
     c->bs->read_only = true;
     c->bs->open_flags &= ~BDRV_O_RDWR;
 
-    ret = bdrv_truncate(c, 65536, PREALLOC_MODE_OFF, NULL);
+    ret = bdrv_truncate(c, 65536, false, PREALLOC_MODE_OFF, NULL);
     g_assert_cmpint(ret, ==, -EACCES);
 
     c->bs->read_only = false;

From 82325ae5f2f86ad696db3d66563a078daabc9769 Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Wed, 18 Sep 2019 11:51:41 +0200
Subject: [PATCH 65/69] block: Evaluate @exact in protocol drivers

We have two protocol drivers that return success when trying to shrink a
block device even though they cannot shrink it.  This behavior is now
only allowed with exact=false, so they should return an error with
exact=true.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-id: 20190918095144.955-6-mreitz@redhat.com
Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/file-posix.c | 8 +++++++-
 block/iscsi.c      | 7 ++++++-
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/block/file-posix.c b/block/file-posix.c
index a3e8a8aa70..e0ea1a7446 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -2033,6 +2033,7 @@ static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset,
     }
 
     if (S_ISREG(st.st_mode)) {
+        /* Always resizes to the exact @offset */
         return raw_regular_truncate(bs, s->fd, offset, prealloc, errp);
     }
 
@@ -2043,7 +2044,12 @@ static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset,
     }
 
     if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
-        if (offset > raw_getlength(bs)) {
+        int64_t cur_length = raw_getlength(bs);
+
+        if (offset != cur_length && exact) {
+            error_setg(errp, "Cannot resize device files");
+            return -ENOTSUP;
+        } else if (offset > cur_length) {
             error_setg(errp, "Cannot grow device files");
             return -EINVAL;
         }
diff --git a/block/iscsi.c b/block/iscsi.c
index 677946cf09..2aea7e3f13 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -2127,6 +2127,7 @@ static int coroutine_fn iscsi_co_truncate(BlockDriverState *bs, int64_t offset,
                                           Error **errp)
 {
     IscsiLun *iscsilun = bs->opaque;
+    int64_t cur_length;
     Error *local_err = NULL;
 
     if (prealloc != PREALLOC_MODE_OFF) {
@@ -2146,7 +2147,11 @@ static int coroutine_fn iscsi_co_truncate(BlockDriverState *bs, int64_t offset,
         return -EIO;
     }
 
-    if (offset > iscsi_getlength(bs)) {
+    cur_length = iscsi_getlength(bs);
+    if (offset != cur_length && exact) {
+        error_setg(errp, "Cannot resize iSCSI devices");
+        return -ENOTSUP;
+    } else if (offset > cur_length) {
         error_setg(errp, "Cannot grow iSCSI devices");
         return -EINVAL;
     }

From e61a28a9b6b43da6a7a48f6d325fceadf9769388 Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Wed, 18 Sep 2019 11:51:42 +0200
Subject: [PATCH 66/69] block: Let format drivers pass @exact

When truncating a format node, the @exact parameter is generally handled
simply by virtue of the format storing the new size in the image
metadata.  Such formats do not need to pass on the parameter to their
file nodes.

There are exceptions, though:
- raw and crypto cannot store the image size, and thus must pass on
  @exact.

- When using qcow2 with an external data file, it just makes sense to
  keep its size in sync with the qcow2 virtual disk (because the
  external data file is the virtual disk).  Therefore, we should pass
  @exact when truncating it.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-id: 20190918095144.955-7-mreitz@redhat.com
Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/crypto.c     |  2 +-
 block/qcow2.c      | 15 ++++++++++++++-
 block/raw-format.c |  2 +-
 3 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/block/crypto.c b/block/crypto.c
index e5a1a2cdf3..24823835c1 100644
--- a/block/crypto.c
+++ b/block/crypto.c
@@ -311,7 +311,7 @@ block_crypto_co_truncate(BlockDriverState *bs, int64_t offset, bool exact,
 
     offset += payload_offset;
 
-    return bdrv_co_truncate(bs->file, offset, false, prealloc, errp);
+    return bdrv_co_truncate(bs->file, offset, exact, prealloc, errp);
 }
 
 static void block_crypto_close(BlockDriverState *bs)
diff --git a/block/qcow2.c b/block/qcow2.c
index de284ad803..9f32dae41f 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -4027,6 +4027,13 @@ static int coroutine_fn qcow2_co_truncate(BlockDriverState *bs, int64_t offset,
         if ((last_cluster + 1) * s->cluster_size < old_file_size) {
             Error *local_err = NULL;
 
+            /*
+             * Do not pass @exact here: It will not help the user if
+             * we get an error here just because they wanted to shrink
+             * their qcow2 image (on a block device) with qemu-img.
+             * (And on the qcow2 layer, the @exact requirement is
+             * always fulfilled, so there is no need to pass it on.)
+             */
             bdrv_co_truncate(bs->file, (last_cluster + 1) * s->cluster_size,
                              false, PREALLOC_MODE_OFF, &local_err);
             if (local_err) {
@@ -4045,7 +4052,12 @@ static int coroutine_fn qcow2_co_truncate(BlockDriverState *bs, int64_t offset,
     switch (prealloc) {
     case PREALLOC_MODE_OFF:
         if (has_data_file(bs)) {
-            ret = bdrv_co_truncate(s->data_file, offset, false, prealloc, errp);
+            /*
+             * If the caller wants an exact resize, the external data
+             * file should be resized to the exact target size, too,
+             * so we pass @exact here.
+             */
+            ret = bdrv_co_truncate(s->data_file, offset, exact, prealloc, errp);
             if (ret < 0) {
                 goto fail;
             }
@@ -4130,6 +4142,7 @@ static int coroutine_fn qcow2_co_truncate(BlockDriverState *bs, int64_t offset,
         /* Allocate the data area */
         new_file_size = allocation_start +
                         nb_new_data_clusters * s->cluster_size;
+        /* Image file grows, so @exact does not matter */
         ret = bdrv_co_truncate(bs->file, new_file_size, false, prealloc, errp);
         if (ret < 0) {
             error_prepend(errp, "Failed to resize underlying file: ");
diff --git a/block/raw-format.c b/block/raw-format.c
index 57d84d0bae..3a76ec7dd2 100644
--- a/block/raw-format.c
+++ b/block/raw-format.c
@@ -387,7 +387,7 @@ static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset,
 
     s->size = offset;
     offset += s->offset;
-    return bdrv_co_truncate(bs->file, offset, false, prealloc, errp);
+    return bdrv_co_truncate(bs->file, offset, exact, prealloc, errp);
 }
 
 static void raw_eject(BlockDriverState *bs, bool eject_flag)

From e8d04f92378c2de7b464e04469a657fd37eb29ea Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Wed, 18 Sep 2019 11:51:43 +0200
Subject: [PATCH 67/69] block: Pass truncate exact=true where reasonable

This is a change in behavior, so all instances need a good
justification.  The comments added here should explain my reasoning.

qed already had a comment that suggests it always expected
bdrv_truncate()/blk_truncate() to behave as if exact=true were passed
(c743849bee7 came eight months before 55b949c8476), so it was simply
broken until now.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-id: 20190918095144.955-8-mreitz@redhat.com
Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
[mreitz: Changed comment in qed.c to explain why a new QED file must be
         empty, as requested and suggested by Maxim]
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/parallels.c | 11 +++++++++--
 block/qcow2.c     |  6 +++++-
 block/qed.c       |  7 +++++--
 qemu-img.c        |  7 ++++++-
 qemu-io-cmds.c    |  7 ++++++-
 5 files changed, 31 insertions(+), 7 deletions(-)

diff --git a/block/parallels.c b/block/parallels.c
index a1a92c97a4..603211f83c 100644
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -487,7 +487,12 @@ static int coroutine_fn parallels_co_check(BlockDriverState *bs,
         res->leaks += count;
         if (fix & BDRV_FIX_LEAKS) {
             Error *local_err = NULL;
-            ret = bdrv_truncate(bs->file, res->image_end_offset, false,
+
+            /*
+             * In order to really repair the image, we must shrink it.
+             * That means we have to pass exact=true.
+             */
+            ret = bdrv_truncate(bs->file, res->image_end_offset, true,
                                 PREALLOC_MODE_OFF, &local_err);
             if (ret < 0) {
                 error_report_err(local_err);
@@ -880,7 +885,9 @@ static void parallels_close(BlockDriverState *bs)
     if ((bs->open_flags & BDRV_O_RDWR) && !(bs->open_flags & BDRV_O_INACTIVE)) {
         s->header->inuse = 0;
         parallels_update_header(bs);
-        bdrv_truncate(bs->file, s->data_end << BDRV_SECTOR_BITS, false,
+
+        /* errors are ignored, so we might as well pass exact=true */
+        bdrv_truncate(bs->file, s->data_end << BDRV_SECTOR_BITS, true,
                       PREALLOC_MODE_OFF, NULL);
     }
 
diff --git a/block/qcow2.c b/block/qcow2.c
index 9f32dae41f..7c18721741 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -5323,7 +5323,11 @@ static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts,
             return ret;
         }
 
-        ret = blk_truncate(blk, new_size, false, PREALLOC_MODE_OFF, errp);
+        /*
+         * Amending image options should ensure that the image has
+         * exactly the given new values, so pass exact=true here.
+         */
+        ret = blk_truncate(blk, new_size, true, PREALLOC_MODE_OFF, errp);
         blk_unref(blk);
         if (ret < 0) {
             return ret;
diff --git a/block/qed.c b/block/qed.c
index 7c2a65af40..d8c4e5fb1e 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -673,8 +673,11 @@ static int coroutine_fn bdrv_qed_co_create(BlockdevCreateOptions *opts,
 
     l1_size = header.cluster_size * header.table_size;
 
-    /* File must start empty and grow, check truncate is supported */
-    ret = blk_truncate(blk, 0, false, PREALLOC_MODE_OFF, errp);
+    /*
+     * The QED format associates file length with allocation status,
+     * so a new file (which is empty) must have a length of 0.
+     */
+    ret = blk_truncate(blk, 0, true, PREALLOC_MODE_OFF, errp);
     if (ret < 0) {
         goto out;
     }
diff --git a/qemu-img.c b/qemu-img.c
index c738297b04..b288c967b5 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -3831,7 +3831,12 @@ static int img_resize(int argc, char **argv)
         }
     }
 
-    ret = blk_truncate(blk, total_size, false, prealloc, &err);
+    /*
+     * The user expects the image to have the desired size after
+     * resizing, so pass @exact=true.  It is of no use to report
+     * success when the image has not actually been resized.
+     */
+    ret = blk_truncate(blk, total_size, true, prealloc, &err);
     if (ret < 0) {
         error_report_err(err);
         goto out;
diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
index 5e9017c979..1b7e700020 100644
--- a/qemu-io-cmds.c
+++ b/qemu-io-cmds.c
@@ -1710,7 +1710,12 @@ static int truncate_f(BlockBackend *blk, int argc, char **argv)
         return offset;
     }
 
-    ret = blk_truncate(blk, offset, false, PREALLOC_MODE_OFF, &local_err);
+    /*
+     * qemu-io is a debugging tool, so let us be strict here and pass
+     * exact=true.  It is better to err on the "emit more errors" side
+     * than to be overly permissive.
+     */
+    ret = blk_truncate(blk, offset, true, PREALLOC_MODE_OFF, &local_err);
     if (ret < 0) {
         error_report_err(local_err);
         return ret;

From 09c5c6de41f27bb128a020d64983adf07b6a256c Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Wed, 18 Sep 2019 11:51:44 +0200
Subject: [PATCH 68/69] Revert "qemu-img: Check post-truncation size"

This reverts commit 5279b30392da7a3248b320c75f20c61e3a95863c.

We no longer need this check because exact=true forces the block driver
to give the image the exact size requested by the user.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-id: 20190918095144.955-9-mreitz@redhat.com
Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 qemu-img.c | 39 ++++-----------------------------------
 1 file changed, 4 insertions(+), 35 deletions(-)

diff --git a/qemu-img.c b/qemu-img.c
index b288c967b5..95a24b9762 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -3656,7 +3656,7 @@ static int img_resize(int argc, char **argv)
     Error *err = NULL;
     int c, ret, relative;
     const char *filename, *fmt, *size;
-    int64_t n, total_size, current_size, new_size;
+    int64_t n, total_size, current_size;
     bool quiet = false;
     BlockBackend *blk = NULL;
     PreallocMode prealloc = PREALLOC_MODE_OFF;
@@ -3837,42 +3837,11 @@ static int img_resize(int argc, char **argv)
      * success when the image has not actually been resized.
      */
     ret = blk_truncate(blk, total_size, true, prealloc, &err);
-    if (ret < 0) {
+    if (!ret) {
+        qprintf(quiet, "Image resized.\n");
+    } else {
         error_report_err(err);
-        goto out;
     }
-
-    new_size = blk_getlength(blk);
-    if (new_size < 0) {
-        error_report("Failed to verify truncated image length: %s",
-                     strerror(-new_size));
-        ret = -1;
-        goto out;
-    }
-
-    /* Some block drivers implement a truncation method, but only so
-     * the user can cause qemu to refresh the image's size from disk.
-     * The idea is that the user resizes the image outside of qemu and
-     * then invokes block_resize to inform qemu about it.
-     * (This includes iscsi and file-posix for device files.)
-     * Of course, that is not the behavior someone invoking
-     * qemu-img resize would find useful, so we catch that behavior
-     * here and tell the user. */
-    if (new_size != total_size && new_size == current_size) {
-        error_report("Image was not resized; resizing may not be supported "
-                     "for this image");
-        ret = -1;
-        goto out;
-    }
-
-    if (new_size != total_size) {
-        warn_report("Image should have been resized to %" PRIi64
-                    " bytes, but was resized to %" PRIi64 " bytes",
-                    total_size, new_size);
-    }
-
-    qprintf(quiet, "Image resized.\n");
-
 out:
     blk_unref(blk);
     if (ret) {

From ba9c45139e2938b8d20ce407db83a31bc9e5066c Mon Sep 17 00:00:00 2001
From: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Date: Fri, 25 Oct 2019 17:50:22 +0300
Subject: [PATCH 69/69] qemu-iotests: restrict 264 to qcow2 only

264 is unprepared to run with different formats, for example luks needs
handling keys, cloop doesn't support image creation, vpc creates image
larger than requested (which breaks "Backup completed: 5242880" in test
output).

The test is here to check nbd-reconnect feature and we actually don't
need it for all formats. Let's restrict it to qcow2 only.

Reported-by: Max Reitz <mreitz@redhat.com>
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-id: 20191025145023.6182-1-vsementsov@virtuozzo.com
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/264 | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/qemu-iotests/264 b/tests/qemu-iotests/264
index c8cd97ae2b..131366422b 100755
--- a/tests/qemu-iotests/264
+++ b/tests/qemu-iotests/264
@@ -24,6 +24,8 @@ import iotests
 from iotests import qemu_img_create, qemu_io_silent_check, file_path, \
         qemu_nbd_popen, log
 
+iotests.verify_image_format(supported_fmts=['qcow2'])
+
 disk_a, disk_b, nbd_sock = file_path('disk_a', 'disk_b', 'nbd-sock')
 nbd_uri = 'nbd+unix:///?socket=' + nbd_sock
 size = 5 * 1024 * 1024