block: posix: Always allocate the first block
When creating an image with preallocation "off" or "falloc", the first block of the image is typically not allocated. When using Gluster storage backed by XFS filesystem, reading this block using direct I/O succeeds regardless of request length, fooling alignment detection. In this case we fallback to a safe value (4096) instead of the optimal value (512), which may lead to unneeded data copying when aligning requests. Allocating the first block avoids the fallback. Since we allocate the first block even with preallocation=off, we no longer create images with zero disk size: $ ./qemu-img create -f raw test.raw 1g Formatting 'test.raw', fmt=raw size=1073741824 $ ls -lhs test.raw 4.0K -rw-r--r--. 1 nsoffer nsoffer 1.0G Aug 16 23:48 test.raw And converting the image requires additional cluster: $ ./qemu-img measure -f raw -O qcow2 test.raw required size: 458752 fully allocated size: 1074135040 When using format like vmdk with multiple files per image, we allocate one block per file: $ ./qemu-img create -f vmdk -o subformat=twoGbMaxExtentFlat test.vmdk 4g Formatting 'test.vmdk', fmt=vmdk size=4294967296 compat6=off hwversion=undefined subformat=twoGbMaxExtentFlat $ ls -lhs test*.vmdk 4.0K -rw-r--r--. 1 nsoffer nsoffer 2.0G Aug 27 03:23 test-f001.vmdk 4.0K -rw-r--r--. 1 nsoffer nsoffer 2.0G Aug 27 03:23 test-f002.vmdk 4.0K -rw-r--r--. 1 nsoffer nsoffer 353 Aug 27 03:23 test.vmdk I did quick performance test for copying disks with qemu-img convert to new raw target image to Gluster storage with sector size of 512 bytes: for i in $(seq 10); do rm -f dst.raw sleep 10 time ./qemu-img convert -f raw -O raw -t none -T none src.raw dst.raw done Here is a table comparing the total time spent: Type Before(s) After(s) Diff(%) --------------------------------------- real 530.028 469.123 -11.4 user 17.204 10.768 -37.4 sys 17.881 7.011 -60.7 We can see very clear improvement in CPU usage. Signed-off-by: Nir Soffer <nsoffer@redhat.com> Message-id: 20190827010528.8818-2-nsoffer@redhat.com Reviewed-by: Max Reitz <mreitz@redhat.com> Signed-off-by: Max Reitz <mreitz@redhat.com>
This commit is contained in:
parent
b503de619e
commit
3a20013fbb
@ -1749,6 +1749,43 @@ static int handle_aiocb_discard(void *opaque)
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Help alignment probing by allocating the first block.
|
||||||
|
*
|
||||||
|
* When reading with direct I/O from unallocated area on Gluster backed by XFS,
|
||||||
|
* reading succeeds regardless of request length. In this case we fallback to
|
||||||
|
* safe alignment which is not optimal. Allocating the first block avoids this
|
||||||
|
* fallback.
|
||||||
|
*
|
||||||
|
* fd may be opened with O_DIRECT, but we don't know the buffer alignment or
|
||||||
|
* request alignment, so we use safe values.
|
||||||
|
*
|
||||||
|
* Returns: 0 on success, -errno on failure. Since this is an optimization,
|
||||||
|
* caller may ignore failures.
|
||||||
|
*/
|
||||||
|
static int allocate_first_block(int fd, size_t max_size)
|
||||||
|
{
|
||||||
|
size_t write_size = (max_size < MAX_BLOCKSIZE)
|
||||||
|
? BDRV_SECTOR_SIZE
|
||||||
|
: MAX_BLOCKSIZE;
|
||||||
|
size_t max_align = MAX(MAX_BLOCKSIZE, getpagesize());
|
||||||
|
void *buf;
|
||||||
|
ssize_t n;
|
||||||
|
int ret;
|
||||||
|
|
||||||
|
buf = qemu_memalign(max_align, write_size);
|
||||||
|
memset(buf, 0, write_size);
|
||||||
|
|
||||||
|
do {
|
||||||
|
n = pwrite(fd, buf, write_size, 0);
|
||||||
|
} while (n == -1 && errno == EINTR);
|
||||||
|
|
||||||
|
ret = (n == -1) ? -errno : 0;
|
||||||
|
|
||||||
|
qemu_vfree(buf);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
static int handle_aiocb_truncate(void *opaque)
|
static int handle_aiocb_truncate(void *opaque)
|
||||||
{
|
{
|
||||||
RawPosixAIOData *aiocb = opaque;
|
RawPosixAIOData *aiocb = opaque;
|
||||||
@ -1788,6 +1825,17 @@ static int handle_aiocb_truncate(void *opaque)
|
|||||||
/* posix_fallocate() doesn't set errno. */
|
/* posix_fallocate() doesn't set errno. */
|
||||||
error_setg_errno(errp, -result,
|
error_setg_errno(errp, -result,
|
||||||
"Could not preallocate new data");
|
"Could not preallocate new data");
|
||||||
|
} else if (current_length == 0) {
|
||||||
|
/*
|
||||||
|
* posix_fallocate() uses fallocate() if the filesystem
|
||||||
|
* supports it, or fallback to manually writing zeroes. If
|
||||||
|
* fallocate() was used, unaligned reads from the fallocated
|
||||||
|
* area in raw_probe_alignment() will succeed, hence we need to
|
||||||
|
* allocate the first block.
|
||||||
|
*
|
||||||
|
* Optimize future alignment probing; ignore failures.
|
||||||
|
*/
|
||||||
|
allocate_first_block(fd, offset);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
result = 0;
|
result = 0;
|
||||||
@ -1849,6 +1897,9 @@ static int handle_aiocb_truncate(void *opaque)
|
|||||||
if (ftruncate(fd, offset) != 0) {
|
if (ftruncate(fd, offset) != 0) {
|
||||||
result = -errno;
|
result = -errno;
|
||||||
error_setg_errno(errp, -result, "Could not resize file");
|
error_setg_errno(errp, -result, "Could not resize file");
|
||||||
|
} else if (current_length == 0 && offset > current_length) {
|
||||||
|
/* Optimize future alignment probing; ignore failures. */
|
||||||
|
allocate_first_block(fd, offset);
|
||||||
}
|
}
|
||||||
return result;
|
return result;
|
||||||
default:
|
default:
|
||||||
|
@ -27,7 +27,7 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824000 subformat=twoGbMax
|
|||||||
image: TEST_DIR/t.vmdk
|
image: TEST_DIR/t.vmdk
|
||||||
file format: vmdk
|
file format: vmdk
|
||||||
virtual size: 0.977 TiB (1073741824000 bytes)
|
virtual size: 0.977 TiB (1073741824000 bytes)
|
||||||
disk size: 16 KiB
|
disk size: 1.97 MiB
|
||||||
Format specific information:
|
Format specific information:
|
||||||
cid: XXXXXXXX
|
cid: XXXXXXXX
|
||||||
parent cid: XXXXXXXX
|
parent cid: XXXXXXXX
|
||||||
|
12
tests/qemu-iotests/150.out.raw
Normal file
12
tests/qemu-iotests/150.out.raw
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
QA output created by 150
|
||||||
|
|
||||||
|
=== Mapping sparse conversion ===
|
||||||
|
|
||||||
|
Offset Length File
|
||||||
|
0 0x1000 TEST_DIR/t.IMGFMT
|
||||||
|
|
||||||
|
=== Mapping non-sparse conversion ===
|
||||||
|
|
||||||
|
Offset Length File
|
||||||
|
0 0x100000 TEST_DIR/t.IMGFMT
|
||||||
|
*** done
|
@ -37,14 +37,16 @@ trap "_cleanup; exit \$status" 0 1 2 3 15
|
|||||||
# the file size. This function hides the resulting difference in the
|
# the file size. This function hides the resulting difference in the
|
||||||
# stat -c '%b' output.
|
# stat -c '%b' output.
|
||||||
# Parameter 1: Number of blocks an empty file occupies
|
# Parameter 1: Number of blocks an empty file occupies
|
||||||
# Parameter 2: Image size in bytes
|
# Parameter 2: Minimal number of blocks in an image
|
||||||
|
# Parameter 3: Image size in bytes
|
||||||
_filter_blocks()
|
_filter_blocks()
|
||||||
{
|
{
|
||||||
extra_blocks=$1
|
extra_blocks=$1
|
||||||
img_size=$2
|
min_blocks=$2
|
||||||
|
img_size=$3
|
||||||
|
|
||||||
sed -e "s/blocks=$extra_blocks\\(\$\\|[^0-9]\\)/nothing allocated/" \
|
sed -e "s/blocks=$min_blocks\\(\$\\|[^0-9]\\)/min allocation/" \
|
||||||
-e "s/blocks=$((extra_blocks + img_size / 512))\\(\$\\|[^0-9]\\)/everything allocated/"
|
-e "s/blocks=$((extra_blocks + img_size / 512))\\(\$\\|[^0-9]\\)/max allocation/"
|
||||||
}
|
}
|
||||||
|
|
||||||
# get standard environment, filters and checks
|
# get standard environment, filters and checks
|
||||||
@ -60,16 +62,21 @@ size=$((1 * 1024 * 1024))
|
|||||||
touch "$TEST_DIR/empty"
|
touch "$TEST_DIR/empty"
|
||||||
extra_blocks=$(stat -c '%b' "$TEST_DIR/empty")
|
extra_blocks=$(stat -c '%b' "$TEST_DIR/empty")
|
||||||
|
|
||||||
|
# We always write the first byte; check how many blocks this filesystem
|
||||||
|
# allocates to match empty image alloation.
|
||||||
|
printf "\0" > "$TEST_DIR/empty"
|
||||||
|
min_blocks=$(stat -c '%b' "$TEST_DIR/empty")
|
||||||
|
|
||||||
echo
|
echo
|
||||||
echo "== creating image with default preallocation =="
|
echo "== creating image with default preallocation =="
|
||||||
_make_test_img $size | _filter_imgfmt
|
_make_test_img $size | _filter_imgfmt
|
||||||
stat -c "size=%s, blocks=%b" $TEST_IMG | _filter_blocks $extra_blocks $size
|
stat -c "size=%s, blocks=%b" $TEST_IMG | _filter_blocks $extra_blocks $min_blocks $size
|
||||||
|
|
||||||
for mode in off full falloc; do
|
for mode in off full falloc; do
|
||||||
echo
|
echo
|
||||||
echo "== creating image with preallocation $mode =="
|
echo "== creating image with preallocation $mode =="
|
||||||
IMGOPTS=preallocation=$mode _make_test_img $size | _filter_imgfmt
|
IMGOPTS=preallocation=$mode _make_test_img $size | _filter_imgfmt
|
||||||
stat -c "size=%s, blocks=%b" $TEST_IMG | _filter_blocks $extra_blocks $size
|
stat -c "size=%s, blocks=%b" $TEST_IMG | _filter_blocks $extra_blocks $min_blocks $size
|
||||||
done
|
done
|
||||||
|
|
||||||
# success, all done
|
# success, all done
|
||||||
|
@ -2,17 +2,17 @@ QA output created by 175
|
|||||||
|
|
||||||
== creating image with default preallocation ==
|
== creating image with default preallocation ==
|
||||||
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576
|
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576
|
||||||
size=1048576, nothing allocated
|
size=1048576, min allocation
|
||||||
|
|
||||||
== creating image with preallocation off ==
|
== creating image with preallocation off ==
|
||||||
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 preallocation=off
|
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 preallocation=off
|
||||||
size=1048576, nothing allocated
|
size=1048576, min allocation
|
||||||
|
|
||||||
== creating image with preallocation full ==
|
== creating image with preallocation full ==
|
||||||
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 preallocation=full
|
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 preallocation=full
|
||||||
size=1048576, everything allocated
|
size=1048576, max allocation
|
||||||
|
|
||||||
== creating image with preallocation falloc ==
|
== creating image with preallocation falloc ==
|
||||||
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 preallocation=falloc
|
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 preallocation=falloc
|
||||||
size=1048576, everything allocated
|
size=1048576, max allocation
|
||||||
*** done
|
*** done
|
||||||
|
@ -101,7 +101,7 @@ converted image file size in bytes: 196608
|
|||||||
== raw input image with data (human) ==
|
== raw input image with data (human) ==
|
||||||
|
|
||||||
Formatting 'TEST_DIR/t.qcow2', fmt=IMGFMT size=1073741824
|
Formatting 'TEST_DIR/t.qcow2', fmt=IMGFMT size=1073741824
|
||||||
required size: 393216
|
required size: 458752
|
||||||
fully allocated size: 1074135040
|
fully allocated size: 1074135040
|
||||||
wrote 512/512 bytes at offset 512
|
wrote 512/512 bytes at offset 512
|
||||||
512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
|
512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
|
||||||
@ -257,7 +257,7 @@ converted image file size in bytes: 196608
|
|||||||
|
|
||||||
Formatting 'TEST_DIR/t.qcow2', fmt=IMGFMT size=1073741824
|
Formatting 'TEST_DIR/t.qcow2', fmt=IMGFMT size=1073741824
|
||||||
{
|
{
|
||||||
"required": 393216,
|
"required": 458752,
|
||||||
"fully-allocated": 1074135040
|
"fully-allocated": 1074135040
|
||||||
}
|
}
|
||||||
wrote 512/512 bytes at offset 512
|
wrote 512/512 bytes at offset 512
|
||||||
|
@ -3,14 +3,18 @@ QA output created by 221
|
|||||||
=== Check mapping of unaligned raw image ===
|
=== Check mapping of unaligned raw image ===
|
||||||
|
|
||||||
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=65537
|
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=65537
|
||||||
[{ "start": 0, "length": 66048, "depth": 0, "zero": true, "data": false, "offset": OFFSET}]
|
[{ "start": 0, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
|
||||||
[{ "start": 0, "length": 66048, "depth": 0, "zero": true, "data": false, "offset": OFFSET}]
|
{ "start": 4096, "length": 61952, "depth": 0, "zero": true, "data": false, "offset": OFFSET}]
|
||||||
|
[{ "start": 0, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
|
||||||
|
{ "start": 4096, "length": 61952, "depth": 0, "zero": true, "data": false, "offset": OFFSET}]
|
||||||
wrote 1/1 bytes at offset 65536
|
wrote 1/1 bytes at offset 65536
|
||||||
1 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
|
1 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
|
||||||
[{ "start": 0, "length": 65536, "depth": 0, "zero": true, "data": false, "offset": OFFSET},
|
[{ "start": 0, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
|
||||||
|
{ "start": 4096, "length": 61440, "depth": 0, "zero": true, "data": false, "offset": OFFSET},
|
||||||
{ "start": 65536, "length": 1, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
|
{ "start": 65536, "length": 1, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
|
||||||
{ "start": 65537, "length": 511, "depth": 0, "zero": true, "data": false, "offset": OFFSET}]
|
{ "start": 65537, "length": 511, "depth": 0, "zero": true, "data": false, "offset": OFFSET}]
|
||||||
[{ "start": 0, "length": 65536, "depth": 0, "zero": true, "data": false, "offset": OFFSET},
|
[{ "start": 0, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
|
||||||
|
{ "start": 4096, "length": 61440, "depth": 0, "zero": true, "data": false, "offset": OFFSET},
|
||||||
{ "start": 65536, "length": 1, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
|
{ "start": 65536, "length": 1, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
|
||||||
{ "start": 65537, "length": 511, "depth": 0, "zero": true, "data": false, "offset": OFFSET}]
|
{ "start": 65537, "length": 511, "depth": 0, "zero": true, "data": false, "offset": OFFSET}]
|
||||||
*** done
|
*** done
|
||||||
|
@ -3,12 +3,16 @@ QA output created by 253
|
|||||||
=== Check mapping of unaligned raw image ===
|
=== Check mapping of unaligned raw image ===
|
||||||
|
|
||||||
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048575
|
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048575
|
||||||
[{ "start": 0, "length": 1048576, "depth": 0, "zero": true, "data": false, "offset": OFFSET}]
|
[{ "start": 0, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
|
||||||
[{ "start": 0, "length": 1048576, "depth": 0, "zero": true, "data": false, "offset": OFFSET}]
|
{ "start": 4096, "length": 1044480, "depth": 0, "zero": true, "data": false, "offset": OFFSET}]
|
||||||
|
[{ "start": 0, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
|
||||||
|
{ "start": 4096, "length": 1044480, "depth": 0, "zero": true, "data": false, "offset": OFFSET}]
|
||||||
wrote 65535/65535 bytes at offset 983040
|
wrote 65535/65535 bytes at offset 983040
|
||||||
63.999 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
|
63.999 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
|
||||||
[{ "start": 0, "length": 983040, "depth": 0, "zero": true, "data": false, "offset": OFFSET},
|
[{ "start": 0, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
|
||||||
|
{ "start": 4096, "length": 978944, "depth": 0, "zero": true, "data": false, "offset": OFFSET},
|
||||||
{ "start": 983040, "length": 65536, "depth": 0, "zero": false, "data": true, "offset": OFFSET}]
|
{ "start": 983040, "length": 65536, "depth": 0, "zero": false, "data": true, "offset": OFFSET}]
|
||||||
[{ "start": 0, "length": 983040, "depth": 0, "zero": true, "data": false, "offset": OFFSET},
|
[{ "start": 0, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": OFFSET},
|
||||||
|
{ "start": 4096, "length": 978944, "depth": 0, "zero": true, "data": false, "offset": OFFSET},
|
||||||
{ "start": 983040, "length": 65536, "depth": 0, "zero": false, "data": true, "offset": OFFSET}]
|
{ "start": 983040, "length": 65536, "depth": 0, "zero": false, "data": true, "offset": OFFSET}]
|
||||||
*** done
|
*** done
|
||||||
|
Loading…
Reference in New Issue
Block a user