Merge remote-tracking branch 'kwolf/for-anthony' into staging

# By Paolo Bonzini (14) and others
# Via Kevin Wolf
* kwolf/for-anthony: (24 commits)
  ide: Add fall through annotations
  block: Create proper size file for disk mirror
  ahci: Add migration support
  ahci: Change data types in preparation for migration
  ahci: Remove unused AHCIDevice fields
  hbitmap: add assertion on hbitmap_iter_init
  mirror: do nothing on zero-sized disk
  block/vdi: Check for bad signature
  block/vdi: Improved return values from vdi_open
  block/vdi: Improve debug output for signature
  block: Use error code EMEDIUMTYPE for wrong format in some block drivers
  block: Add special error code for wrong format
  mirror: support arbitrarily-sized iterations
  mirror: support more than one in-flight AIO operation
  mirror: add buf-size argument to drive-mirror
  mirror: switch mirror_iteration to AIO
  mirror: allow customizing the granularity
  block: allow customizing the granularity of the dirty bitmap
  block: return count of dirty sectors, not chunks
  mirror: perform COW if the cluster size is bigger than the granularity
  ...
This commit is contained in:
Anthony Liguori 2013-01-28 14:46:45 -06:00
commit 503cb22e05
30 changed files with 1729 additions and 231 deletions

View File

@ -23,7 +23,8 @@
#include "sysemu/blockdev.h"
#include <assert.h>
#define BLOCK_SIZE (BDRV_SECTORS_PER_DIRTY_CHUNK << BDRV_SECTOR_BITS)
#define BLOCK_SIZE (1 << 20)
#define BDRV_SECTORS_PER_DIRTY_CHUNK (BLOCK_SIZE >> BDRV_SECTOR_BITS)
#define BLK_MIG_FLAG_DEVICE_BLOCK 0x01
#define BLK_MIG_FLAG_EOS 0x02
@ -254,7 +255,7 @@ static void set_dirty_tracking(int enable)
BlkMigDevState *bmds;
QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
bdrv_set_dirty_tracking(bmds->bs, enable);
bdrv_set_dirty_tracking(bmds->bs, enable ? BLOCK_SIZE : 0);
}
}
@ -478,7 +479,7 @@ static int64_t get_remaining_dirty(void)
dirty += bdrv_get_dirty_count(bmds->bs);
}
return dirty * BLOCK_SIZE;
return dirty << BDRV_SECTOR_BITS;
}
static void blk_mig_cleanup(void)

124
block.c
View File

@ -1286,7 +1286,6 @@ static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
bs_dest->iostatus = bs_src->iostatus;
/* dirty bitmap */
bs_dest->dirty_count = bs_src->dirty_count;
bs_dest->dirty_bitmap = bs_src->dirty_bitmap;
/* job */
@ -1674,10 +1673,10 @@ static void tracked_request_begin(BdrvTrackedRequest *req,
/**
* Round a region to cluster boundaries
*/
static void round_to_clusters(BlockDriverState *bs,
int64_t sector_num, int nb_sectors,
int64_t *cluster_sector_num,
int *cluster_nb_sectors)
void bdrv_round_to_clusters(BlockDriverState *bs,
int64_t sector_num, int nb_sectors,
int64_t *cluster_sector_num,
int *cluster_nb_sectors)
{
BlockDriverInfo bdi;
@ -1719,8 +1718,8 @@ static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
* CoR read and write operations are atomic and guest writes cannot
* interleave between them.
*/
round_to_clusters(bs, sector_num, nb_sectors,
&cluster_sector_num, &cluster_nb_sectors);
bdrv_round_to_clusters(bs, sector_num, nb_sectors,
&cluster_sector_num, &cluster_nb_sectors);
do {
retry = false;
@ -2035,36 +2034,6 @@ int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num,
return ret;
}
#define BITS_PER_LONG (sizeof(unsigned long) * 8)
static void set_dirty_bitmap(BlockDriverState *bs, int64_t sector_num,
int nb_sectors, int dirty)
{
int64_t start, end;
unsigned long val, idx, bit;
start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
for (; start <= end; start++) {
idx = start / BITS_PER_LONG;
bit = start % BITS_PER_LONG;
val = bs->dirty_bitmap[idx];
if (dirty) {
if (!(val & (1UL << bit))) {
bs->dirty_count++;
val |= 1UL << bit;
}
} else {
if (val & (1UL << bit)) {
bs->dirty_count--;
val &= ~(1UL << bit);
}
}
bs->dirty_bitmap[idx] = val;
}
}
/* Return < 0 if error. Important errors are:
-EIO generic I/O error (may happen for all errors)
-ENOMEDIUM No media inserted.
@ -2216,8 +2185,8 @@ static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
/* Cover entire cluster so no additional backing file I/O is required when
* allocating cluster in the image file.
*/
round_to_clusters(bs, sector_num, nb_sectors,
&cluster_sector_num, &cluster_nb_sectors);
bdrv_round_to_clusters(bs, sector_num, nb_sectors,
&cluster_sector_num, &cluster_nb_sectors);
trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
cluster_sector_num, cluster_nb_sectors);
@ -2863,8 +2832,9 @@ BlockInfo *bdrv_query_info(BlockDriverState *bs)
if (bs->dirty_bitmap) {
info->has_dirty = true;
info->dirty = g_malloc0(sizeof(*info->dirty));
info->dirty->count = bdrv_get_dirty_count(bs) *
BDRV_SECTORS_PER_DIRTY_CHUNK * BDRV_SECTOR_SIZE;
info->dirty->count = bdrv_get_dirty_count(bs) * BDRV_SECTOR_SIZE;
info->dirty->granularity =
((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bs->dirty_bitmap));
}
if (bs->drv) {
@ -4173,7 +4143,7 @@ int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
}
if (bs->dirty_bitmap) {
set_dirty_bitmap(bs, sector_num, nb_sectors, 0);
bdrv_reset_dirty(bs, sector_num, nb_sectors);
}
if (bs->drv->bdrv_co_discard) {
@ -4331,22 +4301,20 @@ bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
return true;
}
void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable)
void bdrv_set_dirty_tracking(BlockDriverState *bs, int granularity)
{
int64_t bitmap_size;
bs->dirty_count = 0;
if (enable) {
if (!bs->dirty_bitmap) {
bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
BDRV_SECTORS_PER_DIRTY_CHUNK * BITS_PER_LONG - 1;
bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * BITS_PER_LONG;
assert((granularity & (granularity - 1)) == 0);
bs->dirty_bitmap = g_new0(unsigned long, bitmap_size);
}
if (granularity) {
granularity >>= BDRV_SECTOR_BITS;
assert(!bs->dirty_bitmap);
bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS);
bs->dirty_bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1);
} else {
if (bs->dirty_bitmap) {
g_free(bs->dirty_bitmap);
hbitmap_free(bs->dirty_bitmap);
bs->dirty_bitmap = NULL;
}
}
@ -4354,67 +4322,37 @@ void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable)
int bdrv_get_dirty(BlockDriverState *bs, int64_t sector)
{
int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
if (bs->dirty_bitmap &&
(sector << BDRV_SECTOR_BITS) < bdrv_getlength(bs)) {
return !!(bs->dirty_bitmap[chunk / BITS_PER_LONG] &
(1UL << (chunk % BITS_PER_LONG)));
if (bs->dirty_bitmap) {
return hbitmap_get(bs->dirty_bitmap, sector);
} else {
return 0;
}
}
int64_t bdrv_get_next_dirty(BlockDriverState *bs, int64_t sector)
void bdrv_dirty_iter_init(BlockDriverState *bs, HBitmapIter *hbi)
{
int64_t chunk;
int bit, elem;
/* Avoid an infinite loop. */
assert(bs->dirty_count > 0);
sector = (sector | (BDRV_SECTORS_PER_DIRTY_CHUNK - 1)) + 1;
chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
QEMU_BUILD_BUG_ON(sizeof(bs->dirty_bitmap[0]) * 8 != BITS_PER_LONG);
elem = chunk / BITS_PER_LONG;
bit = chunk % BITS_PER_LONG;
for (;;) {
if (sector >= bs->total_sectors) {
sector = 0;
bit = elem = 0;
}
if (bit == 0 && bs->dirty_bitmap[elem] == 0) {
sector += BDRV_SECTORS_PER_DIRTY_CHUNK * BITS_PER_LONG;
elem++;
} else {
if (bs->dirty_bitmap[elem] & (1UL << bit)) {
return sector;
}
sector += BDRV_SECTORS_PER_DIRTY_CHUNK;
if (++bit == BITS_PER_LONG) {
bit = 0;
elem++;
}
}
}
hbitmap_iter_init(hbi, bs->dirty_bitmap, 0);
}
void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
int nr_sectors)
{
set_dirty_bitmap(bs, cur_sector, nr_sectors, 1);
hbitmap_set(bs->dirty_bitmap, cur_sector, nr_sectors);
}
void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
int nr_sectors)
{
set_dirty_bitmap(bs, cur_sector, nr_sectors, 0);
hbitmap_reset(bs->dirty_bitmap, cur_sector, nr_sectors);
}
int64_t bdrv_get_dirty_count(BlockDriverState *bs)
{
return bs->dirty_count;
if (bs->dirty_bitmap) {
return hbitmap_count(bs->dirty_bitmap);
} else {
return 0;
}
}
void bdrv_set_in_use(BlockDriverState *bs, int in_use)

View File

@ -126,7 +126,7 @@ static int bochs_open(BlockDriverState *bs, int flags)
strcmp(bochs.subtype, GROWING_TYPE) ||
((le32_to_cpu(bochs.version) != HEADER_VERSION) &&
(le32_to_cpu(bochs.version) != HEADER_V1))) {
goto fail;
return -EMEDIUMTYPE;
}
if (le32_to_cpu(bochs.version) == HEADER_V1) {

View File

@ -73,7 +73,7 @@ static int cow_open(BlockDriverState *bs, int flags)
}
if (be32_to_cpu(cow_header.magic) != COW_MAGIC) {
ret = -EINVAL;
ret = -EMEDIUMTYPE;
goto fail;
}

View File

@ -15,17 +15,17 @@
#include "block/blockjob.h"
#include "block/block_int.h"
#include "qemu/ratelimit.h"
#include "qemu/bitmap.h"
enum {
/*
* Size of data buffer for populating the image file. This should be large
* enough to process multiple clusters in a single call, so that populating
* contiguous regions of the image is efficient.
*/
BLOCK_SIZE = 512 * BDRV_SECTORS_PER_DIRTY_CHUNK, /* in bytes */
};
#define SLICE_TIME 100000000ULL /* ns */
#define MAX_IN_FLIGHT 16
#define SLICE_TIME 100000000ULL /* ns */
/* The mirroring buffer is a list of granularity-sized chunks.
* Free chunks are organized in a list.
*/
typedef struct MirrorBuffer {
QSIMPLEQ_ENTRY(MirrorBuffer) next;
} MirrorBuffer;
typedef struct MirrorBlockJob {
BlockJob common;
@ -36,9 +36,26 @@ typedef struct MirrorBlockJob {
bool synced;
bool should_complete;
int64_t sector_num;
int64_t granularity;
size_t buf_size;
unsigned long *cow_bitmap;
HBitmapIter hbi;
uint8_t *buf;
QSIMPLEQ_HEAD(, MirrorBuffer) buf_free;
int buf_free_count;
unsigned long *in_flight_bitmap;
int in_flight;
int ret;
} MirrorBlockJob;
typedef struct MirrorOp {
MirrorBlockJob *s;
QEMUIOVector qiov;
int64_t sector_num;
int nb_sectors;
} MirrorOp;
static BlockErrorAction mirror_error_action(MirrorBlockJob *s, bool read,
int error)
{
@ -52,51 +69,234 @@ static BlockErrorAction mirror_error_action(MirrorBlockJob *s, bool read,
}
}
static int coroutine_fn mirror_iteration(MirrorBlockJob *s,
BlockErrorAction *p_action)
static void mirror_iteration_done(MirrorOp *op, int ret)
{
MirrorBlockJob *s = op->s;
struct iovec *iov;
int64_t chunk_num;
int i, nb_chunks, sectors_per_chunk;
trace_mirror_iteration_done(s, op->sector_num, op->nb_sectors, ret);
s->in_flight--;
iov = op->qiov.iov;
for (i = 0; i < op->qiov.niov; i++) {
MirrorBuffer *buf = (MirrorBuffer *) iov[i].iov_base;
QSIMPLEQ_INSERT_TAIL(&s->buf_free, buf, next);
s->buf_free_count++;
}
sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
chunk_num = op->sector_num / sectors_per_chunk;
nb_chunks = op->nb_sectors / sectors_per_chunk;
bitmap_clear(s->in_flight_bitmap, chunk_num, nb_chunks);
if (s->cow_bitmap && ret >= 0) {
bitmap_set(s->cow_bitmap, chunk_num, nb_chunks);
}
g_slice_free(MirrorOp, op);
qemu_coroutine_enter(s->common.co, NULL);
}
static void mirror_write_complete(void *opaque, int ret)
{
MirrorOp *op = opaque;
MirrorBlockJob *s = op->s;
if (ret < 0) {
BlockDriverState *source = s->common.bs;
BlockErrorAction action;
bdrv_set_dirty(source, op->sector_num, op->nb_sectors);
action = mirror_error_action(s, false, -ret);
if (action == BDRV_ACTION_REPORT && s->ret >= 0) {
s->ret = ret;
}
}
mirror_iteration_done(op, ret);
}
static void mirror_read_complete(void *opaque, int ret)
{
MirrorOp *op = opaque;
MirrorBlockJob *s = op->s;
if (ret < 0) {
BlockDriverState *source = s->common.bs;
BlockErrorAction action;
bdrv_set_dirty(source, op->sector_num, op->nb_sectors);
action = mirror_error_action(s, true, -ret);
if (action == BDRV_ACTION_REPORT && s->ret >= 0) {
s->ret = ret;
}
mirror_iteration_done(op, ret);
return;
}
bdrv_aio_writev(s->target, op->sector_num, &op->qiov, op->nb_sectors,
mirror_write_complete, op);
}
static void coroutine_fn mirror_iteration(MirrorBlockJob *s)
{
BlockDriverState *source = s->common.bs;
BlockDriverState *target = s->target;
QEMUIOVector qiov;
int ret, nb_sectors;
int64_t end;
struct iovec iov;
int nb_sectors, sectors_per_chunk, nb_chunks;
int64_t end, sector_num, next_chunk, next_sector, hbitmap_next_sector;
MirrorOp *op;
s->sector_num = hbitmap_iter_next(&s->hbi);
if (s->sector_num < 0) {
bdrv_dirty_iter_init(source, &s->hbi);
s->sector_num = hbitmap_iter_next(&s->hbi);
trace_mirror_restart_iter(s, bdrv_get_dirty_count(source));
assert(s->sector_num >= 0);
}
hbitmap_next_sector = s->sector_num;
sector_num = s->sector_num;
sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
end = s->common.len >> BDRV_SECTOR_BITS;
s->sector_num = bdrv_get_next_dirty(source, s->sector_num);
nb_sectors = MIN(BDRV_SECTORS_PER_DIRTY_CHUNK, end - s->sector_num);
bdrv_reset_dirty(source, s->sector_num, nb_sectors);
/* Extend the QEMUIOVector to include all adjacent blocks that will
* be copied in this operation.
*
* We have to do this if we have no backing file yet in the destination,
* and the cluster size is very large. Then we need to do COW ourselves.
* The first time a cluster is copied, copy it entirely. Note that,
* because both the granularity and the cluster size are powers of two,
* the number of sectors to copy cannot exceed one cluster.
*
* We also want to extend the QEMUIOVector to include more adjacent
* dirty blocks if possible, to limit the number of I/O operations and
* run efficiently even with a small granularity.
*/
nb_chunks = 0;
nb_sectors = 0;
next_sector = sector_num;
next_chunk = sector_num / sectors_per_chunk;
/* Wait for I/O to this cluster (from a previous iteration) to be done. */
while (test_bit(next_chunk, s->in_flight_bitmap)) {
trace_mirror_yield_in_flight(s, sector_num, s->in_flight);
qemu_coroutine_yield();
}
do {
int added_sectors, added_chunks;
if (!bdrv_get_dirty(source, next_sector) ||
test_bit(next_chunk, s->in_flight_bitmap)) {
assert(nb_sectors > 0);
break;
}
added_sectors = sectors_per_chunk;
if (s->cow_bitmap && !test_bit(next_chunk, s->cow_bitmap)) {
bdrv_round_to_clusters(s->target,
next_sector, added_sectors,
&next_sector, &added_sectors);
/* On the first iteration, the rounding may make us copy
* sectors before the first dirty one.
*/
if (next_sector < sector_num) {
assert(nb_sectors == 0);
sector_num = next_sector;
next_chunk = next_sector / sectors_per_chunk;
}
}
added_sectors = MIN(added_sectors, end - (sector_num + nb_sectors));
added_chunks = (added_sectors + sectors_per_chunk - 1) / sectors_per_chunk;
/* When doing COW, it may happen that there is not enough space for
* a full cluster. Wait if that is the case.
*/
while (nb_chunks == 0 && s->buf_free_count < added_chunks) {
trace_mirror_yield_buf_busy(s, nb_chunks, s->in_flight);
qemu_coroutine_yield();
}
if (s->buf_free_count < nb_chunks + added_chunks) {
trace_mirror_break_buf_busy(s, nb_chunks, s->in_flight);
break;
}
/* We have enough free space to copy these sectors. */
bitmap_set(s->in_flight_bitmap, next_chunk, added_chunks);
nb_sectors += added_sectors;
nb_chunks += added_chunks;
next_sector += added_sectors;
next_chunk += added_chunks;
} while (next_sector < end);
/* Allocate a MirrorOp that is used as an AIO callback. */
op = g_slice_new(MirrorOp);
op->s = s;
op->sector_num = sector_num;
op->nb_sectors = nb_sectors;
/* Now make a QEMUIOVector taking enough granularity-sized chunks
* from s->buf_free.
*/
qemu_iovec_init(&op->qiov, nb_chunks);
next_sector = sector_num;
while (nb_chunks-- > 0) {
MirrorBuffer *buf = QSIMPLEQ_FIRST(&s->buf_free);
QSIMPLEQ_REMOVE_HEAD(&s->buf_free, next);
s->buf_free_count--;
qemu_iovec_add(&op->qiov, buf, s->granularity);
/* Advance the HBitmapIter in parallel, so that we do not examine
* the same sector twice.
*/
if (next_sector > hbitmap_next_sector && bdrv_get_dirty(source, next_sector)) {
hbitmap_next_sector = hbitmap_iter_next(&s->hbi);
}
next_sector += sectors_per_chunk;
}
bdrv_reset_dirty(source, sector_num, nb_sectors);
/* Copy the dirty cluster. */
iov.iov_base = s->buf;
iov.iov_len = nb_sectors * 512;
qemu_iovec_init_external(&qiov, &iov, 1);
s->in_flight++;
trace_mirror_one_iteration(s, sector_num, nb_sectors);
bdrv_aio_readv(source, sector_num, &op->qiov, nb_sectors,
mirror_read_complete, op);
}
trace_mirror_one_iteration(s, s->sector_num, nb_sectors);
ret = bdrv_co_readv(source, s->sector_num, nb_sectors, &qiov);
if (ret < 0) {
*p_action = mirror_error_action(s, true, -ret);
goto fail;
}
ret = bdrv_co_writev(target, s->sector_num, nb_sectors, &qiov);
if (ret < 0) {
*p_action = mirror_error_action(s, false, -ret);
s->synced = false;
goto fail;
}
return 0;
static void mirror_free_init(MirrorBlockJob *s)
{
int granularity = s->granularity;
size_t buf_size = s->buf_size;
uint8_t *buf = s->buf;
fail:
/* Try again later. */
bdrv_set_dirty(source, s->sector_num, nb_sectors);
return ret;
assert(s->buf_free_count == 0);
QSIMPLEQ_INIT(&s->buf_free);
while (buf_size != 0) {
MirrorBuffer *cur = (MirrorBuffer *)buf;
QSIMPLEQ_INSERT_TAIL(&s->buf_free, cur, next);
s->buf_free_count++;
buf_size -= granularity;
buf += granularity;
}
}
static void mirror_drain(MirrorBlockJob *s)
{
while (s->in_flight > 0) {
qemu_coroutine_yield();
}
}
static void coroutine_fn mirror_run(void *opaque)
{
MirrorBlockJob *s = opaque;
BlockDriverState *bs = s->common.bs;
int64_t sector_num, end;
int64_t sector_num, end, sectors_per_chunk, length;
uint64_t last_pause_ns;
BlockDriverInfo bdi;
char backing_filename[1024];
int ret = 0;
int n;
@ -105,20 +305,39 @@ static void coroutine_fn mirror_run(void *opaque)
}
s->common.len = bdrv_getlength(bs);
if (s->common.len < 0) {
if (s->common.len <= 0) {
block_job_completed(&s->common, s->common.len);
return;
}
length = (bdrv_getlength(bs) + s->granularity - 1) / s->granularity;
s->in_flight_bitmap = bitmap_new(length);
/* If we have no backing file yet in the destination, we cannot let
* the destination do COW. Instead, we copy sectors around the
* dirty data if needed. We need a bitmap to do that.
*/
bdrv_get_backing_filename(s->target, backing_filename,
sizeof(backing_filename));
if (backing_filename[0] && !s->target->backing_hd) {
bdrv_get_info(s->target, &bdi);
if (s->granularity < bdi.cluster_size) {
s->buf_size = MAX(s->buf_size, bdi.cluster_size);
s->cow_bitmap = bitmap_new(length);
}
}
end = s->common.len >> BDRV_SECTOR_BITS;
s->buf = qemu_blockalign(bs, BLOCK_SIZE);
s->buf = qemu_blockalign(bs, s->buf_size);
sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS;
mirror_free_init(s);
if (s->mode != MIRROR_SYNC_MODE_NONE) {
/* First part, loop on the sectors and initialize the dirty bitmap. */
BlockDriverState *base;
base = s->mode == MIRROR_SYNC_MODE_FULL ? NULL : bs->backing_hd;
for (sector_num = 0; sector_num < end; ) {
int64_t next = (sector_num | (BDRV_SECTORS_PER_DIRTY_CHUNK - 1)) + 1;
int64_t next = (sector_num | (sectors_per_chunk - 1)) + 1;
ret = bdrv_co_is_allocated_above(bs, base,
sector_num, next - sector_num, &n);
@ -136,24 +355,40 @@ static void coroutine_fn mirror_run(void *opaque)
}
}
s->sector_num = -1;
bdrv_dirty_iter_init(bs, &s->hbi);
last_pause_ns = qemu_get_clock_ns(rt_clock);
for (;;) {
uint64_t delay_ns;
int64_t cnt;
bool should_complete;
if (s->ret < 0) {
ret = s->ret;
goto immediate_exit;
}
cnt = bdrv_get_dirty_count(bs);
if (cnt != 0) {
BlockErrorAction action = BDRV_ACTION_REPORT;
ret = mirror_iteration(s, &action);
if (ret < 0 && action == BDRV_ACTION_REPORT) {
goto immediate_exit;
/* Note that even when no rate limit is applied we need to yield
* periodically with no pending I/O so that qemu_aio_flush() returns.
* We do so every SLICE_TIME nanoseconds, or when there is an error,
* or when the source is clean, whichever comes first.
*/
if (qemu_get_clock_ns(rt_clock) - last_pause_ns < SLICE_TIME &&
s->common.iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
if (s->in_flight == MAX_IN_FLIGHT || s->buf_free_count == 0 ||
(cnt == 0 && s->in_flight > 0)) {
trace_mirror_yield(s, s->in_flight, s->buf_free_count, cnt);
qemu_coroutine_yield();
continue;
} else if (cnt != 0) {
mirror_iteration(s);
continue;
}
cnt = bdrv_get_dirty_count(bs);
}
should_complete = false;
if (cnt == 0) {
if (s->in_flight == 0 && cnt == 0) {
trace_mirror_before_flush(s);
ret = bdrv_flush(s->target);
if (ret < 0) {
@ -196,23 +431,20 @@ static void coroutine_fn mirror_run(void *opaque)
trace_mirror_before_sleep(s, cnt, s->synced);
if (!s->synced) {
/* Publish progress */
s->common.offset = end * BDRV_SECTOR_SIZE - cnt * BLOCK_SIZE;
s->common.offset = (end - cnt) * BDRV_SECTOR_SIZE;
if (s->common.speed) {
delay_ns = ratelimit_calculate_delay(&s->limit, BDRV_SECTORS_PER_DIRTY_CHUNK);
delay_ns = ratelimit_calculate_delay(&s->limit, sectors_per_chunk);
} else {
delay_ns = 0;
}
/* Note that even when no rate limit is applied we need to yield
* with no pending I/O here so that bdrv_drain_all() returns.
*/
block_job_sleep_ns(&s->common, rt_clock, delay_ns);
if (block_job_is_cancelled(&s->common)) {
break;
}
} else if (!should_complete) {
delay_ns = (cnt == 0 ? SLICE_TIME : 0);
delay_ns = (s->in_flight == 0 && cnt == 0 ? SLICE_TIME : 0);
block_job_sleep_ns(&s->common, rt_clock, delay_ns);
} else if (cnt == 0) {
/* The two disks are in sync. Exit and report successful
@ -222,11 +454,24 @@ static void coroutine_fn mirror_run(void *opaque)
s->common.cancelled = false;
break;
}
last_pause_ns = qemu_get_clock_ns(rt_clock);
}
immediate_exit:
if (s->in_flight > 0) {
/* We get here only if something went wrong. Either the job failed,
* or it was cancelled prematurely so that we do not guarantee that
* the target is a copy of the source.
*/
assert(ret < 0 || (!s->synced && block_job_is_cancelled(&s->common)));
mirror_drain(s);
}
assert(s->in_flight == 0);
qemu_vfree(s->buf);
bdrv_set_dirty_tracking(bs, false);
g_free(s->cow_bitmap);
g_free(s->in_flight_bitmap);
bdrv_set_dirty_tracking(bs, 0);
bdrv_iostatus_disable(s->target);
if (s->should_complete && ret == 0) {
if (bdrv_get_flags(s->target) != bdrv_get_flags(s->common.bs)) {
@ -288,14 +533,28 @@ static BlockJobType mirror_job_type = {
};
void mirror_start(BlockDriverState *bs, BlockDriverState *target,
int64_t speed, MirrorSyncMode mode,
BlockdevOnError on_source_error,
int64_t speed, int64_t granularity, int64_t buf_size,
MirrorSyncMode mode, BlockdevOnError on_source_error,
BlockdevOnError on_target_error,
BlockDriverCompletionFunc *cb,
void *opaque, Error **errp)
{
MirrorBlockJob *s;
if (granularity == 0) {
/* Choose the default granularity based on the target file's cluster
* size, clamped between 4k and 64k. */
BlockDriverInfo bdi;
if (bdrv_get_info(target, &bdi) >= 0 && bdi.cluster_size != 0) {
granularity = MAX(4096, bdi.cluster_size);
granularity = MIN(65536, granularity);
} else {
granularity = 65536;
}
}
assert ((granularity & (granularity - 1)) == 0);
if ((on_source_error == BLOCKDEV_ON_ERROR_STOP ||
on_source_error == BLOCKDEV_ON_ERROR_ENOSPC) &&
!bdrv_iostatus_is_enabled(bs)) {
@ -312,7 +571,10 @@ void mirror_start(BlockDriverState *bs, BlockDriverState *target,
s->on_target_error = on_target_error;
s->target = target;
s->mode = mode;
bdrv_set_dirty_tracking(bs, true);
s->granularity = granularity;
s->buf_size = MAX(buf_size, granularity);
bdrv_set_dirty_tracking(bs, granularity);
bdrv_set_enable_write_cache(s->target, true);
bdrv_set_on_error(s->target, on_target_error, on_target_error);
bdrv_iostatus_enable(s->target);

View File

@ -112,7 +112,7 @@ static int qcow_open(BlockDriverState *bs, int flags)
be64_to_cpus(&header.l1_table_offset);
if (header.magic != QCOW_MAGIC) {
ret = -EINVAL;
ret = -EMEDIUMTYPE;
goto fail;
}
if (header.version != QCOW_VERSION) {

View File

@ -311,7 +311,7 @@ static int qcow2_open(BlockDriverState *bs, int flags)
be32_to_cpus(&header.nb_snapshots);
if (header.magic != QCOW_MAGIC) {
ret = -EINVAL;
ret = -EMEDIUMTYPE;
goto fail;
}
if (header.version < 2 || header.version > 3) {

View File

@ -390,7 +390,7 @@ static int bdrv_qed_open(BlockDriverState *bs, int flags)
qed_header_le_to_cpu(&le_header, &s->header);
if (s->header.magic != QED_MAGIC) {
return -EINVAL;
return -EMEDIUMTYPE;
}
if (s->header.features & ~QED_FEATURE_MASK) {
/* image uses unsupported feature bits */

View File

@ -246,7 +246,7 @@ static void vdi_header_print(VdiHeader *header)
{
char uuid[37];
logout("text %s", header->text);
logout("signature 0x%04x\n", header->signature);
logout("signature 0x%08x\n", header->signature);
logout("header size 0x%04x\n", header->header_size);
logout("image type 0x%04x\n", header->image_type);
logout("image flags 0x%04x\n", header->image_flags);
@ -369,10 +369,12 @@ static int vdi_open(BlockDriverState *bs, int flags)
BDRVVdiState *s = bs->opaque;
VdiHeader header;
size_t bmap_size;
int ret;
logout("\n");
if (bdrv_read(bs->file, 0, (uint8_t *)&header, 1) < 0) {
ret = bdrv_read(bs->file, 0, (uint8_t *)&header, 1);
if (ret < 0) {
goto fail;
}
@ -390,33 +392,45 @@ static int vdi_open(BlockDriverState *bs, int flags)
header.disk_size &= ~(SECTOR_SIZE - 1);
}
if (header.version != VDI_VERSION_1_1) {
if (header.signature != VDI_SIGNATURE) {
logout("bad vdi signature %08x\n", header.signature);
ret = -EMEDIUMTYPE;
goto fail;
} else if (header.version != VDI_VERSION_1_1) {
logout("unsupported version %u.%u\n",
header.version >> 16, header.version & 0xffff);
ret = -ENOTSUP;
goto fail;
} else if (header.offset_bmap % SECTOR_SIZE != 0) {
/* We only support block maps which start on a sector boundary. */
logout("unsupported block map offset 0x%x B\n", header.offset_bmap);
ret = -ENOTSUP;
goto fail;
} else if (header.offset_data % SECTOR_SIZE != 0) {
/* We only support data blocks which start on a sector boundary. */
logout("unsupported data offset 0x%x B\n", header.offset_data);
ret = -ENOTSUP;
goto fail;
} else if (header.sector_size != SECTOR_SIZE) {
logout("unsupported sector size %u B\n", header.sector_size);
ret = -ENOTSUP;
goto fail;
} else if (header.block_size != 1 * MiB) {
logout("unsupported block size %u B\n", header.block_size);
ret = -ENOTSUP;
goto fail;
} else if (header.disk_size >
(uint64_t)header.blocks_in_image * header.block_size) {
logout("unsupported disk size %" PRIu64 " B\n", header.disk_size);
ret = -ENOTSUP;
goto fail;
} else if (!uuid_is_null(header.uuid_link)) {
logout("link uuid != 0, unsupported\n");
ret = -ENOTSUP;
goto fail;
} else if (!uuid_is_null(header.uuid_parent)) {
logout("parent uuid != 0, unsupported\n");
ret = -ENOTSUP;
goto fail;
}
@ -432,7 +446,8 @@ static int vdi_open(BlockDriverState *bs, int flags)
if (bmap_size > 0) {
s->bmap = g_malloc(bmap_size * SECTOR_SIZE);
}
if (bdrv_read(bs->file, s->bmap_sector, (uint8_t *)s->bmap, bmap_size) < 0) {
ret = bdrv_read(bs->file, s->bmap_sector, (uint8_t *)s->bmap, bmap_size);
if (ret < 0) {
goto fail_free_bmap;
}
@ -448,7 +463,7 @@ static int vdi_open(BlockDriverState *bs, int flags)
g_free(s->bmap);
fail:
return -1;
return ret;
}
static int vdi_reopen_prepare(BDRVReopenState *state,

View File

@ -616,7 +616,7 @@ static int vmdk_open_sparse(BlockDriverState *bs,
return vmdk_open_vmdk4(bs, file, flags);
break;
default:
return -EINVAL;
return -EMEDIUMTYPE;
break;
}
}
@ -718,7 +718,7 @@ static int vmdk_open_desc_file(BlockDriverState *bs, int flags,
}
buf[2047] = '\0';
if (vmdk_parse_description(buf, "createType", ct, sizeof(ct))) {
return -EINVAL;
return -EMEDIUMTYPE;
}
if (strcmp(ct, "monolithicFlat") &&
strcmp(ct, "twoGbMaxExtentSparse") &&

View File

@ -617,8 +617,13 @@ DriveInfo *drive_init(QemuOpts *opts, BlockInterfaceType block_default_type)
ret = bdrv_open(dinfo->bdrv, file, bdrv_flags, drv);
if (ret < 0) {
error_report("could not open disk image %s: %s",
file, strerror(-ret));
if (ret == -EMEDIUMTYPE) {
error_report("could not open disk image %s: not in %s format",
file, drv->format_name);
} else {
error_report("could not open disk image %s: %s",
file, strerror(-ret));
}
goto err;
}
@ -1184,16 +1189,19 @@ void qmp_block_commit(const char *device,
drive_get_ref(drive_get_by_blockdev(bs));
}
#define DEFAULT_MIRROR_BUF_SIZE (10 << 20)
void qmp_drive_mirror(const char *device, const char *target,
bool has_format, const char *format,
enum MirrorSyncMode sync,
bool has_mode, enum NewImageMode mode,
bool has_speed, int64_t speed,
bool has_granularity, uint32_t granularity,
bool has_buf_size, int64_t buf_size,
bool has_on_source_error, BlockdevOnError on_source_error,
bool has_on_target_error, BlockdevOnError on_target_error,
Error **errp)
{
BlockDriverInfo bdi;
BlockDriverState *bs;
BlockDriverState *source, *target_bs;
BlockDriver *proto_drv;
@ -1215,6 +1223,21 @@ void qmp_drive_mirror(const char *device, const char *target,
if (!has_mode) {
mode = NEW_IMAGE_MODE_ABSOLUTE_PATHS;
}
if (!has_granularity) {
granularity = 0;
}
if (!has_buf_size) {
buf_size = DEFAULT_MIRROR_BUF_SIZE;
}
if (granularity != 0 && (granularity < 512 || granularity > 1048576 * 64)) {
error_set(errp, QERR_INVALID_PARAMETER, device);
return;
}
if (granularity & (granularity - 1)) {
error_set(errp, QERR_INVALID_PARAMETER, device);
return;
}
bs = bdrv_find(device);
if (!bs) {
@ -1255,11 +1278,11 @@ void qmp_drive_mirror(const char *device, const char *target,
return;
}
bdrv_get_geometry(bs, &size);
size *= 512;
if (sync == MIRROR_SYNC_MODE_FULL && mode != NEW_IMAGE_MODE_EXISTING) {
/* create new image w/o backing file */
assert(format && drv);
bdrv_get_geometry(bs, &size);
size *= 512;
bdrv_img_create(target, format,
NULL, NULL, NULL, size, flags, &local_err);
} else {
@ -1272,7 +1295,7 @@ void qmp_drive_mirror(const char *device, const char *target,
bdrv_img_create(target, format,
source->filename,
source->drv->format_name,
NULL, -1, flags, &local_err);
NULL, size, flags, &local_err);
break;
default:
abort();
@ -1284,6 +1307,9 @@ void qmp_drive_mirror(const char *device, const char *target,
return;
}
/* Mirroring takes care of copy-on-write using the source's backing
* file.
*/
target_bs = bdrv_new("");
ret = bdrv_open(target_bs, target, flags | BDRV_O_NO_BACKING, drv);
@ -1293,18 +1319,8 @@ void qmp_drive_mirror(const char *device, const char *target,
return;
}
/* We need a backing file if we will copy parts of a cluster. */
if (bdrv_get_info(target_bs, &bdi) >= 0 && bdi.cluster_size != 0 &&
bdi.cluster_size >= BDRV_SECTORS_PER_DIRTY_CHUNK * 512) {
ret = bdrv_open_backing_file(target_bs);
if (ret < 0) {
bdrv_delete(target_bs);
error_set(errp, QERR_OPEN_FILE_FAILED, target);
return;
}
}
mirror_start(bs, target_bs, speed, sync, on_source_error, on_target_error,
mirror_start(bs, target_bs, speed, granularity, buf_size, sync,
on_source_error, on_target_error,
block_job_cb, bs, &local_err);
if (local_err != NULL) {
bdrv_delete(target_bs);

2
hmp.c
View File

@ -808,7 +808,7 @@ void hmp_drive_mirror(Monitor *mon, const QDict *qdict)
qmp_drive_mirror(device, filename, !!format, format,
full ? MIRROR_SYNC_MODE_FULL : MIRROR_SYNC_MODE_TOP,
true, mode, false, 0,
true, mode, false, 0, false, 0, false, 0,
false, 0, false, 0, &errp);
hmp_handle_error(mon, &errp);
}

View File

@ -241,7 +241,7 @@ static void ahci_port_write(AHCIState *s, int port, int offset, uint32_t val)
if ((pr->cmd & PORT_CMD_FIS_ON) &&
!s->dev[port].init_d2h_sent) {
ahci_init_d2h(&s->dev[port]);
s->dev[port].init_d2h_sent = 1;
s->dev[port].init_d2h_sent = true;
}
check_cmd(s, port);
@ -494,7 +494,7 @@ static void ahci_reset_port(AHCIState *s, int port)
pr->scr_err = 0;
pr->scr_act = 0;
d->busy_slot = -1;
d->init_d2h_sent = 0;
d->init_d2h_sent = false;
ide_state = &s->dev[port].port.ifs[0];
if (!ide_state->bs) {
@ -946,7 +946,7 @@ static int handle_cmd(AHCIState *s, int port, int slot)
ide_state->hcyl = 0xeb;
debug_print_fis(ide_state->io_buffer, 0x10);
ide_state->feature = IDE_FEATURE_DMA;
s->dev[port].done_atapi_packet = 0;
s->dev[port].done_atapi_packet = false;
/* XXX send PIO setup FIS */
}
@ -991,7 +991,7 @@ static int ahci_start_transfer(IDEDMA *dma)
if (is_atapi && !ad->done_atapi_packet) {
/* already prepopulated iobuffer */
ad->done_atapi_packet = 1;
ad->done_atapi_packet = true;
goto out;
}
@ -1035,11 +1035,10 @@ out:
static void ahci_start_dma(IDEDMA *dma, IDEState *s,
BlockDriverCompletionFunc *dma_cb)
{
#ifdef DEBUG_AHCI
AHCIDevice *ad = DO_UPCAST(AHCIDevice, dma, dma);
#endif
DPRINTF(ad->port_no, "\n");
ad->dma_cb = dma_cb;
ad->dma_status |= BM_STATUS_DMAING;
s->io_buffer_offset = 0;
dma_cb(s, 0);
}
@ -1095,7 +1094,6 @@ static int ahci_dma_set_unit(IDEDMA *dma, int unit)
static int ahci_dma_add_status(IDEDMA *dma, int status)
{
AHCIDevice *ad = DO_UPCAST(AHCIDevice, dma, dma);
ad->dma_status |= status;
DPRINTF(ad->port_no, "set status: %x\n", status);
if (status & BM_STATUS_INT) {
@ -1114,8 +1112,6 @@ static int ahci_dma_set_inactive(IDEDMA *dma)
/* update d2h status */
ahci_write_fis_d2h(ad, NULL);
ad->dma_cb = NULL;
if (!ad->check_bh) {
/* maybe we still have something to process, check later */
ad->check_bh = qemu_bh_new(ahci_check_cmd_bh, ad);
@ -1203,6 +1199,82 @@ void ahci_reset(AHCIState *s)
}
}
static const VMStateDescription vmstate_ahci_device = {
.name = "ahci port",
.version_id = 1,
.fields = (VMStateField []) {
VMSTATE_IDE_BUS(port, AHCIDevice),
VMSTATE_UINT32(port_state, AHCIDevice),
VMSTATE_UINT32(finished, AHCIDevice),
VMSTATE_UINT32(port_regs.lst_addr, AHCIDevice),
VMSTATE_UINT32(port_regs.lst_addr_hi, AHCIDevice),
VMSTATE_UINT32(port_regs.fis_addr, AHCIDevice),
VMSTATE_UINT32(port_regs.fis_addr_hi, AHCIDevice),
VMSTATE_UINT32(port_regs.irq_stat, AHCIDevice),
VMSTATE_UINT32(port_regs.irq_mask, AHCIDevice),
VMSTATE_UINT32(port_regs.cmd, AHCIDevice),
VMSTATE_UINT32(port_regs.tfdata, AHCIDevice),
VMSTATE_UINT32(port_regs.sig, AHCIDevice),
VMSTATE_UINT32(port_regs.scr_stat, AHCIDevice),
VMSTATE_UINT32(port_regs.scr_ctl, AHCIDevice),
VMSTATE_UINT32(port_regs.scr_err, AHCIDevice),
VMSTATE_UINT32(port_regs.scr_act, AHCIDevice),
VMSTATE_UINT32(port_regs.cmd_issue, AHCIDevice),
VMSTATE_BOOL(done_atapi_packet, AHCIDevice),
VMSTATE_INT32(busy_slot, AHCIDevice),
VMSTATE_BOOL(init_d2h_sent, AHCIDevice),
VMSTATE_END_OF_LIST()
},
};
static int ahci_state_post_load(void *opaque, int version_id)
{
int i;
struct AHCIDevice *ad;
AHCIState *s = opaque;
for (i = 0; i < s->ports; i++) {
ad = &s->dev[i];
AHCIPortRegs *pr = &ad->port_regs;
map_page(&ad->lst,
((uint64_t)pr->lst_addr_hi << 32) | pr->lst_addr, 1024);
map_page(&ad->res_fis,
((uint64_t)pr->fis_addr_hi << 32) | pr->fis_addr, 256);
/*
* All pending i/o should be flushed out on a migrate. However,
* we might not have cleared the busy_slot since this is done
* in a bh. Also, issue i/o against any slots that are pending.
*/
if ((ad->busy_slot != -1) &&
!(ad->port.ifs[0].status & (BUSY_STAT|DRQ_STAT))) {
pr->cmd_issue &= ~(1 << ad->busy_slot);
ad->busy_slot = -1;
}
check_cmd(s, i);
}
return 0;
}
const VMStateDescription vmstate_ahci = {
.name = "ahci",
.version_id = 1,
.post_load = ahci_state_post_load,
.fields = (VMStateField []) {
VMSTATE_STRUCT_VARRAY_POINTER_INT32(dev, AHCIState, ports,
vmstate_ahci_device, AHCIDevice),
VMSTATE_UINT32(control_regs.cap, AHCIState),
VMSTATE_UINT32(control_regs.ghc, AHCIState),
VMSTATE_UINT32(control_regs.irqstatus, AHCIState),
VMSTATE_UINT32(control_regs.impl, AHCIState),
VMSTATE_UINT32(control_regs.version, AHCIState),
VMSTATE_UINT32(idp_index, AHCIState),
VMSTATE_INT32(ports, AHCIState),
VMSTATE_END_OF_LIST()
},
};
typedef struct SysbusAHCIState {
SysBusDevice busdev;
AHCIState ahci;
@ -1211,7 +1283,11 @@ typedef struct SysbusAHCIState {
static const VMStateDescription vmstate_sysbus_ahci = {
.name = "sysbus-ahci",
.unmigratable = 1,
.unmigratable = 1, /* Still buggy under I/O load */
.fields = (VMStateField []) {
VMSTATE_AHCI(ahci, AHCIPCIState),
VMSTATE_END_OF_LIST()
},
};
static void sysbus_ahci_reset(DeviceState *dev)

View File

@ -281,11 +281,9 @@ struct AHCIDevice {
QEMUBH *check_bh;
uint8_t *lst;
uint8_t *res_fis;
int dma_status;
int done_atapi_packet;
int busy_slot;
int init_d2h_sent;
BlockDriverCompletionFunc *dma_cb;
bool done_atapi_packet;
int32_t busy_slot;
bool init_d2h_sent;
AHCICmdHdr *cur_cmd;
NCQTransferState ncq_tfs[AHCI_MAX_CMDS];
};
@ -297,7 +295,7 @@ typedef struct AHCIState {
MemoryRegion idp; /* Index-Data Pair I/O port space */
unsigned idp_offset; /* Offset of index in I/O port space */
uint32_t idp_index; /* Current IDP index */
int ports;
int32_t ports;
qemu_irq irq;
DMAContext *dma;
} AHCIState;
@ -307,6 +305,16 @@ typedef struct AHCIPCIState {
AHCIState ahci;
} AHCIPCIState;
extern const VMStateDescription vmstate_ahci;
#define VMSTATE_AHCI(_field, _state) { \
.name = (stringify(_field)), \
.size = sizeof(AHCIState), \
.vmsd = &vmstate_ahci, \
.flags = VMS_STRUCT, \
.offset = vmstate_offset_value(_state, _field, AHCIState), \
}
typedef struct NCQFrame {
uint8_t fis_type;
uint8_t c;

View File

@ -1149,8 +1149,10 @@ void ide_exec_cmd(IDEBus *bus, uint32_t val)
}
ide_set_irq(s->bus);
break;
case WIN_VERIFY_EXT:
lba48 = 1;
lba48 = 1;
/* fall through */
case WIN_VERIFY:
case WIN_VERIFY_ONCE:
/* do sector number check ? */
@ -1158,8 +1160,10 @@ void ide_exec_cmd(IDEBus *bus, uint32_t val)
s->status = READY_STAT | SEEK_STAT;
ide_set_irq(s->bus);
break;
case WIN_READ_EXT:
lba48 = 1;
lba48 = 1;
/* fall through */
case WIN_READ:
case WIN_READ_ONCE:
if (s->drive_kind == IDE_CD) {
@ -1173,8 +1177,10 @@ void ide_exec_cmd(IDEBus *bus, uint32_t val)
s->req_nb_sectors = 1;
ide_sector_read(s);
break;
case WIN_WRITE_EXT:
lba48 = 1;
lba48 = 1;
/* fall through */
case WIN_WRITE:
case WIN_WRITE_ONCE:
case CFA_WRITE_SECT_WO_ERASE:
@ -1189,8 +1195,10 @@ void ide_exec_cmd(IDEBus *bus, uint32_t val)
ide_transfer_start(s, s->io_buffer, 512, ide_sector_write);
s->media_changed = 1;
break;
case WIN_MULTREAD_EXT:
lba48 = 1;
lba48 = 1;
/* fall through */
case WIN_MULTREAD:
if (!s->bs) {
goto abort_cmd;
@ -1202,8 +1210,10 @@ void ide_exec_cmd(IDEBus *bus, uint32_t val)
s->req_nb_sectors = s->mult_sectors;
ide_sector_read(s);
break;
case WIN_MULTWRITE_EXT:
lba48 = 1;
lba48 = 1;
/* fall through */
case WIN_MULTWRITE:
case CFA_WRITE_MULTI_WO_ERASE:
if (!s->bs) {
@ -1222,8 +1232,10 @@ void ide_exec_cmd(IDEBus *bus, uint32_t val)
ide_transfer_start(s, s->io_buffer, 512 * n, ide_sector_write);
s->media_changed = 1;
break;
case WIN_READDMA_EXT:
lba48 = 1;
lba48 = 1;
/* fall through */
case WIN_READDMA:
case WIN_READDMA_ONCE:
if (!s->bs) {
@ -1232,8 +1244,10 @@ void ide_exec_cmd(IDEBus *bus, uint32_t val)
ide_cmd_lba48_transform(s, lba48);
ide_sector_start_dma(s, IDE_DMA_READ);
break;
case WIN_WRITEDMA_EXT:
lba48 = 1;
lba48 = 1;
/* fall through */
case WIN_WRITEDMA:
case WIN_WRITEDMA_ONCE:
if (!s->bs) {
@ -1243,14 +1257,17 @@ void ide_exec_cmd(IDEBus *bus, uint32_t val)
ide_sector_start_dma(s, IDE_DMA_WRITE);
s->media_changed = 1;
break;
case WIN_READ_NATIVE_MAX_EXT:
lba48 = 1;
lba48 = 1;
/* fall through */
case WIN_READ_NATIVE_MAX:
ide_cmd_lba48_transform(s, lba48);
ide_set_sector(s, s->nb_sectors - 1);
s->status = READY_STAT | SEEK_STAT;
ide_set_irq(s->bus);
break;
case WIN_CHECKPOWERMODE1:
case WIN_CHECKPOWERMODE2:
s->error = 0;

View File

@ -79,9 +79,15 @@
#define ICH9_IDP_INDEX 0x10
#define ICH9_IDP_INDEX_LOG2 0x04
static const VMStateDescription vmstate_ahci = {
.name = "ahci",
.unmigratable = 1,
static const VMStateDescription vmstate_ich9_ahci = {
.name = "ich9_ahci",
.unmigratable = 1, /* Still buggy under I/O load */
.version_id = 1,
.fields = (VMStateField []) {
VMSTATE_PCI_DEVICE(card, AHCIPCIState),
VMSTATE_AHCI(ahci, AHCIPCIState),
VMSTATE_END_OF_LIST()
},
};
static void pci_ich9_reset(DeviceState *dev)
@ -152,7 +158,7 @@ static void ich_ahci_class_init(ObjectClass *klass, void *data)
k->device_id = PCI_DEVICE_ID_INTEL_82801IR;
k->revision = 0x02;
k->class_id = PCI_CLASS_STORAGE_SATA;
dc->vmsd = &vmstate_ahci;
dc->vmsd = &vmstate_ich9_ahci;
dc->reset = pci_ich9_reset;
}

View File

@ -309,6 +309,10 @@ int bdrv_get_flags(BlockDriverState *bs);
int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
const uint8_t *buf, int nb_sectors);
int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi);
void bdrv_round_to_clusters(BlockDriverState *bs,
int64_t sector_num, int nb_sectors,
int64_t *cluster_sector_num,
int *cluster_nb_sectors);
const char *bdrv_get_encrypted_filename(BlockDriverState *bs);
void bdrv_get_backing_filename(BlockDriverState *bs,
@ -351,13 +355,12 @@ void bdrv_set_buffer_alignment(BlockDriverState *bs, int align);
void *qemu_blockalign(BlockDriverState *bs, size_t size);
bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov);
#define BDRV_SECTORS_PER_DIRTY_CHUNK 2048
void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable);
struct HBitmapIter;
void bdrv_set_dirty_tracking(BlockDriverState *bs, int granularity);
int bdrv_get_dirty(BlockDriverState *bs, int64_t sector);
void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors);
void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors);
int64_t bdrv_get_next_dirty(BlockDriverState *bs, int64_t sector);
void bdrv_dirty_iter_init(BlockDriverState *bs, struct HBitmapIter *hbi);
int64_t bdrv_get_dirty_count(BlockDriverState *bs);
void bdrv_enable_copy_on_read(BlockDriverState *bs);

View File

@ -32,6 +32,7 @@
#include "qapi-types.h"
#include "qapi/qmp/qerror.h"
#include "monitor/monitor.h"
#include "qemu/hbitmap.h"
#define BLOCK_FLAG_ENCRYPT 1
#define BLOCK_FLAG_COMPAT6 4
@ -275,8 +276,7 @@ struct BlockDriverState {
bool iostatus_enabled;
BlockDeviceIoStatus iostatus;
char device_name[32];
unsigned long *dirty_bitmap;
int64_t dirty_count;
HBitmap *dirty_bitmap;
int in_use; /* users other than guest access, eg. block migration */
QTAILQ_ENTRY(BlockDriverState) list;
@ -344,6 +344,8 @@ void commit_start(BlockDriverState *bs, BlockDriverState *base,
* @bs: Block device to operate on.
* @target: Block device to write to.
* @speed: The maximum speed, in bytes per second, or 0 for unlimited.
* @granularity: The chosen granularity for the dirty bitmap.
* @buf_size: The amount of data that can be in flight at one time.
* @mode: Whether to collapse all images in the chain to the target.
* @on_source_error: The action to take upon error reading from the source.
* @on_target_error: The action to take upon error writing to the target.
@ -357,8 +359,8 @@ void commit_start(BlockDriverState *bs, BlockDriverState *base,
* @bs will be switched to read from @target.
*/
void mirror_start(BlockDriverState *bs, BlockDriverState *target,
int64_t speed, MirrorSyncMode mode,
BlockdevOnError on_source_error,
int64_t speed, int64_t granularity, int64_t buf_size,
MirrorSyncMode mode, BlockdevOnError on_source_error,
BlockdevOnError on_target_error,
BlockDriverCompletionFunc *cb,
void *opaque, Error **errp);

View File

@ -68,6 +68,9 @@
#if !defined(ECANCELED)
#define ECANCELED 4097
#endif
#if !defined(EMEDIUMTYPE)
#define EMEDIUMTYPE 4098
#endif
#ifndef TIME_MAX
#define TIME_MAX LONG_MAX
#endif

208
include/qemu/hbitmap.h Normal file
View File

@ -0,0 +1,208 @@
/*
* Hierarchical Bitmap Data Type
*
* Copyright Red Hat, Inc., 2012
*
* Author: Paolo Bonzini <pbonzini@redhat.com>
*
* This work is licensed under the terms of the GNU GPL, version 2 or
* later. See the COPYING file in the top-level directory.
*/
#ifndef HBITMAP_H
#define HBITMAP_H 1
#include <limits.h>
#include <stdint.h>
#include <stdbool.h>
#include "bitops.h"
typedef struct HBitmap HBitmap;
typedef struct HBitmapIter HBitmapIter;
#define BITS_PER_LEVEL (BITS_PER_LONG == 32 ? 5 : 6)
/* For 32-bit, the largest that fits in a 4 GiB address space.
* For 64-bit, the number of sectors in 1 PiB. Good luck, in
* either case... :)
*/
#define HBITMAP_LOG_MAX_SIZE (BITS_PER_LONG == 32 ? 34 : 41)
/* We need to place a sentinel in level 0 to speed up iteration. Thus,
* we do this instead of HBITMAP_LOG_MAX_SIZE / BITS_PER_LEVEL. The
* difference is that it allocates an extra level when HBITMAP_LOG_MAX_SIZE
* is an exact multiple of BITS_PER_LEVEL.
*/
#define HBITMAP_LEVELS ((HBITMAP_LOG_MAX_SIZE / BITS_PER_LEVEL) + 1)
struct HBitmapIter {
const HBitmap *hb;
/* Copied from hb for access in the inline functions (hb is opaque). */
int granularity;
/* Entry offset into the last-level array of longs. */
size_t pos;
/* The currently-active path in the tree. Each item of cur[i] stores
* the bits (i.e. the subtrees) yet to be processed under that node.
*/
unsigned long cur[HBITMAP_LEVELS];
};
/**
* hbitmap_alloc:
* @size: Number of bits in the bitmap.
* @granularity: Granularity of the bitmap. Aligned groups of 2^@granularity
* bits will be represented by a single bit. Each operation on a
* range of bits first rounds the bits to determine which group they land
* in, and then affect the entire set; iteration will only visit the first
* bit of each group.
*
* Allocate a new HBitmap.
*/
HBitmap *hbitmap_alloc(uint64_t size, int granularity);
/**
* hbitmap_empty:
* @hb: HBitmap to operate on.
*
* Return whether the bitmap is empty.
*/
bool hbitmap_empty(const HBitmap *hb);
/**
* hbitmap_granularity:
* @hb: HBitmap to operate on.
*
* Return the granularity of the HBitmap.
*/
int hbitmap_granularity(const HBitmap *hb);
/**
* hbitmap_count:
* @hb: HBitmap to operate on.
*
* Return the number of bits set in the HBitmap.
*/
uint64_t hbitmap_count(const HBitmap *hb);
/**
* hbitmap_set:
* @hb: HBitmap to operate on.
* @start: First bit to set (0-based).
* @count: Number of bits to set.
*
* Set a consecutive range of bits in an HBitmap.
*/
void hbitmap_set(HBitmap *hb, uint64_t start, uint64_t count);
/**
* hbitmap_reset:
* @hb: HBitmap to operate on.
* @start: First bit to reset (0-based).
* @count: Number of bits to reset.
*
* Reset a consecutive range of bits in an HBitmap.
*/
void hbitmap_reset(HBitmap *hb, uint64_t start, uint64_t count);
/**
* hbitmap_get:
* @hb: HBitmap to operate on.
* @item: Bit to query (0-based).
*
* Return whether the @item-th bit in an HBitmap is set.
*/
bool hbitmap_get(const HBitmap *hb, uint64_t item);
/**
* hbitmap_free:
* @hb: HBitmap to operate on.
*
* Free an HBitmap and all of its associated memory.
*/
void hbitmap_free(HBitmap *hb);
/**
* hbitmap_iter_init:
* @hbi: HBitmapIter to initialize.
* @hb: HBitmap to iterate on.
* @first: First bit to visit (0-based, must be strictly less than the
* size of the bitmap).
*
* Set up @hbi to iterate on the HBitmap @hb. hbitmap_iter_next will return
* the lowest-numbered bit that is set in @hb, starting at @first.
*
* Concurrent setting of bits is acceptable, and will at worst cause the
* iteration to miss some of those bits. Resetting bits before the current
* position of the iterator is also okay. However, concurrent resetting of
* bits can lead to unexpected behavior if the iterator has not yet reached
* those bits.
*/
void hbitmap_iter_init(HBitmapIter *hbi, const HBitmap *hb, uint64_t first);
/* hbitmap_iter_skip_words:
* @hbi: HBitmapIter to operate on.
*
* Internal function used by hbitmap_iter_next and hbitmap_iter_next_word.
*/
unsigned long hbitmap_iter_skip_words(HBitmapIter *hbi);
/**
* hbitmap_iter_next:
* @hbi: HBitmapIter to operate on.
*
* Return the next bit that is set in @hbi's associated HBitmap,
* or -1 if all remaining bits are zero.
*/
static inline int64_t hbitmap_iter_next(HBitmapIter *hbi)
{
unsigned long cur = hbi->cur[HBITMAP_LEVELS - 1];
int64_t item;
if (cur == 0) {
cur = hbitmap_iter_skip_words(hbi);
if (cur == 0) {
return -1;
}
}
/* The next call will resume work from the next bit. */
hbi->cur[HBITMAP_LEVELS - 1] = cur & (cur - 1);
item = ((uint64_t)hbi->pos << BITS_PER_LEVEL) + ffsl(cur) - 1;
return item << hbi->granularity;
}
/**
* hbitmap_iter_next_word:
* @hbi: HBitmapIter to operate on.
* @p_cur: Location where to store the next non-zero word.
*
* Return the index of the next nonzero word that is set in @hbi's
* associated HBitmap, and set *p_cur to the content of that word
* (bits before the index that was passed to hbitmap_iter_init are
* trimmed on the first call). Return -1, and set *p_cur to zero,
* if all remaining words are zero.
*/
static inline size_t hbitmap_iter_next_word(HBitmapIter *hbi, unsigned long *p_cur)
{
unsigned long cur = hbi->cur[HBITMAP_LEVELS - 1];
if (cur == 0) {
cur = hbitmap_iter_skip_words(hbi);
if (cur == 0) {
*p_cur = 0;
return -1;
}
}
/* The next call will resume work from the next word. */
hbi->cur[HBITMAP_LEVELS - 1] = 0;
*p_cur = cur;
return hbi->pos;
}
#endif

View File

@ -26,6 +26,7 @@
#define HOST_UTILS_H 1
#include "qemu/compiler.h" /* QEMU_GNUC_PREREQ */
#include <string.h> /* ffsl */
#if defined(__x86_64__)
#define __HAVE_FAST_MULU64__
@ -237,4 +238,29 @@ static inline int ctpop64(uint64_t val)
#endif
}
/* glibc does not provide an inline version of ffsl, so always define
* ours. We need to give it a different name, however.
*/
#ifdef __GLIBC__
#define ffsl qemu_ffsl
#endif
static inline int ffsl(long val)
{
if (!val) {
return 0;
}
#if QEMU_GNUC_PREREQ(3, 4)
return __builtin_ctzl(val) + 1;
#else
if (sizeof(long) == 4) {
return ctz32(val) + 1;
} else if (sizeof(long) == 8) {
return ctz64(val) + 1;
} else {
abort();
}
#endif
}
#endif

View File

@ -741,10 +741,12 @@
#
# @count: number of dirty bytes according to the dirty bitmap
#
# @granularity: granularity of the dirty bitmap in bytes (since 1.4)
#
# Since: 1.3
##
{ 'type': 'BlockDirtyInfo',
'data': {'count': 'int'} }
'data': {'count': 'int', 'granularity': 'int'} }
##
# @BlockInfo:
@ -1690,6 +1692,14 @@
# (all the disk, only the sectors allocated in the topmost image, or
# only new I/O).
#
# @granularity: #optional granularity of the dirty bitmap, default is 64K
# if the image format doesn't have clusters, 4K if the clusters
# are smaller than that, else the cluster size. Must be a
# power of 2 between 512 and 64M (since 1.4).
#
# @buf-size: #optional maximum amount of data in flight from source to
# target (since 1.4).
#
# @on-source-error: #optional the action to take on an error on the source,
# default 'report'. 'stop' and 'enospc' can only be used
# if the block device supports io-status (see BlockInfo).
@ -1706,7 +1716,8 @@
{ 'command': 'drive-mirror',
'data': { 'device': 'str', 'target': 'str', '*format': 'str',
'sync': 'MirrorSyncMode', '*mode': 'NewImageMode',
'*speed': 'int', '*on-source-error': 'BlockdevOnError',
'*speed': 'int', '*granularity': 'uint32',
'*buf-size': 'int', '*on-source-error': 'BlockdevOnError',
'*on-target-error': 'BlockdevOnError' } }
##

View File

@ -1004,7 +1004,8 @@ EQMP
{
.name = "drive-mirror",
.args_type = "sync:s,device:B,target:s,speed:i?,mode:s?,format:s?,"
"on-source-error:s?,on-target-error:s?",
"on-source-error:s?,on-target-error:s?,"
"granularity:i?,buf-size:i?",
.mhandler.cmd_new = qmp_marshal_input_drive_mirror,
},
@ -1028,6 +1029,9 @@ Arguments:
file/device (NewImageMode, optional, default 'absolute-paths')
- "speed": maximum speed of the streaming job, in bytes per second
(json-int)
- "granularity": granularity of the dirty bitmap, in bytes (json-int, optional)
- "buf_size": maximum amount of data in flight from source to target, in bytes
(json-int, default 10M)
- "sync": what parts of the disk image should be copied to the destination;
possibilities include "full" for all the disk, "top" for only the sectors
allocated in the topmost image, or "none" to only replicate new I/O
@ -1037,6 +1041,10 @@ Arguments:
- "on-target-error": the action to take on an error on the target
(BlockdevOnError, default 'report')
The default value of the granularity is the image cluster size clamped
between 4096 and 65536, if the image format defines one. If the format
does not define a cluster size, the default value of the granularity
is 65536.
Example:

View File

@ -45,6 +45,8 @@ gcov-files-test-aio-$(CONFIG_WIN32) = aio-win32.c
gcov-files-test-aio-$(CONFIG_POSIX) = aio-posix.c
check-unit-y += tests/test-thread-pool$(EXESUF)
gcov-files-test-thread-pool-y = thread-pool.c
gcov-files-test-hbitmap-y = util/hbitmap.c
check-unit-y += tests/test-hbitmap$(EXESUF)
check-block-$(CONFIG_POSIX) += tests/qemu-iotests-quick.sh
@ -88,6 +90,7 @@ tests/test-coroutine$(EXESUF): tests/test-coroutine.o $(block-obj-y) libqemuutil
tests/test-aio$(EXESUF): tests/test-aio.o $(block-obj-y) libqemuutil.a libqemustub.a
tests/test-thread-pool$(EXESUF): tests/test-thread-pool.o $(block-obj-y) libqemuutil.a libqemustub.a
tests/test-iov$(EXESUF): tests/test-iov.o libqemuutil.a
tests/test-hbitmap$(EXESUF): tests/test-hbitmap.o libqemuutil.a libqemustub.a
tests/test-qapi-types.c tests/test-qapi-types.h :\
$(SRC_PATH)/qapi-schema-test.json $(SRC_PATH)/scripts/qapi-types.py

View File

@ -207,6 +207,37 @@ class TestSingleDrive(ImageMirroringTestCase):
self.assertTrue(self.compare_images(test_img, target_img),
'target image does not match source after mirroring')
def test_small_buffer(self):
self.assert_no_active_mirrors()
# A small buffer is rounded up automatically
result = self.vm.qmp('drive-mirror', device='drive0', sync='full',
buf_size=4096, target=target_img)
self.assert_qmp(result, 'return', {})
self.complete_and_wait()
result = self.vm.qmp('query-block')
self.assert_qmp(result, 'return[0]/inserted/file', target_img)
self.vm.shutdown()
self.assertTrue(self.compare_images(test_img, target_img),
'target image does not match source after mirroring')
def test_small_buffer2(self):
self.assert_no_active_mirrors()
qemu_img('create', '-f', iotests.imgfmt, '-o', 'cluster_size=%d,size=%d'
% (TestSingleDrive.image_len, TestSingleDrive.image_len), target_img)
result = self.vm.qmp('drive-mirror', device='drive0', sync='full',
buf_size=65536, mode='existing', target=target_img)
self.assert_qmp(result, 'return', {})
self.complete_and_wait()
result = self.vm.qmp('query-block')
self.assert_qmp(result, 'return[0]/inserted/file', target_img)
self.vm.shutdown()
self.assertTrue(self.compare_images(test_img, target_img),
'target image does not match source after mirroring')
def test_large_cluster(self):
self.assert_no_active_mirrors()
@ -292,6 +323,27 @@ class TestMirrorNoBacking(ImageMirroringTestCase):
self.assertTrue(self.compare_images(test_img, target_img),
'target image does not match source after mirroring')
def test_large_cluster(self):
self.assert_no_active_mirrors()
# qemu-img create fails if the image is not there
qemu_img('create', '-f', iotests.imgfmt, '-o', 'size=%d'
%(TestMirrorNoBacking.image_len), target_backing_img)
qemu_img('create', '-f', iotests.imgfmt, '-o', 'cluster_size=%d,backing_file=%s'
% (TestMirrorNoBacking.image_len, target_backing_img), target_img)
os.remove(target_backing_img)
result = self.vm.qmp('drive-mirror', device='drive0', sync='full',
mode='existing', target=target_img)
self.assert_qmp(result, 'return', {})
self.complete_and_wait()
result = self.vm.qmp('query-block')
self.assert_qmp(result, 'return[0]/inserted/file', target_img)
self.vm.shutdown()
self.assertTrue(self.compare_images(test_img, target_img),
'target image does not match source after mirroring')
class TestReadErrors(ImageMirroringTestCase):
image_len = 2 * 1024 * 1024 # MB
@ -330,6 +382,9 @@ new_state = "1"
'-o', 'backing_file=blkdebug:%s:%s,backing_fmt=raw'
% (self.blkdebug_file, backing_img),
test_img)
# Write something for tests that use sync='top'
qemu_io('-c', 'write %d 512' % (self.MIRROR_GRANULARITY + 65536),
test_img)
self.vm = iotests.VM().add_drive(test_img)
self.vm.launch()
@ -383,6 +438,32 @@ new_state = "1"
self.complete_and_wait()
self.vm.shutdown()
def test_large_cluster(self):
self.assert_no_active_mirrors()
# Test COW into the target image. The first half of the
# cluster at MIRROR_GRANULARITY has to be copied from
# backing_img, even though sync='top'.
qemu_img('create', '-f', iotests.imgfmt, '-ocluster_size=131072,backing_file=%s' %(backing_img), target_img)
result = self.vm.qmp('drive-mirror', device='drive0', sync='top',
on_source_error='ignore',
mode='existing', target=target_img)
self.assert_qmp(result, 'return', {})
event = self.vm.get_qmp_event(wait=True)
self.assertEquals(event['event'], 'BLOCK_JOB_ERROR')
self.assert_qmp(event, 'data/device', 'drive0')
self.assert_qmp(event, 'data/operation', 'read')
result = self.vm.qmp('query-block-jobs')
self.assert_qmp(result, 'return[0]/paused', False)
self.complete_and_wait()
self.vm.shutdown()
# Detach blkdebug to compare images successfully
qemu_img('rebase', '-f', iotests.imgfmt, '-u', '-b', backing_img, test_img)
self.assertTrue(self.compare_images(test_img, target_img),
'target image does not match source after mirroring')
def test_stop_read(self):
self.assert_no_active_mirrors()

View File

@ -1,5 +1,5 @@
..................
......................
----------------------------------------------------------------------
Ran 18 tests
Ran 22 tests
OK

401
tests/test-hbitmap.c Normal file
View File

@ -0,0 +1,401 @@
/*
* Hierarchical bitmap unit-tests.
*
* Copyright (C) 2012 Red Hat Inc.
*
* Author: Paolo Bonzini <pbonzini@redhat.com>
*
* This work is licensed under the terms of the GNU GPL, version 2 or later.
* See the COPYING file in the top-level directory.
*/
#include <glib.h>
#include <stdarg.h>
#include "qemu/hbitmap.h"
#define LOG_BITS_PER_LONG (BITS_PER_LONG == 32 ? 5 : 6)
#define L1 BITS_PER_LONG
#define L2 (BITS_PER_LONG * L1)
#define L3 (BITS_PER_LONG * L2)
typedef struct TestHBitmapData {
HBitmap *hb;
unsigned long *bits;
size_t size;
int granularity;
} TestHBitmapData;
/* Check that the HBitmap and the shadow bitmap contain the same data,
* ignoring the same "first" bits.
*/
static void hbitmap_test_check(TestHBitmapData *data,
uint64_t first)
{
uint64_t count = 0;
size_t pos;
int bit;
HBitmapIter hbi;
int64_t i, next;
hbitmap_iter_init(&hbi, data->hb, first);
i = first;
for (;;) {
next = hbitmap_iter_next(&hbi);
if (next < 0) {
next = data->size;
}
while (i < next) {
pos = i >> LOG_BITS_PER_LONG;
bit = i & (BITS_PER_LONG - 1);
i++;
g_assert_cmpint(data->bits[pos] & (1UL << bit), ==, 0);
}
if (next == data->size) {
break;
}
pos = i >> LOG_BITS_PER_LONG;
bit = i & (BITS_PER_LONG - 1);
i++;
count++;
g_assert_cmpint(data->bits[pos] & (1UL << bit), !=, 0);
}
if (first == 0) {
g_assert_cmpint(count << data->granularity, ==, hbitmap_count(data->hb));
}
}
/* This is provided instead of a test setup function so that the sizes
are kept in the test functions (and not in main()) */
static void hbitmap_test_init(TestHBitmapData *data,
uint64_t size, int granularity)
{
size_t n;
data->hb = hbitmap_alloc(size, granularity);
n = (size + BITS_PER_LONG - 1) / BITS_PER_LONG;
if (n == 0) {
n = 1;
}
data->bits = g_new0(unsigned long, n);
data->size = size;
data->granularity = granularity;
if (size) {
hbitmap_test_check(data, 0);
}
}
static void hbitmap_test_teardown(TestHBitmapData *data,
const void *unused)
{
if (data->hb) {
hbitmap_free(data->hb);
data->hb = NULL;
}
if (data->bits) {
g_free(data->bits);
data->bits = NULL;
}
}
/* Set a range in the HBitmap and in the shadow "simple" bitmap.
* The two bitmaps are then tested against each other.
*/
static void hbitmap_test_set(TestHBitmapData *data,
uint64_t first, uint64_t count)
{
hbitmap_set(data->hb, first, count);
while (count-- != 0) {
size_t pos = first >> LOG_BITS_PER_LONG;
int bit = first & (BITS_PER_LONG - 1);
first++;
data->bits[pos] |= 1UL << bit;
}
if (data->granularity == 0) {
hbitmap_test_check(data, 0);
}
}
/* Reset a range in the HBitmap and in the shadow "simple" bitmap.
*/
static void hbitmap_test_reset(TestHBitmapData *data,
uint64_t first, uint64_t count)
{
hbitmap_reset(data->hb, first, count);
while (count-- != 0) {
size_t pos = first >> LOG_BITS_PER_LONG;
int bit = first & (BITS_PER_LONG - 1);
first++;
data->bits[pos] &= ~(1UL << bit);
}
if (data->granularity == 0) {
hbitmap_test_check(data, 0);
}
}
static void hbitmap_test_check_get(TestHBitmapData *data)
{
uint64_t count = 0;
uint64_t i;
for (i = 0; i < data->size; i++) {
size_t pos = i >> LOG_BITS_PER_LONG;
int bit = i & (BITS_PER_LONG - 1);
unsigned long val = data->bits[pos] & (1UL << bit);
count += hbitmap_get(data->hb, i);
g_assert_cmpint(hbitmap_get(data->hb, i), ==, val != 0);
}
g_assert_cmpint(count, ==, hbitmap_count(data->hb));
}
static void test_hbitmap_zero(TestHBitmapData *data,
const void *unused)
{
hbitmap_test_init(data, 0, 0);
}
static void test_hbitmap_unaligned(TestHBitmapData *data,
const void *unused)
{
hbitmap_test_init(data, L3 + 23, 0);
hbitmap_test_set(data, 0, 1);
hbitmap_test_set(data, L3 + 22, 1);
}
static void test_hbitmap_iter_empty(TestHBitmapData *data,
const void *unused)
{
hbitmap_test_init(data, L1, 0);
}
static void test_hbitmap_iter_partial(TestHBitmapData *data,
const void *unused)
{
hbitmap_test_init(data, L3, 0);
hbitmap_test_set(data, 0, L3);
hbitmap_test_check(data, 1);
hbitmap_test_check(data, L1 - 1);
hbitmap_test_check(data, L1);
hbitmap_test_check(data, L1 * 2 - 1);
hbitmap_test_check(data, L2 - 1);
hbitmap_test_check(data, L2);
hbitmap_test_check(data, L2 + 1);
hbitmap_test_check(data, L2 + L1);
hbitmap_test_check(data, L2 + L1 * 2 - 1);
hbitmap_test_check(data, L2 * 2 - 1);
hbitmap_test_check(data, L2 * 2);
hbitmap_test_check(data, L2 * 2 + 1);
hbitmap_test_check(data, L2 * 2 + L1);
hbitmap_test_check(data, L2 * 2 + L1 * 2 - 1);
hbitmap_test_check(data, L3 / 2);
}
static void test_hbitmap_set_all(TestHBitmapData *data,
const void *unused)
{
hbitmap_test_init(data, L3, 0);
hbitmap_test_set(data, 0, L3);
}
static void test_hbitmap_get_all(TestHBitmapData *data,
const void *unused)
{
hbitmap_test_init(data, L3, 0);
hbitmap_test_set(data, 0, L3);
hbitmap_test_check_get(data);
}
static void test_hbitmap_get_some(TestHBitmapData *data,
const void *unused)
{
hbitmap_test_init(data, 2 * L2, 0);
hbitmap_test_set(data, 10, 1);
hbitmap_test_check_get(data);
hbitmap_test_set(data, L1 - 1, 1);
hbitmap_test_check_get(data);
hbitmap_test_set(data, L1, 1);
hbitmap_test_check_get(data);
hbitmap_test_set(data, L2 - 1, 1);
hbitmap_test_check_get(data);
hbitmap_test_set(data, L2, 1);
hbitmap_test_check_get(data);
}
static void test_hbitmap_set_one(TestHBitmapData *data,
const void *unused)
{
hbitmap_test_init(data, 2 * L2, 0);
hbitmap_test_set(data, 10, 1);
hbitmap_test_set(data, L1 - 1, 1);
hbitmap_test_set(data, L1, 1);
hbitmap_test_set(data, L2 - 1, 1);
hbitmap_test_set(data, L2, 1);
}
static void test_hbitmap_set_two_elem(TestHBitmapData *data,
const void *unused)
{
hbitmap_test_init(data, 2 * L2, 0);
hbitmap_test_set(data, L1 - 1, 2);
hbitmap_test_set(data, L1 * 2 - 1, 4);
hbitmap_test_set(data, L1 * 4, L1 + 1);
hbitmap_test_set(data, L1 * 8 - 1, L1 + 1);
hbitmap_test_set(data, L2 - 1, 2);
hbitmap_test_set(data, L2 + L1 - 1, 8);
hbitmap_test_set(data, L2 + L1 * 4, L1 + 1);
hbitmap_test_set(data, L2 + L1 * 8 - 1, L1 + 1);
}
static void test_hbitmap_set(TestHBitmapData *data,
const void *unused)
{
hbitmap_test_init(data, L3 * 2, 0);
hbitmap_test_set(data, L1 - 1, L1 + 2);
hbitmap_test_set(data, L1 * 3 - 1, L1 + 2);
hbitmap_test_set(data, L1 * 5, L1 * 2 + 1);
hbitmap_test_set(data, L1 * 8 - 1, L1 * 2 + 1);
hbitmap_test_set(data, L2 - 1, L1 + 2);
hbitmap_test_set(data, L2 + L1 * 2 - 1, L1 + 2);
hbitmap_test_set(data, L2 + L1 * 4, L1 * 2 + 1);
hbitmap_test_set(data, L2 + L1 * 7 - 1, L1 * 2 + 1);
hbitmap_test_set(data, L2 * 2 - 1, L3 * 2 - L2 * 2);
}
static void test_hbitmap_set_twice(TestHBitmapData *data,
const void *unused)
{
hbitmap_test_init(data, L1 * 3, 0);
hbitmap_test_set(data, 0, L1 * 3);
hbitmap_test_set(data, L1, 1);
}
static void test_hbitmap_set_overlap(TestHBitmapData *data,
const void *unused)
{
hbitmap_test_init(data, L3 * 2, 0);
hbitmap_test_set(data, L1 - 1, L1 + 2);
hbitmap_test_set(data, L1 * 2 - 1, L1 * 2 + 2);
hbitmap_test_set(data, 0, L1 * 3);
hbitmap_test_set(data, L1 * 8 - 1, L2);
hbitmap_test_set(data, L2, L1);
hbitmap_test_set(data, L2 - L1 - 1, L1 * 8 + 2);
hbitmap_test_set(data, L2, L3 - L2 + 1);
hbitmap_test_set(data, L3 - L1, L1 * 3);
hbitmap_test_set(data, L3 - 1, 3);
hbitmap_test_set(data, L3 - 1, L2);
}
static void test_hbitmap_reset_empty(TestHBitmapData *data,
const void *unused)
{
hbitmap_test_init(data, L3, 0);
hbitmap_test_reset(data, 0, L3);
}
static void test_hbitmap_reset(TestHBitmapData *data,
const void *unused)
{
hbitmap_test_init(data, L3 * 2, 0);
hbitmap_test_set(data, L1 - 1, L1 + 2);
hbitmap_test_reset(data, L1 * 2 - 1, L1 * 2 + 2);
hbitmap_test_set(data, 0, L1 * 3);
hbitmap_test_reset(data, L1 * 8 - 1, L2);
hbitmap_test_set(data, L2, L1);
hbitmap_test_reset(data, L2 - L1 - 1, L1 * 8 + 2);
hbitmap_test_set(data, L2, L3 - L2 + 1);
hbitmap_test_reset(data, L3 - L1, L1 * 3);
hbitmap_test_set(data, L3 - 1, 3);
hbitmap_test_reset(data, L3 - 1, L2);
hbitmap_test_set(data, 0, L3 * 2);
hbitmap_test_reset(data, 0, L1);
hbitmap_test_reset(data, 0, L2);
hbitmap_test_reset(data, L3, L3);
hbitmap_test_set(data, L3 / 2, L3);
}
static void test_hbitmap_granularity(TestHBitmapData *data,
const void *unused)
{
/* Note that hbitmap_test_check has to be invoked manually in this test. */
hbitmap_test_init(data, L1, 1);
hbitmap_test_set(data, 0, 1);
g_assert_cmpint(hbitmap_count(data->hb), ==, 2);
hbitmap_test_check(data, 0);
hbitmap_test_set(data, 2, 1);
g_assert_cmpint(hbitmap_count(data->hb), ==, 4);
hbitmap_test_check(data, 0);
hbitmap_test_set(data, 0, 3);
g_assert_cmpint(hbitmap_count(data->hb), ==, 4);
hbitmap_test_reset(data, 0, 1);
g_assert_cmpint(hbitmap_count(data->hb), ==, 2);
}
static void test_hbitmap_iter_granularity(TestHBitmapData *data,
const void *unused)
{
HBitmapIter hbi;
/* Note that hbitmap_test_check has to be invoked manually in this test. */
hbitmap_test_init(data, 131072 << 7, 7);
hbitmap_iter_init(&hbi, data->hb, 0);
g_assert_cmpint(hbitmap_iter_next(&hbi), <, 0);
hbitmap_test_set(data, ((L2 + L1 + 1) << 7) + 8, 8);
hbitmap_iter_init(&hbi, data->hb, 0);
g_assert_cmpint(hbitmap_iter_next(&hbi), ==, (L2 + L1 + 1) << 7);
g_assert_cmpint(hbitmap_iter_next(&hbi), <, 0);
hbitmap_iter_init(&hbi, data->hb, (L2 + L1 + 2) << 7);
g_assert_cmpint(hbitmap_iter_next(&hbi), <, 0);
hbitmap_test_set(data, (131072 << 7) - 8, 8);
hbitmap_iter_init(&hbi, data->hb, 0);
g_assert_cmpint(hbitmap_iter_next(&hbi), ==, (L2 + L1 + 1) << 7);
g_assert_cmpint(hbitmap_iter_next(&hbi), ==, 131071 << 7);
g_assert_cmpint(hbitmap_iter_next(&hbi), <, 0);
hbitmap_iter_init(&hbi, data->hb, (L2 + L1 + 2) << 7);
g_assert_cmpint(hbitmap_iter_next(&hbi), ==, 131071 << 7);
g_assert_cmpint(hbitmap_iter_next(&hbi), <, 0);
}
static void hbitmap_test_add(const char *testpath,
void (*test_func)(TestHBitmapData *data, const void *user_data))
{
g_test_add(testpath, TestHBitmapData, NULL, NULL, test_func,
hbitmap_test_teardown);
}
int main(int argc, char **argv)
{
g_test_init(&argc, &argv, NULL);
hbitmap_test_add("/hbitmap/size/0", test_hbitmap_zero);
hbitmap_test_add("/hbitmap/size/unaligned", test_hbitmap_unaligned);
hbitmap_test_add("/hbitmap/iter/empty", test_hbitmap_iter_empty);
hbitmap_test_add("/hbitmap/iter/partial", test_hbitmap_iter_partial);
hbitmap_test_add("/hbitmap/iter/granularity", test_hbitmap_iter_granularity);
hbitmap_test_add("/hbitmap/get/all", test_hbitmap_get_all);
hbitmap_test_add("/hbitmap/get/some", test_hbitmap_get_some);
hbitmap_test_add("/hbitmap/set/all", test_hbitmap_set_all);
hbitmap_test_add("/hbitmap/set/one", test_hbitmap_set_one);
hbitmap_test_add("/hbitmap/set/two-elem", test_hbitmap_set_two_elem);
hbitmap_test_add("/hbitmap/set/general", test_hbitmap_set);
hbitmap_test_add("/hbitmap/set/twice", test_hbitmap_set_twice);
hbitmap_test_add("/hbitmap/set/overlap", test_hbitmap_set_overlap);
hbitmap_test_add("/hbitmap/reset/empty", test_hbitmap_reset_empty);
hbitmap_test_add("/hbitmap/reset/general", test_hbitmap_reset);
hbitmap_test_add("/hbitmap/granularity", test_hbitmap_granularity);
g_test_run();
return 0;
}

View File

@ -79,10 +79,17 @@ commit_start(void *bs, void *base, void *top, void *s, void *co, void *opaque) "
# block/mirror.c
mirror_start(void *bs, void *s, void *co, void *opaque) "bs %p s %p co %p opaque %p"
mirror_restart_iter(void *s, int64_t cnt) "s %p dirty count %"PRId64
mirror_before_flush(void *s) "s %p"
mirror_before_drain(void *s, int64_t cnt) "s %p dirty count %"PRId64
mirror_before_sleep(void *s, int64_t cnt, int synced) "s %p dirty count %"PRId64" synced %d"
mirror_one_iteration(void *s, int64_t sector_num, int nb_sectors) "s %p sector_num %"PRId64" nb_sectors %d"
mirror_cow(void *s, int64_t sector_num) "s %p sector_num %"PRId64
mirror_iteration_done(void *s, int64_t sector_num, int nb_sectors, int ret) "s %p sector_num %"PRId64" nb_sectors %d ret %d"
mirror_yield(void *s, int64_t cnt, int buf_free_count, int in_flight) "s %p dirty count %"PRId64" free buffers %d in_flight %d"
mirror_yield_in_flight(void *s, int64_t sector_num, int in_flight) "s %p sector_num %"PRId64" in_flight %d"
mirror_yield_buf_busy(void *s, int nb_chunks, int in_flight) "s %p requested chunks %d in_flight %d"
mirror_break_buf_busy(void *s, int nb_chunks, int in_flight) "s %p requested chunks %d in_flight %d"
# blockdev.c
qmp_block_job_cancel(void *job) "job %p"
@ -1060,3 +1067,8 @@ xics_set_irq_lsi(int srcno, int nr) "set_irq_lsi: srcno %d [irq %#x]"
xics_ics_write_xive(int nr, int srcno, int server, uint8_t priority) "ics_write_xive: irq %#x [src %d] server %#x prio %#x"
xics_ics_reject(int nr, int srcno) "reject irq %#x [src %d]"
xics_ics_eoi(int nr) "ics_eoi: irq %#x"
# hbitmap.c
hbitmap_iter_skip_words(const void *hb, void *hbi, uint64_t pos, unsigned long cur) "hb %p hbi %p pos %"PRId64" cur 0x%lx"
hbitmap_reset(void *hb, uint64_t start, uint64_t count, uint64_t sbit, uint64_t ebit) "hb %p items %"PRIu64",%"PRIu64" bits %"PRIu64"..%"PRIu64
hbitmap_set(void *hb, uint64_t start, uint64_t count, uint64_t sbit, uint64_t ebit) "hb %p items %"PRIu64",%"PRIu64" bits %"PRIu64"..%"PRIu64

View File

@ -2,7 +2,7 @@ util-obj-y = osdep.o cutils.o qemu-timer-common.o
util-obj-$(CONFIG_WIN32) += oslib-win32.o qemu-thread-win32.o event_notifier-win32.o
util-obj-$(CONFIG_POSIX) += oslib-posix.o qemu-thread-posix.o event_notifier-posix.o
util-obj-y += envlist.o path.o host-utils.o cache-utils.o module.o
util-obj-y += bitmap.o bitops.o
util-obj-y += bitmap.o bitops.o hbitmap.o
util-obj-y += acl.o
util-obj-y += error.o qemu-error.o
util-obj-$(CONFIG_POSIX) += compatfd.o

401
util/hbitmap.c Normal file
View File

@ -0,0 +1,401 @@
/*
* Hierarchical Bitmap Data Type
*
* Copyright Red Hat, Inc., 2012
*
* Author: Paolo Bonzini <pbonzini@redhat.com>
*
* This work is licensed under the terms of the GNU GPL, version 2 or
* later. See the COPYING file in the top-level directory.
*/
#include <string.h>
#include <glib.h>
#include <assert.h>
#include "qemu/osdep.h"
#include "qemu/hbitmap.h"
#include "qemu/host-utils.h"
#include "trace.h"
/* HBitmaps provides an array of bits. The bits are stored as usual in an
* array of unsigned longs, but HBitmap is also optimized to provide fast
* iteration over set bits; going from one bit to the next is O(logB n)
* worst case, with B = sizeof(long) * CHAR_BIT: the result is low enough
* that the number of levels is in fact fixed.
*
* In order to do this, it stacks multiple bitmaps with progressively coarser
* granularity; in all levels except the last, bit N is set iff the N-th
* unsigned long is nonzero in the immediately next level. When iteration
* completes on the last level it can examine the 2nd-last level to quickly
* skip entire words, and even do so recursively to skip blocks of 64 words or
* powers thereof (32 on 32-bit machines).
*
* Given an index in the bitmap, it can be split in group of bits like
* this (for the 64-bit case):
*
* bits 0-57 => word in the last bitmap | bits 58-63 => bit in the word
* bits 0-51 => word in the 2nd-last bitmap | bits 52-57 => bit in the word
* bits 0-45 => word in the 3rd-last bitmap | bits 46-51 => bit in the word
*
* So it is easy to move up simply by shifting the index right by
* log2(BITS_PER_LONG) bits. To move down, you shift the index left
* similarly, and add the word index within the group. Iteration uses
* ffs (find first set bit) to find the next word to examine; this
* operation can be done in constant time in most current architectures.
*
* Setting or clearing a range of m bits on all levels, the work to perform
* is O(m + m/W + m/W^2 + ...), which is O(m) like on a regular bitmap.
*
* When iterating on a bitmap, each bit (on any level) is only visited
* once. Hence, The total cost of visiting a bitmap with m bits in it is
* the number of bits that are set in all bitmaps. Unless the bitmap is
* extremely sparse, this is also O(m + m/W + m/W^2 + ...), so the amortized
* cost of advancing from one bit to the next is usually constant (worst case
* O(logB n) as in the non-amortized complexity).
*/
struct HBitmap {
/* Number of total bits in the bottom level. */
uint64_t size;
/* Number of set bits in the bottom level. */
uint64_t count;
/* A scaling factor. Given a granularity of G, each bit in the bitmap will
* will actually represent a group of 2^G elements. Each operation on a
* range of bits first rounds the bits to determine which group they land
* in, and then affect the entire page; iteration will only visit the first
* bit of each group. Here is an example of operations in a size-16,
* granularity-1 HBitmap:
*
* initial state 00000000
* set(start=0, count=9) 11111000 (iter: 0, 2, 4, 6, 8)
* reset(start=1, count=3) 00111000 (iter: 4, 6, 8)
* set(start=9, count=2) 00111100 (iter: 4, 6, 8, 10)
* reset(start=5, count=5) 00000000
*
* From an implementation point of view, when setting or resetting bits,
* the bitmap will scale bit numbers right by this amount of bits. When
* iterating, the bitmap will scale bit numbers left by this amount of
* bits.
*/
int granularity;
/* A number of progressively less coarse bitmaps (i.e. level 0 is the
* coarsest). Each bit in level N represents a word in level N+1 that
* has a set bit, except the last level where each bit represents the
* actual bitmap.
*
* Note that all bitmaps have the same number of levels. Even a 1-bit
* bitmap will still allocate HBITMAP_LEVELS arrays.
*/
unsigned long *levels[HBITMAP_LEVELS];
};
static inline int popcountl(unsigned long l)
{
return BITS_PER_LONG == 32 ? ctpop32(l) : ctpop64(l);
}
/* Advance hbi to the next nonzero word and return it. hbi->pos
* is updated. Returns zero if we reach the end of the bitmap.
*/
unsigned long hbitmap_iter_skip_words(HBitmapIter *hbi)
{
size_t pos = hbi->pos;
const HBitmap *hb = hbi->hb;
unsigned i = HBITMAP_LEVELS - 1;
unsigned long cur;
do {
cur = hbi->cur[--i];
pos >>= BITS_PER_LEVEL;
} while (cur == 0);
/* Check for end of iteration. We always use fewer than BITS_PER_LONG
* bits in the level 0 bitmap; thus we can repurpose the most significant
* bit as a sentinel. The sentinel is set in hbitmap_alloc and ensures
* that the above loop ends even without an explicit check on i.
*/
if (i == 0 && cur == (1UL << (BITS_PER_LONG - 1))) {
return 0;
}
for (; i < HBITMAP_LEVELS - 1; i++) {
/* Shift back pos to the left, matching the right shifts above.
* The index of this word's least significant set bit provides
* the low-order bits.
*/
pos = (pos << BITS_PER_LEVEL) + ffsl(cur) - 1;
hbi->cur[i] = cur & (cur - 1);
/* Set up next level for iteration. */
cur = hb->levels[i + 1][pos];
}
hbi->pos = pos;
trace_hbitmap_iter_skip_words(hbi->hb, hbi, pos, cur);
assert(cur);
return cur;
}
void hbitmap_iter_init(HBitmapIter *hbi, const HBitmap *hb, uint64_t first)
{
unsigned i, bit;
uint64_t pos;
hbi->hb = hb;
pos = first >> hb->granularity;
assert(pos < hb->size);
hbi->pos = pos >> BITS_PER_LEVEL;
hbi->granularity = hb->granularity;
for (i = HBITMAP_LEVELS; i-- > 0; ) {
bit = pos & (BITS_PER_LONG - 1);
pos >>= BITS_PER_LEVEL;
/* Drop bits representing items before first. */
hbi->cur[i] = hb->levels[i][pos] & ~((1UL << bit) - 1);
/* We have already added level i+1, so the lowest set bit has
* been processed. Clear it.
*/
if (i != HBITMAP_LEVELS - 1) {
hbi->cur[i] &= ~(1UL << bit);
}
}
}
bool hbitmap_empty(const HBitmap *hb)
{
return hb->count == 0;
}
int hbitmap_granularity(const HBitmap *hb)
{
return hb->granularity;
}
uint64_t hbitmap_count(const HBitmap *hb)
{
return hb->count << hb->granularity;
}
/* Count the number of set bits between start and end, not accounting for
* the granularity. Also an example of how to use hbitmap_iter_next_word.
*/
static uint64_t hb_count_between(HBitmap *hb, uint64_t start, uint64_t last)
{
HBitmapIter hbi;
uint64_t count = 0;
uint64_t end = last + 1;
unsigned long cur;
size_t pos;
hbitmap_iter_init(&hbi, hb, start << hb->granularity);
for (;;) {
pos = hbitmap_iter_next_word(&hbi, &cur);
if (pos >= (end >> BITS_PER_LEVEL)) {
break;
}
count += popcountl(cur);
}
if (pos == (end >> BITS_PER_LEVEL)) {
/* Drop bits representing the END-th and subsequent items. */
int bit = end & (BITS_PER_LONG - 1);
cur &= (1UL << bit) - 1;
count += popcountl(cur);
}
return count;
}
/* Setting starts at the last layer and propagates up if an element
* changes from zero to non-zero.
*/
static inline bool hb_set_elem(unsigned long *elem, uint64_t start, uint64_t last)
{
unsigned long mask;
bool changed;
assert((last >> BITS_PER_LEVEL) == (start >> BITS_PER_LEVEL));
assert(start <= last);
mask = 2UL << (last & (BITS_PER_LONG - 1));
mask -= 1UL << (start & (BITS_PER_LONG - 1));
changed = (*elem == 0);
*elem |= mask;
return changed;
}
/* The recursive workhorse (the depth is limited to HBITMAP_LEVELS)... */
static void hb_set_between(HBitmap *hb, int level, uint64_t start, uint64_t last)
{
size_t pos = start >> BITS_PER_LEVEL;
size_t lastpos = last >> BITS_PER_LEVEL;
bool changed = false;
size_t i;
i = pos;
if (i < lastpos) {
uint64_t next = (start | (BITS_PER_LONG - 1)) + 1;
changed |= hb_set_elem(&hb->levels[level][i], start, next - 1);
for (;;) {
start = next;
next += BITS_PER_LONG;
if (++i == lastpos) {
break;
}
changed |= (hb->levels[level][i] == 0);
hb->levels[level][i] = ~0UL;
}
}
changed |= hb_set_elem(&hb->levels[level][i], start, last);
/* If there was any change in this layer, we may have to update
* the one above.
*/
if (level > 0 && changed) {
hb_set_between(hb, level - 1, pos, lastpos);
}
}
void hbitmap_set(HBitmap *hb, uint64_t start, uint64_t count)
{
/* Compute range in the last layer. */
uint64_t last = start + count - 1;
trace_hbitmap_set(hb, start, count,
start >> hb->granularity, last >> hb->granularity);
start >>= hb->granularity;
last >>= hb->granularity;
count = last - start + 1;
hb->count += count - hb_count_between(hb, start, last);
hb_set_between(hb, HBITMAP_LEVELS - 1, start, last);
}
/* Resetting works the other way round: propagate up if the new
* value is zero.
*/
static inline bool hb_reset_elem(unsigned long *elem, uint64_t start, uint64_t last)
{
unsigned long mask;
bool blanked;
assert((last >> BITS_PER_LEVEL) == (start >> BITS_PER_LEVEL));
assert(start <= last);
mask = 2UL << (last & (BITS_PER_LONG - 1));
mask -= 1UL << (start & (BITS_PER_LONG - 1));
blanked = *elem != 0 && ((*elem & ~mask) == 0);
*elem &= ~mask;
return blanked;
}
/* The recursive workhorse (the depth is limited to HBITMAP_LEVELS)... */
static void hb_reset_between(HBitmap *hb, int level, uint64_t start, uint64_t last)
{
size_t pos = start >> BITS_PER_LEVEL;
size_t lastpos = last >> BITS_PER_LEVEL;
bool changed = false;
size_t i;
i = pos;
if (i < lastpos) {
uint64_t next = (start | (BITS_PER_LONG - 1)) + 1;
/* Here we need a more complex test than when setting bits. Even if
* something was changed, we must not blank bits in the upper level
* unless the lower-level word became entirely zero. So, remove pos
* from the upper-level range if bits remain set.
*/
if (hb_reset_elem(&hb->levels[level][i], start, next - 1)) {
changed = true;
} else {
pos++;
}
for (;;) {
start = next;
next += BITS_PER_LONG;
if (++i == lastpos) {
break;
}
changed |= (hb->levels[level][i] != 0);
hb->levels[level][i] = 0UL;
}
}
/* Same as above, this time for lastpos. */
if (hb_reset_elem(&hb->levels[level][i], start, last)) {
changed = true;
} else {
lastpos--;
}
if (level > 0 && changed) {
hb_reset_between(hb, level - 1, pos, lastpos);
}
}
void hbitmap_reset(HBitmap *hb, uint64_t start, uint64_t count)
{
/* Compute range in the last layer. */
uint64_t last = start + count - 1;
trace_hbitmap_reset(hb, start, count,
start >> hb->granularity, last >> hb->granularity);
start >>= hb->granularity;
last >>= hb->granularity;
hb->count -= hb_count_between(hb, start, last);
hb_reset_between(hb, HBITMAP_LEVELS - 1, start, last);
}
bool hbitmap_get(const HBitmap *hb, uint64_t item)
{
/* Compute position and bit in the last layer. */
uint64_t pos = item >> hb->granularity;
unsigned long bit = 1UL << (pos & (BITS_PER_LONG - 1));
return (hb->levels[HBITMAP_LEVELS - 1][pos >> BITS_PER_LEVEL] & bit) != 0;
}
void hbitmap_free(HBitmap *hb)
{
unsigned i;
for (i = HBITMAP_LEVELS; i-- > 0; ) {
g_free(hb->levels[i]);
}
g_free(hb);
}
HBitmap *hbitmap_alloc(uint64_t size, int granularity)
{
HBitmap *hb = g_malloc0(sizeof (struct HBitmap));
unsigned i;
assert(granularity >= 0 && granularity < 64);
size = (size + (1ULL << granularity) - 1) >> granularity;
assert(size <= ((uint64_t)1 << HBITMAP_LOG_MAX_SIZE));
hb->size = size;
hb->granularity = granularity;
for (i = HBITMAP_LEVELS; i-- > 0; ) {
size = MAX((size + BITS_PER_LONG - 1) >> BITS_PER_LEVEL, 1);
hb->levels[i] = g_malloc0(size * sizeof(unsigned long));
}
/* We necessarily have free bits in level 0 due to the definition
* of HBITMAP_LEVELS, so use one for a sentinel. This speeds up
* hbitmap_iter_skip_words.
*/
assert(size == 1);
hb->levels[0][0] |= 1UL << (BITS_PER_LONG - 1);
return hb;
}