Merge remote-tracking branch 'quintela/migration.next' into staging

# By Michael R. Hines (8) and others
# Via Juan Quintela
* quintela/migration.next:
  migration: add autoconvergence documentation
  Fix real mode guest segments dpl value in savevm
  Fix real mode guest migration
  rdma: account for the time spent in MIG_STATE_SETUP through QMP
  rdma: introduce MIG_STATE_NONE and change MIG_STATE_SETUP state transition
  rdma: allow state transitions between other states besides ACTIVE
  rdma: send pc.ram
  rdma: core logic
  rdma: introduce ram_handle_compressed()
  rdma: bugfix: ram_control_save_page()
  rdma: update documentation to reflect new unpin support

Message-id: 1374590725-14144-1-git-send-email-quintela@redhat.com
Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
This commit is contained in:
Anthony Liguori 2013-07-23 10:57:23 -05:00
commit f03d07d468
11 changed files with 3467 additions and 47 deletions

View File

@ -51,6 +51,7 @@ common-obj-$(CONFIG_POSIX) += os-posix.o
common-obj-$(CONFIG_LINUX) += fsdev/
common-obj-y += migration.o migration-tcp.o
common-obj-$(CONFIG_RDMA) += migration-rdma.o
common-obj-y += qemu-char.o #aio.o
common-obj-y += block-migration.o
common-obj-y += page_cache.o xbzrle.o

View File

@ -118,6 +118,7 @@ static void check_guest_throttling(void);
#define RAM_SAVE_FLAG_EOS 0x10
#define RAM_SAVE_FLAG_CONTINUE 0x20
#define RAM_SAVE_FLAG_XBZRLE 0x40
/* 0x80 is reserved in migration.h start with 0x100 next */
static struct defconfig_file {
@ -475,6 +476,7 @@ static int ram_save_block(QEMUFile *f, bool last_stage)
ram_bulk_stage = false;
}
} else {
int ret;
uint8_t *p;
int cont = (block == last_sent_block) ?
RAM_SAVE_FLAG_CONTINUE : 0;
@ -483,7 +485,18 @@ static int ram_save_block(QEMUFile *f, bool last_stage)
/* In doubt sent page as normal */
bytes_sent = -1;
if (is_zero_page(p)) {
ret = ram_control_save_page(f, block->offset,
offset, TARGET_PAGE_SIZE, &bytes_sent);
if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
if (ret != RAM_SAVE_CONTROL_DELAYED) {
if (bytes_sent > 0) {
acct_info.norm_pages++;
} else if (bytes_sent == 0) {
acct_info.dup_pages++;
}
}
} else if (is_zero_page(p)) {
acct_info.dup_pages++;
bytes_sent = save_block_hdr(f, block, offset, cont,
RAM_SAVE_FLAG_COMPRESS);
@ -635,6 +648,10 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
}
qemu_mutex_unlock_ramlist();
ram_control_before_iterate(f, RAM_CONTROL_SETUP);
ram_control_after_iterate(f, RAM_CONTROL_SETUP);
qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
return 0;
@ -653,6 +670,8 @@ static int ram_save_iterate(QEMUFile *f, void *opaque)
reset_ram_globals();
}
ram_control_before_iterate(f, RAM_CONTROL_ROUND);
t0 = qemu_get_clock_ns(rt_clock);
i = 0;
while ((ret = qemu_file_rate_limit(f)) == 0) {
@ -684,6 +703,12 @@ static int ram_save_iterate(QEMUFile *f, void *opaque)
qemu_mutex_unlock_ramlist();
/*
* Must occur before EOS (or any QEMUFile operation)
* because of RDMA protocol.
*/
ram_control_after_iterate(f, RAM_CONTROL_ROUND);
if (ret < 0) {
bytes_transferred += total_sent;
return ret;
@ -701,6 +726,8 @@ static int ram_save_complete(QEMUFile *f, void *opaque)
qemu_mutex_lock_ramlist();
migration_bitmap_sync();
ram_control_before_iterate(f, RAM_CONTROL_FINISH);
/* try transferring iterative blocks of memory */
/* flush all remaining blocks regardless of rate limiting */
@ -714,6 +741,8 @@ static int ram_save_complete(QEMUFile *f, void *opaque)
}
bytes_transferred += bytes_sent;
}
ram_control_after_iterate(f, RAM_CONTROL_FINISH);
migration_end();
qemu_mutex_unlock_ramlist();
@ -808,6 +837,24 @@ static inline void *host_from_stream_offset(QEMUFile *f,
return NULL;
}
/*
* If a page (or a whole RDMA chunk) has been
* determined to be zero, then zap it.
*/
void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
{
if (ch != 0 || !is_zero_page(host)) {
memset(host, ch, size);
#ifndef _WIN32
if (ch == 0 &&
(!kvm_enabled() || kvm_has_sync_mmu()) &&
getpagesize() <= TARGET_PAGE_SIZE) {
qemu_madvise(host, TARGET_PAGE_SIZE, QEMU_MADV_DONTNEED);
}
#endif
}
}
static int ram_load(QEMUFile *f, void *opaque, int version_id)
{
ram_addr_t addr;
@ -879,16 +926,7 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
}
ch = qemu_get_byte(f);
if (ch != 0 || !is_zero_page(host)) {
memset(host, ch, TARGET_PAGE_SIZE);
#ifndef _WIN32
if (ch == 0 &&
(!kvm_enabled() || kvm_has_sync_mmu()) &&
getpagesize() <= TARGET_PAGE_SIZE) {
qemu_madvise(host, TARGET_PAGE_SIZE, QEMU_MADV_DONTNEED);
}
#endif
}
ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
} else if (flags & RAM_SAVE_FLAG_PAGE) {
void *host;
@ -908,6 +946,8 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
ret = -EINVAL;
goto done;
}
} else if (flags & RAM_SAVE_FLAG_HOOK) {
ram_control_load_hook(f, flags);
}
error = qemu_file_get_error(f);
if (error) {

40
configure vendored
View File

@ -180,6 +180,7 @@ xfs=""
vhost_net="no"
vhost_scsi="no"
kvm="no"
rdma=""
gprof="no"
debug_tcg="no"
debug="no"
@ -937,6 +938,10 @@ for opt do
;;
--enable-gtk) gtk="yes"
;;
--enable-rdma) rdma="yes"
;;
--disable-rdma) rdma="no"
;;
--with-gtkabi=*) gtkabi="$optarg"
;;
--enable-tpm) tpm="yes"
@ -1095,6 +1100,8 @@ echo " --enable-bluez enable bluez stack connectivity"
echo " --disable-slirp disable SLIRP userspace network connectivity"
echo " --disable-kvm disable KVM acceleration support"
echo " --enable-kvm enable KVM acceleration support"
echo " --disable-rdma disable RDMA-based migration support"
echo " --enable-rdma enable RDMA-based migration support"
echo " --enable-tcg-interpreter enable TCG with bytecode interpreter (TCI)"
echo " --disable-nptl disable usermode NPTL support"
echo " --enable-nptl enable usermode NPTL support"
@ -1801,6 +1808,30 @@ EOF
libs_softmmu="$sdl_libs $libs_softmmu"
fi
##########################################
# RDMA needs OpenFabrics libraries
if test "$rdma" != "no" ; then
cat > $TMPC <<EOF
#include <rdma/rdma_cma.h>
int main(void) { return 0; }
EOF
rdma_libs="-lrdmacm -libverbs"
if compile_prog "" "$rdma_libs" ; then
rdma="yes"
libs_softmmu="$libs_softmmu $rdma_libs"
else
if test "$rdma" = "yes" ; then
error_exit \
" OpenFabrics librdmacm/libibverbs not present." \
" Your options:" \
" (1) Fast: Install infiniband packages from your distro." \
" (2) Cleanest: Install libraries from www.openfabrics.org" \
" (3) Also: Install softiwarp if you don't have RDMA hardware"
fi
rdma="no"
fi
fi
##########################################
# VNC TLS/WS detection
if test "$vnc" = "yes" -a \( "$vnc_tls" != "no" -o "$vnc_ws" != "no" \) ; then
@ -3558,6 +3589,7 @@ echo "Linux AIO support $linux_aio"
echo "ATTR/XATTR support $attr"
echo "Install blobs $blobs"
echo "KVM support $kvm"
echo "RDMA support $rdma"
echo "TCG interpreter $tcg_interpreter"
echo "fdt support $fdt"
echo "preadv support $preadv"
@ -4046,6 +4078,10 @@ if test "$trace_default" = "yes"; then
echo "CONFIG_TRACE_DEFAULT=y" >> $config_host_mak
fi
if test "$rdma" = "yes" ; then
echo "CONFIG_RDMA=y" >> $config_host_mak
fi
if test "$tcg_interpreter" = "yes"; then
QEMU_INCLUDES="-I\$(SRC_PATH)/tcg/tci $QEMU_INCLUDES"
elif test "$ARCH" = "sparc64" ; then
@ -4485,6 +4521,10 @@ if [ "$pixman" = "internal" ]; then
echo "config-host.h: subdir-pixman" >> $config_host_mak
fi
if test "$rdma" = "yes" ; then
echo "CONFIG_RDMA=y" >> $config_host_mak
fi
if [ "$dtc_internal" = "yes" ]; then
echo "config-host.h: subdir-dtc" >> $config_host_mak
fi

View File

@ -35,7 +35,7 @@ memory tracked during each live migration iteration round cannot keep pace
with the rate of dirty memory produced by the workload.
RDMA currently comes in two flavors: both Ethernet based (RoCE, or RDMA
over Convered Ethernet) as well as Infiniband-based. This implementation of
over Converged Ethernet) as well as Infiniband-based. This implementation of
migration using RDMA is capable of using both technologies because of
the use of the OpenFabrics OFED software stack that abstracts out the
programming model irrespective of the underlying hardware.
@ -188,9 +188,9 @@ header portion and a data portion (but together are transmitted
as a single SEND message).
Header:
* Length (of the data portion, uint32, network byte order)
* Type (what command to perform, uint32, network byte order)
* Repeat (Number of commands in data portion, same type only)
* Length (of the data portion, uint32, network byte order)
* Type (what command to perform, uint32, network byte order)
* Repeat (Number of commands in data portion, same type only)
The 'Repeat' field is here to support future multiple page registrations
in a single message without any need to change the protocol itself
@ -202,17 +202,19 @@ The maximum number of repeats is hard-coded to 4096. This is a conservative
limit based on the maximum size of a SEND message along with emperical
observations on the maximum future benefit of simultaneous page registrations.
The 'type' field has 10 different command values:
1. Unused
2. Error (sent to the source during bad things)
3. Ready (control-channel is available)
4. QEMU File (for sending non-live device state)
5. RAM Blocks request (used right after connection setup)
6. RAM Blocks result (used right after connection setup)
7. Compress page (zap zero page and skip registration)
8. Register request (dynamic chunk registration)
9. Register result ('rkey' to be used by sender)
10. Register finished (registration for current iteration finished)
The 'type' field has 12 different command values:
1. Unused
2. Error (sent to the source during bad things)
3. Ready (control-channel is available)
4. QEMU File (for sending non-live device state)
5. RAM Blocks request (used right after connection setup)
6. RAM Blocks result (used right after connection setup)
7. Compress page (zap zero page and skip registration)
8. Register request (dynamic chunk registration)
9. Register result ('rkey' to be used by sender)
10. Register finished (registration for current iteration finished)
11. Unregister request (unpin previously registered memory)
12. Unregister finished (confirmation that unpin completed)
A single control message, as hinted above, can contain within the data
portion an array of many commands of the same type. If there is more than
@ -243,7 +245,7 @@ qemu_rdma_exchange_send(header, data, optional response header & data):
from the receiver to tell us that the receiver
is *ready* for us to transmit some new bytes.
2. Optionally: if we are expecting a response from the command
(that we have no yet transmitted), let's post an RQ
(that we have not yet transmitted), let's post an RQ
work request to receive that data a few moments later.
3. When the READY arrives, librdmacm will
unblock us and we immediately post a RQ work request
@ -293,8 +295,10 @@ librdmacm provides the user with a 'private data' area to be exchanged
at connection-setup time before any infiniband traffic is generated.
Header:
* Version (protocol version validated before send/recv occurs), uint32, network byte order
* Flags (bitwise OR of each capability), uint32, network byte order
* Version (protocol version validated before send/recv occurs),
uint32, network byte order
* Flags (bitwise OR of each capability),
uint32, network byte order
There is no data portion of this header right now, so there is
no length field. The maximum size of the 'private data' section
@ -313,7 +317,7 @@ If the version is invalid, we throw an error.
If the version is new, we only negotiate the capabilities that the
requested version is able to perform and ignore the rest.
Currently there is only *one* capability in Version #1: dynamic page registration
Currently there is only one capability in Version #1: dynamic page registration
Finally: Negotiation happens with the Flags field: If the primary-VM
sets a flag, but the destination does not support this capability, it
@ -326,8 +330,8 @@ QEMUFileRDMA Interface:
QEMUFileRDMA introduces a couple of new functions:
1. qemu_rdma_get_buffer() (QEMUFileOps rdma_read_ops)
2. qemu_rdma_put_buffer() (QEMUFileOps rdma_write_ops)
1. qemu_rdma_get_buffer() (QEMUFileOps rdma_read_ops)
2. qemu_rdma_put_buffer() (QEMUFileOps rdma_write_ops)
These two functions are very short and simply use the protocol
describe above to deliver bytes without changing the upper-level
@ -413,3 +417,8 @@ TODO:
the use of KSM and ballooning while using RDMA.
4. Also, some form of balloon-device usage tracking would also
help alleviate some issues.
5. Move UNREGISTER requests to a separate thread.
6. Use LRU to provide more fine-grained direction of UNREGISTER
requests for unpinning memory in an overcommitted environment.
7. Expose UNREGISTER support to the user by way of workload-specific
hints about application behavior.

4
hmp.c
View File

@ -164,6 +164,10 @@ void hmp_info_migrate(Monitor *mon, const QDict *qdict)
monitor_printf(mon, "downtime: %" PRIu64 " milliseconds\n",
info->downtime);
}
if (info->has_setup_time) {
monitor_printf(mon, "setup: %" PRIu64 " milliseconds\n",
info->setup_time);
}
}
if (info->has_ram) {

View File

@ -49,6 +49,7 @@ struct MigrationState
int64_t dirty_bytes_rate;
bool enabled_capabilities[MIGRATION_CAPABILITY_MAX];
int64_t xbzrle_cache_size;
int64_t setup_time;
};
void process_incoming_migration(QEMUFile *f);
@ -77,6 +78,10 @@ void fd_start_incoming_migration(const char *path, Error **errp);
void fd_start_outgoing_migration(MigrationState *s, const char *fdname, Error **errp);
void rdma_start_outgoing_migration(void *opaque, const char *host_port, Error **errp);
void rdma_start_incoming_migration(const char *host_port, Error **errp);
void migrate_fd_error(MigrationState *s);
void migrate_fd_connect(MigrationState *s);
@ -109,6 +114,8 @@ uint64_t xbzrle_mig_pages_transferred(void);
uint64_t xbzrle_mig_pages_overflow(void);
uint64_t xbzrle_mig_pages_cache_miss(void);
void ram_handle_compressed(void *host, uint8_t ch, uint64_t size);
/**
* @migrate_add_blocker - prevent migration from proceeding
*

3249
migration-rdma.c Normal file

File diff suppressed because it is too large Load Diff

View File

@ -36,7 +36,8 @@
#endif
enum {
MIG_STATE_ERROR,
MIG_STATE_ERROR = -1,
MIG_STATE_NONE,
MIG_STATE_SETUP,
MIG_STATE_CANCELLED,
MIG_STATE_ACTIVE,
@ -63,7 +64,7 @@ static NotifierList migration_state_notifiers =
MigrationState *migrate_get_current(void)
{
static MigrationState current_migration = {
.state = MIG_STATE_SETUP,
.state = MIG_STATE_NONE,
.bandwidth_limit = MAX_THROTTLE,
.xbzrle_cache_size = DEFAULT_MIGRATE_CACHE_SIZE,
.mbps = -1,
@ -78,6 +79,10 @@ void qemu_start_incoming_migration(const char *uri, Error **errp)
if (strstart(uri, "tcp:", &p))
tcp_start_incoming_migration(p, errp);
#ifdef CONFIG_RDMA
else if (strstart(uri, "x-rdma:", &p))
rdma_start_incoming_migration(p, errp);
#endif
#if !defined(WIN32)
else if (strstart(uri, "exec:", &p))
exec_start_incoming_migration(p, errp);
@ -180,9 +185,14 @@ MigrationInfo *qmp_query_migrate(Error **errp)
MigrationState *s = migrate_get_current();
switch (s->state) {
case MIG_STATE_SETUP:
case MIG_STATE_NONE:
/* no migration has happened ever */
break;
case MIG_STATE_SETUP:
info->has_status = true;
info->status = g_strdup("setup");
info->has_total_time = false;
break;
case MIG_STATE_ACTIVE:
info->has_status = true;
info->status = g_strdup("active");
@ -191,6 +201,8 @@ MigrationInfo *qmp_query_migrate(Error **errp)
- s->total_time;
info->has_expected_downtime = true;
info->expected_downtime = s->expected_downtime;
info->has_setup_time = true;
info->setup_time = s->setup_time;
info->has_ram = true;
info->ram = g_malloc0(sizeof(*info->ram));
@ -222,6 +234,8 @@ MigrationInfo *qmp_query_migrate(Error **errp)
info->total_time = s->total_time;
info->has_downtime = true;
info->downtime = s->downtime;
info->has_setup_time = true;
info->setup_time = s->setup_time;
info->has_ram = true;
info->ram = g_malloc0(sizeof(*info->ram));
@ -253,7 +267,7 @@ void qmp_migrate_set_capabilities(MigrationCapabilityStatusList *params,
MigrationState *s = migrate_get_current();
MigrationCapabilityStatusList *cap;
if (s->state == MIG_STATE_ACTIVE) {
if (s->state == MIG_STATE_ACTIVE || s->state == MIG_STATE_SETUP) {
error_set(errp, QERR_MIGRATION_ACTIVE);
return;
}
@ -291,9 +305,9 @@ static void migrate_fd_cleanup(void *opaque)
notifier_list_notify(&migration_state_notifiers, s);
}
static void migrate_finish_set_state(MigrationState *s, int new_state)
static void migrate_set_state(MigrationState *s, int old_state, int new_state)
{
if (atomic_cmpxchg(&s->state, MIG_STATE_ACTIVE, new_state) == new_state) {
if (atomic_cmpxchg(&s->state, old_state, new_state) == new_state) {
trace_migrate_set_state(new_state);
}
}
@ -311,7 +325,7 @@ static void migrate_fd_cancel(MigrationState *s)
{
DPRINTF("cancelling migration\n");
migrate_finish_set_state(s, MIG_STATE_CANCELLED);
migrate_set_state(s, s->state, MIG_STATE_CANCELLED);
}
void add_migration_state_change_notifier(Notifier *notify)
@ -388,7 +402,7 @@ void qmp_migrate(const char *uri, bool has_blk, bool blk,
params.blk = blk;
params.shared = inc;
if (s->state == MIG_STATE_ACTIVE) {
if (s->state == MIG_STATE_ACTIVE || s->state == MIG_STATE_SETUP) {
error_set(errp, QERR_MIGRATION_ACTIVE);
return;
}
@ -406,6 +420,10 @@ void qmp_migrate(const char *uri, bool has_blk, bool blk,
if (strstart(uri, "tcp:", &p)) {
tcp_start_outgoing_migration(s, p, &local_err);
#ifdef CONFIG_RDMA
} else if (strstart(uri, "x-rdma:", &p)) {
rdma_start_outgoing_migration(s, p, &local_err);
#endif
#if !defined(WIN32)
} else if (strstart(uri, "exec:", &p)) {
exec_start_outgoing_migration(s, p, &local_err);
@ -526,6 +544,7 @@ static void *migration_thread(void *opaque)
{
MigrationState *s = opaque;
int64_t initial_time = qemu_get_clock_ms(rt_clock);
int64_t setup_start = qemu_get_clock_ms(host_clock);
int64_t initial_bytes = 0;
int64_t max_size = 0;
int64_t start_time = initial_time;
@ -534,6 +553,11 @@ static void *migration_thread(void *opaque)
DPRINTF("beginning savevm\n");
qemu_savevm_state_begin(s->file, &s->params);
s->setup_time = qemu_get_clock_ms(host_clock) - setup_start;
migrate_set_state(s, MIG_STATE_SETUP, MIG_STATE_ACTIVE);
DPRINTF("setup complete\n");
while (s->state == MIG_STATE_ACTIVE) {
int64_t current_time;
uint64_t pending_size;
@ -561,19 +585,19 @@ static void *migration_thread(void *opaque)
qemu_mutex_unlock_iothread();
if (ret < 0) {
migrate_finish_set_state(s, MIG_STATE_ERROR);
migrate_set_state(s, MIG_STATE_ACTIVE, MIG_STATE_ERROR);
break;
}
if (!qemu_file_get_error(s->file)) {
migrate_finish_set_state(s, MIG_STATE_COMPLETED);
migrate_set_state(s, MIG_STATE_ACTIVE, MIG_STATE_COMPLETED);
break;
}
}
}
if (qemu_file_get_error(s->file)) {
migrate_finish_set_state(s, MIG_STATE_ERROR);
migrate_set_state(s, MIG_STATE_ACTIVE, MIG_STATE_ERROR);
break;
}
current_time = qemu_get_clock_ms(rt_clock);
@ -624,8 +648,8 @@ static void *migration_thread(void *opaque)
void migrate_fd_connect(MigrationState *s)
{
s->state = MIG_STATE_ACTIVE;
trace_migrate_set_state(MIG_STATE_ACTIVE);
s->state = MIG_STATE_SETUP;
trace_migrate_set_state(MIG_STATE_SETUP);
/* This is a best 1st approximation. ns to ms */
s->expected_downtime = max_downtime/1000000;

View File

@ -578,6 +578,12 @@
# expected downtime in milliseconds for the guest in last walk
# of the dirty bitmap. (since 1.3)
#
# @setup-time: #optional amount of setup time in milliseconds _before_ the
# iterations begin but _after_ the QMP command is issued. This is designed
# to provide an accounting of any activities (such as RDMA pinning) which
# may be expensive, but do not actually occur during the iterative
# migration rounds themselves. (since 1.6)
#
# Since: 0.14.0
##
{ 'type': 'MigrationInfo',
@ -586,7 +592,8 @@
'*xbzrle-cache': 'XBZRLECacheStats',
'*total-time': 'int',
'*expected-downtime': 'int',
'*downtime': 'int'} }
'*downtime': 'int',
'*setup-time': 'int'} }
##
# @query-migrate
@ -619,6 +626,9 @@
# to enable the capability on the source VM. The feature is disabled by
# default. (since 1.6)
#
# @auto-converge: If enabled, QEMU will automatically throttle down the guest
# to speed up convergence of RAM migration. (since 1.6)
#
# Since: 1.2
##
{ 'enum': 'MigrationCapability',

View File

@ -662,7 +662,7 @@ size_t ram_control_save_page(QEMUFile *f, ram_addr_t block_offset,
offset, size, bytes_sent);
if (ret != RAM_SAVE_CONTROL_DELAYED) {
if (*bytes_sent > 0) {
if (bytes_sent && *bytes_sent > 0) {
qemu_update_position(f, *bytes_sent);
} else if (ret < 0) {
qemu_file_set_error(f, ret);

View File

@ -252,6 +252,24 @@ static void cpu_pre_save(void *opaque)
}
env->fpregs_format_vmstate = 0;
/*
* Real mode guest segments register DPL should be zero.
* Older KVM version were setting it wrongly.
* Fixing it will allow live migration to host with unrestricted guest
* support (otherwise the migration will fail with invalid guest state
* error).
*/
if (!(env->cr[0] & CR0_PE_MASK) &&
(env->segs[R_CS].flags >> DESC_DPL_SHIFT & 3) != 0) {
env->segs[R_CS].flags &= ~(env->segs[R_CS].flags & DESC_DPL_MASK);
env->segs[R_DS].flags &= ~(env->segs[R_DS].flags & DESC_DPL_MASK);
env->segs[R_ES].flags &= ~(env->segs[R_ES].flags & DESC_DPL_MASK);
env->segs[R_FS].flags &= ~(env->segs[R_FS].flags & DESC_DPL_MASK);
env->segs[R_GS].flags &= ~(env->segs[R_GS].flags & DESC_DPL_MASK);
env->segs[R_SS].flags &= ~(env->segs[R_SS].flags & DESC_DPL_MASK);
}
}
static int cpu_post_load(void *opaque, int version_id)
@ -260,6 +278,24 @@ static int cpu_post_load(void *opaque, int version_id)
CPUX86State *env = &cpu->env;
int i;
/*
* Real mode guest segments register DPL should be zero.
* Older KVM version were setting it wrongly.
* Fixing it will allow live migration from such host that don't have
* restricted guest support to a host with unrestricted guest support
* (otherwise the migration will fail with invalid guest state
* error).
*/
if (!(env->cr[0] & CR0_PE_MASK) &&
(env->segs[R_CS].flags >> DESC_DPL_SHIFT & 3) != 0) {
env->segs[R_CS].flags &= ~(env->segs[R_CS].flags & DESC_DPL_MASK);
env->segs[R_DS].flags &= ~(env->segs[R_DS].flags & DESC_DPL_MASK);
env->segs[R_ES].flags &= ~(env->segs[R_ES].flags & DESC_DPL_MASK);
env->segs[R_FS].flags &= ~(env->segs[R_FS].flags & DESC_DPL_MASK);
env->segs[R_GS].flags &= ~(env->segs[R_GS].flags & DESC_DPL_MASK);
env->segs[R_SS].flags &= ~(env->segs[R_SS].flags & DESC_DPL_MASK);
}
/* XXX: restore FPU round state */
env->fpstt = (env->fpus_vmstate >> 11) & 7;
env->fpus = env->fpus_vmstate & ~0x3800;