migration/next for 20180515
-----BEGIN PGP SIGNATURE----- iQIcBAABCAAGBQJa+1ceAAoJEPSH7xhYctcjdHQP/2To/iZXf7lp2mgxnN0YeDRV ivOooHXsCfYx5MtZCFo11wlZjGlnE6b5yEUzU7MapRLUgO3kd+gbDld/90gVdbLg umDL2nxTSjvIAlpg6O8aiQLaQLPvMDf1WSQQdyBHcqauy3KLaFmlm0ldK1besatQ KFniKXc8fL0R9QWA4aGDe22Sq9m7M9AKfmJU3NO9qNvxDImyMU2ix4w6cTZ0BtXk ltLju5zegBkKUxLFCgip7k6iCHh//tnBymQaLx6O7DdUJWIdKS+fHuCbA8WR24v2 pKsTcTICjI4FLWJMyIiH012paHoYGQIGdIx3Dv0JoQA6fT8x3arpztX3YVYFqrwF hsC6CAnHufDioipEhJk7JcEAGJ+82krqvTG1nRQe/ULvDSHGDjpwF1y0BxzGMVw6 hU+Ds4ohxyWJYJ0cx9msuKyIlCj1EooIuO6l1unmjBdRAptC8/bZXITVTixEHG+e X5vnBqloCL7PUATlaXipqVJfLA8T1MMNfwb11bI4OhTfteueFAAdyeJ2zCBqGBLa SK1sxYvMYY3/pzBhAO3YcTJIQofDQ2IciUvylr2jzCd2hB5Qj5In9789LVbl5I9B 55cICuVcsiXS7krJysSxtdCbjOB44Iv6M09vmA4ap8dmi9AWL/4LQIAJHVc1+koK 6y76K1BBFdB6aMkDzBeg =1a+b -----END PGP SIGNATURE----- Merge remote-tracking branch 'remotes/juanquintela/tags/migration/20180515' into staging migration/next for 20180515 # gpg: Signature made Tue 15 May 2018 22:54:38 BST # gpg: using RSA key F487EF185872D723 # gpg: Good signature from "Juan Quintela <quintela@redhat.com>" # gpg: aka "Juan Quintela <quintela@trasno.org>" # Primary key fingerprint: 1899 FF8E DEBF 58CC EE03 4B82 F487 EF18 5872 D723 * remotes/juanquintela/tags/migration/20180515: (40 commits) Migration+TLS: Fix crash due to double cleanup migration: Textual fixups for blocktime migration: update index field when delete or qsort RDMALocalBlock migration: update docs migration/hmp: add migrate_pause command migration/qmp: add command migrate-pause migration: introduce lock for to_dst_file hmp/migration: add migrate_recover command qmp/migration: new command migrate-recover migration: init dst in migration_object_init too migration: final handshake for the resume migration: setup ramstate for resume migration: synchronize dirty bitmap for resume migration: introduce SaveVMHandlers.resume_prepare migration: new message MIG_RP_MSG_RESUME_ACK migration: new cmd MIG_CMD_POSTCOPY_RESUME migration: new message MIG_RP_MSG_RECV_BITMAP migration: new cmd MIG_CMD_RECV_BITMAP migration: wakeup dst ram-load-thread for recover migration: new state "postcopy-recover" ... Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
This commit is contained in:
commit
eb7514ae10
@ -28,11 +28,11 @@ the guest to be stopped. Typically the time that the guest is
|
||||
unresponsive during live migration is the low hundred of milliseconds
|
||||
(notice that this depends on a lot of things).
|
||||
|
||||
Types of migration
|
||||
==================
|
||||
Transports
|
||||
==========
|
||||
|
||||
Now that we have talked about live migration, there are several ways
|
||||
to do migration:
|
||||
The migration stream is normally just a byte stream that can be passed
|
||||
over any transport.
|
||||
|
||||
- tcp migration: do the migration using tcp sockets
|
||||
- unix migration: do the migration using unix sockets
|
||||
@ -40,16 +40,16 @@ to do migration:
|
||||
- fd migration: do the migration using an file descriptor that is
|
||||
passed to QEMU. QEMU doesn't care how this file descriptor is opened.
|
||||
|
||||
All these four migration protocols use the same infrastructure to
|
||||
In addition, support is included for migration using RDMA, which
|
||||
transports the page data using ``RDMA``, where the hardware takes care of
|
||||
transporting the pages, and the load on the CPU is much lower. While the
|
||||
internals of RDMA migration are a bit different, this isn't really visible
|
||||
outside the RAM migration code.
|
||||
|
||||
All these migration protocols use the same infrastructure to
|
||||
save/restore state devices. This infrastructure is shared with the
|
||||
savevm/loadvm functionality.
|
||||
|
||||
State Live Migration
|
||||
====================
|
||||
|
||||
This is used for RAM and block devices. It is not yet ported to vmstate.
|
||||
<Fill more information here>
|
||||
|
||||
Common infrastructure
|
||||
=====================
|
||||
|
||||
@ -57,60 +57,75 @@ The files, sockets or fd's that carry the migration stream are abstracted by
|
||||
the ``QEMUFile`` type (see `migration/qemu-file.h`). In most cases this
|
||||
is connected to a subtype of ``QIOChannel`` (see `io/`).
|
||||
|
||||
|
||||
Saving the state of one device
|
||||
==============================
|
||||
|
||||
The state of a device is saved using intermediate buffers. There are
|
||||
some helper functions to assist this saving.
|
||||
For most devices, the state is saved in a single call to the migration
|
||||
infrastructure; these are *non-iterative* devices. The data for these
|
||||
devices is sent at the end of precopy migration, when the CPUs are paused.
|
||||
There are also *iterative* devices, which contain a very large amount of
|
||||
data (e.g. RAM or large tables). See the iterative device section below.
|
||||
|
||||
There is a new concept that we have to explain here: device state
|
||||
version. When we migrate a device, we save/load the state as a series
|
||||
of fields. Some times, due to bugs or new functionality, we need to
|
||||
change the state to store more/different information. We use the
|
||||
version to identify each time that we do a change. Each version is
|
||||
associated with a series of fields saved. The `save_state` always saves
|
||||
the state as the newer version. But `load_state` sometimes is able to
|
||||
load state from an older version.
|
||||
General advice for device developers
|
||||
------------------------------------
|
||||
|
||||
Legacy way
|
||||
----------
|
||||
- The migration state saved should reflect the device being modelled rather
|
||||
than the way your implementation works. That way if you change the implementation
|
||||
later the migration stream will stay compatible. That model may include
|
||||
internal state that's not directly visible in a register.
|
||||
|
||||
This way is going to disappear as soon as all current users are ported to VMSTATE.
|
||||
- When saving a migration stream the device code may walk and check
|
||||
the state of the device. These checks might fail in various ways (e.g.
|
||||
discovering internal state is corrupt or that the guest has done something bad).
|
||||
Consider carefully before asserting/aborting at this point, since the
|
||||
normal response from users is that *migration broke their VM* since it had
|
||||
apparently been running fine until then. In these error cases, the device
|
||||
should log a message indicating the cause of error, and should consider
|
||||
putting the device into an error state, allowing the rest of the VM to
|
||||
continue execution.
|
||||
|
||||
Each device has to register two functions, one to save the state and
|
||||
another to load the state back.
|
||||
- The migration might happen at an inconvenient point,
|
||||
e.g. right in the middle of the guest reprogramming the device, during
|
||||
guest reboot or shutdown or while the device is waiting for external IO.
|
||||
It's strongly preferred that migrations do not fail in this situation,
|
||||
since in the cloud environment migrations might happen automatically to
|
||||
VMs that the administrator doesn't directly control.
|
||||
|
||||
.. code:: c
|
||||
- If you do need to fail a migration, ensure that sufficient information
|
||||
is logged to identify what went wrong.
|
||||
|
||||
int register_savevm(DeviceState *dev,
|
||||
const char *idstr,
|
||||
int instance_id,
|
||||
int version_id,
|
||||
SaveStateHandler *save_state,
|
||||
LoadStateHandler *load_state,
|
||||
void *opaque);
|
||||
- The destination should treat an incoming migration stream as hostile
|
||||
(which we do to varying degrees in the existing code). Check that offsets
|
||||
into buffers and the like can't cause overruns. Fail the incoming migration
|
||||
in the case of a corrupted stream like this.
|
||||
|
||||
typedef void SaveStateHandler(QEMUFile *f, void *opaque);
|
||||
typedef int LoadStateHandler(QEMUFile *f, void *opaque, int version_id);
|
||||
- Take care with internal device state or behaviour that might become
|
||||
migration version dependent. For example, the order of PCI capabilities
|
||||
is required to stay constant across migration. Another example would
|
||||
be that a special case handled by subsections (see below) might become
|
||||
much more common if a default behaviour is changed.
|
||||
|
||||
The important functions for the device state format are the `save_state`
|
||||
and `load_state`. Notice that `load_state` receives a version_id
|
||||
parameter to know what state format is receiving. `save_state` doesn't
|
||||
have a version_id parameter because it always uses the latest version.
|
||||
- The state of the source should not be changed or destroyed by the
|
||||
outgoing migration. Migrations timing out or being failed by
|
||||
higher levels of management, or failures of the destination host are
|
||||
not unusual, and in that case the VM is restarted on the source.
|
||||
Note that the management layer can validly revert the migration
|
||||
even though the QEMU level of migration has succeeded as long as it
|
||||
does it before starting execution on the destination.
|
||||
|
||||
- Buses and devices should be able to explicitly specify addresses when
|
||||
instantiated, and management tools should use those. For example,
|
||||
when hot adding USB devices it's important to specify the ports
|
||||
and addresses, since implicit ordering based on the command line order
|
||||
may be different on the destination. This can result in the
|
||||
device state being loaded into the wrong device.
|
||||
|
||||
VMState
|
||||
-------
|
||||
|
||||
The legacy way of saving/loading state of the device had the problem
|
||||
that we have to maintain two functions in sync. If we did one change
|
||||
in one of them and not in the other, we would get a failed migration.
|
||||
|
||||
VMState changed the way that state is saved/loaded. Instead of using
|
||||
a function to save the state and another to load it, it was changed to
|
||||
a declarative way of what the state consisted of. Now VMState is able
|
||||
to interpret that definition to be able to load/save the state. As
|
||||
the state is declared only once, it can't go out of sync in the
|
||||
save/load functions.
|
||||
Most device data can be described using the ``VMSTATE`` macros (mostly defined
|
||||
in ``include/migration/vmstate.h``).
|
||||
|
||||
An example (from hw/input/pckbd.c)
|
||||
|
||||
@ -137,103 +152,99 @@ We registered this with:
|
||||
|
||||
vmstate_register(NULL, 0, &vmstate_kbd, s);
|
||||
|
||||
Note: talk about how vmstate <-> qdev interact, and what the instance ids mean.
|
||||
For devices that are `qdev` based, we can register the device in the class
|
||||
init function:
|
||||
|
||||
You can search for ``VMSTATE_*`` macros for lots of types used in QEMU in
|
||||
include/hw/hw.h.
|
||||
.. code:: c
|
||||
|
||||
More about versions
|
||||
-------------------
|
||||
dc->vmsd = &vmstate_kbd_isa;
|
||||
|
||||
Version numbers are intended for major incompatible changes to the
|
||||
migration of a device, and using them breaks backwards-migration
|
||||
compatibility; in general most changes can be made by adding Subsections
|
||||
(see below) or _TEST macros (see below) which won't break compatibility.
|
||||
The VMState macros take care of ensuring that the device data section
|
||||
is formatted portably (normally big endian) and make some compile time checks
|
||||
against the types of the fields in the structures.
|
||||
|
||||
You can see that there are several version fields:
|
||||
VMState macros can include other VMStateDescriptions to store substructures
|
||||
(see ``VMSTATE_STRUCT_``), arrays (``VMSTATE_ARRAY_``) and variable length
|
||||
arrays (``VMSTATE_VARRAY_``). Various other macros exist for special
|
||||
cases.
|
||||
|
||||
- `version_id`: the maximum version_id supported by VMState for that device.
|
||||
- `minimum_version_id`: the minimum version_id that VMState is able to understand
|
||||
for that device.
|
||||
- `minimum_version_id_old`: For devices that were not able to port to vmstate, we can
|
||||
assign a function that knows how to read this old state. This field is
|
||||
ignored if there is no `load_state_old` handler.
|
||||
Note that the format on the wire is still very raw; i.e. a VMSTATE_UINT32
|
||||
ends up with a 4 byte bigendian representation on the wire; in the future
|
||||
it might be possible to use a more structured format.
|
||||
|
||||
So, VMState is able to read versions from minimum_version_id to
|
||||
version_id. And the function ``load_state_old()`` (if present) is able to
|
||||
load state from minimum_version_id_old to minimum_version_id. This
|
||||
function is deprecated and will be removed when no more users are left.
|
||||
Legacy way
|
||||
----------
|
||||
|
||||
Saving state will always create a section with the 'version_id' value
|
||||
and thus can't be loaded by any older QEMU.
|
||||
This way is going to disappear as soon as all current users are ported to VMSTATE;
|
||||
although converting existing code can be tricky, and thus 'soon' is relative.
|
||||
|
||||
Massaging functions
|
||||
-------------------
|
||||
Each device has to register two functions, one to save the state and
|
||||
another to load the state back.
|
||||
|
||||
Sometimes, it is not enough to be able to save the state directly
|
||||
from one structure, we need to fill the correct values there. One
|
||||
example is when we are using kvm. Before saving the cpu state, we
|
||||
need to ask kvm to copy to QEMU the state that it is using. And the
|
||||
opposite when we are loading the state, we need a way to tell kvm to
|
||||
load the state for the cpu that we have just loaded from the QEMUFile.
|
||||
.. code:: c
|
||||
|
||||
The functions to do that are inside a vmstate definition, and are called:
|
||||
int register_savevm_live(DeviceState *dev,
|
||||
const char *idstr,
|
||||
int instance_id,
|
||||
int version_id,
|
||||
SaveVMHandlers *ops,
|
||||
void *opaque);
|
||||
|
||||
- ``int (*pre_load)(void *opaque);``
|
||||
Two functions in the ``ops`` structure are the `save_state`
|
||||
and `load_state` functions. Notice that `load_state` receives a version_id
|
||||
parameter to know what state format is receiving. `save_state` doesn't
|
||||
have a version_id parameter because it always uses the latest version.
|
||||
|
||||
This function is called before we load the state of one device.
|
||||
Note that because the VMState macros still save the data in a raw
|
||||
format, in many cases it's possible to replace legacy code
|
||||
with a carefully constructed VMState description that matches the
|
||||
byte layout of the existing code.
|
||||
|
||||
- ``int (*post_load)(void *opaque, int version_id);``
|
||||
Changing migration data structures
|
||||
----------------------------------
|
||||
|
||||
This function is called after we load the state of one device.
|
||||
|
||||
- ``int (*pre_save)(void *opaque);``
|
||||
|
||||
This function is called before we save the state of one device.
|
||||
|
||||
Example: You can look at hpet.c, that uses the three function to
|
||||
massage the state that is transferred.
|
||||
|
||||
If you use memory API functions that update memory layout outside
|
||||
initialization (i.e., in response to a guest action), this is a strong
|
||||
indication that you need to call these functions in a `post_load` callback.
|
||||
Examples of such memory API functions are:
|
||||
|
||||
- memory_region_add_subregion()
|
||||
- memory_region_del_subregion()
|
||||
- memory_region_set_readonly()
|
||||
- memory_region_set_enabled()
|
||||
- memory_region_set_address()
|
||||
- memory_region_set_alias_offset()
|
||||
When we migrate a device, we save/load the state as a series
|
||||
of fields. Sometimes, due to bugs or new functionality, we need to
|
||||
change the state to store more/different information. Changing the migration
|
||||
state saved for a device can break migration compatibility unless
|
||||
care is taken to use the appropriate techniques. In general QEMU tries
|
||||
to maintain forward migration compatibility (i.e. migrating from
|
||||
QEMU n->n+1) and there are users who benefit from backward compatibility
|
||||
as well.
|
||||
|
||||
Subsections
|
||||
-----------
|
||||
|
||||
The use of version_id allows to be able to migrate from older versions
|
||||
to newer versions of a device. But not the other way around. This
|
||||
makes very complicated to fix bugs in stable branches. If we need to
|
||||
add anything to the state to fix a bug, we have to disable migration
|
||||
to older versions that don't have that bug-fix (i.e. a new field).
|
||||
The most common structure change is adding new data, e.g. when adding
|
||||
a newer form of device, or adding that state that you previously
|
||||
forgot to migrate. This is best solved using a subsection.
|
||||
|
||||
But sometimes, that bug-fix is only needed sometimes, not always. For
|
||||
instance, if the device is in the middle of a DMA operation, it is
|
||||
using a specific functionality, ....
|
||||
|
||||
It is impossible to create a way to make migration from any version to
|
||||
any other version to work. But we can do better than only allowing
|
||||
migration from older versions to newer ones. For that fields that are
|
||||
only needed sometimes, we add the idea of subsections. A subsection
|
||||
is "like" a device vmstate, but with a particularity, it has a Boolean
|
||||
function that tells if that values are needed to be sent or not. If
|
||||
this functions returns false, the subsection is not sent.
|
||||
A subsection is "like" a device vmstate, but with a particularity, it
|
||||
has a Boolean function that tells if that values are needed to be sent
|
||||
or not. If this functions returns false, the subsection is not sent.
|
||||
Subsections have a unique name, that is looked for on the receiving
|
||||
side.
|
||||
|
||||
On the receiving side, if we found a subsection for a device that we
|
||||
don't understand, we just fail the migration. If we understand all
|
||||
the subsections, then we load the state with success.
|
||||
the subsections, then we load the state with success. There's no check
|
||||
that a subsection is loaded, so a newer QEMU that knows about a subsection
|
||||
can (with care) load a stream from an older QEMU that didn't send
|
||||
the subsection.
|
||||
|
||||
If the new data is only needed in a rare case, then the subsection
|
||||
can be made conditional on that case and the migration will still
|
||||
succeed to older QEMUs in most cases. This is OK for data that's
|
||||
critical, but in some use cases it's preferred that the migration
|
||||
should succeed even with the data missing. To support this the
|
||||
subsection can be connected to a device property and from there
|
||||
to a versioned machine type.
|
||||
|
||||
One important note is that the post_load() function is called "after"
|
||||
loading all subsections, because a newer subsection could change same
|
||||
value that it uses.
|
||||
value that it uses. A flag, and the combination of pre_load and post_load
|
||||
can be used to detect whether a subsection was loaded, and to
|
||||
fall back on default behaviour when the subsection isn't present.
|
||||
|
||||
Example:
|
||||
|
||||
@ -288,9 +299,13 @@ save/send this state when we are in the middle of a pio operation
|
||||
not enabled, the values on that fields are garbage and don't need to
|
||||
be sent.
|
||||
|
||||
Connecting subsections to properties
|
||||
------------------------------------
|
||||
|
||||
Using a condition function that checks a 'property' to determine whether
|
||||
to send a subsection allows backwards migration compatibility when
|
||||
new subsections are added.
|
||||
to send a subsection allows backward migration compatibility when
|
||||
new subsections are added, especially when combined with versioned
|
||||
machine types.
|
||||
|
||||
For example:
|
||||
|
||||
@ -305,21 +320,7 @@ For example:
|
||||
|
||||
Now that subsection will not be generated when using an older
|
||||
machine type and the migration stream will be accepted by older
|
||||
QEMU versions. pre-load functions can be used to initialise state
|
||||
on the newer version so that they default to suitable values
|
||||
when loading streams created by older QEMU versions that do not
|
||||
generate the subsection.
|
||||
|
||||
In some cases subsections are added for data that had been accidentally
|
||||
omitted by earlier versions; if the missing data causes the migration
|
||||
process to succeed but the guest to behave badly then it may be better
|
||||
to send the subsection and cause the migration to explicitly fail
|
||||
with the unknown subsection error. If the bad behaviour only happens
|
||||
with certain data values, making the subsection conditional on
|
||||
the data value (rather than the machine type) allows migrations to succeed
|
||||
in most cases. In general the preference is to tie the subsection to
|
||||
the machine type, and allow reliable migrations, unless the behaviour
|
||||
from omission of the subsection is really bad.
|
||||
QEMU versions.
|
||||
|
||||
Not sending existing elements
|
||||
-----------------------------
|
||||
@ -328,9 +329,13 @@ Sometimes members of the VMState are no longer needed:
|
||||
|
||||
- removing them will break migration compatibility
|
||||
|
||||
- making them version dependent and bumping the version will break backwards migration compatibility.
|
||||
- making them version dependent and bumping the version will break backward migration
|
||||
compatibility.
|
||||
|
||||
The best way is to:
|
||||
Adding a dummy field into the migration stream is normally the best way to preserve
|
||||
compatibility.
|
||||
|
||||
If the field really does need to be removed then:
|
||||
|
||||
a) Add a new property/compatibility/function in the same way for subsections above.
|
||||
b) replace the VMSTATE macro with the _TEST version of the macro, e.g.:
|
||||
@ -342,18 +347,208 @@ The best way is to:
|
||||
``VMSTATE_UINT32_TEST(foo, barstruct, pre_version_baz)``
|
||||
|
||||
Sometime in the future when we no longer care about the ancient versions these can be killed off.
|
||||
Note that for backward compatibility it's important to fill in the structure with
|
||||
data that the destination will understand.
|
||||
|
||||
Any difference in the predicates on the source and destination will end up
|
||||
with different fields being enabled and data being loaded into the wrong
|
||||
fields; for this reason conditional fields like this are very fragile.
|
||||
|
||||
Versions
|
||||
--------
|
||||
|
||||
Version numbers are intended for major incompatible changes to the
|
||||
migration of a device, and using them breaks backward-migration
|
||||
compatibility; in general most changes can be made by adding Subsections
|
||||
(see above) or _TEST macros (see above) which won't break compatibility.
|
||||
|
||||
Each version is associated with a series of fields saved. The `save_state` always saves
|
||||
the state as the newer version. But `load_state` sometimes is able to
|
||||
load state from an older version.
|
||||
|
||||
You can see that there are several version fields:
|
||||
|
||||
- `version_id`: the maximum version_id supported by VMState for that device.
|
||||
- `minimum_version_id`: the minimum version_id that VMState is able to understand
|
||||
for that device.
|
||||
- `minimum_version_id_old`: For devices that were not able to port to vmstate, we can
|
||||
assign a function that knows how to read this old state. This field is
|
||||
ignored if there is no `load_state_old` handler.
|
||||
|
||||
VMState is able to read versions from minimum_version_id to
|
||||
version_id. And the function ``load_state_old()`` (if present) is able to
|
||||
load state from minimum_version_id_old to minimum_version_id. This
|
||||
function is deprecated and will be removed when no more users are left.
|
||||
|
||||
There are *_V* forms of many ``VMSTATE_`` macros to load fields for version dependent fields,
|
||||
e.g.
|
||||
|
||||
.. code:: c
|
||||
|
||||
VMSTATE_UINT16_V(ip_id, Slirp, 2),
|
||||
|
||||
only loads that field for versions 2 and newer.
|
||||
|
||||
Saving state will always create a section with the 'version_id' value
|
||||
and thus can't be loaded by any older QEMU.
|
||||
|
||||
Massaging functions
|
||||
-------------------
|
||||
|
||||
Sometimes, it is not enough to be able to save the state directly
|
||||
from one structure, we need to fill the correct values there. One
|
||||
example is when we are using kvm. Before saving the cpu state, we
|
||||
need to ask kvm to copy to QEMU the state that it is using. And the
|
||||
opposite when we are loading the state, we need a way to tell kvm to
|
||||
load the state for the cpu that we have just loaded from the QEMUFile.
|
||||
|
||||
The functions to do that are inside a vmstate definition, and are called:
|
||||
|
||||
- ``int (*pre_load)(void *opaque);``
|
||||
|
||||
This function is called before we load the state of one device.
|
||||
|
||||
- ``int (*post_load)(void *opaque, int version_id);``
|
||||
|
||||
This function is called after we load the state of one device.
|
||||
|
||||
- ``int (*pre_save)(void *opaque);``
|
||||
|
||||
This function is called before we save the state of one device.
|
||||
|
||||
Example: You can look at hpet.c, that uses the three function to
|
||||
massage the state that is transferred.
|
||||
|
||||
The ``VMSTATE_WITH_TMP`` macro may be useful when the migration
|
||||
data doesn't match the stored device data well; it allows an
|
||||
intermediate temporary structure to be populated with migration
|
||||
data and then transferred to the main structure.
|
||||
|
||||
If you use memory API functions that update memory layout outside
|
||||
initialization (i.e., in response to a guest action), this is a strong
|
||||
indication that you need to call these functions in a `post_load` callback.
|
||||
Examples of such memory API functions are:
|
||||
|
||||
- memory_region_add_subregion()
|
||||
- memory_region_del_subregion()
|
||||
- memory_region_set_readonly()
|
||||
- memory_region_set_enabled()
|
||||
- memory_region_set_address()
|
||||
- memory_region_set_alias_offset()
|
||||
|
||||
Iterative device migration
|
||||
--------------------------
|
||||
|
||||
Some devices, such as RAM, Block storage or certain platform devices,
|
||||
have large amounts of data that would mean that the CPUs would be
|
||||
paused for too long if they were sent in one section. For these
|
||||
devices an *iterative* approach is taken.
|
||||
|
||||
The iterative devices generally don't use VMState macros
|
||||
(although it may be possible in some cases) and instead use
|
||||
qemu_put_*/qemu_get_* macros to read/write data to the stream. Specialist
|
||||
versions exist for high bandwidth IO.
|
||||
|
||||
|
||||
An iterative device must provide:
|
||||
|
||||
- A ``save_setup`` function that initialises the data structures and
|
||||
transmits a first section containing information on the device. In the
|
||||
case of RAM this transmits a list of RAMBlocks and sizes.
|
||||
|
||||
- A ``load_setup`` function that initialises the data structures on the
|
||||
destination.
|
||||
|
||||
- A ``save_live_pending`` function that is called repeatedly and must
|
||||
indicate how much more data the iterative data must save. The core
|
||||
migration code will use this to determine when to pause the CPUs
|
||||
and complete the migration.
|
||||
|
||||
- A ``save_live_iterate`` function (called after ``save_live_pending``
|
||||
when there is significant data still to be sent). It should send
|
||||
a chunk of data until the point that stream bandwidth limits tell it
|
||||
to stop. Each call generates one section.
|
||||
|
||||
- A ``save_live_complete_precopy`` function that must transmit the
|
||||
last section for the device containing any remaining data.
|
||||
|
||||
- A ``load_state`` function used to load sections generated by
|
||||
any of the save functions that generate sections.
|
||||
|
||||
- ``cleanup`` functions for both save and load that are called
|
||||
at the end of migration.
|
||||
|
||||
Note that the contents of the sections for iterative migration tend
|
||||
to be open-coded by the devices; care should be taken in parsing
|
||||
the results and structuring the stream to make them easy to validate.
|
||||
|
||||
Device ordering
|
||||
---------------
|
||||
|
||||
There are cases in which the ordering of device loading matters; for
|
||||
example in some systems where a device may assert an interrupt during loading,
|
||||
if the interrupt controller is loaded later then it might lose the state.
|
||||
|
||||
Some ordering is implicitly provided by the order in which the machine
|
||||
definition creates devices, however this is somewhat fragile.
|
||||
|
||||
The ``MigrationPriority`` enum provides a means of explicitly enforcing
|
||||
ordering. Numerically higher priorities are loaded earlier.
|
||||
The priority is set by setting the ``priority`` field of the top level
|
||||
``VMStateDescription`` for the device.
|
||||
|
||||
Stream structure
|
||||
================
|
||||
|
||||
The stream tries to be word and endian agnostic, allowing migration between hosts
|
||||
of different characteristics running the same VM.
|
||||
|
||||
- Header
|
||||
|
||||
- Magic
|
||||
- Version
|
||||
- VM configuration section
|
||||
|
||||
- Machine type
|
||||
- Target page bits
|
||||
- List of sections
|
||||
Each section contains a device, or one iteration of a device save.
|
||||
|
||||
- section type
|
||||
- section id
|
||||
- ID string (First section of each device)
|
||||
- instance id (First section of each device)
|
||||
- version id (First section of each device)
|
||||
- <device data>
|
||||
- Footer mark
|
||||
- EOF mark
|
||||
- VM Description structure
|
||||
Consisting of a JSON description of the contents for analysis only
|
||||
|
||||
The ``device data`` in each section consists of the data produced
|
||||
by the code described above. For non-iterative devices they have a single
|
||||
section; iterative devices have an initial and last section and a set
|
||||
of parts in between.
|
||||
Note that there is very little checking by the common code of the integrity
|
||||
of the ``device data`` contents, that's up to the devices themselves.
|
||||
The ``footer mark`` provides a little bit of protection for the case where
|
||||
the receiving side reads more or less data than expected.
|
||||
|
||||
The ``ID string`` is normally unique, having been formed from a bus name
|
||||
and device address, PCI devices and storage devices hung off PCI controllers
|
||||
fit this pattern well. Some devices are fixed single instances (e.g. "pc-ram").
|
||||
Others (especially either older devices or system devices which for
|
||||
some reason don't have a bus concept) make use of the ``instance id``
|
||||
for otherwise identically named devices.
|
||||
|
||||
Return path
|
||||
-----------
|
||||
|
||||
In most migration scenarios there is only a single data path that runs
|
||||
from the source VM to the destination, typically along a single fd (although
|
||||
possibly with another fd or similar for some fast way of throwing pages across).
|
||||
Only a unidirectional stream is required for normal migration, however a
|
||||
``return path`` can be created when bidirectional communication is desired.
|
||||
This is primarily used by postcopy, but is also used to return a success
|
||||
flag to the source at the end of migration.
|
||||
|
||||
However, some uses need two way communication; in particular the Postcopy
|
||||
destination needs to be able to request pages on demand from the source.
|
||||
|
||||
For these scenarios there is a 'return path' from the destination to the source;
|
||||
``qemu_file_get_return_path(QEMUFile* fwdpath)`` gives the QEMUFile* for the return
|
||||
path.
|
||||
|
||||
@ -632,3 +827,28 @@ Retro-fitting postcopy to existing clients is possible:
|
||||
identified and the implication understood; for example if the
|
||||
guest memory access is made while holding a lock then all other
|
||||
threads waiting for that lock will also be blocked.
|
||||
|
||||
Firmware
|
||||
========
|
||||
|
||||
Migration migrates the copies of RAM and ROM, and thus when running
|
||||
on the destination it includes the firmware from the source. Even after
|
||||
resetting a VM, the old firmware is used. Only once QEMU has been restarted
|
||||
is the new firmware in use.
|
||||
|
||||
- Changes in firmware size can cause changes in the required RAMBlock size
|
||||
to hold the firmware and thus migration can fail. In practice it's best
|
||||
to pad firmware images to convenient powers of 2 with plenty of space
|
||||
for growth.
|
||||
|
||||
- Care should be taken with device emulation code so that newer
|
||||
emulation code can work with older firmware to allow forward migration.
|
||||
|
||||
- Care should be taken with newer firmware so that backward migration
|
||||
to older systems with older device emulation code will work.
|
||||
|
||||
In some cases it may be best to tie specific firmware versions to specific
|
||||
versioned machine types to cut down on the combinations that will need
|
||||
support. This is also useful when newer versions of firmware outgrow
|
||||
the padding.
|
||||
|
||||
|
@ -897,13 +897,14 @@ ETEXI
|
||||
|
||||
{
|
||||
.name = "migrate",
|
||||
.args_type = "detach:-d,blk:-b,inc:-i,uri:s",
|
||||
.params = "[-d] [-b] [-i] uri",
|
||||
.args_type = "detach:-d,blk:-b,inc:-i,resume:-r,uri:s",
|
||||
.params = "[-d] [-b] [-i] [-r] uri",
|
||||
.help = "migrate to URI (using -d to not wait for completion)"
|
||||
"\n\t\t\t -b for migration without shared storage with"
|
||||
" full copy of disk\n\t\t\t -i for migration without "
|
||||
"shared storage with incremental copy of disk "
|
||||
"(base image shared between src and destination)",
|
||||
"(base image shared between src and destination)"
|
||||
"\n\t\t\t -r to resume a paused migration",
|
||||
.cmd = hmp_migrate,
|
||||
},
|
||||
|
||||
@ -956,7 +957,34 @@ STEXI
|
||||
@findex migrate_incoming
|
||||
Continue an incoming migration using the @var{uri} (that has the same syntax
|
||||
as the -incoming option).
|
||||
ETEXI
|
||||
|
||||
{
|
||||
.name = "migrate_recover",
|
||||
.args_type = "uri:s",
|
||||
.params = "uri",
|
||||
.help = "Continue a paused incoming postcopy migration",
|
||||
.cmd = hmp_migrate_recover,
|
||||
},
|
||||
|
||||
STEXI
|
||||
@item migrate_recover @var{uri}
|
||||
@findex migrate_recover
|
||||
Continue a paused incoming postcopy migration using the @var{uri}.
|
||||
ETEXI
|
||||
|
||||
{
|
||||
.name = "migrate_pause",
|
||||
.args_type = "",
|
||||
.params = "",
|
||||
.help = "Pause an ongoing migration (postcopy-only)",
|
||||
.cmd = hmp_migrate_pause,
|
||||
},
|
||||
|
||||
STEXI
|
||||
@item migrate_pause
|
||||
@findex migrate_pause
|
||||
Pause an ongoing migration. Currently it only supports postcopy.
|
||||
ETEXI
|
||||
|
||||
{
|
||||
|
23
hmp.c
23
hmp.c
@ -1517,6 +1517,25 @@ void hmp_migrate_incoming(Monitor *mon, const QDict *qdict)
|
||||
hmp_handle_error(mon, &err);
|
||||
}
|
||||
|
||||
void hmp_migrate_recover(Monitor *mon, const QDict *qdict)
|
||||
{
|
||||
Error *err = NULL;
|
||||
const char *uri = qdict_get_str(qdict, "uri");
|
||||
|
||||
qmp_migrate_recover(uri, &err);
|
||||
|
||||
hmp_handle_error(mon, &err);
|
||||
}
|
||||
|
||||
void hmp_migrate_pause(Monitor *mon, const QDict *qdict)
|
||||
{
|
||||
Error *err = NULL;
|
||||
|
||||
qmp_migrate_pause(&err);
|
||||
|
||||
hmp_handle_error(mon, &err);
|
||||
}
|
||||
|
||||
/* Kept for backwards compatibility */
|
||||
void hmp_migrate_set_downtime(Monitor *mon, const QDict *qdict)
|
||||
{
|
||||
@ -1929,10 +1948,12 @@ void hmp_migrate(Monitor *mon, const QDict *qdict)
|
||||
bool detach = qdict_get_try_bool(qdict, "detach", false);
|
||||
bool blk = qdict_get_try_bool(qdict, "blk", false);
|
||||
bool inc = qdict_get_try_bool(qdict, "inc", false);
|
||||
bool resume = qdict_get_try_bool(qdict, "resume", false);
|
||||
const char *uri = qdict_get_str(qdict, "uri");
|
||||
Error *err = NULL;
|
||||
|
||||
qmp_migrate(uri, !!blk, blk, !!inc, inc, false, false, &err);
|
||||
qmp_migrate(uri, !!blk, blk, !!inc, inc,
|
||||
false, false, true, resume, &err);
|
||||
if (err) {
|
||||
hmp_handle_error(mon, &err);
|
||||
return;
|
||||
|
2
hmp.h
2
hmp.h
@ -68,6 +68,8 @@ void hmp_info_snapshots(Monitor *mon, const QDict *qdict);
|
||||
void hmp_migrate_cancel(Monitor *mon, const QDict *qdict);
|
||||
void hmp_migrate_continue(Monitor *mon, const QDict *qdict);
|
||||
void hmp_migrate_incoming(Monitor *mon, const QDict *qdict);
|
||||
void hmp_migrate_recover(Monitor *mon, const QDict *qdict);
|
||||
void hmp_migrate_pause(Monitor *mon, const QDict *qdict);
|
||||
void hmp_migrate_set_downtime(Monitor *mon, const QDict *qdict);
|
||||
void hmp_migrate_set_speed(Monitor *mon, const QDict *qdict);
|
||||
void hmp_migrate_set_capability(Monitor *mon, const QDict *qdict);
|
||||
|
@ -64,6 +64,8 @@ typedef struct SaveVMHandlers {
|
||||
LoadStateHandler *load_state;
|
||||
int (*load_setup)(QEMUFile *f, void *opaque);
|
||||
int (*load_cleanup)(void *opaque);
|
||||
/* Called when postcopy migration wants to resume from failure */
|
||||
int (*resume_prepare)(MigrationState *s, void *opaque);
|
||||
} SaveVMHandlers;
|
||||
|
||||
int register_savevm_live(DeviceState *dev,
|
||||
|
@ -71,11 +71,21 @@ void migration_channel_connect(MigrationState *s,
|
||||
!object_dynamic_cast(OBJECT(ioc),
|
||||
TYPE_QIO_CHANNEL_TLS)) {
|
||||
migration_tls_channel_connect(s, ioc, hostname, &error);
|
||||
|
||||
if (!error) {
|
||||
/* tls_channel_connect will call back to this
|
||||
* function after the TLS handshake,
|
||||
* so we mustn't call migrate_fd_connect until then
|
||||
*/
|
||||
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
QEMUFile *f = qemu_fopen_channel_output(ioc);
|
||||
|
||||
qemu_mutex_lock(&s->qemu_file_lock);
|
||||
s->to_dst_file = f;
|
||||
|
||||
qemu_mutex_unlock(&s->qemu_file_lock);
|
||||
}
|
||||
}
|
||||
migrate_fd_connect(s, error);
|
||||
|
@ -65,9 +65,8 @@ void exec_start_incoming_migration(const char *command, Error **errp)
|
||||
}
|
||||
|
||||
qio_channel_set_name(ioc, "migration-exec-incoming");
|
||||
qio_channel_add_watch(ioc,
|
||||
G_IO_IN,
|
||||
exec_accept_incoming_migration,
|
||||
NULL,
|
||||
NULL);
|
||||
qio_channel_add_watch_full(ioc, G_IO_IN,
|
||||
exec_accept_incoming_migration,
|
||||
NULL, NULL,
|
||||
g_main_context_get_thread_default());
|
||||
}
|
||||
|
@ -66,9 +66,8 @@ void fd_start_incoming_migration(const char *infd, Error **errp)
|
||||
}
|
||||
|
||||
qio_channel_set_name(QIO_CHANNEL(ioc), "migration-fd-incoming");
|
||||
qio_channel_add_watch(ioc,
|
||||
G_IO_IN,
|
||||
fd_accept_incoming_migration,
|
||||
NULL,
|
||||
NULL);
|
||||
qio_channel_add_watch_full(ioc, G_IO_IN,
|
||||
fd_accept_incoming_migration,
|
||||
NULL, NULL,
|
||||
g_main_context_get_thread_default());
|
||||
}
|
||||
|
@ -95,6 +95,8 @@ enum mig_rp_message_type {
|
||||
|
||||
MIG_RP_MSG_REQ_PAGES_ID, /* data (start: be64, len: be32, id: string) */
|
||||
MIG_RP_MSG_REQ_PAGES, /* data (start: be64, len: be32) */
|
||||
MIG_RP_MSG_RECV_BITMAP, /* send recved_bitmap back to source */
|
||||
MIG_RP_MSG_RESUME_ACK, /* tell source that we are ready to resume */
|
||||
|
||||
MIG_RP_MSG_MAX
|
||||
};
|
||||
@ -104,6 +106,7 @@ enum mig_rp_message_type {
|
||||
dynamic creation of migration */
|
||||
|
||||
static MigrationState *current_migration;
|
||||
static MigrationIncomingState *current_incoming;
|
||||
|
||||
static bool migration_object_check(MigrationState *ms, Error **errp);
|
||||
static int migration_maybe_pause(MigrationState *s,
|
||||
@ -119,6 +122,22 @@ void migration_object_init(void)
|
||||
assert(!current_migration);
|
||||
current_migration = MIGRATION_OBJ(object_new(TYPE_MIGRATION));
|
||||
|
||||
/*
|
||||
* Init the migrate incoming object as well no matter whether
|
||||
* we'll use it or not.
|
||||
*/
|
||||
assert(!current_incoming);
|
||||
current_incoming = g_new0(MigrationIncomingState, 1);
|
||||
current_incoming->state = MIGRATION_STATUS_NONE;
|
||||
current_incoming->postcopy_remote_fds =
|
||||
g_array_new(FALSE, TRUE, sizeof(struct PostCopyFD));
|
||||
qemu_mutex_init(¤t_incoming->rp_mutex);
|
||||
qemu_event_init(¤t_incoming->main_thread_load_event, false);
|
||||
qemu_sem_init(¤t_incoming->postcopy_pause_sem_dst, 0);
|
||||
qemu_sem_init(¤t_incoming->postcopy_pause_sem_fault, 0);
|
||||
|
||||
init_dirty_bitmap_incoming_migration();
|
||||
|
||||
if (!migration_object_check(current_migration, &err)) {
|
||||
error_report_err(err);
|
||||
exit(1);
|
||||
@ -149,22 +168,8 @@ MigrationState *migrate_get_current(void)
|
||||
|
||||
MigrationIncomingState *migration_incoming_get_current(void)
|
||||
{
|
||||
static bool once;
|
||||
static MigrationIncomingState mis_current;
|
||||
|
||||
if (!once) {
|
||||
mis_current.state = MIGRATION_STATUS_NONE;
|
||||
memset(&mis_current, 0, sizeof(MigrationIncomingState));
|
||||
mis_current.postcopy_remote_fds = g_array_new(FALSE, TRUE,
|
||||
sizeof(struct PostCopyFD));
|
||||
qemu_mutex_init(&mis_current.rp_mutex);
|
||||
qemu_event_init(&mis_current.main_thread_load_event, false);
|
||||
|
||||
init_dirty_bitmap_incoming_migration();
|
||||
|
||||
once = true;
|
||||
}
|
||||
return &mis_current;
|
||||
assert(current_incoming);
|
||||
return current_incoming;
|
||||
}
|
||||
|
||||
void migration_incoming_state_destroy(void)
|
||||
@ -430,7 +435,7 @@ static void migration_incoming_setup(QEMUFile *f)
|
||||
qemu_file_set_blocking(f, false);
|
||||
}
|
||||
|
||||
static void migration_incoming_process(void)
|
||||
void migration_incoming_process(void)
|
||||
{
|
||||
Coroutine *co = qemu_coroutine_create(process_incoming_migration_co, NULL);
|
||||
qemu_coroutine_enter(co);
|
||||
@ -438,8 +443,34 @@ static void migration_incoming_process(void)
|
||||
|
||||
void migration_fd_process_incoming(QEMUFile *f)
|
||||
{
|
||||
migration_incoming_setup(f);
|
||||
migration_incoming_process();
|
||||
MigrationIncomingState *mis = migration_incoming_get_current();
|
||||
|
||||
if (mis->state == MIGRATION_STATUS_POSTCOPY_PAUSED) {
|
||||
/* Resumed from a paused postcopy migration */
|
||||
|
||||
mis->from_src_file = f;
|
||||
/* Postcopy has standalone thread to do vm load */
|
||||
qemu_file_set_blocking(f, true);
|
||||
|
||||
/* Re-configure the return path */
|
||||
mis->to_src_file = qemu_file_get_return_path(f);
|
||||
|
||||
migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_PAUSED,
|
||||
MIGRATION_STATUS_POSTCOPY_RECOVER);
|
||||
|
||||
/*
|
||||
* Here, we only wake up the main loading thread (while the
|
||||
* fault thread will still be waiting), so that we can receive
|
||||
* commands from source now, and answer it if needed. The
|
||||
* fault thread will be woken up afterwards until we are sure
|
||||
* that source is ready to reply to page requests.
|
||||
*/
|
||||
qemu_sem_post(&mis->postcopy_pause_sem_dst);
|
||||
} else {
|
||||
/* New incoming migration */
|
||||
migration_incoming_setup(f);
|
||||
migration_incoming_process();
|
||||
}
|
||||
}
|
||||
|
||||
void migration_ioc_process_incoming(QIOChannel *ioc)
|
||||
@ -448,9 +479,10 @@ void migration_ioc_process_incoming(QIOChannel *ioc)
|
||||
|
||||
if (!mis->from_src_file) {
|
||||
QEMUFile *f = qemu_fopen_channel_input(ioc);
|
||||
migration_fd_process_incoming(f);
|
||||
migration_incoming_setup(f);
|
||||
return;
|
||||
}
|
||||
/* We still only have a single channel. Nothing to do here yet */
|
||||
multifd_recv_new_channel(ioc);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -461,7 +493,11 @@ void migration_ioc_process_incoming(QIOChannel *ioc)
|
||||
*/
|
||||
bool migration_has_all_channels(void)
|
||||
{
|
||||
return true;
|
||||
bool all_channels;
|
||||
|
||||
all_channels = multifd_recv_all_channels_created();
|
||||
|
||||
return all_channels;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -491,6 +527,53 @@ void migrate_send_rp_pong(MigrationIncomingState *mis,
|
||||
migrate_send_rp_message(mis, MIG_RP_MSG_PONG, sizeof(buf), &buf);
|
||||
}
|
||||
|
||||
void migrate_send_rp_recv_bitmap(MigrationIncomingState *mis,
|
||||
char *block_name)
|
||||
{
|
||||
char buf[512];
|
||||
int len;
|
||||
int64_t res;
|
||||
|
||||
/*
|
||||
* First, we send the header part. It contains only the len of
|
||||
* idstr, and the idstr itself.
|
||||
*/
|
||||
len = strlen(block_name);
|
||||
buf[0] = len;
|
||||
memcpy(buf + 1, block_name, len);
|
||||
|
||||
if (mis->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
|
||||
error_report("%s: MSG_RP_RECV_BITMAP only used for recovery",
|
||||
__func__);
|
||||
return;
|
||||
}
|
||||
|
||||
migrate_send_rp_message(mis, MIG_RP_MSG_RECV_BITMAP, len + 1, buf);
|
||||
|
||||
/*
|
||||
* Next, we dump the received bitmap to the stream.
|
||||
*
|
||||
* TODO: currently we are safe since we are the only one that is
|
||||
* using the to_src_file handle (fault thread is still paused),
|
||||
* and it's ok even not taking the mutex. However the best way is
|
||||
* to take the lock before sending the message header, and release
|
||||
* the lock after sending the bitmap.
|
||||
*/
|
||||
qemu_mutex_lock(&mis->rp_mutex);
|
||||
res = ramblock_recv_bitmap_send(mis->to_src_file, block_name);
|
||||
qemu_mutex_unlock(&mis->rp_mutex);
|
||||
|
||||
trace_migrate_send_rp_recv_bitmap(block_name, res);
|
||||
}
|
||||
|
||||
void migrate_send_rp_resume_ack(MigrationIncomingState *mis, uint32_t value)
|
||||
{
|
||||
uint32_t buf;
|
||||
|
||||
buf = cpu_to_be32(value);
|
||||
migrate_send_rp_message(mis, MIG_RP_MSG_RESUME_ACK, sizeof(buf), &buf);
|
||||
}
|
||||
|
||||
MigrationCapabilityStatusList *qmp_query_migrate_capabilities(Error **errp)
|
||||
{
|
||||
MigrationCapabilityStatusList *head = NULL;
|
||||
@ -569,6 +652,8 @@ static bool migration_is_setup_or_active(int state)
|
||||
switch (state) {
|
||||
case MIGRATION_STATUS_ACTIVE:
|
||||
case MIGRATION_STATUS_POSTCOPY_ACTIVE:
|
||||
case MIGRATION_STATUS_POSTCOPY_PAUSED:
|
||||
case MIGRATION_STATUS_POSTCOPY_RECOVER:
|
||||
case MIGRATION_STATUS_SETUP:
|
||||
case MIGRATION_STATUS_PRE_SWITCHOVER:
|
||||
case MIGRATION_STATUS_DEVICE:
|
||||
@ -649,6 +734,8 @@ static void fill_source_migration_info(MigrationInfo *info)
|
||||
case MIGRATION_STATUS_POSTCOPY_ACTIVE:
|
||||
case MIGRATION_STATUS_PRE_SWITCHOVER:
|
||||
case MIGRATION_STATUS_DEVICE:
|
||||
case MIGRATION_STATUS_POSTCOPY_PAUSED:
|
||||
case MIGRATION_STATUS_POSTCOPY_RECOVER:
|
||||
/* TODO add some postcopy stats */
|
||||
info->has_status = true;
|
||||
info->has_total_time = true;
|
||||
@ -1147,6 +1234,7 @@ static void migrate_fd_cleanup(void *opaque)
|
||||
|
||||
if (s->to_dst_file) {
|
||||
Error *local_err = NULL;
|
||||
QEMUFile *tmp;
|
||||
|
||||
trace_migrate_fd_cleanup();
|
||||
qemu_mutex_unlock_iothread();
|
||||
@ -1159,8 +1247,15 @@ static void migrate_fd_cleanup(void *opaque)
|
||||
if (multifd_save_cleanup(&local_err) != 0) {
|
||||
error_report_err(local_err);
|
||||
}
|
||||
qemu_fclose(s->to_dst_file);
|
||||
qemu_mutex_lock(&s->qemu_file_lock);
|
||||
tmp = s->to_dst_file;
|
||||
s->to_dst_file = NULL;
|
||||
qemu_mutex_unlock(&s->qemu_file_lock);
|
||||
/*
|
||||
* Close the file handle without the lock to make sure the
|
||||
* critical section won't block for long.
|
||||
*/
|
||||
qemu_fclose(tmp);
|
||||
}
|
||||
|
||||
assert((s->state != MIGRATION_STATUS_ACTIVE) &&
|
||||
@ -1388,6 +1483,59 @@ void qmp_migrate_incoming(const char *uri, Error **errp)
|
||||
once = false;
|
||||
}
|
||||
|
||||
void qmp_migrate_recover(const char *uri, Error **errp)
|
||||
{
|
||||
MigrationIncomingState *mis = migration_incoming_get_current();
|
||||
|
||||
if (mis->state != MIGRATION_STATUS_POSTCOPY_PAUSED) {
|
||||
error_setg(errp, "Migrate recover can only be run "
|
||||
"when postcopy is paused.");
|
||||
return;
|
||||
}
|
||||
|
||||
if (atomic_cmpxchg(&mis->postcopy_recover_triggered,
|
||||
false, true) == true) {
|
||||
error_setg(errp, "Migrate recovery is triggered already");
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* Note that this call will never start a real migration; it will
|
||||
* only re-setup the migration stream and poke existing migration
|
||||
* to continue using that newly established channel.
|
||||
*/
|
||||
qemu_start_incoming_migration(uri, errp);
|
||||
}
|
||||
|
||||
void qmp_migrate_pause(Error **errp)
|
||||
{
|
||||
MigrationState *ms = migrate_get_current();
|
||||
MigrationIncomingState *mis = migration_incoming_get_current();
|
||||
int ret;
|
||||
|
||||
if (ms->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
|
||||
/* Source side, during postcopy */
|
||||
qemu_mutex_lock(&ms->qemu_file_lock);
|
||||
ret = qemu_file_shutdown(ms->to_dst_file);
|
||||
qemu_mutex_unlock(&ms->qemu_file_lock);
|
||||
if (ret) {
|
||||
error_setg(errp, "Failed to pause source migration");
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (mis->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
|
||||
ret = qemu_file_shutdown(mis->from_src_file);
|
||||
if (ret) {
|
||||
error_setg(errp, "Failed to pause destination migration");
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
error_setg(errp, "migrate-pause is currently only supported "
|
||||
"during postcopy-active state");
|
||||
}
|
||||
|
||||
bool migration_is_blocked(Error **errp)
|
||||
{
|
||||
if (qemu_savevm_state_blocked(errp)) {
|
||||
@ -1402,49 +1550,75 @@ bool migration_is_blocked(Error **errp)
|
||||
return false;
|
||||
}
|
||||
|
||||
void qmp_migrate(const char *uri, bool has_blk, bool blk,
|
||||
bool has_inc, bool inc, bool has_detach, bool detach,
|
||||
Error **errp)
|
||||
/* Returns true if continue to migrate, or false if error detected */
|
||||
static bool migrate_prepare(MigrationState *s, bool blk, bool blk_inc,
|
||||
bool resume, Error **errp)
|
||||
{
|
||||
Error *local_err = NULL;
|
||||
MigrationState *s = migrate_get_current();
|
||||
const char *p;
|
||||
|
||||
if (resume) {
|
||||
if (s->state != MIGRATION_STATUS_POSTCOPY_PAUSED) {
|
||||
error_setg(errp, "Cannot resume if there is no "
|
||||
"paused migration");
|
||||
return false;
|
||||
}
|
||||
/* This is a resume, skip init status */
|
||||
return true;
|
||||
}
|
||||
|
||||
if (migration_is_setup_or_active(s->state) ||
|
||||
s->state == MIGRATION_STATUS_CANCELLING ||
|
||||
s->state == MIGRATION_STATUS_COLO) {
|
||||
error_setg(errp, QERR_MIGRATION_ACTIVE);
|
||||
return;
|
||||
return false;
|
||||
}
|
||||
|
||||
if (runstate_check(RUN_STATE_INMIGRATE)) {
|
||||
error_setg(errp, "Guest is waiting for an incoming migration");
|
||||
return;
|
||||
return false;
|
||||
}
|
||||
|
||||
if (migration_is_blocked(errp)) {
|
||||
return;
|
||||
return false;
|
||||
}
|
||||
|
||||
if ((has_blk && blk) || (has_inc && inc)) {
|
||||
if (blk || blk_inc) {
|
||||
if (migrate_use_block() || migrate_use_block_incremental()) {
|
||||
error_setg(errp, "Command options are incompatible with "
|
||||
"current migration capabilities");
|
||||
return;
|
||||
return false;
|
||||
}
|
||||
migrate_set_block_enabled(true, &local_err);
|
||||
if (local_err) {
|
||||
error_propagate(errp, local_err);
|
||||
return;
|
||||
return false;
|
||||
}
|
||||
s->must_remove_block_options = true;
|
||||
}
|
||||
|
||||
if (has_inc && inc) {
|
||||
if (blk_inc) {
|
||||
migrate_set_block_incremental(s, true);
|
||||
}
|
||||
|
||||
migrate_init(s);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void qmp_migrate(const char *uri, bool has_blk, bool blk,
|
||||
bool has_inc, bool inc, bool has_detach, bool detach,
|
||||
bool has_resume, bool resume, Error **errp)
|
||||
{
|
||||
Error *local_err = NULL;
|
||||
MigrationState *s = migrate_get_current();
|
||||
const char *p;
|
||||
|
||||
if (!migrate_prepare(s, has_blk && blk, has_inc && inc,
|
||||
has_resume && resume, errp)) {
|
||||
/* Error detected, put into errp */
|
||||
return;
|
||||
}
|
||||
|
||||
if (strstart(uri, "tcp:", &p)) {
|
||||
tcp_start_outgoing_migration(s, p, &local_err);
|
||||
#ifdef CONFIG_RDMA
|
||||
@ -1739,6 +1913,8 @@ static struct rp_cmd_args {
|
||||
[MIG_RP_MSG_PONG] = { .len = 4, .name = "PONG" },
|
||||
[MIG_RP_MSG_REQ_PAGES] = { .len = 12, .name = "REQ_PAGES" },
|
||||
[MIG_RP_MSG_REQ_PAGES_ID] = { .len = -1, .name = "REQ_PAGES_ID" },
|
||||
[MIG_RP_MSG_RECV_BITMAP] = { .len = -1, .name = "RECV_BITMAP" },
|
||||
[MIG_RP_MSG_RESUME_ACK] = { .len = 4, .name = "RESUME_ACK" },
|
||||
[MIG_RP_MSG_MAX] = { .len = -1, .name = "MAX" },
|
||||
};
|
||||
|
||||
@ -1771,6 +1947,51 @@ static void migrate_handle_rp_req_pages(MigrationState *ms, const char* rbname,
|
||||
}
|
||||
}
|
||||
|
||||
/* Return true to retry, false to quit */
|
||||
static bool postcopy_pause_return_path_thread(MigrationState *s)
|
||||
{
|
||||
trace_postcopy_pause_return_path();
|
||||
|
||||
qemu_sem_wait(&s->postcopy_pause_rp_sem);
|
||||
|
||||
trace_postcopy_pause_return_path_continued();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static int migrate_handle_rp_recv_bitmap(MigrationState *s, char *block_name)
|
||||
{
|
||||
RAMBlock *block = qemu_ram_block_by_name(block_name);
|
||||
|
||||
if (!block) {
|
||||
error_report("%s: invalid block name '%s'", __func__, block_name);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/* Fetch the received bitmap and refresh the dirty bitmap */
|
||||
return ram_dirty_bitmap_reload(s, block);
|
||||
}
|
||||
|
||||
static int migrate_handle_rp_resume_ack(MigrationState *s, uint32_t value)
|
||||
{
|
||||
trace_source_return_path_thread_resume_ack(value);
|
||||
|
||||
if (value != MIGRATION_RESUME_ACK_VALUE) {
|
||||
error_report("%s: illegal resume_ack value %"PRIu32,
|
||||
__func__, value);
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* Now both sides are active. */
|
||||
migrate_set_state(&s->state, MIGRATION_STATUS_POSTCOPY_RECOVER,
|
||||
MIGRATION_STATUS_POSTCOPY_ACTIVE);
|
||||
|
||||
/* Notify send thread that time to continue send pages */
|
||||
qemu_sem_post(&s->rp_state.rp_sem);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Handles messages sent on the return path towards the source VM
|
||||
*
|
||||
@ -1787,6 +2008,8 @@ static void *source_return_path_thread(void *opaque)
|
||||
int res;
|
||||
|
||||
trace_source_return_path_thread_entry();
|
||||
|
||||
retry:
|
||||
while (!ms->rp_state.error && !qemu_file_get_error(rp) &&
|
||||
migration_is_setup_or_active(ms->state)) {
|
||||
trace_source_return_path_thread_loop_top();
|
||||
@ -1874,23 +2097,61 @@ static void *source_return_path_thread(void *opaque)
|
||||
migrate_handle_rp_req_pages(ms, (char *)&buf[13], start, len);
|
||||
break;
|
||||
|
||||
case MIG_RP_MSG_RECV_BITMAP:
|
||||
if (header_len < 1) {
|
||||
error_report("%s: missing block name", __func__);
|
||||
mark_source_rp_bad(ms);
|
||||
goto out;
|
||||
}
|
||||
/* Format: len (1B) + idstr (<255B). This ends the idstr. */
|
||||
buf[buf[0] + 1] = '\0';
|
||||
if (migrate_handle_rp_recv_bitmap(ms, (char *)(buf + 1))) {
|
||||
mark_source_rp_bad(ms);
|
||||
goto out;
|
||||
}
|
||||
break;
|
||||
|
||||
case MIG_RP_MSG_RESUME_ACK:
|
||||
tmp32 = ldl_be_p(buf);
|
||||
if (migrate_handle_rp_resume_ack(ms, tmp32)) {
|
||||
mark_source_rp_bad(ms);
|
||||
goto out;
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (qemu_file_get_error(rp)) {
|
||||
|
||||
out:
|
||||
res = qemu_file_get_error(rp);
|
||||
if (res) {
|
||||
if (res == -EIO) {
|
||||
/*
|
||||
* Maybe there is something we can do: it looks like a
|
||||
* network down issue, and we pause for a recovery.
|
||||
*/
|
||||
if (postcopy_pause_return_path_thread(ms)) {
|
||||
/* Reload rp, reset the rest */
|
||||
rp = ms->rp_state.from_dst_file;
|
||||
ms->rp_state.error = false;
|
||||
goto retry;
|
||||
}
|
||||
}
|
||||
|
||||
trace_source_return_path_thread_bad_end();
|
||||
mark_source_rp_bad(ms);
|
||||
}
|
||||
|
||||
trace_source_return_path_thread_end();
|
||||
out:
|
||||
ms->rp_state.from_dst_file = NULL;
|
||||
qemu_fclose(rp);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static int open_return_path_on_source(MigrationState *ms)
|
||||
static int open_return_path_on_source(MigrationState *ms,
|
||||
bool create_thread)
|
||||
{
|
||||
|
||||
ms->rp_state.from_dst_file = qemu_file_get_return_path(ms->to_dst_file);
|
||||
@ -1899,6 +2160,12 @@ static int open_return_path_on_source(MigrationState *ms)
|
||||
}
|
||||
|
||||
trace_open_return_path_on_source();
|
||||
|
||||
if (!create_thread) {
|
||||
/* We're done */
|
||||
return 0;
|
||||
}
|
||||
|
||||
qemu_thread_create(&ms->rp_state.rp_thread, "return path",
|
||||
source_return_path_thread, ms, QEMU_THREAD_JOINABLE);
|
||||
|
||||
@ -2238,6 +2505,156 @@ bool migrate_colo_enabled(void)
|
||||
return s->enabled_capabilities[MIGRATION_CAPABILITY_X_COLO];
|
||||
}
|
||||
|
||||
typedef enum MigThrError {
|
||||
/* No error detected */
|
||||
MIG_THR_ERR_NONE = 0,
|
||||
/* Detected error, but resumed successfully */
|
||||
MIG_THR_ERR_RECOVERED = 1,
|
||||
/* Detected fatal error, need to exit */
|
||||
MIG_THR_ERR_FATAL = 2,
|
||||
} MigThrError;
|
||||
|
||||
static int postcopy_resume_handshake(MigrationState *s)
|
||||
{
|
||||
qemu_savevm_send_postcopy_resume(s->to_dst_file);
|
||||
|
||||
while (s->state == MIGRATION_STATUS_POSTCOPY_RECOVER) {
|
||||
qemu_sem_wait(&s->rp_state.rp_sem);
|
||||
}
|
||||
|
||||
if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* Return zero if success, or <0 for error */
|
||||
static int postcopy_do_resume(MigrationState *s)
|
||||
{
|
||||
int ret;
|
||||
|
||||
/*
|
||||
* Call all the resume_prepare() hooks, so that modules can be
|
||||
* ready for the migration resume.
|
||||
*/
|
||||
ret = qemu_savevm_state_resume_prepare(s);
|
||||
if (ret) {
|
||||
error_report("%s: resume_prepare() failure detected: %d",
|
||||
__func__, ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Last handshake with destination on the resume (destination will
|
||||
* switch to postcopy-active afterwards)
|
||||
*/
|
||||
ret = postcopy_resume_handshake(s);
|
||||
if (ret) {
|
||||
error_report("%s: handshake failed: %d", __func__, ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* We don't return until we are in a safe state to continue current
|
||||
* postcopy migration. Returns MIG_THR_ERR_RECOVERED if recovered, or
|
||||
* MIG_THR_ERR_FATAL if unrecovery failure happened.
|
||||
*/
|
||||
static MigThrError postcopy_pause(MigrationState *s)
|
||||
{
|
||||
assert(s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE);
|
||||
|
||||
while (true) {
|
||||
QEMUFile *file;
|
||||
|
||||
migrate_set_state(&s->state, s->state,
|
||||
MIGRATION_STATUS_POSTCOPY_PAUSED);
|
||||
|
||||
/* Current channel is possibly broken. Release it. */
|
||||
assert(s->to_dst_file);
|
||||
qemu_mutex_lock(&s->qemu_file_lock);
|
||||
file = s->to_dst_file;
|
||||
s->to_dst_file = NULL;
|
||||
qemu_mutex_unlock(&s->qemu_file_lock);
|
||||
|
||||
qemu_file_shutdown(file);
|
||||
qemu_fclose(file);
|
||||
|
||||
error_report("Detected IO failure for postcopy. "
|
||||
"Migration paused.");
|
||||
|
||||
/*
|
||||
* We wait until things fixed up. Then someone will setup the
|
||||
* status back for us.
|
||||
*/
|
||||
while (s->state == MIGRATION_STATUS_POSTCOPY_PAUSED) {
|
||||
qemu_sem_wait(&s->postcopy_pause_sem);
|
||||
}
|
||||
|
||||
if (s->state == MIGRATION_STATUS_POSTCOPY_RECOVER) {
|
||||
/* Woken up by a recover procedure. Give it a shot */
|
||||
|
||||
/*
|
||||
* Firstly, let's wake up the return path now, with a new
|
||||
* return path channel.
|
||||
*/
|
||||
qemu_sem_post(&s->postcopy_pause_rp_sem);
|
||||
|
||||
/* Do the resume logic */
|
||||
if (postcopy_do_resume(s) == 0) {
|
||||
/* Let's continue! */
|
||||
trace_postcopy_pause_continued();
|
||||
return MIG_THR_ERR_RECOVERED;
|
||||
} else {
|
||||
/*
|
||||
* Something wrong happened during the recovery, let's
|
||||
* pause again. Pause is always better than throwing
|
||||
* data away.
|
||||
*/
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
/* This is not right... Time to quit. */
|
||||
return MIG_THR_ERR_FATAL;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static MigThrError migration_detect_error(MigrationState *s)
|
||||
{
|
||||
int ret;
|
||||
|
||||
/* Try to detect any file errors */
|
||||
ret = qemu_file_get_error(s->to_dst_file);
|
||||
|
||||
if (!ret) {
|
||||
/* Everything is fine */
|
||||
return MIG_THR_ERR_NONE;
|
||||
}
|
||||
|
||||
if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE && ret == -EIO) {
|
||||
/*
|
||||
* For postcopy, we allow the network to be down for a
|
||||
* while. After that, it can be continued by a
|
||||
* recovery phase.
|
||||
*/
|
||||
return postcopy_pause(s);
|
||||
} else {
|
||||
/*
|
||||
* For precopy (or postcopy with error outside IO), we fail
|
||||
* with no time.
|
||||
*/
|
||||
migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILED);
|
||||
trace_migration_thread_file_err();
|
||||
|
||||
/* Time to stop the migration, now. */
|
||||
return MIG_THR_ERR_FATAL;
|
||||
}
|
||||
}
|
||||
|
||||
static void migration_calculate_complete(MigrationState *s)
|
||||
{
|
||||
uint64_t bytes = qemu_ftell(s->to_dst_file);
|
||||
@ -2394,6 +2811,7 @@ static void *migration_thread(void *opaque)
|
||||
{
|
||||
MigrationState *s = opaque;
|
||||
int64_t setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST);
|
||||
MigThrError thr_error;
|
||||
|
||||
rcu_register_thread();
|
||||
|
||||
@ -2443,13 +2861,22 @@ static void *migration_thread(void *opaque)
|
||||
}
|
||||
}
|
||||
|
||||
if (qemu_file_get_error(s->to_dst_file)) {
|
||||
if (migration_is_setup_or_active(s->state)) {
|
||||
migrate_set_state(&s->state, s->state,
|
||||
MIGRATION_STATUS_FAILED);
|
||||
}
|
||||
trace_migration_thread_file_err();
|
||||
/*
|
||||
* Try to detect any kind of failures, and see whether we
|
||||
* should stop the migration now.
|
||||
*/
|
||||
thr_error = migration_detect_error(s);
|
||||
if (thr_error == MIG_THR_ERR_FATAL) {
|
||||
/* Stop migration */
|
||||
break;
|
||||
} else if (thr_error == MIG_THR_ERR_RECOVERED) {
|
||||
/*
|
||||
* Just recovered from a e.g. network failure, reset all
|
||||
* the local variables. This is important to avoid
|
||||
* breaking transferred_bytes and bandwidth calculation
|
||||
*/
|
||||
s->iteration_start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
|
||||
s->iteration_initial_bytes = 0;
|
||||
}
|
||||
|
||||
current_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
|
||||
@ -2471,6 +2898,9 @@ static void *migration_thread(void *opaque)
|
||||
|
||||
void migrate_fd_connect(MigrationState *s, Error *error_in)
|
||||
{
|
||||
int64_t rate_limit;
|
||||
bool resume = s->state == MIGRATION_STATUS_POSTCOPY_PAUSED;
|
||||
|
||||
s->expected_downtime = s->parameters.downtime_limit;
|
||||
s->cleanup_bh = qemu_bh_new(migrate_fd_cleanup, s);
|
||||
if (error_in) {
|
||||
@ -2479,12 +2909,21 @@ void migrate_fd_connect(MigrationState *s, Error *error_in)
|
||||
return;
|
||||
}
|
||||
|
||||
qemu_file_set_blocking(s->to_dst_file, true);
|
||||
qemu_file_set_rate_limit(s->to_dst_file,
|
||||
s->parameters.max_bandwidth / XFER_LIMIT_RATIO);
|
||||
if (resume) {
|
||||
/* This is a resumed migration */
|
||||
rate_limit = INT64_MAX;
|
||||
} else {
|
||||
/* This is a fresh new migration */
|
||||
rate_limit = s->parameters.max_bandwidth / XFER_LIMIT_RATIO;
|
||||
s->expected_downtime = s->parameters.downtime_limit;
|
||||
s->cleanup_bh = qemu_bh_new(migrate_fd_cleanup, s);
|
||||
|
||||
/* Notify before starting migration thread */
|
||||
notifier_list_notify(&migration_state_notifiers, s);
|
||||
/* Notify before starting migration thread */
|
||||
notifier_list_notify(&migration_state_notifiers, s);
|
||||
}
|
||||
|
||||
qemu_file_set_rate_limit(s->to_dst_file, rate_limit);
|
||||
qemu_file_set_blocking(s->to_dst_file, true);
|
||||
|
||||
/*
|
||||
* Open the return path. For postcopy, it is used exclusively. For
|
||||
@ -2492,15 +2931,22 @@ void migrate_fd_connect(MigrationState *s, Error *error_in)
|
||||
* QEMU uses the return path.
|
||||
*/
|
||||
if (migrate_postcopy_ram() || migrate_use_return_path()) {
|
||||
if (open_return_path_on_source(s)) {
|
||||
if (open_return_path_on_source(s, !resume)) {
|
||||
error_report("Unable to open return-path for postcopy");
|
||||
migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
|
||||
MIGRATION_STATUS_FAILED);
|
||||
migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILED);
|
||||
migrate_fd_cleanup(s);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if (resume) {
|
||||
/* Wakeup the main migration thread to do the recovery */
|
||||
migrate_set_state(&s->state, MIGRATION_STATUS_POSTCOPY_PAUSED,
|
||||
MIGRATION_STATUS_POSTCOPY_RECOVER);
|
||||
qemu_sem_post(&s->postcopy_pause_sem);
|
||||
return;
|
||||
}
|
||||
|
||||
if (multifd_save_setup() != 0) {
|
||||
migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
|
||||
MIGRATION_STATUS_FAILED);
|
||||
@ -2604,9 +3050,13 @@ static void migration_instance_finalize(Object *obj)
|
||||
MigrationParameters *params = &ms->parameters;
|
||||
|
||||
qemu_mutex_destroy(&ms->error_mutex);
|
||||
qemu_mutex_destroy(&ms->qemu_file_lock);
|
||||
g_free(params->tls_hostname);
|
||||
g_free(params->tls_creds);
|
||||
qemu_sem_destroy(&ms->pause_sem);
|
||||
qemu_sem_destroy(&ms->postcopy_pause_sem);
|
||||
qemu_sem_destroy(&ms->postcopy_pause_rp_sem);
|
||||
qemu_sem_destroy(&ms->rp_state.rp_sem);
|
||||
error_free(ms->error);
|
||||
}
|
||||
|
||||
@ -2636,6 +3086,11 @@ static void migration_instance_init(Object *obj)
|
||||
params->has_x_multifd_channels = true;
|
||||
params->has_x_multifd_page_count = true;
|
||||
params->has_xbzrle_cache_size = true;
|
||||
|
||||
qemu_sem_init(&ms->postcopy_pause_sem, 0);
|
||||
qemu_sem_init(&ms->postcopy_pause_rp_sem, 0);
|
||||
qemu_sem_init(&ms->rp_state.rp_sem, 0);
|
||||
qemu_mutex_init(&ms->qemu_file_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -24,6 +24,8 @@
|
||||
|
||||
struct PostcopyBlocktimeContext;
|
||||
|
||||
#define MIGRATION_RESUME_ACK_VALUE (1)
|
||||
|
||||
/* State for the incoming migration */
|
||||
struct MigrationIncomingState {
|
||||
QEMUFile *from_src_file;
|
||||
@ -73,6 +75,11 @@ struct MigrationIncomingState {
|
||||
* live migration, to calculate vCPU block time
|
||||
* */
|
||||
struct PostcopyBlocktimeContext *blocktime_ctx;
|
||||
|
||||
/* notify PAUSED postcopy incoming migrations to try to continue */
|
||||
bool postcopy_recover_triggered;
|
||||
QemuSemaphore postcopy_pause_sem_dst;
|
||||
QemuSemaphore postcopy_pause_sem_fault;
|
||||
};
|
||||
|
||||
MigrationIncomingState *migration_incoming_get_current(void);
|
||||
@ -107,6 +114,12 @@ struct MigrationState
|
||||
QemuThread thread;
|
||||
QEMUBH *cleanup_bh;
|
||||
QEMUFile *to_dst_file;
|
||||
/*
|
||||
* Protects to_dst_file pointer. We need to make sure we won't
|
||||
* yield or hang during the critical section, since this lock will
|
||||
* be used in OOB command handler.
|
||||
*/
|
||||
QemuMutex qemu_file_lock;
|
||||
|
||||
/* bytes already send at the beggining of current interation */
|
||||
uint64_t iteration_initial_bytes;
|
||||
@ -129,6 +142,7 @@ struct MigrationState
|
||||
QEMUFile *from_dst_file;
|
||||
QemuThread rp_thread;
|
||||
bool error;
|
||||
QemuSemaphore rp_sem;
|
||||
} rp_state;
|
||||
|
||||
double mbps;
|
||||
@ -194,12 +208,17 @@ struct MigrationState
|
||||
bool send_configuration;
|
||||
/* Whether we send section footer during migration */
|
||||
bool send_section_footer;
|
||||
|
||||
/* Needed by postcopy-pause state */
|
||||
QemuSemaphore postcopy_pause_sem;
|
||||
QemuSemaphore postcopy_pause_rp_sem;
|
||||
};
|
||||
|
||||
void migrate_set_state(int *state, int old_state, int new_state);
|
||||
|
||||
void migration_fd_process_incoming(QEMUFile *f);
|
||||
void migration_ioc_process_incoming(QIOChannel *ioc);
|
||||
void migration_incoming_process(void);
|
||||
|
||||
bool migration_has_all_channels(void);
|
||||
|
||||
@ -251,6 +270,9 @@ void migrate_send_rp_pong(MigrationIncomingState *mis,
|
||||
uint32_t value);
|
||||
int migrate_send_rp_req_pages(MigrationIncomingState *mis, const char* rbname,
|
||||
ram_addr_t start, size_t len);
|
||||
void migrate_send_rp_recv_bitmap(MigrationIncomingState *mis,
|
||||
char *block_name);
|
||||
void migrate_send_rp_resume_ack(MigrationIncomingState *mis, uint32_t value);
|
||||
|
||||
void dirty_bitmap_mig_before_vm_start(void);
|
||||
void init_dirty_bitmap_incoming_migration(void);
|
||||
|
@ -830,6 +830,17 @@ static void mark_postcopy_blocktime_end(uintptr_t addr)
|
||||
affected_cpu);
|
||||
}
|
||||
|
||||
static bool postcopy_pause_fault_thread(MigrationIncomingState *mis)
|
||||
{
|
||||
trace_postcopy_pause_fault_thread();
|
||||
|
||||
qemu_sem_wait(&mis->postcopy_pause_sem_fault);
|
||||
|
||||
trace_postcopy_pause_fault_thread_continued();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* Handle faults detected by the USERFAULT markings
|
||||
*/
|
||||
@ -880,6 +891,22 @@ static void *postcopy_ram_fault_thread(void *opaque)
|
||||
break;
|
||||
}
|
||||
|
||||
if (!mis->to_src_file) {
|
||||
/*
|
||||
* Possibly someone tells us that the return path is
|
||||
* broken already using the event. We should hold until
|
||||
* the channel is rebuilt.
|
||||
*/
|
||||
if (postcopy_pause_fault_thread(mis)) {
|
||||
mis->last_rb = NULL;
|
||||
/* Continue to read the userfaultfd */
|
||||
} else {
|
||||
error_report("%s: paused but don't allow to continue",
|
||||
__func__);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (pfd[1].revents) {
|
||||
uint64_t tmp64 = 0;
|
||||
|
||||
@ -942,18 +969,37 @@ static void *postcopy_ram_fault_thread(void *opaque)
|
||||
(uintptr_t)(msg.arg.pagefault.address),
|
||||
msg.arg.pagefault.feat.ptid, rb);
|
||||
|
||||
retry:
|
||||
/*
|
||||
* Send the request to the source - we want to request one
|
||||
* of our host page sizes (which is >= TPS)
|
||||
*/
|
||||
if (rb != mis->last_rb) {
|
||||
mis->last_rb = rb;
|
||||
migrate_send_rp_req_pages(mis, qemu_ram_get_idstr(rb),
|
||||
rb_offset, qemu_ram_pagesize(rb));
|
||||
ret = migrate_send_rp_req_pages(mis,
|
||||
qemu_ram_get_idstr(rb),
|
||||
rb_offset,
|
||||
qemu_ram_pagesize(rb));
|
||||
} else {
|
||||
/* Save some space */
|
||||
migrate_send_rp_req_pages(mis, NULL,
|
||||
rb_offset, qemu_ram_pagesize(rb));
|
||||
ret = migrate_send_rp_req_pages(mis,
|
||||
NULL,
|
||||
rb_offset,
|
||||
qemu_ram_pagesize(rb));
|
||||
}
|
||||
|
||||
if (ret) {
|
||||
/* May be network failure, try to wait for recovery */
|
||||
if (ret == -EIO && postcopy_pause_fault_thread(mis)) {
|
||||
/* We got reconnected somehow, try to continue */
|
||||
mis->last_rb = NULL;
|
||||
goto retry;
|
||||
} else {
|
||||
/* This is a unavoidable fault */
|
||||
error_report("%s: migrate_send_rp_req_pages() get %d",
|
||||
__func__, ret);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
500
migration/ram.c
500
migration/ram.c
@ -36,6 +36,7 @@
|
||||
#include "xbzrle.h"
|
||||
#include "ram.h"
|
||||
#include "migration.h"
|
||||
#include "socket.h"
|
||||
#include "migration/register.h"
|
||||
#include "migration/misc.h"
|
||||
#include "qemu-file.h"
|
||||
@ -51,6 +52,9 @@
|
||||
#include "qemu/rcu_queue.h"
|
||||
#include "migration/colo.h"
|
||||
#include "migration/block.h"
|
||||
#include "sysemu/sysemu.h"
|
||||
#include "qemu/uuid.h"
|
||||
#include "savevm.h"
|
||||
|
||||
/***********************************************************/
|
||||
/* ram save/restore */
|
||||
@ -187,6 +191,70 @@ void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
|
||||
nr);
|
||||
}
|
||||
|
||||
#define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL)
|
||||
|
||||
/*
|
||||
* Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
|
||||
*
|
||||
* Returns >0 if success with sent bytes, or <0 if error.
|
||||
*/
|
||||
int64_t ramblock_recv_bitmap_send(QEMUFile *file,
|
||||
const char *block_name)
|
||||
{
|
||||
RAMBlock *block = qemu_ram_block_by_name(block_name);
|
||||
unsigned long *le_bitmap, nbits;
|
||||
uint64_t size;
|
||||
|
||||
if (!block) {
|
||||
error_report("%s: invalid block name: %s", __func__, block_name);
|
||||
return -1;
|
||||
}
|
||||
|
||||
nbits = block->used_length >> TARGET_PAGE_BITS;
|
||||
|
||||
/*
|
||||
* Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
|
||||
* machines we may need 4 more bytes for padding (see below
|
||||
* comment). So extend it a bit before hand.
|
||||
*/
|
||||
le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
|
||||
|
||||
/*
|
||||
* Always use little endian when sending the bitmap. This is
|
||||
* required that when source and destination VMs are not using the
|
||||
* same endianess. (Note: big endian won't work.)
|
||||
*/
|
||||
bitmap_to_le(le_bitmap, block->receivedmap, nbits);
|
||||
|
||||
/* Size of the bitmap, in bytes */
|
||||
size = nbits / 8;
|
||||
|
||||
/*
|
||||
* size is always aligned to 8 bytes for 64bit machines, but it
|
||||
* may not be true for 32bit machines. We need this padding to
|
||||
* make sure the migration can survive even between 32bit and
|
||||
* 64bit machines.
|
||||
*/
|
||||
size = ROUND_UP(size, 8);
|
||||
|
||||
qemu_put_be64(file, size);
|
||||
qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
|
||||
/*
|
||||
* Mark as an end, in case the middle part is screwed up due to
|
||||
* some "misterious" reason.
|
||||
*/
|
||||
qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
|
||||
qemu_fflush(file);
|
||||
|
||||
free(le_bitmap);
|
||||
|
||||
if (qemu_file_get_error(file)) {
|
||||
return qemu_file_get_error(file);
|
||||
}
|
||||
|
||||
return size + sizeof(size);
|
||||
}
|
||||
|
||||
/*
|
||||
* An outstanding page request, on the source, having been received
|
||||
* and queued
|
||||
@ -432,15 +500,117 @@ exit:
|
||||
|
||||
/* Multiple fd's */
|
||||
|
||||
struct MultiFDSendParams {
|
||||
#define MULTIFD_MAGIC 0x11223344U
|
||||
#define MULTIFD_VERSION 1
|
||||
|
||||
typedef struct {
|
||||
uint32_t magic;
|
||||
uint32_t version;
|
||||
unsigned char uuid[16]; /* QemuUUID */
|
||||
uint8_t id;
|
||||
} __attribute__((packed)) MultiFDInit_t;
|
||||
|
||||
typedef struct {
|
||||
/* this fields are not changed once the thread is created */
|
||||
/* channel number */
|
||||
uint8_t id;
|
||||
/* channel thread name */
|
||||
char *name;
|
||||
/* channel thread id */
|
||||
QemuThread thread;
|
||||
/* communication channel */
|
||||
QIOChannel *c;
|
||||
/* sem where to wait for more work */
|
||||
QemuSemaphore sem;
|
||||
/* this mutex protects the following parameters */
|
||||
QemuMutex mutex;
|
||||
/* is this channel thread running */
|
||||
bool running;
|
||||
/* should this thread finish */
|
||||
bool quit;
|
||||
};
|
||||
typedef struct MultiFDSendParams MultiFDSendParams;
|
||||
} MultiFDSendParams;
|
||||
|
||||
typedef struct {
|
||||
/* this fields are not changed once the thread is created */
|
||||
/* channel number */
|
||||
uint8_t id;
|
||||
/* channel thread name */
|
||||
char *name;
|
||||
/* channel thread id */
|
||||
QemuThread thread;
|
||||
/* communication channel */
|
||||
QIOChannel *c;
|
||||
/* sem where to wait for more work */
|
||||
QemuSemaphore sem;
|
||||
/* this mutex protects the following parameters */
|
||||
QemuMutex mutex;
|
||||
/* is this channel thread running */
|
||||
bool running;
|
||||
/* should this thread finish */
|
||||
bool quit;
|
||||
} MultiFDRecvParams;
|
||||
|
||||
static int multifd_send_initial_packet(MultiFDSendParams *p, Error **errp)
|
||||
{
|
||||
MultiFDInit_t msg;
|
||||
int ret;
|
||||
|
||||
msg.magic = cpu_to_be32(MULTIFD_MAGIC);
|
||||
msg.version = cpu_to_be32(MULTIFD_VERSION);
|
||||
msg.id = p->id;
|
||||
memcpy(msg.uuid, &qemu_uuid.data, sizeof(msg.uuid));
|
||||
|
||||
ret = qio_channel_write_all(p->c, (char *)&msg, sizeof(msg), errp);
|
||||
if (ret != 0) {
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int multifd_recv_initial_packet(QIOChannel *c, Error **errp)
|
||||
{
|
||||
MultiFDInit_t msg;
|
||||
int ret;
|
||||
|
||||
ret = qio_channel_read_all(c, (char *)&msg, sizeof(msg), errp);
|
||||
if (ret != 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
be32_to_cpus(&msg.magic);
|
||||
be32_to_cpus(&msg.version);
|
||||
|
||||
if (msg.magic != MULTIFD_MAGIC) {
|
||||
error_setg(errp, "multifd: received packet magic %x "
|
||||
"expected %x", msg.magic, MULTIFD_MAGIC);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (msg.version != MULTIFD_VERSION) {
|
||||
error_setg(errp, "multifd: received packet version %d "
|
||||
"expected %d", msg.version, MULTIFD_VERSION);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (memcmp(msg.uuid, &qemu_uuid, sizeof(qemu_uuid))) {
|
||||
char *uuid = qemu_uuid_unparse_strdup(&qemu_uuid);
|
||||
char *msg_uuid = qemu_uuid_unparse_strdup((const QemuUUID *)msg.uuid);
|
||||
|
||||
error_setg(errp, "multifd: received uuid '%s' and expected "
|
||||
"uuid '%s' for channel %hhd", msg_uuid, uuid, msg.id);
|
||||
g_free(uuid);
|
||||
g_free(msg_uuid);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (msg.id > migrate_multifd_channels()) {
|
||||
error_setg(errp, "multifd: received channel version %d "
|
||||
"expected %d", msg.version, MULTIFD_VERSION);
|
||||
return -1;
|
||||
}
|
||||
|
||||
return msg.id;
|
||||
}
|
||||
|
||||
struct {
|
||||
MultiFDSendParams *params;
|
||||
@ -448,11 +618,23 @@ struct {
|
||||
int count;
|
||||
} *multifd_send_state;
|
||||
|
||||
static void terminate_multifd_send_threads(Error *errp)
|
||||
static void multifd_send_terminate_threads(Error *err)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < multifd_send_state->count; i++) {
|
||||
if (err) {
|
||||
MigrationState *s = migrate_get_current();
|
||||
migrate_set_error(s, err);
|
||||
if (s->state == MIGRATION_STATUS_SETUP ||
|
||||
s->state == MIGRATION_STATUS_PRE_SWITCHOVER ||
|
||||
s->state == MIGRATION_STATUS_DEVICE ||
|
||||
s->state == MIGRATION_STATUS_ACTIVE) {
|
||||
migrate_set_state(&s->state, s->state,
|
||||
MIGRATION_STATUS_FAILED);
|
||||
}
|
||||
}
|
||||
|
||||
for (i = 0; i < migrate_multifd_channels(); i++) {
|
||||
MultiFDSendParams *p = &multifd_send_state->params[i];
|
||||
|
||||
qemu_mutex_lock(&p->mutex);
|
||||
@ -470,11 +652,15 @@ int multifd_save_cleanup(Error **errp)
|
||||
if (!migrate_use_multifd()) {
|
||||
return 0;
|
||||
}
|
||||
terminate_multifd_send_threads(NULL);
|
||||
for (i = 0; i < multifd_send_state->count; i++) {
|
||||
multifd_send_terminate_threads(NULL);
|
||||
for (i = 0; i < migrate_multifd_channels(); i++) {
|
||||
MultiFDSendParams *p = &multifd_send_state->params[i];
|
||||
|
||||
qemu_thread_join(&p->thread);
|
||||
if (p->running) {
|
||||
qemu_thread_join(&p->thread);
|
||||
}
|
||||
socket_send_channel_destroy(p->c);
|
||||
p->c = NULL;
|
||||
qemu_mutex_destroy(&p->mutex);
|
||||
qemu_sem_destroy(&p->sem);
|
||||
g_free(p->name);
|
||||
@ -490,6 +676,11 @@ int multifd_save_cleanup(Error **errp)
|
||||
static void *multifd_send_thread(void *opaque)
|
||||
{
|
||||
MultiFDSendParams *p = opaque;
|
||||
Error *local_err = NULL;
|
||||
|
||||
if (multifd_send_initial_packet(p, &local_err) < 0) {
|
||||
goto out;
|
||||
}
|
||||
|
||||
while (true) {
|
||||
qemu_mutex_lock(&p->mutex);
|
||||
@ -501,9 +692,39 @@ static void *multifd_send_thread(void *opaque)
|
||||
qemu_sem_wait(&p->sem);
|
||||
}
|
||||
|
||||
out:
|
||||
if (local_err) {
|
||||
multifd_send_terminate_threads(local_err);
|
||||
}
|
||||
|
||||
qemu_mutex_lock(&p->mutex);
|
||||
p->running = false;
|
||||
qemu_mutex_unlock(&p->mutex);
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque)
|
||||
{
|
||||
MultiFDSendParams *p = opaque;
|
||||
QIOChannel *sioc = QIO_CHANNEL(qio_task_get_source(task));
|
||||
Error *local_err = NULL;
|
||||
|
||||
if (qio_task_propagate_error(task, &local_err)) {
|
||||
if (multifd_save_cleanup(&local_err) != 0) {
|
||||
migrate_set_error(migrate_get_current(), local_err);
|
||||
}
|
||||
} else {
|
||||
p->c = QIO_CHANNEL(sioc);
|
||||
qio_channel_set_delay(p->c, false);
|
||||
p->running = true;
|
||||
qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
|
||||
QEMU_THREAD_JOINABLE);
|
||||
|
||||
atomic_inc(&multifd_send_state->count);
|
||||
}
|
||||
}
|
||||
|
||||
int multifd_save_setup(void)
|
||||
{
|
||||
int thread_count;
|
||||
@ -515,7 +736,7 @@ int multifd_save_setup(void)
|
||||
thread_count = migrate_multifd_channels();
|
||||
multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
|
||||
multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
|
||||
multifd_send_state->count = 0;
|
||||
atomic_set(&multifd_send_state->count, 0);
|
||||
for (i = 0; i < thread_count; i++) {
|
||||
MultiFDSendParams *p = &multifd_send_state->params[i];
|
||||
|
||||
@ -524,35 +745,32 @@ int multifd_save_setup(void)
|
||||
p->quit = false;
|
||||
p->id = i;
|
||||
p->name = g_strdup_printf("multifdsend_%d", i);
|
||||
qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
|
||||
QEMU_THREAD_JOINABLE);
|
||||
|
||||
multifd_send_state->count++;
|
||||
socket_send_channel_create(multifd_new_send_channel_async, p);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
struct MultiFDRecvParams {
|
||||
uint8_t id;
|
||||
char *name;
|
||||
QemuThread thread;
|
||||
QemuSemaphore sem;
|
||||
QemuMutex mutex;
|
||||
bool quit;
|
||||
};
|
||||
typedef struct MultiFDRecvParams MultiFDRecvParams;
|
||||
|
||||
struct {
|
||||
MultiFDRecvParams *params;
|
||||
/* number of created threads */
|
||||
int count;
|
||||
} *multifd_recv_state;
|
||||
|
||||
static void terminate_multifd_recv_threads(Error *errp)
|
||||
static void multifd_recv_terminate_threads(Error *err)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < multifd_recv_state->count; i++) {
|
||||
if (err) {
|
||||
MigrationState *s = migrate_get_current();
|
||||
migrate_set_error(s, err);
|
||||
if (s->state == MIGRATION_STATUS_SETUP ||
|
||||
s->state == MIGRATION_STATUS_ACTIVE) {
|
||||
migrate_set_state(&s->state, s->state,
|
||||
MIGRATION_STATUS_FAILED);
|
||||
}
|
||||
}
|
||||
|
||||
for (i = 0; i < migrate_multifd_channels(); i++) {
|
||||
MultiFDRecvParams *p = &multifd_recv_state->params[i];
|
||||
|
||||
qemu_mutex_lock(&p->mutex);
|
||||
@ -570,11 +788,15 @@ int multifd_load_cleanup(Error **errp)
|
||||
if (!migrate_use_multifd()) {
|
||||
return 0;
|
||||
}
|
||||
terminate_multifd_recv_threads(NULL);
|
||||
for (i = 0; i < multifd_recv_state->count; i++) {
|
||||
multifd_recv_terminate_threads(NULL);
|
||||
for (i = 0; i < migrate_multifd_channels(); i++) {
|
||||
MultiFDRecvParams *p = &multifd_recv_state->params[i];
|
||||
|
||||
qemu_thread_join(&p->thread);
|
||||
if (p->running) {
|
||||
qemu_thread_join(&p->thread);
|
||||
}
|
||||
object_unref(OBJECT(p->c));
|
||||
p->c = NULL;
|
||||
qemu_mutex_destroy(&p->mutex);
|
||||
qemu_sem_destroy(&p->sem);
|
||||
g_free(p->name);
|
||||
@ -602,6 +824,10 @@ static void *multifd_recv_thread(void *opaque)
|
||||
qemu_sem_wait(&p->sem);
|
||||
}
|
||||
|
||||
qemu_mutex_lock(&p->mutex);
|
||||
p->running = false;
|
||||
qemu_mutex_unlock(&p->mutex);
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@ -616,7 +842,7 @@ int multifd_load_setup(void)
|
||||
thread_count = migrate_multifd_channels();
|
||||
multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
|
||||
multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
|
||||
multifd_recv_state->count = 0;
|
||||
atomic_set(&multifd_recv_state->count, 0);
|
||||
for (i = 0; i < thread_count; i++) {
|
||||
MultiFDRecvParams *p = &multifd_recv_state->params[i];
|
||||
|
||||
@ -625,13 +851,52 @@ int multifd_load_setup(void)
|
||||
p->quit = false;
|
||||
p->id = i;
|
||||
p->name = g_strdup_printf("multifdrecv_%d", i);
|
||||
qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
|
||||
QEMU_THREAD_JOINABLE);
|
||||
multifd_recv_state->count++;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool multifd_recv_all_channels_created(void)
|
||||
{
|
||||
int thread_count = migrate_multifd_channels();
|
||||
|
||||
if (!migrate_use_multifd()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return thread_count == atomic_read(&multifd_recv_state->count);
|
||||
}
|
||||
|
||||
void multifd_recv_new_channel(QIOChannel *ioc)
|
||||
{
|
||||
MultiFDRecvParams *p;
|
||||
Error *local_err = NULL;
|
||||
int id;
|
||||
|
||||
id = multifd_recv_initial_packet(ioc, &local_err);
|
||||
if (id < 0) {
|
||||
multifd_recv_terminate_threads(local_err);
|
||||
return;
|
||||
}
|
||||
|
||||
p = &multifd_recv_state->params[id];
|
||||
if (p->c != NULL) {
|
||||
error_setg(&local_err, "multifd: received id '%d' already setup'",
|
||||
id);
|
||||
multifd_recv_terminate_threads(local_err);
|
||||
return;
|
||||
}
|
||||
p->c = ioc;
|
||||
object_ref(OBJECT(ioc));
|
||||
|
||||
p->running = true;
|
||||
qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
|
||||
QEMU_THREAD_JOINABLE);
|
||||
atomic_inc(&multifd_recv_state->count);
|
||||
if (multifd_recv_state->count == migrate_multifd_channels()) {
|
||||
migration_incoming_process();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* save_page_header: write page header to wire
|
||||
*
|
||||
@ -1490,7 +1755,7 @@ static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
|
||||
* CPU resource.
|
||||
*/
|
||||
if (block == rs->last_sent_block && save_page_use_compression(rs)) {
|
||||
res = compress_page_with_multi_thread(rs, block, offset);
|
||||
return compress_page_with_multi_thread(rs, block, offset);
|
||||
}
|
||||
|
||||
return ram_save_page(rs, pss, last_stage);
|
||||
@ -2226,6 +2491,41 @@ static int ram_init_all(RAMState **rsp)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
|
||||
{
|
||||
RAMBlock *block;
|
||||
uint64_t pages = 0;
|
||||
|
||||
/*
|
||||
* Postcopy is not using xbzrle/compression, so no need for that.
|
||||
* Also, since source are already halted, we don't need to care
|
||||
* about dirty page logging as well.
|
||||
*/
|
||||
|
||||
RAMBLOCK_FOREACH(block) {
|
||||
pages += bitmap_count_one(block->bmap,
|
||||
block->used_length >> TARGET_PAGE_BITS);
|
||||
}
|
||||
|
||||
/* This may not be aligned with current bitmaps. Recalculate. */
|
||||
rs->migration_dirty_pages = pages;
|
||||
|
||||
rs->last_seen_block = NULL;
|
||||
rs->last_sent_block = NULL;
|
||||
rs->last_page = 0;
|
||||
rs->last_version = ram_list.version;
|
||||
/*
|
||||
* Disable the bulk stage, otherwise we'll resend the whole RAM no
|
||||
* matter what we have sent.
|
||||
*/
|
||||
rs->ram_bulk_stage = false;
|
||||
|
||||
/* Update RAMState cache of output QEMUFile */
|
||||
rs->f = out;
|
||||
|
||||
trace_ram_state_resume_prepare(pages);
|
||||
}
|
||||
|
||||
/*
|
||||
* Each of ram_save_setup, ram_save_iterate and ram_save_complete has
|
||||
* long-running RCU critical section. When rcu-reclaims in the code
|
||||
@ -3100,6 +3400,139 @@ static bool ram_has_postcopy(void *opaque)
|
||||
return migrate_postcopy_ram();
|
||||
}
|
||||
|
||||
/* Sync all the dirty bitmap with destination VM. */
|
||||
static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
|
||||
{
|
||||
RAMBlock *block;
|
||||
QEMUFile *file = s->to_dst_file;
|
||||
int ramblock_count = 0;
|
||||
|
||||
trace_ram_dirty_bitmap_sync_start();
|
||||
|
||||
RAMBLOCK_FOREACH(block) {
|
||||
qemu_savevm_send_recv_bitmap(file, block->idstr);
|
||||
trace_ram_dirty_bitmap_request(block->idstr);
|
||||
ramblock_count++;
|
||||
}
|
||||
|
||||
trace_ram_dirty_bitmap_sync_wait();
|
||||
|
||||
/* Wait until all the ramblocks' dirty bitmap synced */
|
||||
while (ramblock_count--) {
|
||||
qemu_sem_wait(&s->rp_state.rp_sem);
|
||||
}
|
||||
|
||||
trace_ram_dirty_bitmap_sync_complete();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void ram_dirty_bitmap_reload_notify(MigrationState *s)
|
||||
{
|
||||
qemu_sem_post(&s->rp_state.rp_sem);
|
||||
}
|
||||
|
||||
/*
|
||||
* Read the received bitmap, revert it as the initial dirty bitmap.
|
||||
* This is only used when the postcopy migration is paused but wants
|
||||
* to resume from a middle point.
|
||||
*/
|
||||
int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
|
||||
{
|
||||
int ret = -EINVAL;
|
||||
QEMUFile *file = s->rp_state.from_dst_file;
|
||||
unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
|
||||
uint64_t local_size = nbits / 8;
|
||||
uint64_t size, end_mark;
|
||||
|
||||
trace_ram_dirty_bitmap_reload_begin(block->idstr);
|
||||
|
||||
if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
|
||||
error_report("%s: incorrect state %s", __func__,
|
||||
MigrationStatus_str(s->state));
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Note: see comments in ramblock_recv_bitmap_send() on why we
|
||||
* need the endianess convertion, and the paddings.
|
||||
*/
|
||||
local_size = ROUND_UP(local_size, 8);
|
||||
|
||||
/* Add paddings */
|
||||
le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
|
||||
|
||||
size = qemu_get_be64(file);
|
||||
|
||||
/* The size of the bitmap should match with our ramblock */
|
||||
if (size != local_size) {
|
||||
error_report("%s: ramblock '%s' bitmap size mismatch "
|
||||
"(0x%"PRIx64" != 0x%"PRIx64")", __func__,
|
||||
block->idstr, size, local_size);
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
|
||||
end_mark = qemu_get_be64(file);
|
||||
|
||||
ret = qemu_file_get_error(file);
|
||||
if (ret || size != local_size) {
|
||||
error_report("%s: read bitmap failed for ramblock '%s': %d"
|
||||
" (size 0x%"PRIx64", got: 0x%"PRIx64")",
|
||||
__func__, block->idstr, ret, local_size, size);
|
||||
ret = -EIO;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
|
||||
error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
|
||||
__func__, block->idstr, end_mark);
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/*
|
||||
* Endianess convertion. We are during postcopy (though paused).
|
||||
* The dirty bitmap won't change. We can directly modify it.
|
||||
*/
|
||||
bitmap_from_le(block->bmap, le_bitmap, nbits);
|
||||
|
||||
/*
|
||||
* What we received is "received bitmap". Revert it as the initial
|
||||
* dirty bitmap for this ramblock.
|
||||
*/
|
||||
bitmap_complement(block->bmap, block->bmap, nbits);
|
||||
|
||||
trace_ram_dirty_bitmap_reload_complete(block->idstr);
|
||||
|
||||
/*
|
||||
* We succeeded to sync bitmap for current ramblock. If this is
|
||||
* the last one to sync, we need to notify the main send thread.
|
||||
*/
|
||||
ram_dirty_bitmap_reload_notify(s);
|
||||
|
||||
ret = 0;
|
||||
out:
|
||||
free(le_bitmap);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int ram_resume_prepare(MigrationState *s, void *opaque)
|
||||
{
|
||||
RAMState *rs = *(RAMState **)opaque;
|
||||
int ret;
|
||||
|
||||
ret = ram_dirty_bitmap_sync_all(s, rs);
|
||||
if (ret) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
ram_state_resume_prepare(rs, s->to_dst_file);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static SaveVMHandlers savevm_ram_handlers = {
|
||||
.save_setup = ram_save_setup,
|
||||
.save_live_iterate = ram_save_iterate,
|
||||
@ -3111,6 +3544,7 @@ static SaveVMHandlers savevm_ram_handlers = {
|
||||
.save_cleanup = ram_save_cleanup,
|
||||
.load_setup = ram_load_setup,
|
||||
.load_cleanup = ram_load_cleanup,
|
||||
.resume_prepare = ram_resume_prepare,
|
||||
};
|
||||
|
||||
void ram_mig_init(void)
|
||||
|
@ -32,6 +32,7 @@
|
||||
#include "qemu-common.h"
|
||||
#include "qapi/qapi-types-migration.h"
|
||||
#include "exec/cpu-common.h"
|
||||
#include "io/channel.h"
|
||||
|
||||
extern MigrationStats ram_counters;
|
||||
extern XBZRLECacheStats xbzrle_counters;
|
||||
@ -44,6 +45,8 @@ int multifd_save_setup(void);
|
||||
int multifd_save_cleanup(Error **errp);
|
||||
int multifd_load_setup(void);
|
||||
int multifd_load_cleanup(Error **errp);
|
||||
bool multifd_recv_all_channels_created(void);
|
||||
void multifd_recv_new_channel(QIOChannel *ioc);
|
||||
|
||||
uint64_t ram_pagesize_summary(void);
|
||||
int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len);
|
||||
@ -63,5 +66,8 @@ int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr);
|
||||
bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset);
|
||||
void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr);
|
||||
void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr, size_t nr);
|
||||
int64_t ramblock_recv_bitmap_send(QEMUFile *file,
|
||||
const char *block_name);
|
||||
int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *rb);
|
||||
|
||||
#endif
|
||||
|
@ -708,6 +708,9 @@ static int rdma_delete_block(RDMAContext *rdma, RDMALocalBlock *block)
|
||||
memcpy(local->block + block->index, old + (block->index + 1),
|
||||
sizeof(RDMALocalBlock) *
|
||||
(local->nb_blocks - (block->index + 1)));
|
||||
for (x = block->index; x < local->nb_blocks - 1; x++) {
|
||||
local->block[x].index--;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
assert(block == local->block);
|
||||
@ -3246,6 +3249,10 @@ static int qemu_rdma_registration_handle(QEMUFile *f, void *opaque)
|
||||
qsort(rdma->local_ram_blocks.block,
|
||||
rdma->local_ram_blocks.nb_blocks,
|
||||
sizeof(RDMALocalBlock), dest_ram_sort_func);
|
||||
for (i = 0; i < local->nb_blocks; i++) {
|
||||
local->block[i].index = i;
|
||||
}
|
||||
|
||||
if (rdma->pin_all) {
|
||||
ret = qemu_rdma_reg_whole_ram_blocks(rdma);
|
||||
if (ret) {
|
||||
|
@ -80,7 +80,9 @@ enum qemu_vm_cmd {
|
||||
MIG_CMD_POSTCOPY_RAM_DISCARD, /* A list of pages to discard that
|
||||
were previously sent during
|
||||
precopy but are dirty. */
|
||||
MIG_CMD_POSTCOPY_RESUME, /* resume postcopy on dest */
|
||||
MIG_CMD_PACKAGED, /* Send a wrapped stream within this stream */
|
||||
MIG_CMD_RECV_BITMAP, /* Request for recved bitmap on dst */
|
||||
MIG_CMD_MAX
|
||||
};
|
||||
|
||||
@ -97,7 +99,9 @@ static struct mig_cmd_args {
|
||||
[MIG_CMD_POSTCOPY_RUN] = { .len = 0, .name = "POSTCOPY_RUN" },
|
||||
[MIG_CMD_POSTCOPY_RAM_DISCARD] = {
|
||||
.len = -1, .name = "POSTCOPY_RAM_DISCARD" },
|
||||
[MIG_CMD_POSTCOPY_RESUME] = { .len = 0, .name = "POSTCOPY_RESUME" },
|
||||
[MIG_CMD_PACKAGED] = { .len = 4, .name = "PACKAGED" },
|
||||
[MIG_CMD_RECV_BITMAP] = { .len = -1, .name = "RECV_BITMAP" },
|
||||
[MIG_CMD_MAX] = { .len = -1, .name = "MAX" },
|
||||
};
|
||||
|
||||
@ -956,6 +960,25 @@ void qemu_savevm_send_postcopy_run(QEMUFile *f)
|
||||
qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_RUN, 0, NULL);
|
||||
}
|
||||
|
||||
void qemu_savevm_send_postcopy_resume(QEMUFile *f)
|
||||
{
|
||||
trace_savevm_send_postcopy_resume();
|
||||
qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_RESUME, 0, NULL);
|
||||
}
|
||||
|
||||
void qemu_savevm_send_recv_bitmap(QEMUFile *f, char *block_name)
|
||||
{
|
||||
size_t len;
|
||||
char buf[256];
|
||||
|
||||
trace_savevm_send_recv_bitmap(block_name);
|
||||
|
||||
buf[0] = len = strlen(block_name);
|
||||
memcpy(buf + 1, block_name, len);
|
||||
|
||||
qemu_savevm_command_send(f, MIG_CMD_RECV_BITMAP, len + 1, (uint8_t *)buf);
|
||||
}
|
||||
|
||||
bool qemu_savevm_state_blocked(Error **errp)
|
||||
{
|
||||
SaveStateEntry *se;
|
||||
@ -1008,6 +1031,31 @@ void qemu_savevm_state_setup(QEMUFile *f)
|
||||
}
|
||||
}
|
||||
|
||||
int qemu_savevm_state_resume_prepare(MigrationState *s)
|
||||
{
|
||||
SaveStateEntry *se;
|
||||
int ret;
|
||||
|
||||
trace_savevm_state_resume_prepare();
|
||||
|
||||
QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
|
||||
if (!se->ops || !se->ops->resume_prepare) {
|
||||
continue;
|
||||
}
|
||||
if (se->ops && se->ops->is_active) {
|
||||
if (!se->ops->is_active(se->opaque)) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
ret = se->ops->resume_prepare(s, se->opaque);
|
||||
if (ret < 0) {
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* this function has three return values:
|
||||
* negative: there was one error, and we have -errno.
|
||||
@ -1564,8 +1612,8 @@ static int loadvm_postcopy_ram_handle_discard(MigrationIncomingState *mis,
|
||||
*/
|
||||
static void *postcopy_ram_listen_thread(void *opaque)
|
||||
{
|
||||
QEMUFile *f = opaque;
|
||||
MigrationIncomingState *mis = migration_incoming_get_current();
|
||||
QEMUFile *f = mis->from_src_file;
|
||||
int load_res;
|
||||
|
||||
migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
|
||||
@ -1579,6 +1627,14 @@ static void *postcopy_ram_listen_thread(void *opaque)
|
||||
*/
|
||||
qemu_file_set_blocking(f, true);
|
||||
load_res = qemu_loadvm_state_main(f, mis);
|
||||
|
||||
/*
|
||||
* This is tricky, but, mis->from_src_file can change after it
|
||||
* returns, when postcopy recovery happened. In the future, we may
|
||||
* want a wrapper for the QEMUFile handle.
|
||||
*/
|
||||
f = mis->from_src_file;
|
||||
|
||||
/* And non-blocking again so we don't block in any cleanup */
|
||||
qemu_file_set_blocking(f, false);
|
||||
|
||||
@ -1668,7 +1724,7 @@ static int loadvm_postcopy_handle_listen(MigrationIncomingState *mis)
|
||||
/* Start up the listening thread and wait for it to signal ready */
|
||||
qemu_sem_init(&mis->listen_thread_sem, 0);
|
||||
qemu_thread_create(&mis->listen_thread, "postcopy/listen",
|
||||
postcopy_ram_listen_thread, mis->from_src_file,
|
||||
postcopy_ram_listen_thread, NULL,
|
||||
QEMU_THREAD_DETACHED);
|
||||
qemu_sem_wait(&mis->listen_thread_sem);
|
||||
qemu_sem_destroy(&mis->listen_thread_sem);
|
||||
@ -1745,6 +1801,31 @@ static int loadvm_postcopy_handle_run(MigrationIncomingState *mis)
|
||||
return LOADVM_QUIT;
|
||||
}
|
||||
|
||||
static int loadvm_postcopy_handle_resume(MigrationIncomingState *mis)
|
||||
{
|
||||
if (mis->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
|
||||
error_report("%s: illegal resume received", __func__);
|
||||
/* Don't fail the load, only for this. */
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* This means source VM is ready to resume the postcopy migration.
|
||||
* It's time to switch state and release the fault thread to
|
||||
* continue service page faults.
|
||||
*/
|
||||
migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_RECOVER,
|
||||
MIGRATION_STATUS_POSTCOPY_ACTIVE);
|
||||
qemu_sem_post(&mis->postcopy_pause_sem_fault);
|
||||
|
||||
trace_loadvm_postcopy_handle_resume();
|
||||
|
||||
/* Tell source that "we are ready" */
|
||||
migrate_send_rp_resume_ack(mis, MIGRATION_RESUME_ACK_VALUE);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Immediately following this command is a blob of data containing an embedded
|
||||
* chunk of migration stream; read it and load it.
|
||||
@ -1793,6 +1874,49 @@ static int loadvm_handle_cmd_packaged(MigrationIncomingState *mis)
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Handle request that source requests for recved_bitmap on
|
||||
* destination. Payload format:
|
||||
*
|
||||
* len (1 byte) + ramblock_name (<255 bytes)
|
||||
*/
|
||||
static int loadvm_handle_recv_bitmap(MigrationIncomingState *mis,
|
||||
uint16_t len)
|
||||
{
|
||||
QEMUFile *file = mis->from_src_file;
|
||||
RAMBlock *rb;
|
||||
char block_name[256];
|
||||
size_t cnt;
|
||||
|
||||
cnt = qemu_get_counted_string(file, block_name);
|
||||
if (!cnt) {
|
||||
error_report("%s: failed to read block name", __func__);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/* Validate before using the data */
|
||||
if (qemu_file_get_error(file)) {
|
||||
return qemu_file_get_error(file);
|
||||
}
|
||||
|
||||
if (len != cnt + 1) {
|
||||
error_report("%s: invalid payload length (%d)", __func__, len);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
rb = qemu_ram_block_by_name(block_name);
|
||||
if (!rb) {
|
||||
error_report("%s: block '%s' not found", __func__, block_name);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
migrate_send_rp_recv_bitmap(mis, block_name);
|
||||
|
||||
trace_loadvm_handle_recv_bitmap(block_name);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Process an incoming 'QEMU_VM_COMMAND'
|
||||
* 0 just a normal return
|
||||
@ -1866,6 +1990,12 @@ static int loadvm_process_command(QEMUFile *f)
|
||||
|
||||
case MIG_CMD_POSTCOPY_RAM_DISCARD:
|
||||
return loadvm_postcopy_ram_handle_discard(mis, len);
|
||||
|
||||
case MIG_CMD_POSTCOPY_RESUME:
|
||||
return loadvm_postcopy_handle_resume(mis);
|
||||
|
||||
case MIG_CMD_RECV_BITMAP:
|
||||
return loadvm_handle_recv_bitmap(mis, len);
|
||||
}
|
||||
|
||||
return 0;
|
||||
@ -2055,11 +2185,50 @@ void qemu_loadvm_state_cleanup(void)
|
||||
}
|
||||
}
|
||||
|
||||
/* Return true if we should continue the migration, or false. */
|
||||
static bool postcopy_pause_incoming(MigrationIncomingState *mis)
|
||||
{
|
||||
trace_postcopy_pause_incoming();
|
||||
|
||||
/* Clear the triggered bit to allow one recovery */
|
||||
mis->postcopy_recover_triggered = false;
|
||||
|
||||
migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
|
||||
MIGRATION_STATUS_POSTCOPY_PAUSED);
|
||||
|
||||
assert(mis->from_src_file);
|
||||
qemu_file_shutdown(mis->from_src_file);
|
||||
qemu_fclose(mis->from_src_file);
|
||||
mis->from_src_file = NULL;
|
||||
|
||||
assert(mis->to_src_file);
|
||||
qemu_file_shutdown(mis->to_src_file);
|
||||
qemu_mutex_lock(&mis->rp_mutex);
|
||||
qemu_fclose(mis->to_src_file);
|
||||
mis->to_src_file = NULL;
|
||||
qemu_mutex_unlock(&mis->rp_mutex);
|
||||
|
||||
/* Notify the fault thread for the invalidated file handle */
|
||||
postcopy_fault_thread_notify(mis);
|
||||
|
||||
error_report("Detected IO failure for postcopy. "
|
||||
"Migration paused.");
|
||||
|
||||
while (mis->state == MIGRATION_STATUS_POSTCOPY_PAUSED) {
|
||||
qemu_sem_wait(&mis->postcopy_pause_sem_dst);
|
||||
}
|
||||
|
||||
trace_postcopy_pause_incoming_continued();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis)
|
||||
{
|
||||
uint8_t section_type;
|
||||
int ret = 0;
|
||||
|
||||
retry:
|
||||
while (true) {
|
||||
section_type = qemu_get_byte(f);
|
||||
|
||||
@ -2104,6 +2273,24 @@ static int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis)
|
||||
out:
|
||||
if (ret < 0) {
|
||||
qemu_file_set_error(f, ret);
|
||||
|
||||
/*
|
||||
* Detect whether it is:
|
||||
*
|
||||
* 1. postcopy running (after receiving all device data, which
|
||||
* must be in POSTCOPY_INCOMING_RUNNING state. Note that
|
||||
* POSTCOPY_INCOMING_LISTENING is still not enough, it's
|
||||
* still receiving device states).
|
||||
* 2. network failure (-EIO)
|
||||
*
|
||||
* If so, we try to wait for a recovery.
|
||||
*/
|
||||
if (postcopy_state_get() == POSTCOPY_INCOMING_RUNNING &&
|
||||
ret == -EIO && postcopy_pause_incoming(mis)) {
|
||||
/* Reset f to point to the newly created channel */
|
||||
f = mis->from_src_file;
|
||||
goto retry;
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
@ -31,6 +31,7 @@
|
||||
|
||||
bool qemu_savevm_state_blocked(Error **errp);
|
||||
void qemu_savevm_state_setup(QEMUFile *f);
|
||||
int qemu_savevm_state_resume_prepare(MigrationState *s);
|
||||
void qemu_savevm_state_header(QEMUFile *f);
|
||||
int qemu_savevm_state_iterate(QEMUFile *f, bool postcopy);
|
||||
void qemu_savevm_state_cleanup(void);
|
||||
@ -47,6 +48,8 @@ int qemu_savevm_send_packaged(QEMUFile *f, const uint8_t *buf, size_t len);
|
||||
void qemu_savevm_send_postcopy_advise(QEMUFile *f);
|
||||
void qemu_savevm_send_postcopy_listen(QEMUFile *f);
|
||||
void qemu_savevm_send_postcopy_run(QEMUFile *f);
|
||||
void qemu_savevm_send_postcopy_resume(QEMUFile *f);
|
||||
void qemu_savevm_send_recv_bitmap(QEMUFile *f, char *block_name);
|
||||
|
||||
void qemu_savevm_send_postcopy_ram_discard(QEMUFile *f, const char *name,
|
||||
uint16_t len,
|
||||
|
@ -28,6 +28,28 @@
|
||||
#include "trace.h"
|
||||
|
||||
|
||||
struct SocketOutgoingArgs {
|
||||
SocketAddress *saddr;
|
||||
} outgoing_args;
|
||||
|
||||
void socket_send_channel_create(QIOTaskFunc f, void *data)
|
||||
{
|
||||
QIOChannelSocket *sioc = qio_channel_socket_new();
|
||||
qio_channel_socket_connect_async(sioc, outgoing_args.saddr,
|
||||
f, data, NULL, NULL);
|
||||
}
|
||||
|
||||
int socket_send_channel_destroy(QIOChannel *send)
|
||||
{
|
||||
/* Remove channel */
|
||||
object_unref(OBJECT(send));
|
||||
if (outgoing_args.saddr) {
|
||||
qapi_free_SocketAddress(outgoing_args.saddr);
|
||||
outgoing_args.saddr = NULL;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static SocketAddress *tcp_build_address(const char *host_port, Error **errp)
|
||||
{
|
||||
SocketAddress *saddr;
|
||||
@ -95,6 +117,11 @@ static void socket_start_outgoing_migration(MigrationState *s,
|
||||
struct SocketConnectData *data = g_new0(struct SocketConnectData, 1);
|
||||
|
||||
data->s = s;
|
||||
|
||||
/* in case previous migration leaked it */
|
||||
qapi_free_SocketAddress(outgoing_args.saddr);
|
||||
outgoing_args.saddr = saddr;
|
||||
|
||||
if (saddr->type == SOCKET_ADDRESS_TYPE_INET) {
|
||||
data->hostname = g_strdup(saddr->u.inet.host);
|
||||
}
|
||||
@ -106,7 +133,6 @@ static void socket_start_outgoing_migration(MigrationState *s,
|
||||
data,
|
||||
socket_connect_data_free,
|
||||
NULL);
|
||||
qapi_free_SocketAddress(saddr);
|
||||
}
|
||||
|
||||
void tcp_start_outgoing_migration(MigrationState *s,
|
||||
@ -144,6 +170,10 @@ static void socket_accept_incoming_migration(QIONetListener *listener,
|
||||
qio_net_listener_disconnect(listener);
|
||||
|
||||
object_unref(OBJECT(listener));
|
||||
|
||||
if (!migrate_use_multifd()) {
|
||||
migration_incoming_process();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -160,9 +190,10 @@ static void socket_start_incoming_migration(SocketAddress *saddr,
|
||||
return;
|
||||
}
|
||||
|
||||
qio_net_listener_set_client_func(listener,
|
||||
socket_accept_incoming_migration,
|
||||
NULL, NULL);
|
||||
qio_net_listener_set_client_func_full(listener,
|
||||
socket_accept_incoming_migration,
|
||||
NULL, NULL,
|
||||
g_main_context_get_thread_default());
|
||||
}
|
||||
|
||||
void tcp_start_incoming_migration(const char *host_port, Error **errp)
|
||||
|
@ -16,6 +16,13 @@
|
||||
|
||||
#ifndef QEMU_MIGRATION_SOCKET_H
|
||||
#define QEMU_MIGRATION_SOCKET_H
|
||||
|
||||
#include "io/channel.h"
|
||||
#include "io/task.h"
|
||||
|
||||
void socket_send_channel_create(QIOTaskFunc f, void *data);
|
||||
int socket_send_channel_destroy(QIOChannel *send);
|
||||
|
||||
void tcp_start_incoming_migration(const char *host_port, Error **errp);
|
||||
|
||||
void tcp_start_outgoing_migration(MigrationState *s, const char *host_port,
|
||||
|
@ -12,11 +12,13 @@ loadvm_state_cleanup(void) ""
|
||||
loadvm_handle_cmd_packaged(unsigned int length) "%u"
|
||||
loadvm_handle_cmd_packaged_main(int ret) "%d"
|
||||
loadvm_handle_cmd_packaged_received(int ret) "%d"
|
||||
loadvm_handle_recv_bitmap(char *s) "%s"
|
||||
loadvm_postcopy_handle_advise(void) ""
|
||||
loadvm_postcopy_handle_listen(void) ""
|
||||
loadvm_postcopy_handle_run(void) ""
|
||||
loadvm_postcopy_handle_run_cpu_sync(void) ""
|
||||
loadvm_postcopy_handle_run_vmstart(void) ""
|
||||
loadvm_postcopy_handle_resume(void) ""
|
||||
loadvm_postcopy_ram_handle_discard(void) ""
|
||||
loadvm_postcopy_ram_handle_discard_end(void) ""
|
||||
loadvm_postcopy_ram_handle_discard_header(const char *ramid, uint16_t len) "%s: %ud"
|
||||
@ -34,7 +36,10 @@ savevm_send_open_return_path(void) ""
|
||||
savevm_send_ping(uint32_t val) "0x%x"
|
||||
savevm_send_postcopy_listen(void) ""
|
||||
savevm_send_postcopy_run(void) ""
|
||||
savevm_send_postcopy_resume(void) ""
|
||||
savevm_send_recv_bitmap(char *name) "%s"
|
||||
savevm_state_setup(void) ""
|
||||
savevm_state_resume_prepare(void) ""
|
||||
savevm_state_header(void) ""
|
||||
savevm_state_iterate(void) ""
|
||||
savevm_state_cleanup(void) ""
|
||||
@ -77,6 +82,13 @@ ram_load_postcopy_loop(uint64_t addr, int flags) "@%" PRIx64 " %x"
|
||||
ram_postcopy_send_discard_bitmap(void) ""
|
||||
ram_save_page(const char *rbname, uint64_t offset, void *host) "%s: offset: 0x%" PRIx64 " host: %p"
|
||||
ram_save_queue_pages(const char *rbname, size_t start, size_t len) "%s: start: 0x%zx len: 0x%zx"
|
||||
ram_dirty_bitmap_request(char *str) "%s"
|
||||
ram_dirty_bitmap_reload_begin(char *str) "%s"
|
||||
ram_dirty_bitmap_reload_complete(char *str) "%s"
|
||||
ram_dirty_bitmap_sync_start(void) ""
|
||||
ram_dirty_bitmap_sync_wait(void) ""
|
||||
ram_dirty_bitmap_sync_complete(void) ""
|
||||
ram_state_resume_prepare(uint64_t v) "%" PRId64
|
||||
|
||||
# migration/migration.c
|
||||
await_return_path_close_on_source_close(void) ""
|
||||
@ -88,6 +100,7 @@ migrate_fd_cancel(void) ""
|
||||
migrate_handle_rp_req_pages(const char *rbname, size_t start, size_t len) "in %s at 0x%zx len 0x%zx"
|
||||
migrate_pending(uint64_t size, uint64_t max, uint64_t pre, uint64_t compat, uint64_t post) "pending size %" PRIu64 " max %" PRIu64 " (pre = %" PRIu64 " compat=%" PRIu64 " post=%" PRIu64 ")"
|
||||
migrate_send_rp_message(int msg_type, uint16_t len) "%d: len %d"
|
||||
migrate_send_rp_recv_bitmap(char *name, int64_t size) "block '%s' size 0x%"PRIi64
|
||||
migration_completion_file_err(void) ""
|
||||
migration_completion_postcopy_end(void) ""
|
||||
migration_completion_postcopy_end_after_complete(void) ""
|
||||
@ -99,6 +112,13 @@ migration_thread_setup_complete(void) ""
|
||||
open_return_path_on_source(void) ""
|
||||
open_return_path_on_source_continue(void) ""
|
||||
postcopy_start(void) ""
|
||||
postcopy_pause_return_path(void) ""
|
||||
postcopy_pause_return_path_continued(void) ""
|
||||
postcopy_pause_fault_thread(void) ""
|
||||
postcopy_pause_fault_thread_continued(void) ""
|
||||
postcopy_pause_continued(void) ""
|
||||
postcopy_pause_incoming(void) ""
|
||||
postcopy_pause_incoming_continued(void) ""
|
||||
postcopy_start_set_run(void) ""
|
||||
source_return_path_thread_bad_end(void) ""
|
||||
source_return_path_thread_end(void) ""
|
||||
@ -106,6 +126,7 @@ source_return_path_thread_entry(void) ""
|
||||
source_return_path_thread_loop_top(void) ""
|
||||
source_return_path_thread_pong(uint32_t val) "0x%x"
|
||||
source_return_path_thread_shut(uint32_t val) "0x%x"
|
||||
source_return_path_thread_resume_ack(uint32_t v) "%"PRIu32
|
||||
migrate_global_state_post_load(const char *state) "loaded state: %s"
|
||||
migrate_global_state_pre_save(const char *state) "saved state: %s"
|
||||
migration_thread_low_pending(uint64_t pending) "%" PRIu64
|
||||
|
@ -89,6 +89,10 @@
|
||||
#
|
||||
# @postcopy-active: like active, but now in postcopy mode. (since 2.5)
|
||||
#
|
||||
# @postcopy-paused: during postcopy but paused. (since 2.13)
|
||||
#
|
||||
# @postcopy-recover: trying to recover from a paused postcopy. (since 2.13)
|
||||
#
|
||||
# @completed: migration is finished.
|
||||
#
|
||||
# @failed: some error occurred during migration process.
|
||||
@ -106,7 +110,8 @@
|
||||
##
|
||||
{ 'enum': 'MigrationStatus',
|
||||
'data': [ 'none', 'setup', 'cancelling', 'cancelled',
|
||||
'active', 'postcopy-active', 'completed', 'failed', 'colo',
|
||||
'active', 'postcopy-active', 'postcopy-paused',
|
||||
'postcopy-recover', 'completed', 'failed', 'colo',
|
||||
'pre-switchover', 'device' ] }
|
||||
|
||||
##
|
||||
@ -157,11 +162,13 @@
|
||||
# error strings. (Since 2.7)
|
||||
#
|
||||
# @postcopy-blocktime: total time when all vCPU were blocked during postcopy
|
||||
# live migration (Since 2.13)
|
||||
# live migration. This is only present when the postcopy-blocktime
|
||||
# migration capability is enabled. (Since 2.13)
|
||||
#
|
||||
# @postcopy-vcpu-blocktime: list of the postcopy blocktime per vCPU (Since 2.13)
|
||||
# @postcopy-vcpu-blocktime: list of the postcopy blocktime per vCPU. This is
|
||||
# only present when the postcopy-blocktime migration capability
|
||||
# is enabled. (Since 2.13)
|
||||
#
|
||||
|
||||
#
|
||||
# Since: 0.14.0
|
||||
##
|
||||
@ -363,7 +370,6 @@
|
||||
#
|
||||
# @x-multifd: Use more than one fd for migration (since 2.11)
|
||||
#
|
||||
#
|
||||
# @dirty-bitmaps: If enabled, QEMU will migrate named dirty bitmaps.
|
||||
# (since 2.12)
|
||||
#
|
||||
@ -1028,6 +1034,8 @@
|
||||
# @detach: this argument exists only for compatibility reasons and
|
||||
# is ignored by QEMU
|
||||
#
|
||||
# @resume: resume one paused migration, default "off". (since 2.13)
|
||||
#
|
||||
# Returns: nothing on success
|
||||
#
|
||||
# Since: 0.14.0
|
||||
@ -1049,7 +1057,8 @@
|
||||
#
|
||||
##
|
||||
{ 'command': 'migrate',
|
||||
'data': {'uri': 'str', '*blk': 'bool', '*inc': 'bool', '*detach': 'bool' } }
|
||||
'data': {'uri': 'str', '*blk': 'bool', '*inc': 'bool',
|
||||
'*detach': 'bool', '*resume': 'bool' } }
|
||||
|
||||
##
|
||||
# @migrate-incoming:
|
||||
@ -1183,3 +1192,39 @@
|
||||
# Since: 2.9
|
||||
##
|
||||
{ 'command': 'xen-colo-do-checkpoint' }
|
||||
|
||||
##
|
||||
# @migrate-recover:
|
||||
#
|
||||
# Provide a recovery migration stream URI.
|
||||
#
|
||||
# @uri: the URI to be used for the recovery of migration stream.
|
||||
#
|
||||
# Returns: nothing.
|
||||
#
|
||||
# Example:
|
||||
#
|
||||
# -> { "execute": "migrate-recover",
|
||||
# "arguments": { "uri": "tcp:192.168.1.200:12345" } }
|
||||
# <- { "return": {} }
|
||||
#
|
||||
# Since: 2.13
|
||||
##
|
||||
{ 'command': 'migrate-recover', 'data': { 'uri': 'str' },
|
||||
'allow-oob': true }
|
||||
|
||||
##
|
||||
# @migrate-pause:
|
||||
#
|
||||
# Pause a migration. Currently it only supports postcopy.
|
||||
#
|
||||
# Returns: nothing.
|
||||
#
|
||||
# Example:
|
||||
#
|
||||
# -> { "execute": "migrate-pause" }
|
||||
# <- { "return": {} }
|
||||
#
|
||||
# Since: 2.13
|
||||
##
|
||||
{ 'command': 'migrate-pause', 'allow-oob': true }
|
||||
|
@ -19,9 +19,6 @@
|
||||
#include "qemu/sockets.h"
|
||||
#include "chardev/char.h"
|
||||
#include "sysemu/sysemu.h"
|
||||
#include "hw/nvram/chrp_nvram.h"
|
||||
|
||||
#define MIN_NVRAM_SIZE 8192 /* from spapr_nvram.c */
|
||||
|
||||
const unsigned start_address = 1024 * 1024;
|
||||
const unsigned end_address = 100 * 1024 * 1024;
|
||||
@ -92,36 +89,6 @@ static void init_bootfile_x86(const char *bootpath)
|
||||
fclose(bootfile);
|
||||
}
|
||||
|
||||
static void init_bootfile_ppc(const char *bootpath)
|
||||
{
|
||||
FILE *bootfile;
|
||||
char buf[MIN_NVRAM_SIZE];
|
||||
ChrpNvramPartHdr *header = (ChrpNvramPartHdr *)buf;
|
||||
|
||||
memset(buf, 0, MIN_NVRAM_SIZE);
|
||||
|
||||
/* Create a "common" partition in nvram to store boot-command property */
|
||||
|
||||
header->signature = CHRP_NVPART_SYSTEM;
|
||||
memcpy(header->name, "common", 6);
|
||||
chrp_nvram_finish_partition(header, MIN_NVRAM_SIZE);
|
||||
|
||||
/* FW_MAX_SIZE is 4MB, but slof.bin is only 900KB,
|
||||
* so let's modify memory between 1MB and 100MB
|
||||
* to do like PC bootsector
|
||||
*/
|
||||
|
||||
sprintf(buf + 16,
|
||||
"boot-command=hex .\" _\" begin %x %x do i c@ 1 + i c! 1000 +loop "
|
||||
".\" B\" 0 until", end_address, start_address);
|
||||
|
||||
/* Write partition to the NVRAM file */
|
||||
|
||||
bootfile = fopen(bootpath, "wb");
|
||||
g_assert_cmpint(fwrite(buf, MIN_NVRAM_SIZE, 1, bootfile), ==, 1);
|
||||
fclose(bootfile);
|
||||
}
|
||||
|
||||
/*
|
||||
* Wait for some output in the serial output file,
|
||||
* we get an 'A' followed by an endless string of 'B's
|
||||
@ -422,12 +389,14 @@ static void test_migrate_start(QTestState **from, QTestState **to,
|
||||
if (access("/sys/module/kvm_hv", F_OK)) {
|
||||
accel = "tcg";
|
||||
}
|
||||
init_bootfile_ppc(bootpath);
|
||||
cmd_src = g_strdup_printf("-machine accel=%s -m 256M"
|
||||
" -name source,debug-threads=on"
|
||||
" -serial file:%s/src_serial"
|
||||
" -drive file=%s,if=pflash,format=raw",
|
||||
accel, tmpfs, bootpath);
|
||||
" -prom-env '"
|
||||
"boot-command=hex .\" _\" begin %x %x "
|
||||
"do i c@ 1 + i c! 1000 +loop .\" B\" 0 "
|
||||
"until'", accel, tmpfs, end_address,
|
||||
start_address);
|
||||
cmd_dst = g_strdup_printf("-machine accel=%s -m 256M"
|
||||
" -name target,debug-threads=on"
|
||||
" -serial file:%s/dest_serial"
|
||||
@ -536,7 +505,7 @@ static void test_deprecated(void)
|
||||
qtest_quit(from);
|
||||
}
|
||||
|
||||
static void test_migrate(void)
|
||||
static void test_postcopy(void)
|
||||
{
|
||||
char *uri = g_strdup_printf("unix:%s/migsocket", tmpfs);
|
||||
QTestState *from, *to;
|
||||
@ -611,6 +580,45 @@ static void test_baddest(void)
|
||||
test_migrate_end(from, to, false);
|
||||
}
|
||||
|
||||
static void test_precopy_unix(void)
|
||||
{
|
||||
char *uri = g_strdup_printf("unix:%s/migsocket", tmpfs);
|
||||
QTestState *from, *to;
|
||||
|
||||
test_migrate_start(&from, &to, uri, false);
|
||||
|
||||
/* We want to pick a speed slow enough that the test completes
|
||||
* quickly, but that it doesn't complete precopy even on a slow
|
||||
* machine, so also set the downtime.
|
||||
*/
|
||||
/* 1 ms should make it not converge*/
|
||||
migrate_set_parameter(from, "downtime-limit", "1");
|
||||
/* 1GB/s */
|
||||
migrate_set_parameter(from, "max-bandwidth", "1000000000");
|
||||
|
||||
/* Wait for the first serial output from the source */
|
||||
wait_for_serial("src_serial");
|
||||
|
||||
migrate(from, uri);
|
||||
|
||||
wait_for_migration_pass(from);
|
||||
|
||||
/* 300 ms should converge */
|
||||
migrate_set_parameter(from, "downtime-limit", "300");
|
||||
|
||||
if (!got_stop) {
|
||||
qtest_qmp_eventwait(from, "STOP");
|
||||
}
|
||||
|
||||
qtest_qmp_eventwait(to, "RESUME");
|
||||
|
||||
wait_for_serial("dest_serial");
|
||||
wait_for_migration_complete(from);
|
||||
|
||||
test_migrate_end(from, to, true);
|
||||
g_free(uri);
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
char template[] = "/tmp/migration-test-XXXXXX";
|
||||
@ -630,9 +638,10 @@ int main(int argc, char **argv)
|
||||
|
||||
module_call_init(MODULE_INIT_QOM);
|
||||
|
||||
qtest_add_func("/migration/postcopy/unix", test_migrate);
|
||||
qtest_add_func("/migration/postcopy/unix", test_postcopy);
|
||||
qtest_add_func("/migration/deprecated", test_deprecated);
|
||||
qtest_add_func("/migration/bad_dest", test_baddest);
|
||||
qtest_add_func("/migration/precopy/unix", test_precopy_unix);
|
||||
|
||||
ret = g_test_run();
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user