migration: allow fault thread to pause
Allows the fault thread to stop handling page faults temporarily. When network failure happened (and if we expect a recovery afterwards), we should not allow the fault thread to continue sending things to source, instead, it should halt for a while until the connection is rebuilt. When the dest main thread noticed the failure, it kicks the fault thread to switch to pause state. Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com> Signed-off-by: Peter Xu <peterx@redhat.com> Message-Id: <20180502104740.12123-7-peterx@redhat.com> Signed-off-by: Juan Quintela <quintela@redhat.com>
This commit is contained in:
parent
14b1742eaa
commit
3a7804c306
@ -160,6 +160,7 @@ MigrationIncomingState *migration_incoming_get_current(void)
|
||||
qemu_mutex_init(&mis_current.rp_mutex);
|
||||
qemu_event_init(&mis_current.main_thread_load_event, false);
|
||||
qemu_sem_init(&mis_current.postcopy_pause_sem_dst, 0);
|
||||
qemu_sem_init(&mis_current.postcopy_pause_sem_fault, 0);
|
||||
|
||||
init_dirty_bitmap_incoming_migration();
|
||||
|
||||
|
@ -76,6 +76,7 @@ struct MigrationIncomingState {
|
||||
|
||||
/* notify PAUSED postcopy incoming migrations to try to continue */
|
||||
QemuSemaphore postcopy_pause_sem_dst;
|
||||
QemuSemaphore postcopy_pause_sem_fault;
|
||||
};
|
||||
|
||||
MigrationIncomingState *migration_incoming_get_current(void);
|
||||
|
@ -830,6 +830,17 @@ static void mark_postcopy_blocktime_end(uintptr_t addr)
|
||||
affected_cpu);
|
||||
}
|
||||
|
||||
static bool postcopy_pause_fault_thread(MigrationIncomingState *mis)
|
||||
{
|
||||
trace_postcopy_pause_fault_thread();
|
||||
|
||||
qemu_sem_wait(&mis->postcopy_pause_sem_fault);
|
||||
|
||||
trace_postcopy_pause_fault_thread_continued();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* Handle faults detected by the USERFAULT markings
|
||||
*/
|
||||
@ -880,6 +891,22 @@ static void *postcopy_ram_fault_thread(void *opaque)
|
||||
break;
|
||||
}
|
||||
|
||||
if (!mis->to_src_file) {
|
||||
/*
|
||||
* Possibly someone tells us that the return path is
|
||||
* broken already using the event. We should hold until
|
||||
* the channel is rebuilt.
|
||||
*/
|
||||
if (postcopy_pause_fault_thread(mis)) {
|
||||
mis->last_rb = NULL;
|
||||
/* Continue to read the userfaultfd */
|
||||
} else {
|
||||
error_report("%s: paused but don't allow to continue",
|
||||
__func__);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (pfd[1].revents) {
|
||||
uint64_t tmp64 = 0;
|
||||
|
||||
@ -942,18 +969,37 @@ static void *postcopy_ram_fault_thread(void *opaque)
|
||||
(uintptr_t)(msg.arg.pagefault.address),
|
||||
msg.arg.pagefault.feat.ptid, rb);
|
||||
|
||||
retry:
|
||||
/*
|
||||
* Send the request to the source - we want to request one
|
||||
* of our host page sizes (which is >= TPS)
|
||||
*/
|
||||
if (rb != mis->last_rb) {
|
||||
mis->last_rb = rb;
|
||||
migrate_send_rp_req_pages(mis, qemu_ram_get_idstr(rb),
|
||||
rb_offset, qemu_ram_pagesize(rb));
|
||||
ret = migrate_send_rp_req_pages(mis,
|
||||
qemu_ram_get_idstr(rb),
|
||||
rb_offset,
|
||||
qemu_ram_pagesize(rb));
|
||||
} else {
|
||||
/* Save some space */
|
||||
migrate_send_rp_req_pages(mis, NULL,
|
||||
rb_offset, qemu_ram_pagesize(rb));
|
||||
ret = migrate_send_rp_req_pages(mis,
|
||||
NULL,
|
||||
rb_offset,
|
||||
qemu_ram_pagesize(rb));
|
||||
}
|
||||
|
||||
if (ret) {
|
||||
/* May be network failure, try to wait for recovery */
|
||||
if (ret == -EIO && postcopy_pause_fault_thread(mis)) {
|
||||
/* We got reconnected somehow, try to continue */
|
||||
mis->last_rb = NULL;
|
||||
goto retry;
|
||||
} else {
|
||||
/* This is a unavoidable fault */
|
||||
error_report("%s: migrate_send_rp_req_pages() get %d",
|
||||
__func__, ret);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2083,6 +2083,9 @@ static bool postcopy_pause_incoming(MigrationIncomingState *mis)
|
||||
mis->to_src_file = NULL;
|
||||
qemu_mutex_unlock(&mis->rp_mutex);
|
||||
|
||||
/* Notify the fault thread for the invalidated file handle */
|
||||
postcopy_fault_thread_notify(mis);
|
||||
|
||||
error_report("Detected IO failure for postcopy. "
|
||||
"Migration paused.");
|
||||
|
||||
|
@ -101,6 +101,8 @@ open_return_path_on_source_continue(void) ""
|
||||
postcopy_start(void) ""
|
||||
postcopy_pause_return_path(void) ""
|
||||
postcopy_pause_return_path_continued(void) ""
|
||||
postcopy_pause_fault_thread(void) ""
|
||||
postcopy_pause_fault_thread_continued(void) ""
|
||||
postcopy_pause_continued(void) ""
|
||||
postcopy_pause_incoming(void) ""
|
||||
postcopy_pause_incoming_continued(void) ""
|
||||
|
Loading…
Reference in New Issue
Block a user