diff --git a/meson.build b/meson.build index 5e7946776d..9ed9a993e2 100644 --- a/meson.build +++ b/meson.build @@ -1530,6 +1530,12 @@ config_host_data.set('HAVE_COPY_FILE_RANGE', cc.has_function('copy_file_range')) config_host_data.set('HAVE_OPENPTY', cc.has_function('openpty', dependencies: util)) config_host_data.set('HAVE_STRCHRNUL', cc.has_function('strchrnul')) config_host_data.set('HAVE_SYSTEM_FUNCTION', cc.has_function('system', prefix: '#include ')) +if rdma.found() + config_host_data.set('HAVE_IBV_ADVISE_MR', + cc.has_function('ibv_advise_mr', + args: config_host['RDMA_LIBS'].split(), + prefix: '#include ')) +endif # has_header_symbol config_host_data.set('CONFIG_BYTESWAP_H', diff --git a/migration/migration.c b/migration/migration.c index 6ac807ef3d..9172686b89 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -453,10 +453,12 @@ static void qemu_start_incoming_migration(const char *uri, Error **errp) { const char *p = NULL; + migrate_protocol_allow_multifd(false); /* reset it anyway */ qapi_event_send_migration(MIGRATION_STATUS_SETUP); if (strstart(uri, "tcp:", &p) || strstart(uri, "unix:", NULL) || strstart(uri, "vsock:", NULL)) { + migrate_protocol_allow_multifd(true); socket_start_incoming_migration(p ? p : uri, errp); #ifdef CONFIG_RDMA } else if (strstart(uri, "rdma:", &p)) { @@ -1235,6 +1237,14 @@ static bool migrate_caps_check(bool *cap_list, } } + /* incoming side only */ + if (runstate_check(RUN_STATE_INMIGRATE) && + !migrate_multifd_is_allowed() && + cap_list[MIGRATION_CAPABILITY_MULTIFD]) { + error_setg(errp, "multifd is not supported by current protocol"); + return false; + } + return true; } @@ -2280,9 +2290,11 @@ void qmp_migrate(const char *uri, bool has_blk, bool blk, } } + migrate_protocol_allow_multifd(false); if (strstart(uri, "tcp:", &p) || strstart(uri, "unix:", NULL) || strstart(uri, "vsock:", NULL)) { + migrate_protocol_allow_multifd(true); socket_start_outgoing_migration(s, p ? p : uri, &local_err); #ifdef CONFIG_RDMA } else if (strstart(uri, "rdma:", &p)) { diff --git a/migration/multifd.c b/migration/multifd.c index 377da78f5b..7c9deb1921 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -531,7 +531,7 @@ void multifd_save_cleanup(void) { int i; - if (!migrate_use_multifd()) { + if (!migrate_use_multifd() || !migrate_multifd_is_allowed()) { return; } multifd_send_terminate_threads(NULL); @@ -546,6 +546,9 @@ void multifd_save_cleanup(void) MultiFDSendParams *p = &multifd_send_state->params[i]; Error *local_err = NULL; + if (p->registered_yank) { + migration_ioc_unregister_yank(p->c); + } socket_send_channel_destroy(p->c); p->c = NULL; qemu_mutex_destroy(&p->mutex); @@ -813,7 +816,8 @@ static bool multifd_channel_connect(MultiFDSendParams *p, return false; } } else { - /* update for tls qio channel */ + migration_ioc_register_yank(ioc); + p->registered_yank = true; p->c = ioc; qemu_thread_create(&p->thread, p->name, multifd_send_thread, p, QEMU_THREAD_JOINABLE); @@ -864,6 +868,17 @@ cleanup: multifd_new_send_channel_cleanup(p, sioc, local_err); } +static bool migrate_allow_multifd = true; +void migrate_protocol_allow_multifd(bool allow) +{ + migrate_allow_multifd = allow; +} + +bool migrate_multifd_is_allowed(void) +{ + return migrate_allow_multifd; +} + int multifd_save_setup(Error **errp) { int thread_count; @@ -874,6 +889,11 @@ int multifd_save_setup(Error **errp) if (!migrate_use_multifd()) { return 0; } + if (!migrate_multifd_is_allowed()) { + error_setg(errp, "multifd is not supported by current protocol"); + return -1; + } + s = migrate_get_current(); thread_count = migrate_multifd_channels(); multifd_send_state = g_malloc0(sizeof(*multifd_send_state)); @@ -967,7 +987,7 @@ int multifd_load_cleanup(Error **errp) { int i; - if (!migrate_use_multifd()) { + if (!migrate_use_multifd() || !migrate_multifd_is_allowed()) { return 0; } multifd_recv_terminate_threads(NULL); @@ -987,10 +1007,7 @@ int multifd_load_cleanup(Error **errp) for (i = 0; i < migrate_multifd_channels(); i++) { MultiFDRecvParams *p = &multifd_recv_state->params[i]; - if (OBJECT(p->c)->ref == 1) { - migration_ioc_unregister_yank(p->c); - } - + migration_ioc_unregister_yank(p->c); object_unref(OBJECT(p->c)); p->c = NULL; qemu_mutex_destroy(&p->mutex); @@ -1119,6 +1136,10 @@ int multifd_load_setup(Error **errp) if (!migrate_use_multifd()) { return 0; } + if (!migrate_multifd_is_allowed()) { + error_setg(errp, "multifd is not supported by current protocol"); + return -1; + } thread_count = migrate_multifd_channels(); multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state)); multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count); diff --git a/migration/multifd.h b/migration/multifd.h index 8d6751f5ed..15c50ca0b2 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -13,6 +13,8 @@ #ifndef QEMU_MIGRATION_MULTIFD_H #define QEMU_MIGRATION_MULTIFD_H +bool migrate_multifd_is_allowed(void); +void migrate_protocol_allow_multifd(bool allow); int multifd_save_setup(Error **errp); void multifd_save_cleanup(void); int multifd_load_setup(Error **errp); @@ -85,6 +87,8 @@ typedef struct { bool running; /* should this thread finish */ bool quit; + /* is the yank function registered */ + bool registered_yank; /* thread has work to do */ int pending_job; /* array of pages to sent */ diff --git a/migration/ram.c b/migration/ram.c index 7a43bfd7af..bb908822d5 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -789,8 +789,7 @@ unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb, return find_next_bit(bitmap, size, start); } -static void migration_clear_memory_region_dirty_bitmap(RAMState *rs, - RAMBlock *rb, +static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb, unsigned long page) { uint8_t shift; @@ -818,8 +817,7 @@ static void migration_clear_memory_region_dirty_bitmap(RAMState *rs, } static void -migration_clear_memory_region_dirty_bitmap_range(RAMState *rs, - RAMBlock *rb, +migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb, unsigned long start, unsigned long npages) { @@ -832,7 +830,7 @@ migration_clear_memory_region_dirty_bitmap_range(RAMState *rs, * exclusive. */ for (i = chunk_start; i < chunk_end; i += chunk_pages) { - migration_clear_memory_region_dirty_bitmap(rs, rb, i); + migration_clear_memory_region_dirty_bitmap(rb, i); } } @@ -850,7 +848,7 @@ static inline bool migration_bitmap_clear_dirty(RAMState *rs, * the page in the chunk we clear the remote dirty bitmap for all. * Clearing it earlier won't be a problem, but too late will. */ - migration_clear_memory_region_dirty_bitmap(rs, rb, page); + migration_clear_memory_region_dirty_bitmap(rb, page); ret = test_and_clear_bit(page, rb->bmap); if (ret) { @@ -2777,8 +2775,7 @@ void qemu_guest_free_page_hint(void *addr, size_t len) * are initially set. Otherwise those skipped pages will be sent in * the next round after syncing from the memory region bitmap. */ - migration_clear_memory_region_dirty_bitmap_range(ram_state, block, - start, npages); + migration_clear_memory_region_dirty_bitmap_range(block, start, npages); ram_state->migration_dirty_pages -= bitmap_count_one_with_offset(block->bmap, start, npages); bitmap_clear(block->bmap, start, npages); diff --git a/migration/rdma.c b/migration/rdma.c index 5c2d113aa9..2a3c7889b9 100644 --- a/migration/rdma.c +++ b/migration/rdma.c @@ -1117,19 +1117,82 @@ static int qemu_rdma_alloc_qp(RDMAContext *rdma) return 0; } +/* Check whether On-Demand Paging is supported by RDAM device */ +static bool rdma_support_odp(struct ibv_context *dev) +{ + struct ibv_device_attr_ex attr = {0}; + int ret = ibv_query_device_ex(dev, NULL, &attr); + if (ret) { + return false; + } + + if (attr.odp_caps.general_caps & IBV_ODP_SUPPORT) { + return true; + } + + return false; +} + +/* + * ibv_advise_mr to avoid RNR NAK error as far as possible. + * The responder mr registering with ODP will sent RNR NAK back to + * the requester in the face of the page fault. + */ +static void qemu_rdma_advise_prefetch_mr(struct ibv_pd *pd, uint64_t addr, + uint32_t len, uint32_t lkey, + const char *name, bool wr) +{ +#ifdef HAVE_IBV_ADVISE_MR + int ret; + int advice = wr ? IBV_ADVISE_MR_ADVICE_PREFETCH_WRITE : + IBV_ADVISE_MR_ADVICE_PREFETCH; + struct ibv_sge sg_list = {.lkey = lkey, .addr = addr, .length = len}; + + ret = ibv_advise_mr(pd, advice, + IBV_ADVISE_MR_FLAG_FLUSH, &sg_list, 1); + /* ignore the error */ + if (ret) { + trace_qemu_rdma_advise_mr(name, len, addr, strerror(errno)); + } else { + trace_qemu_rdma_advise_mr(name, len, addr, "successed"); + } +#endif +} + static int qemu_rdma_reg_whole_ram_blocks(RDMAContext *rdma) { int i; RDMALocalBlocks *local = &rdma->local_ram_blocks; for (i = 0; i < local->nb_blocks; i++) { + int access = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE; + local->block[i].mr = ibv_reg_mr(rdma->pd, local->block[i].local_host_addr, - local->block[i].length, - IBV_ACCESS_LOCAL_WRITE | - IBV_ACCESS_REMOTE_WRITE + local->block[i].length, access ); + + if (!local->block[i].mr && + errno == ENOTSUP && rdma_support_odp(rdma->verbs)) { + access |= IBV_ACCESS_ON_DEMAND; + /* register ODP mr */ + local->block[i].mr = + ibv_reg_mr(rdma->pd, + local->block[i].local_host_addr, + local->block[i].length, access); + trace_qemu_rdma_register_odp_mr(local->block[i].block_name); + + if (local->block[i].mr) { + qemu_rdma_advise_prefetch_mr(rdma->pd, + (uintptr_t)local->block[i].local_host_addr, + local->block[i].length, + local->block[i].mr->lkey, + local->block[i].block_name, + true); + } + } + if (!local->block[i].mr) { perror("Failed to register local dest ram block!"); break; @@ -1215,28 +1278,40 @@ static int qemu_rdma_register_and_get_keys(RDMAContext *rdma, */ if (!block->pmr[chunk]) { uint64_t len = chunk_end - chunk_start; + int access = rkey ? IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE : + 0; trace_qemu_rdma_register_and_get_keys(len, chunk_start); - block->pmr[chunk] = ibv_reg_mr(rdma->pd, - chunk_start, len, - (rkey ? (IBV_ACCESS_LOCAL_WRITE | - IBV_ACCESS_REMOTE_WRITE) : 0)); + block->pmr[chunk] = ibv_reg_mr(rdma->pd, chunk_start, len, access); + if (!block->pmr[chunk] && + errno == ENOTSUP && rdma_support_odp(rdma->verbs)) { + access |= IBV_ACCESS_ON_DEMAND; + /* register ODP mr */ + block->pmr[chunk] = ibv_reg_mr(rdma->pd, chunk_start, len, access); + trace_qemu_rdma_register_odp_mr(block->block_name); - if (!block->pmr[chunk]) { - perror("Failed to register chunk!"); - fprintf(stderr, "Chunk details: block: %d chunk index %d" - " start %" PRIuPTR " end %" PRIuPTR - " host %" PRIuPTR - " local %" PRIuPTR " registrations: %d\n", - block->index, chunk, (uintptr_t)chunk_start, - (uintptr_t)chunk_end, host_addr, - (uintptr_t)block->local_host_addr, - rdma->total_registrations); - return -1; + if (block->pmr[chunk]) { + qemu_rdma_advise_prefetch_mr(rdma->pd, (uintptr_t)chunk_start, + len, block->pmr[chunk]->lkey, + block->block_name, rkey); + + } } - rdma->total_registrations++; } + if (!block->pmr[chunk]) { + perror("Failed to register chunk!"); + fprintf(stderr, "Chunk details: block: %d chunk index %d" + " start %" PRIuPTR " end %" PRIuPTR + " host %" PRIuPTR + " local %" PRIuPTR " registrations: %d\n", + block->index, chunk, (uintptr_t)chunk_start, + (uintptr_t)chunk_end, host_addr, + (uintptr_t)block->local_host_addr, + rdma->total_registrations); + return -1; + } + rdma->total_registrations++; if (lkey) { *lkey = block->pmr[chunk]->lkey; diff --git a/migration/trace-events b/migration/trace-events index a1c0f034ab..a8ae163707 100644 --- a/migration/trace-events +++ b/migration/trace-events @@ -212,6 +212,8 @@ qemu_rdma_poll_write(const char *compstr, int64_t comp, int left, uint64_t block qemu_rdma_poll_other(const char *compstr, int64_t comp, int left) "other completion %s (%" PRId64 ") received left %d" qemu_rdma_post_send_control(const char *desc) "CONTROL: sending %s.." qemu_rdma_register_and_get_keys(uint64_t len, void *start) "Registering %" PRIu64 " bytes @ %p" +qemu_rdma_register_odp_mr(const char *name) "Try to register On-Demand Paging memory region: %s" +qemu_rdma_advise_mr(const char *name, uint32_t len, uint64_t addr, const char *res) "Try to advise block %s prefetch at %" PRIu32 "@0x%" PRIx64 ": %s" qemu_rdma_registration_handle_compress(int64_t length, int index, int64_t offset) "Zapping zero chunk: %" PRId64 " bytes, index %d, offset %" PRId64 qemu_rdma_registration_handle_finished(void) "" qemu_rdma_registration_handle_ram_blocks(void) ""