virtio,vhost,pci,pc: features, cleanups
SRAT tables for DIMM devices new virtio net flags for speed/duplex post-copy migration support in vhost cleanups in pci Signed-off-by: Michael S. Tsirkin <mst@redhat.com> -----BEGIN PGP SIGNATURE----- iQEcBAABAgAGBQJasR1rAAoJECgfDbjSjVRpOocH/R9A3g/TkpGjmLzJBrrX1NGO I/iq0ttHjqg4OBIChA4BHHjXwYUMs7XQn26B3efrk1otLAJhuqntZIIo3uU0WraA 5J+4DT46ogs5rZWNzDCZ0zAkSaATDA6h9Nfh7TvPc9Q2WpcIT0cTa/jOtrxRc9Vq 32hbUKtJSpNxRjwbZvk6YV21HtWo3Tktdaj9IeTQTN0/gfMyOMdgxta3+bymicbJ FuF9ybHcpXvrEctHhXHIL4/YVGEH/4shagZ4JVzv1dVdLeHLZtPomdf7+oc0+07m Qs+yV0HeRS5Zxt7w5blGLC4zDXczT/bUx8oln0Tz5MV7RR/+C2HwMOHC69gfpSc= =vomK -----END PGP SIGNATURE----- Merge remote-tracking branch 'remotes/mst/tags/for_upstream' into staging virtio,vhost,pci,pc: features, cleanups SRAT tables for DIMM devices new virtio net flags for speed/duplex post-copy migration support in vhost cleanups in pci Signed-off-by: Michael S. Tsirkin <mst@redhat.com> # gpg: Signature made Tue 20 Mar 2018 14:40:43 GMT # gpg: using RSA key 281F0DB8D28D5469 # gpg: Good signature from "Michael S. Tsirkin <mst@kernel.org>" # gpg: aka "Michael S. Tsirkin <mst@redhat.com>" # Primary key fingerprint: 0270 606B 6F3C DF3D 0B17 0970 C350 3912 AFBE 8E67 # Subkey fingerprint: 5D09 FD08 71C8 F85B 94CA 8A0D 281F 0DB8 D28D 5469 * remotes/mst/tags/for_upstream: (51 commits) postcopy shared docs libvhost-user: Claim support for postcopy postcopy: Allow shared memory vhost: Huge page align and merge vhost+postcopy: Wire up POSTCOPY_END notify vhost-user: Add VHOST_USER_POSTCOPY_END message libvhost-user: mprotect & madvises for postcopy vhost+postcopy: Call wakeups vhost+postcopy: Add vhost waker postcopy: postcopy_notify_shared_wake postcopy: helper for waking shared vhost+postcopy: Resolve client address postcopy-ram: add a stub for postcopy_request_shared_page vhost+postcopy: Helper to send requests to source for shared pages vhost+postcopy: Stash RAMBlock and offset vhost+postcopy: Send address back to qemu libvhost-user+postcopy: Register new regions with the ufd migration/ram: ramblock_recv_bitmap_test_byte_offset postcopy+vhost-user: Split set_mem_table for postcopy vhost+postcopy: Transmit 'listen' to slave ... Signed-off-by: Peter Maydell <peter.maydell@linaro.org> # Conflicts: # scripts/update-linux-headers.sh
This commit is contained in:
commit
ed627b2ad3
4
Makefile
4
Makefile
@ -777,7 +777,6 @@ bepo cz
|
||||
ifdef INSTALL_BLOBS
|
||||
BLOBS=bios.bin bios-256k.bin sgabios.bin vgabios.bin vgabios-cirrus.bin \
|
||||
vgabios-stdvga.bin vgabios-vmware.bin vgabios-qxl.bin vgabios-virtio.bin \
|
||||
acpi-dsdt.aml \
|
||||
ppc_rom.bin openbios-sparc32 openbios-sparc64 openbios-ppc QEMU,tcx.bin QEMU,cgthree.bin \
|
||||
pxe-e1000.rom pxe-eepro100.rom pxe-ne2k_pci.rom \
|
||||
pxe-pcnet.rom pxe-rtl8139.rom pxe-virtio.rom \
|
||||
@ -1048,6 +1047,9 @@ endif
|
||||
include $(SRC_PATH)/tests/docker/Makefile.include
|
||||
include $(SRC_PATH)/tests/vm/Makefile.include
|
||||
|
||||
printgen:
|
||||
@echo $(GENERATED_FILES)
|
||||
|
||||
.PHONY: help
|
||||
help:
|
||||
@echo 'Generic targets:'
|
||||
|
@ -26,9 +26,20 @@
|
||||
#include <sys/socket.h>
|
||||
#include <sys/eventfd.h>
|
||||
#include <sys/mman.h>
|
||||
#include "qemu/compiler.h"
|
||||
|
||||
#if defined(__linux__)
|
||||
#include <sys/syscall.h>
|
||||
#include <fcntl.h>
|
||||
#include <sys/ioctl.h>
|
||||
#include <linux/vhost.h>
|
||||
|
||||
#include "qemu/compiler.h"
|
||||
#ifdef __NR_userfaultfd
|
||||
#include <linux/userfaultfd.h>
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#include "qemu/atomic.h"
|
||||
|
||||
#include "libvhost-user.h"
|
||||
@ -86,6 +97,9 @@ vu_request_to_string(unsigned int req)
|
||||
REQ(VHOST_USER_SET_VRING_ENDIAN),
|
||||
REQ(VHOST_USER_GET_CONFIG),
|
||||
REQ(VHOST_USER_SET_CONFIG),
|
||||
REQ(VHOST_USER_POSTCOPY_ADVISE),
|
||||
REQ(VHOST_USER_POSTCOPY_LISTEN),
|
||||
REQ(VHOST_USER_POSTCOPY_END),
|
||||
REQ(VHOST_USER_MAX),
|
||||
};
|
||||
#undef REQ
|
||||
@ -171,6 +185,35 @@ vmsg_close_fds(VhostUserMsg *vmsg)
|
||||
}
|
||||
}
|
||||
|
||||
/* A test to see if we have userfault available */
|
||||
static bool
|
||||
have_userfault(void)
|
||||
{
|
||||
#if defined(__linux__) && defined(__NR_userfaultfd) &&\
|
||||
defined(UFFD_FEATURE_MISSING_SHMEM) &&\
|
||||
defined(UFFD_FEATURE_MISSING_HUGETLBFS)
|
||||
/* Now test the kernel we're running on really has the features */
|
||||
int ufd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
|
||||
struct uffdio_api api_struct;
|
||||
if (ufd < 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
api_struct.api = UFFD_API;
|
||||
api_struct.features = UFFD_FEATURE_MISSING_SHMEM |
|
||||
UFFD_FEATURE_MISSING_HUGETLBFS;
|
||||
if (ioctl(ufd, UFFDIO_API, &api_struct)) {
|
||||
close(ufd);
|
||||
return false;
|
||||
}
|
||||
close(ufd);
|
||||
return true;
|
||||
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
static bool
|
||||
vu_message_read(VuDev *dev, int conn_fd, VhostUserMsg *vmsg)
|
||||
{
|
||||
@ -245,6 +288,31 @@ vu_message_write(VuDev *dev, int conn_fd, VhostUserMsg *vmsg)
|
||||
{
|
||||
int rc;
|
||||
uint8_t *p = (uint8_t *)vmsg;
|
||||
char control[CMSG_SPACE(VHOST_MEMORY_MAX_NREGIONS * sizeof(int))] = { };
|
||||
struct iovec iov = {
|
||||
.iov_base = (char *)vmsg,
|
||||
.iov_len = VHOST_USER_HDR_SIZE,
|
||||
};
|
||||
struct msghdr msg = {
|
||||
.msg_iov = &iov,
|
||||
.msg_iovlen = 1,
|
||||
.msg_control = control,
|
||||
};
|
||||
struct cmsghdr *cmsg;
|
||||
|
||||
memset(control, 0, sizeof(control));
|
||||
assert(vmsg->fd_num <= VHOST_MEMORY_MAX_NREGIONS);
|
||||
if (vmsg->fd_num > 0) {
|
||||
size_t fdsize = vmsg->fd_num * sizeof(int);
|
||||
msg.msg_controllen = CMSG_SPACE(fdsize);
|
||||
cmsg = CMSG_FIRSTHDR(&msg);
|
||||
cmsg->cmsg_len = CMSG_LEN(fdsize);
|
||||
cmsg->cmsg_level = SOL_SOCKET;
|
||||
cmsg->cmsg_type = SCM_RIGHTS;
|
||||
memcpy(CMSG_DATA(cmsg), vmsg->fds, fdsize);
|
||||
} else {
|
||||
msg.msg_controllen = 0;
|
||||
}
|
||||
|
||||
/* Set the version in the flags when sending the reply */
|
||||
vmsg->flags &= ~VHOST_USER_VERSION_MASK;
|
||||
@ -252,7 +320,7 @@ vu_message_write(VuDev *dev, int conn_fd, VhostUserMsg *vmsg)
|
||||
vmsg->flags |= VHOST_USER_REPLY_MASK;
|
||||
|
||||
do {
|
||||
rc = write(conn_fd, p, VHOST_USER_HDR_SIZE);
|
||||
rc = sendmsg(conn_fd, &msg, 0);
|
||||
} while (rc < 0 && (errno == EINTR || errno == EAGAIN));
|
||||
|
||||
do {
|
||||
@ -345,6 +413,7 @@ vu_get_features_exec(VuDev *dev, VhostUserMsg *vmsg)
|
||||
}
|
||||
|
||||
vmsg->size = sizeof(vmsg->payload.u64);
|
||||
vmsg->fd_num = 0;
|
||||
|
||||
DPRINT("Sending back to guest u64: 0x%016"PRIx64"\n", vmsg->payload.u64);
|
||||
|
||||
@ -409,6 +478,148 @@ vu_reset_device_exec(VuDev *dev, VhostUserMsg *vmsg)
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool
|
||||
vu_set_mem_table_exec_postcopy(VuDev *dev, VhostUserMsg *vmsg)
|
||||
{
|
||||
int i;
|
||||
VhostUserMemory *memory = &vmsg->payload.memory;
|
||||
dev->nregions = memory->nregions;
|
||||
|
||||
DPRINT("Nregions: %d\n", memory->nregions);
|
||||
for (i = 0; i < dev->nregions; i++) {
|
||||
void *mmap_addr;
|
||||
VhostUserMemoryRegion *msg_region = &memory->regions[i];
|
||||
VuDevRegion *dev_region = &dev->regions[i];
|
||||
|
||||
DPRINT("Region %d\n", i);
|
||||
DPRINT(" guest_phys_addr: 0x%016"PRIx64"\n",
|
||||
msg_region->guest_phys_addr);
|
||||
DPRINT(" memory_size: 0x%016"PRIx64"\n",
|
||||
msg_region->memory_size);
|
||||
DPRINT(" userspace_addr 0x%016"PRIx64"\n",
|
||||
msg_region->userspace_addr);
|
||||
DPRINT(" mmap_offset 0x%016"PRIx64"\n",
|
||||
msg_region->mmap_offset);
|
||||
|
||||
dev_region->gpa = msg_region->guest_phys_addr;
|
||||
dev_region->size = msg_region->memory_size;
|
||||
dev_region->qva = msg_region->userspace_addr;
|
||||
dev_region->mmap_offset = msg_region->mmap_offset;
|
||||
|
||||
/* We don't use offset argument of mmap() since the
|
||||
* mapped address has to be page aligned, and we use huge
|
||||
* pages.
|
||||
* In postcopy we're using PROT_NONE here to catch anyone
|
||||
* accessing it before we userfault
|
||||
*/
|
||||
mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset,
|
||||
PROT_NONE, MAP_SHARED,
|
||||
vmsg->fds[i], 0);
|
||||
|
||||
if (mmap_addr == MAP_FAILED) {
|
||||
vu_panic(dev, "region mmap error: %s", strerror(errno));
|
||||
} else {
|
||||
dev_region->mmap_addr = (uint64_t)(uintptr_t)mmap_addr;
|
||||
DPRINT(" mmap_addr: 0x%016"PRIx64"\n",
|
||||
dev_region->mmap_addr);
|
||||
}
|
||||
|
||||
/* Return the address to QEMU so that it can translate the ufd
|
||||
* fault addresses back.
|
||||
*/
|
||||
msg_region->userspace_addr = (uintptr_t)(mmap_addr +
|
||||
dev_region->mmap_offset);
|
||||
close(vmsg->fds[i]);
|
||||
}
|
||||
|
||||
/* Send the message back to qemu with the addresses filled in */
|
||||
vmsg->fd_num = 0;
|
||||
if (!vu_message_write(dev, dev->sock, vmsg)) {
|
||||
vu_panic(dev, "failed to respond to set-mem-table for postcopy");
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Wait for QEMU to confirm that it's registered the handler for the
|
||||
* faults.
|
||||
*/
|
||||
if (!vu_message_read(dev, dev->sock, vmsg) ||
|
||||
vmsg->size != sizeof(vmsg->payload.u64) ||
|
||||
vmsg->payload.u64 != 0) {
|
||||
vu_panic(dev, "failed to receive valid ack for postcopy set-mem-table");
|
||||
return false;
|
||||
}
|
||||
|
||||
/* OK, now we can go and register the memory and generate faults */
|
||||
for (i = 0; i < dev->nregions; i++) {
|
||||
VuDevRegion *dev_region = &dev->regions[i];
|
||||
int ret;
|
||||
#ifdef UFFDIO_REGISTER
|
||||
/* We should already have an open ufd. Mark each memory
|
||||
* range as ufd.
|
||||
* Discard any mapping we have here; note I can't use MADV_REMOVE
|
||||
* or fallocate to make the hole since I don't want to lose
|
||||
* data that's already arrived in the shared process.
|
||||
* TODO: How to do hugepage
|
||||
*/
|
||||
ret = madvise((void *)dev_region->mmap_addr,
|
||||
dev_region->size + dev_region->mmap_offset,
|
||||
MADV_DONTNEED);
|
||||
if (ret) {
|
||||
fprintf(stderr,
|
||||
"%s: Failed to madvise(DONTNEED) region %d: %s\n",
|
||||
__func__, i, strerror(errno));
|
||||
}
|
||||
/* Turn off transparent hugepages so we dont get lose wakeups
|
||||
* in neighbouring pages.
|
||||
* TODO: Turn this backon later.
|
||||
*/
|
||||
ret = madvise((void *)dev_region->mmap_addr,
|
||||
dev_region->size + dev_region->mmap_offset,
|
||||
MADV_NOHUGEPAGE);
|
||||
if (ret) {
|
||||
/* Note: This can happen legally on kernels that are configured
|
||||
* without madvise'able hugepages
|
||||
*/
|
||||
fprintf(stderr,
|
||||
"%s: Failed to madvise(NOHUGEPAGE) region %d: %s\n",
|
||||
__func__, i, strerror(errno));
|
||||
}
|
||||
struct uffdio_register reg_struct;
|
||||
reg_struct.range.start = (uintptr_t)dev_region->mmap_addr;
|
||||
reg_struct.range.len = dev_region->size + dev_region->mmap_offset;
|
||||
reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
|
||||
|
||||
if (ioctl(dev->postcopy_ufd, UFFDIO_REGISTER, ®_struct)) {
|
||||
vu_panic(dev, "%s: Failed to userfault region %d "
|
||||
"@%p + size:%zx offset: %zx: (ufd=%d)%s\n",
|
||||
__func__, i,
|
||||
dev_region->mmap_addr,
|
||||
dev_region->size, dev_region->mmap_offset,
|
||||
dev->postcopy_ufd, strerror(errno));
|
||||
return false;
|
||||
}
|
||||
if (!(reg_struct.ioctls & ((__u64)1 << _UFFDIO_COPY))) {
|
||||
vu_panic(dev, "%s Region (%d) doesn't support COPY",
|
||||
__func__, i);
|
||||
return false;
|
||||
}
|
||||
DPRINT("%s: region %d: Registered userfault for %llx + %llx\n",
|
||||
__func__, i, reg_struct.range.start, reg_struct.range.len);
|
||||
/* Now it's registered we can let the client at it */
|
||||
if (mprotect((void *)dev_region->mmap_addr,
|
||||
dev_region->size + dev_region->mmap_offset,
|
||||
PROT_READ | PROT_WRITE)) {
|
||||
vu_panic(dev, "failed to mprotect region %d for postcopy (%s)",
|
||||
i, strerror(errno));
|
||||
return false;
|
||||
}
|
||||
/* TODO: Stash 'zero' support flags somewhere */
|
||||
#endif
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool
|
||||
vu_set_mem_table_exec(VuDev *dev, VhostUserMsg *vmsg)
|
||||
{
|
||||
@ -425,6 +636,10 @@ vu_set_mem_table_exec(VuDev *dev, VhostUserMsg *vmsg)
|
||||
}
|
||||
dev->nregions = memory->nregions;
|
||||
|
||||
if (dev->postcopy_listening) {
|
||||
return vu_set_mem_table_exec_postcopy(dev, vmsg);
|
||||
}
|
||||
|
||||
DPRINT("Nregions: %d\n", memory->nregions);
|
||||
for (i = 0; i < dev->nregions; i++) {
|
||||
void *mmap_addr;
|
||||
@ -500,6 +715,7 @@ vu_set_log_base_exec(VuDev *dev, VhostUserMsg *vmsg)
|
||||
dev->log_size = log_mmap_size;
|
||||
|
||||
vmsg->size = sizeof(vmsg->payload.u64);
|
||||
vmsg->fd_num = 0;
|
||||
|
||||
return true;
|
||||
}
|
||||
@ -752,12 +968,17 @@ vu_get_protocol_features_exec(VuDev *dev, VhostUserMsg *vmsg)
|
||||
uint64_t features = 1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD |
|
||||
1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ;
|
||||
|
||||
if (have_userfault()) {
|
||||
features |= 1ULL << VHOST_USER_PROTOCOL_F_PAGEFAULT;
|
||||
}
|
||||
|
||||
if (dev->iface->get_protocol_features) {
|
||||
features |= dev->iface->get_protocol_features(dev);
|
||||
}
|
||||
|
||||
vmsg->payload.u64 = features;
|
||||
vmsg->size = sizeof(vmsg->payload.u64);
|
||||
vmsg->fd_num = 0;
|
||||
|
||||
return true;
|
||||
}
|
||||
@ -856,6 +1077,77 @@ vu_set_config(VuDev *dev, VhostUserMsg *vmsg)
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool
|
||||
vu_set_postcopy_advise(VuDev *dev, VhostUserMsg *vmsg)
|
||||
{
|
||||
dev->postcopy_ufd = -1;
|
||||
#ifdef UFFDIO_API
|
||||
struct uffdio_api api_struct;
|
||||
|
||||
dev->postcopy_ufd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
|
||||
vmsg->size = 0;
|
||||
#endif
|
||||
|
||||
if (dev->postcopy_ufd == -1) {
|
||||
vu_panic(dev, "Userfaultfd not available: %s", strerror(errno));
|
||||
goto out;
|
||||
}
|
||||
|
||||
#ifdef UFFDIO_API
|
||||
api_struct.api = UFFD_API;
|
||||
api_struct.features = 0;
|
||||
if (ioctl(dev->postcopy_ufd, UFFDIO_API, &api_struct)) {
|
||||
vu_panic(dev, "Failed UFFDIO_API: %s", strerror(errno));
|
||||
close(dev->postcopy_ufd);
|
||||
dev->postcopy_ufd = -1;
|
||||
goto out;
|
||||
}
|
||||
/* TODO: Stash feature flags somewhere */
|
||||
#endif
|
||||
|
||||
out:
|
||||
/* Return a ufd to the QEMU */
|
||||
vmsg->fd_num = 1;
|
||||
vmsg->fds[0] = dev->postcopy_ufd;
|
||||
return true; /* = send a reply */
|
||||
}
|
||||
|
||||
static bool
|
||||
vu_set_postcopy_listen(VuDev *dev, VhostUserMsg *vmsg)
|
||||
{
|
||||
vmsg->payload.u64 = -1;
|
||||
vmsg->size = sizeof(vmsg->payload.u64);
|
||||
|
||||
if (dev->nregions) {
|
||||
vu_panic(dev, "Regions already registered at postcopy-listen");
|
||||
return true;
|
||||
}
|
||||
dev->postcopy_listening = true;
|
||||
|
||||
vmsg->flags = VHOST_USER_VERSION | VHOST_USER_REPLY_MASK;
|
||||
vmsg->payload.u64 = 0; /* Success */
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool
|
||||
vu_set_postcopy_end(VuDev *dev, VhostUserMsg *vmsg)
|
||||
{
|
||||
DPRINT("%s: Entry\n", __func__);
|
||||
dev->postcopy_listening = false;
|
||||
if (dev->postcopy_ufd > 0) {
|
||||
close(dev->postcopy_ufd);
|
||||
dev->postcopy_ufd = -1;
|
||||
DPRINT("%s: Done close\n", __func__);
|
||||
}
|
||||
|
||||
vmsg->fd_num = 0;
|
||||
vmsg->payload.u64 = 0;
|
||||
vmsg->size = sizeof(vmsg->payload.u64);
|
||||
vmsg->flags = VHOST_USER_VERSION | VHOST_USER_REPLY_MASK;
|
||||
DPRINT("%s: exit\n", __func__);
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool
|
||||
vu_process_message(VuDev *dev, VhostUserMsg *vmsg)
|
||||
{
|
||||
@ -927,6 +1219,12 @@ vu_process_message(VuDev *dev, VhostUserMsg *vmsg)
|
||||
return vu_set_config(dev, vmsg);
|
||||
case VHOST_USER_NONE:
|
||||
break;
|
||||
case VHOST_USER_POSTCOPY_ADVISE:
|
||||
return vu_set_postcopy_advise(dev, vmsg);
|
||||
case VHOST_USER_POSTCOPY_LISTEN:
|
||||
return vu_set_postcopy_listen(dev, vmsg);
|
||||
case VHOST_USER_POSTCOPY_END:
|
||||
return vu_set_postcopy_end(dev, vmsg);
|
||||
default:
|
||||
vmsg_close_fds(vmsg);
|
||||
vu_panic(dev, "Unhandled request: %d", vmsg->request);
|
||||
|
@ -48,6 +48,8 @@ enum VhostUserProtocolFeature {
|
||||
VHOST_USER_PROTOCOL_F_NET_MTU = 4,
|
||||
VHOST_USER_PROTOCOL_F_SLAVE_REQ = 5,
|
||||
VHOST_USER_PROTOCOL_F_CROSS_ENDIAN = 6,
|
||||
VHOST_USER_PROTOCOL_F_CRYPTO_SESSION = 7,
|
||||
VHOST_USER_PROTOCOL_F_PAGEFAULT = 8,
|
||||
|
||||
VHOST_USER_PROTOCOL_F_MAX
|
||||
};
|
||||
@ -81,6 +83,11 @@ typedef enum VhostUserRequest {
|
||||
VHOST_USER_SET_VRING_ENDIAN = 23,
|
||||
VHOST_USER_GET_CONFIG = 24,
|
||||
VHOST_USER_SET_CONFIG = 25,
|
||||
VHOST_USER_CREATE_CRYPTO_SESSION = 26,
|
||||
VHOST_USER_CLOSE_CRYPTO_SESSION = 27,
|
||||
VHOST_USER_POSTCOPY_ADVISE = 28,
|
||||
VHOST_USER_POSTCOPY_LISTEN = 29,
|
||||
VHOST_USER_POSTCOPY_END = 30,
|
||||
VHOST_USER_MAX
|
||||
} VhostUserRequest;
|
||||
|
||||
@ -277,6 +284,10 @@ struct VuDev {
|
||||
* re-initialize */
|
||||
vu_panic_cb panic;
|
||||
const VuDevIface *iface;
|
||||
|
||||
/* Postcopy data */
|
||||
int postcopy_ufd;
|
||||
bool postcopy_listening;
|
||||
};
|
||||
|
||||
typedef struct VuVirtqElement {
|
||||
|
@ -577,3 +577,44 @@ Postcopy now works with hugetlbfs backed memory:
|
||||
hugepages works well, however 1GB hugepages are likely to be problematic
|
||||
since it takes ~1 second to transfer a 1GB hugepage across a 10Gbps link,
|
||||
and until the full page is transferred the destination thread is blocked.
|
||||
|
||||
Postcopy with shared memory
|
||||
---------------------------
|
||||
|
||||
Postcopy migration with shared memory needs explicit support from the other
|
||||
processes that share memory and from QEMU. There are restrictions on the type of
|
||||
memory that userfault can support shared.
|
||||
|
||||
The Linux kernel userfault support works on `/dev/shm` memory and on `hugetlbfs`
|
||||
(although the kernel doesn't provide an equivalent to `madvise(MADV_DONTNEED)`
|
||||
for hugetlbfs which may be a problem in some configurations).
|
||||
|
||||
The vhost-user code in QEMU supports clients that have Postcopy support,
|
||||
and the `vhost-user-bridge` (in `tests/`) and the DPDK package have changes
|
||||
to support postcopy.
|
||||
|
||||
The client needs to open a userfaultfd and register the areas
|
||||
of memory that it maps with userfault. The client must then pass the
|
||||
userfaultfd back to QEMU together with a mapping table that allows
|
||||
fault addresses in the clients address space to be converted back to
|
||||
RAMBlock/offsets. The client's userfaultfd is added to the postcopy
|
||||
fault-thread and page requests are made on behalf of the client by QEMU.
|
||||
QEMU performs 'wake' operations on the client's userfaultfd to allow it
|
||||
to continue after a page has arrived.
|
||||
|
||||
.. note::
|
||||
There are two future improvements that would be nice:
|
||||
a) Some way to make QEMU ignorant of the addresses in the clients
|
||||
address space
|
||||
b) Avoiding the need for QEMU to perform ufd-wake calls after the
|
||||
pages have arrived
|
||||
|
||||
Retro-fitting postcopy to existing clients is possible:
|
||||
a) A mechanism is needed for the registration with userfault as above,
|
||||
and the registration needs to be coordinated with the phases of
|
||||
postcopy. In vhost-user extra messages are added to the existing
|
||||
control channel.
|
||||
b) Any thread that can block due to guest memory accesses must be
|
||||
identified and the implication understood; for example if the
|
||||
guest memory access is made while holding a lock then all other
|
||||
threads waiting for that lock will also be blocked.
|
||||
|
@ -290,6 +290,15 @@ Once the source has finished migration, rings will be stopped by
|
||||
the source. No further update must be done before rings are
|
||||
restarted.
|
||||
|
||||
In postcopy migration the slave is started before all the memory has been
|
||||
received from the source host, and care must be taken to avoid accessing pages
|
||||
that have yet to be received. The slave opens a 'userfault'-fd and registers
|
||||
the memory with it; this fd is then passed back over to the master.
|
||||
The master services requests on the userfaultfd for pages that are accessed
|
||||
and when the page is available it performs WAKE ioctl's on the userfaultfd
|
||||
to wake the stalled slave. The client indicates support for this via the
|
||||
VHOST_USER_PROTOCOL_F_PAGEFAULT feature.
|
||||
|
||||
Memory access
|
||||
-------------
|
||||
|
||||
@ -369,6 +378,7 @@ Protocol features
|
||||
#define VHOST_USER_PROTOCOL_F_SLAVE_REQ 5
|
||||
#define VHOST_USER_PROTOCOL_F_CROSS_ENDIAN 6
|
||||
#define VHOST_USER_PROTOCOL_F_CRYPTO_SESSION 7
|
||||
#define VHOST_USER_PROTOCOL_F_PAGEFAULT 8
|
||||
|
||||
Master message types
|
||||
--------------------
|
||||
@ -445,12 +455,21 @@ Master message types
|
||||
Id: 5
|
||||
Equivalent ioctl: VHOST_SET_MEM_TABLE
|
||||
Master payload: memory regions description
|
||||
Slave payload: (postcopy only) memory regions description
|
||||
|
||||
Sets the memory map regions on the slave so it can translate the vring
|
||||
addresses. In the ancillary data there is an array of file descriptors
|
||||
for each memory mapped region. The size and ordering of the fds matches
|
||||
the number and ordering of memory regions.
|
||||
|
||||
When VHOST_USER_POSTCOPY_LISTEN has been received, SET_MEM_TABLE replies with
|
||||
the bases of the memory mapped regions to the master. The slave must
|
||||
have mmap'd the regions but not yet accessed them and should not yet generate
|
||||
a userfault event. Note NEED_REPLY_MASK is not set in this case.
|
||||
QEMU will then reply back to the list of mappings with an empty
|
||||
VHOST_USER_SET_MEM_TABLE as an acknowledgment; only upon reception of this
|
||||
message may the guest start accessing the memory and generating faults.
|
||||
|
||||
* VHOST_USER_SET_LOG_BASE
|
||||
|
||||
Id: 6
|
||||
@ -689,6 +708,39 @@ Master message types
|
||||
feature has been successfully negotiated.
|
||||
It's a required feature for crypto devices.
|
||||
|
||||
* VHOST_USER_POSTCOPY_ADVISE
|
||||
Id: 28
|
||||
Master payload: N/A
|
||||
Slave payload: userfault fd
|
||||
|
||||
When VHOST_USER_PROTOCOL_F_PAGEFAULT is supported, the
|
||||
master advises slave that a migration with postcopy enabled is underway,
|
||||
the slave must open a userfaultfd for later use.
|
||||
Note that at this stage the migration is still in precopy mode.
|
||||
|
||||
* VHOST_USER_POSTCOPY_LISTEN
|
||||
Id: 29
|
||||
Master payload: N/A
|
||||
|
||||
Master advises slave that a transition to postcopy mode has happened.
|
||||
The slave must ensure that shared memory is registered with userfaultfd
|
||||
to cause faulting of non-present pages.
|
||||
|
||||
This is always sent sometime after a VHOST_USER_POSTCOPY_ADVISE, and
|
||||
thus only when VHOST_USER_PROTOCOL_F_PAGEFAULT is supported.
|
||||
|
||||
* VHOST_USER_POSTCOPY_END
|
||||
Id: 30
|
||||
Slave payload: u64
|
||||
|
||||
Master advises that postcopy migration has now completed. The
|
||||
slave must disable the userfaultfd. The response is an acknowledgement
|
||||
only.
|
||||
When VHOST_USER_PROTOCOL_F_PAGEFAULT is supported, this message
|
||||
is sent at the end of the migration, after VHOST_USER_POSTCOPY_LISTEN
|
||||
was previously sent.
|
||||
The value returned is an error indication; 0 is success.
|
||||
|
||||
Slave message types
|
||||
-------------------
|
||||
|
||||
|
90
exec.c
90
exec.c
@ -99,6 +99,11 @@ static MemoryRegion io_mem_unassigned;
|
||||
*/
|
||||
#define RAM_RESIZEABLE (1 << 2)
|
||||
|
||||
/* UFFDIO_ZEROPAGE is available on this RAMBlock to atomically
|
||||
* zero the page and wake waiting processes.
|
||||
* (Set during postcopy)
|
||||
*/
|
||||
#define RAM_UF_ZEROPAGE (1 << 3)
|
||||
#endif
|
||||
|
||||
#ifdef TARGET_PAGE_BITS_VARY
|
||||
@ -1790,6 +1795,17 @@ bool qemu_ram_is_shared(RAMBlock *rb)
|
||||
return rb->flags & RAM_SHARED;
|
||||
}
|
||||
|
||||
/* Note: Only set at the start of postcopy */
|
||||
bool qemu_ram_is_uf_zeroable(RAMBlock *rb)
|
||||
{
|
||||
return rb->flags & RAM_UF_ZEROPAGE;
|
||||
}
|
||||
|
||||
void qemu_ram_set_uf_zeroable(RAMBlock *rb)
|
||||
{
|
||||
rb->flags |= RAM_UF_ZEROPAGE;
|
||||
}
|
||||
|
||||
/* Called with iothread lock held. */
|
||||
void qemu_ram_set_idstr(RAMBlock *new_block, const char *name, DeviceState *dev)
|
||||
{
|
||||
@ -2320,6 +2336,16 @@ static void *qemu_ram_ptr_length(RAMBlock *ram_block, ram_addr_t addr,
|
||||
return ramblock_ptr(block, addr);
|
||||
}
|
||||
|
||||
/* Return the offset of a hostpointer within a ramblock */
|
||||
ram_addr_t qemu_ram_block_host_offset(RAMBlock *rb, void *host)
|
||||
{
|
||||
ram_addr_t res = (uint8_t *)host - (uint8_t *)rb->host;
|
||||
assert((uintptr_t)host >= (uintptr_t)rb->host);
|
||||
assert(res < rb->max_length);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
/*
|
||||
* Translates a host ptr back to a RAMBlock, a ram_addr and an offset
|
||||
* in that RAMBlock.
|
||||
@ -3744,6 +3770,7 @@ int ram_block_discard_range(RAMBlock *rb, uint64_t start, size_t length)
|
||||
}
|
||||
|
||||
if ((start + length) <= rb->used_length) {
|
||||
bool need_madvise, need_fallocate;
|
||||
uint8_t *host_endaddr = host_startaddr + length;
|
||||
if ((uintptr_t)host_endaddr & (rb->page_size - 1)) {
|
||||
error_report("ram_block_discard_range: Unaligned end address: %p",
|
||||
@ -3753,29 +3780,60 @@ int ram_block_discard_range(RAMBlock *rb, uint64_t start, size_t length)
|
||||
|
||||
errno = ENOTSUP; /* If we are missing MADVISE etc */
|
||||
|
||||
if (rb->page_size == qemu_host_page_size) {
|
||||
#if defined(CONFIG_MADVISE)
|
||||
/* Note: We need the madvise MADV_DONTNEED behaviour of definitely
|
||||
* freeing the page.
|
||||
*/
|
||||
ret = madvise(host_startaddr, length, MADV_DONTNEED);
|
||||
#endif
|
||||
} else {
|
||||
/* Huge page case - unfortunately it can't do DONTNEED, but
|
||||
* it can do the equivalent by FALLOC_FL_PUNCH_HOLE in the
|
||||
* huge page file.
|
||||
/* The logic here is messy;
|
||||
* madvise DONTNEED fails for hugepages
|
||||
* fallocate works on hugepages and shmem
|
||||
*/
|
||||
need_madvise = (rb->page_size == qemu_host_page_size);
|
||||
need_fallocate = rb->fd != -1;
|
||||
if (need_fallocate) {
|
||||
/* For a file, this causes the area of the file to be zero'd
|
||||
* if read, and for hugetlbfs also causes it to be unmapped
|
||||
* so a userfault will trigger.
|
||||
*/
|
||||
#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
|
||||
ret = fallocate(rb->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
|
||||
start, length);
|
||||
#endif
|
||||
}
|
||||
if (ret) {
|
||||
ret = -errno;
|
||||
error_report("ram_block_discard_range: Failed to discard range "
|
||||
if (ret) {
|
||||
ret = -errno;
|
||||
error_report("ram_block_discard_range: Failed to fallocate "
|
||||
"%s:%" PRIx64 " +%zx (%d)",
|
||||
rb->idstr, start, length, ret);
|
||||
goto err;
|
||||
}
|
||||
#else
|
||||
ret = -ENOSYS;
|
||||
error_report("ram_block_discard_range: fallocate not available/file"
|
||||
"%s:%" PRIx64 " +%zx (%d)",
|
||||
rb->idstr, start, length, ret);
|
||||
goto err;
|
||||
#endif
|
||||
}
|
||||
if (need_madvise) {
|
||||
/* For normal RAM this causes it to be unmapped,
|
||||
* for shared memory it causes the local mapping to disappear
|
||||
* and to fall back on the file contents (which we just
|
||||
* fallocate'd away).
|
||||
*/
|
||||
#if defined(CONFIG_MADVISE)
|
||||
ret = madvise(host_startaddr, length, MADV_DONTNEED);
|
||||
if (ret) {
|
||||
ret = -errno;
|
||||
error_report("ram_block_discard_range: Failed to discard range "
|
||||
"%s:%" PRIx64 " +%zx (%d)",
|
||||
rb->idstr, start, length, ret);
|
||||
goto err;
|
||||
}
|
||||
#else
|
||||
ret = -ENOSYS;
|
||||
error_report("ram_block_discard_range: MADVISE not available"
|
||||
"%s:%" PRIx64 " +%zx (%d)",
|
||||
rb->idstr, start, length, ret);
|
||||
goto err;
|
||||
#endif
|
||||
}
|
||||
trace_ram_block_discard_range(rb->idstr, host_startaddr, length,
|
||||
need_madvise, need_fallocate, ret);
|
||||
} else {
|
||||
error_report("ram_block_discard_range: Overrun block '%s' (%" PRIu64
|
||||
"/%zx/" RAM_ADDR_FMT")",
|
||||
|
14
hmp.c
14
hmp.c
@ -2423,7 +2423,18 @@ void hmp_info_memory_devices(Monitor *mon, const QDict *qdict)
|
||||
switch (value->type) {
|
||||
case MEMORY_DEVICE_INFO_KIND_DIMM:
|
||||
di = value->u.dimm.data;
|
||||
break;
|
||||
|
||||
case MEMORY_DEVICE_INFO_KIND_NVDIMM:
|
||||
di = value->u.nvdimm.data;
|
||||
break;
|
||||
|
||||
default:
|
||||
di = NULL;
|
||||
break;
|
||||
}
|
||||
|
||||
if (di) {
|
||||
monitor_printf(mon, "Memory device [%s]: \"%s\"\n",
|
||||
MemoryDeviceInfoKind_str(value->type),
|
||||
di->id ? di->id : "");
|
||||
@ -2436,9 +2447,6 @@ void hmp_info_memory_devices(Monitor *mon, const QDict *qdict)
|
||||
di->hotplugged ? "true" : "false");
|
||||
monitor_printf(mon, " hotpluggable: %s\n",
|
||||
di->hotpluggable ? "true" : "false");
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -258,6 +258,22 @@ static void build_append_int(GArray *table, uint64_t value)
|
||||
}
|
||||
}
|
||||
|
||||
/* Generic Address Structure (GAS)
|
||||
* ACPI 2.0/3.0: 5.2.3.1 Generic Address Structure
|
||||
* 2.0 compat note:
|
||||
* @access_width must be 0, see ACPI 2.0:Table 5-1
|
||||
*/
|
||||
void build_append_gas(GArray *table, AmlAddressSpace as,
|
||||
uint8_t bit_width, uint8_t bit_offset,
|
||||
uint8_t access_width, uint64_t address)
|
||||
{
|
||||
build_append_int_noprefix(table, as, 1);
|
||||
build_append_int_noprefix(table, bit_width, 1);
|
||||
build_append_int_noprefix(table, bit_offset, 1);
|
||||
build_append_int_noprefix(table, access_width, 1);
|
||||
build_append_int_noprefix(table, address, 8);
|
||||
}
|
||||
|
||||
/*
|
||||
* Build NAME(XXXX, 0x00000000) where 0x00000000 is encoded as a dword,
|
||||
* and return the offset to 0x00000000 for runtime patching.
|
||||
@ -1662,3 +1678,127 @@ void build_slit(GArray *table_data, BIOSLinker *linker)
|
||||
"SLIT",
|
||||
table_data->len - slit_start, 1, NULL, NULL);
|
||||
}
|
||||
|
||||
/* build rev1/rev3/rev5.1 FADT */
|
||||
void build_fadt(GArray *tbl, BIOSLinker *linker, const AcpiFadtData *f,
|
||||
const char *oem_id, const char *oem_table_id)
|
||||
{
|
||||
int off;
|
||||
int fadt_start = tbl->len;
|
||||
|
||||
acpi_data_push(tbl, sizeof(AcpiTableHeader));
|
||||
|
||||
/* FACS address to be filled by Guest linker at runtime */
|
||||
off = tbl->len;
|
||||
build_append_int_noprefix(tbl, 0, 4); /* FIRMWARE_CTRL */
|
||||
if (f->facs_tbl_offset) { /* don't patch if not supported by platform */
|
||||
bios_linker_loader_add_pointer(linker,
|
||||
ACPI_BUILD_TABLE_FILE, off, 4,
|
||||
ACPI_BUILD_TABLE_FILE, *f->facs_tbl_offset);
|
||||
}
|
||||
|
||||
/* DSDT address to be filled by Guest linker at runtime */
|
||||
off = tbl->len;
|
||||
build_append_int_noprefix(tbl, 0, 4); /* DSDT */
|
||||
if (f->dsdt_tbl_offset) { /* don't patch if not supported by platform */
|
||||
bios_linker_loader_add_pointer(linker,
|
||||
ACPI_BUILD_TABLE_FILE, off, 4,
|
||||
ACPI_BUILD_TABLE_FILE, *f->dsdt_tbl_offset);
|
||||
}
|
||||
|
||||
/* ACPI1.0: INT_MODEL, ACPI2.0+: Reserved */
|
||||
build_append_int_noprefix(tbl, f->int_model /* Multiple APIC */, 1);
|
||||
/* Preferred_PM_Profile */
|
||||
build_append_int_noprefix(tbl, 0 /* Unspecified */, 1);
|
||||
build_append_int_noprefix(tbl, f->sci_int, 2); /* SCI_INT */
|
||||
build_append_int_noprefix(tbl, f->smi_cmd, 4); /* SMI_CMD */
|
||||
build_append_int_noprefix(tbl, f->acpi_enable_cmd, 1); /* ACPI_ENABLE */
|
||||
build_append_int_noprefix(tbl, f->acpi_disable_cmd, 1); /* ACPI_DISABLE */
|
||||
build_append_int_noprefix(tbl, 0 /* not supported */, 1); /* S4BIOS_REQ */
|
||||
/* ACPI1.0: Reserved, ACPI2.0+: PSTATE_CNT */
|
||||
build_append_int_noprefix(tbl, 0, 1);
|
||||
build_append_int_noprefix(tbl, f->pm1a_evt.address, 4); /* PM1a_EVT_BLK */
|
||||
build_append_int_noprefix(tbl, 0, 4); /* PM1b_EVT_BLK */
|
||||
build_append_int_noprefix(tbl, f->pm1a_cnt.address, 4); /* PM1a_CNT_BLK */
|
||||
build_append_int_noprefix(tbl, 0, 4); /* PM1b_CNT_BLK */
|
||||
build_append_int_noprefix(tbl, 0, 4); /* PM2_CNT_BLK */
|
||||
build_append_int_noprefix(tbl, f->pm_tmr.address, 4); /* PM_TMR_BLK */
|
||||
build_append_int_noprefix(tbl, f->gpe0_blk.address, 4); /* GPE0_BLK */
|
||||
build_append_int_noprefix(tbl, 0, 4); /* GPE1_BLK */
|
||||
/* PM1_EVT_LEN */
|
||||
build_append_int_noprefix(tbl, f->pm1a_evt.bit_width / 8, 1);
|
||||
/* PM1_CNT_LEN */
|
||||
build_append_int_noprefix(tbl, f->pm1a_cnt.bit_width / 8, 1);
|
||||
build_append_int_noprefix(tbl, 0, 1); /* PM2_CNT_LEN */
|
||||
build_append_int_noprefix(tbl, f->pm_tmr.bit_width / 8, 1); /* PM_TMR_LEN */
|
||||
/* GPE0_BLK_LEN */
|
||||
build_append_int_noprefix(tbl, f->gpe0_blk.bit_width / 8, 1);
|
||||
build_append_int_noprefix(tbl, 0, 1); /* GPE1_BLK_LEN */
|
||||
build_append_int_noprefix(tbl, 0, 1); /* GPE1_BASE */
|
||||
build_append_int_noprefix(tbl, 0, 1); /* CST_CNT */
|
||||
build_append_int_noprefix(tbl, f->plvl2_lat, 2); /* P_LVL2_LAT */
|
||||
build_append_int_noprefix(tbl, f->plvl3_lat, 2); /* P_LVL3_LAT */
|
||||
build_append_int_noprefix(tbl, 0, 2); /* FLUSH_SIZE */
|
||||
build_append_int_noprefix(tbl, 0, 2); /* FLUSH_STRIDE */
|
||||
build_append_int_noprefix(tbl, 0, 1); /* DUTY_OFFSET */
|
||||
build_append_int_noprefix(tbl, 0, 1); /* DUTY_WIDTH */
|
||||
build_append_int_noprefix(tbl, 0, 1); /* DAY_ALRM */
|
||||
build_append_int_noprefix(tbl, 0, 1); /* MON_ALRM */
|
||||
build_append_int_noprefix(tbl, f->rtc_century, 1); /* CENTURY */
|
||||
build_append_int_noprefix(tbl, 0, 2); /* IAPC_BOOT_ARCH */
|
||||
build_append_int_noprefix(tbl, 0, 1); /* Reserved */
|
||||
build_append_int_noprefix(tbl, f->flags, 4); /* Flags */
|
||||
|
||||
if (f->rev == 1) {
|
||||
goto build_hdr;
|
||||
}
|
||||
|
||||
build_append_gas_from_struct(tbl, &f->reset_reg); /* RESET_REG */
|
||||
build_append_int_noprefix(tbl, f->reset_val, 1); /* RESET_VALUE */
|
||||
/* Since ACPI 5.1 */
|
||||
if ((f->rev >= 6) || ((f->rev == 5) && f->minor_ver > 0)) {
|
||||
build_append_int_noprefix(tbl, f->arm_boot_arch, 2); /* ARM_BOOT_ARCH */
|
||||
/* FADT Minor Version */
|
||||
build_append_int_noprefix(tbl, f->minor_ver, 1);
|
||||
} else {
|
||||
build_append_int_noprefix(tbl, 0, 3); /* Reserved upto ACPI 5.0 */
|
||||
}
|
||||
build_append_int_noprefix(tbl, 0, 8); /* X_FIRMWARE_CTRL */
|
||||
|
||||
/* XDSDT address to be filled by Guest linker at runtime */
|
||||
off = tbl->len;
|
||||
build_append_int_noprefix(tbl, 0, 8); /* X_DSDT */
|
||||
if (f->xdsdt_tbl_offset) {
|
||||
bios_linker_loader_add_pointer(linker,
|
||||
ACPI_BUILD_TABLE_FILE, off, 8,
|
||||
ACPI_BUILD_TABLE_FILE, *f->xdsdt_tbl_offset);
|
||||
}
|
||||
|
||||
build_append_gas_from_struct(tbl, &f->pm1a_evt); /* X_PM1a_EVT_BLK */
|
||||
/* X_PM1b_EVT_BLK */
|
||||
build_append_gas(tbl, AML_AS_SYSTEM_MEMORY, 0 , 0, 0, 0);
|
||||
build_append_gas_from_struct(tbl, &f->pm1a_cnt); /* X_PM1a_CNT_BLK */
|
||||
/* X_PM1b_CNT_BLK */
|
||||
build_append_gas(tbl, AML_AS_SYSTEM_MEMORY, 0 , 0, 0, 0);
|
||||
/* X_PM2_CNT_BLK */
|
||||
build_append_gas(tbl, AML_AS_SYSTEM_MEMORY, 0 , 0, 0, 0);
|
||||
build_append_gas_from_struct(tbl, &f->pm_tmr); /* X_PM_TMR_BLK */
|
||||
build_append_gas_from_struct(tbl, &f->gpe0_blk); /* X_GPE0_BLK */
|
||||
build_append_gas(tbl, AML_AS_SYSTEM_MEMORY, 0 , 0, 0, 0); /* X_GPE1_BLK */
|
||||
|
||||
if (f->rev <= 4) {
|
||||
goto build_hdr;
|
||||
}
|
||||
|
||||
/* SLEEP_CONTROL_REG */
|
||||
build_append_gas(tbl, AML_AS_SYSTEM_MEMORY, 0 , 0, 0, 0);
|
||||
/* SLEEP_STATUS_REG */
|
||||
build_append_gas(tbl, AML_AS_SYSTEM_MEMORY, 0 , 0, 0, 0);
|
||||
|
||||
/* TODO: extra fields need to be added to support revisions above rev5 */
|
||||
assert(f->rev == 5);
|
||||
|
||||
build_hdr:
|
||||
build_header(linker, tbl, (void *)(tbl->data + fadt_start),
|
||||
"FACP", tbl->len - fadt_start, f->rev, oem_id, oem_table_id);
|
||||
}
|
||||
|
@ -651,42 +651,33 @@ build_madt(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
|
||||
}
|
||||
|
||||
/* FADT */
|
||||
static void build_fadt(GArray *table_data, BIOSLinker *linker,
|
||||
VirtMachineState *vms, unsigned dsdt_tbl_offset)
|
||||
static void build_fadt_rev5(GArray *table_data, BIOSLinker *linker,
|
||||
VirtMachineState *vms, unsigned dsdt_tbl_offset)
|
||||
{
|
||||
int fadt_start = table_data->len;
|
||||
AcpiFadtDescriptorRev5_1 *fadt = acpi_data_push(table_data, sizeof(*fadt));
|
||||
unsigned xdsdt_entry_offset = (char *)&fadt->x_dsdt - table_data->data;
|
||||
uint16_t bootflags;
|
||||
/* ACPI v5.1 */
|
||||
AcpiFadtData fadt = {
|
||||
.rev = 5,
|
||||
.minor_ver = 1,
|
||||
.flags = 1 << ACPI_FADT_F_HW_REDUCED_ACPI,
|
||||
.xdsdt_tbl_offset = &dsdt_tbl_offset,
|
||||
};
|
||||
|
||||
switch (vms->psci_conduit) {
|
||||
case QEMU_PSCI_CONDUIT_DISABLED:
|
||||
bootflags = 0;
|
||||
fadt.arm_boot_arch = 0;
|
||||
break;
|
||||
case QEMU_PSCI_CONDUIT_HVC:
|
||||
bootflags = ACPI_FADT_ARM_PSCI_COMPLIANT | ACPI_FADT_ARM_PSCI_USE_HVC;
|
||||
fadt.arm_boot_arch = ACPI_FADT_ARM_PSCI_COMPLIANT |
|
||||
ACPI_FADT_ARM_PSCI_USE_HVC;
|
||||
break;
|
||||
case QEMU_PSCI_CONDUIT_SMC:
|
||||
bootflags = ACPI_FADT_ARM_PSCI_COMPLIANT;
|
||||
fadt.arm_boot_arch = ACPI_FADT_ARM_PSCI_COMPLIANT;
|
||||
break;
|
||||
default:
|
||||
g_assert_not_reached();
|
||||
}
|
||||
|
||||
/* Hardware Reduced = 1 and use PSCI 0.2+ */
|
||||
fadt->flags = cpu_to_le32(1 << ACPI_FADT_F_HW_REDUCED_ACPI);
|
||||
fadt->arm_boot_flags = cpu_to_le16(bootflags);
|
||||
|
||||
/* ACPI v5.1 (fadt->revision.fadt->minor_revision) */
|
||||
fadt->minor_revision = 0x1;
|
||||
|
||||
/* DSDT address to be filled by Guest linker */
|
||||
bios_linker_loader_add_pointer(linker,
|
||||
ACPI_BUILD_TABLE_FILE, xdsdt_entry_offset, sizeof(fadt->x_dsdt),
|
||||
ACPI_BUILD_TABLE_FILE, dsdt_tbl_offset);
|
||||
|
||||
build_header(linker, table_data, (void *)(table_data->data + fadt_start),
|
||||
"FACP", table_data->len - fadt_start, 5, NULL, NULL);
|
||||
build_fadt(table_data, linker, &fadt, NULL, NULL);
|
||||
}
|
||||
|
||||
/* DSDT */
|
||||
@ -761,7 +752,7 @@ void virt_acpi_build(VirtMachineState *vms, AcpiBuildTables *tables)
|
||||
|
||||
/* FADT MADT GTDT MCFG SPCR pointed to by RSDT */
|
||||
acpi_add_table(table_offsets, tables_blob);
|
||||
build_fadt(tables_blob, tables->linker, vms, dsdt);
|
||||
build_fadt_rev5(tables_blob, tables->linker, vms, dsdt);
|
||||
|
||||
acpi_add_table(table_offsets, tables_blob);
|
||||
build_madt(tables_blob, tables->linker, vms);
|
||||
|
@ -91,17 +91,11 @@ typedef struct AcpiMcfgInfo {
|
||||
} AcpiMcfgInfo;
|
||||
|
||||
typedef struct AcpiPmInfo {
|
||||
bool force_rev1_fadt;
|
||||
bool s3_disabled;
|
||||
bool s4_disabled;
|
||||
bool pcihp_bridge_en;
|
||||
uint8_t s4_val;
|
||||
uint16_t sci_int;
|
||||
uint8_t acpi_enable_cmd;
|
||||
uint8_t acpi_disable_cmd;
|
||||
uint32_t gpe0_blk;
|
||||
uint32_t gpe0_blk_len;
|
||||
uint32_t io_base;
|
||||
AcpiFadtData fadt;
|
||||
uint16_t cpu_hp_io_base;
|
||||
uint16_t pcihp_io_base;
|
||||
uint16_t pcihp_io_len;
|
||||
@ -124,21 +118,59 @@ typedef struct AcpiBuildPciBusHotplugState {
|
||||
bool pcihp_bridge_en;
|
||||
} AcpiBuildPciBusHotplugState;
|
||||
|
||||
static void init_common_fadt_data(Object *o, AcpiFadtData *data)
|
||||
{
|
||||
uint32_t io = object_property_get_uint(o, ACPI_PM_PROP_PM_IO_BASE, NULL);
|
||||
AmlAddressSpace as = AML_AS_SYSTEM_IO;
|
||||
AcpiFadtData fadt = {
|
||||
.rev = 3,
|
||||
.flags =
|
||||
(1 << ACPI_FADT_F_WBINVD) |
|
||||
(1 << ACPI_FADT_F_PROC_C1) |
|
||||
(1 << ACPI_FADT_F_SLP_BUTTON) |
|
||||
(1 << ACPI_FADT_F_RTC_S4) |
|
||||
(1 << ACPI_FADT_F_USE_PLATFORM_CLOCK) |
|
||||
/* APIC destination mode ("Flat Logical") has an upper limit of 8
|
||||
* CPUs for more than 8 CPUs, "Clustered Logical" mode has to be
|
||||
* used
|
||||
*/
|
||||
((max_cpus > 8) ? (1 << ACPI_FADT_F_FORCE_APIC_CLUSTER_MODEL) : 0),
|
||||
.int_model = 1 /* Multiple APIC */,
|
||||
.rtc_century = RTC_CENTURY,
|
||||
.plvl2_lat = 0xfff /* C2 state not supported */,
|
||||
.plvl3_lat = 0xfff /* C3 state not supported */,
|
||||
.smi_cmd = ACPI_PORT_SMI_CMD,
|
||||
.sci_int = object_property_get_uint(o, ACPI_PM_PROP_SCI_INT, NULL),
|
||||
.acpi_enable_cmd =
|
||||
object_property_get_uint(o, ACPI_PM_PROP_ACPI_ENABLE_CMD, NULL),
|
||||
.acpi_disable_cmd =
|
||||
object_property_get_uint(o, ACPI_PM_PROP_ACPI_DISABLE_CMD, NULL),
|
||||
.pm1a_evt = { .space_id = as, .bit_width = 4 * 8, .address = io },
|
||||
.pm1a_cnt = { .space_id = as, .bit_width = 2 * 8,
|
||||
.address = io + 0x04 },
|
||||
.pm_tmr = { .space_id = as, .bit_width = 4 * 8, .address = io + 0x08 },
|
||||
.gpe0_blk = { .space_id = as, .bit_width =
|
||||
object_property_get_uint(o, ACPI_PM_PROP_GPE0_BLK_LEN, NULL) * 8,
|
||||
.address = object_property_get_uint(o, ACPI_PM_PROP_GPE0_BLK, NULL)
|
||||
},
|
||||
};
|
||||
*data = fadt;
|
||||
}
|
||||
|
||||
static void acpi_get_pm_info(AcpiPmInfo *pm)
|
||||
{
|
||||
Object *piix = piix4_pm_find();
|
||||
Object *lpc = ich9_lpc_find();
|
||||
Object *obj = NULL;
|
||||
Object *obj = piix ? piix : lpc;
|
||||
QObject *o;
|
||||
|
||||
pm->force_rev1_fadt = false;
|
||||
pm->cpu_hp_io_base = 0;
|
||||
pm->pcihp_io_base = 0;
|
||||
pm->pcihp_io_len = 0;
|
||||
|
||||
init_common_fadt_data(obj, &pm->fadt);
|
||||
if (piix) {
|
||||
/* w2k requires FADT(rev1) or it won't boot, keep PC compatible */
|
||||
pm->force_rev1_fadt = true;
|
||||
obj = piix;
|
||||
pm->fadt.rev = 1;
|
||||
pm->cpu_hp_io_base = PIIX4_CPU_HOTPLUG_IO_BASE;
|
||||
pm->pcihp_io_base =
|
||||
object_property_get_uint(obj, ACPI_PCIHP_IO_BASE_PROP, NULL);
|
||||
@ -146,11 +178,19 @@ static void acpi_get_pm_info(AcpiPmInfo *pm)
|
||||
object_property_get_uint(obj, ACPI_PCIHP_IO_LEN_PROP, NULL);
|
||||
}
|
||||
if (lpc) {
|
||||
obj = lpc;
|
||||
struct AcpiGenericAddress r = { .space_id = AML_AS_SYSTEM_IO,
|
||||
.bit_width = 8, .address = ICH9_RST_CNT_IOPORT };
|
||||
pm->fadt.reset_reg = r;
|
||||
pm->fadt.reset_val = 0xf;
|
||||
pm->fadt.flags |= 1 << ACPI_FADT_F_RESET_REG_SUP;
|
||||
pm->cpu_hp_io_base = ICH9_CPU_HOTPLUG_IO_BASE;
|
||||
}
|
||||
assert(obj);
|
||||
|
||||
/* The above need not be conditional on machine type because the reset port
|
||||
* happens to be the same on PIIX (pc) and ICH9 (q35). */
|
||||
QEMU_BUILD_BUG_ON(ICH9_RST_CNT_IOPORT != RCR_IOPORT);
|
||||
|
||||
/* Fill in optional s3/s4 related properties */
|
||||
o = object_property_get_qobject(obj, ACPI_PM_PROP_S3_DISABLED, NULL);
|
||||
if (o) {
|
||||
@ -174,22 +214,6 @@ static void acpi_get_pm_info(AcpiPmInfo *pm)
|
||||
}
|
||||
qobject_decref(o);
|
||||
|
||||
/* Fill in mandatory properties */
|
||||
pm->sci_int = object_property_get_uint(obj, ACPI_PM_PROP_SCI_INT, NULL);
|
||||
|
||||
pm->acpi_enable_cmd = object_property_get_uint(obj,
|
||||
ACPI_PM_PROP_ACPI_ENABLE_CMD,
|
||||
NULL);
|
||||
pm->acpi_disable_cmd =
|
||||
object_property_get_uint(obj,
|
||||
ACPI_PM_PROP_ACPI_DISABLE_CMD,
|
||||
NULL);
|
||||
pm->io_base = object_property_get_uint(obj, ACPI_PM_PROP_PM_IO_BASE,
|
||||
NULL);
|
||||
pm->gpe0_blk = object_property_get_uint(obj, ACPI_PM_PROP_GPE0_BLK,
|
||||
NULL);
|
||||
pm->gpe0_blk_len = object_property_get_uint(obj, ACPI_PM_PROP_GPE0_BLK_LEN,
|
||||
NULL);
|
||||
pm->pcihp_bridge_en =
|
||||
object_property_get_bool(obj, "acpi-pci-hotplug-with-bridge-support",
|
||||
NULL);
|
||||
@ -257,8 +281,6 @@ static void acpi_get_pci_holes(Range *hole, Range *hole64)
|
||||
NULL));
|
||||
}
|
||||
|
||||
#define ACPI_PORT_SMI_CMD 0x00b2 /* TODO: this is APM_CNT_IOPORT */
|
||||
|
||||
static void acpi_align_size(GArray *blob, unsigned align)
|
||||
{
|
||||
/* Align size to multiple of given size. This reduces the chance
|
||||
@ -276,106 +298,6 @@ build_facs(GArray *table_data, BIOSLinker *linker)
|
||||
facs->length = cpu_to_le32(sizeof(*facs));
|
||||
}
|
||||
|
||||
/* Load chipset information in FADT */
|
||||
static void fadt_setup(AcpiFadtDescriptorRev3 *fadt, AcpiPmInfo *pm)
|
||||
{
|
||||
fadt->model = 1;
|
||||
fadt->reserved1 = 0;
|
||||
fadt->sci_int = cpu_to_le16(pm->sci_int);
|
||||
fadt->smi_cmd = cpu_to_le32(ACPI_PORT_SMI_CMD);
|
||||
fadt->acpi_enable = pm->acpi_enable_cmd;
|
||||
fadt->acpi_disable = pm->acpi_disable_cmd;
|
||||
/* EVT, CNT, TMR offset matches hw/acpi/core.c */
|
||||
fadt->pm1a_evt_blk = cpu_to_le32(pm->io_base);
|
||||
fadt->pm1a_cnt_blk = cpu_to_le32(pm->io_base + 0x04);
|
||||
fadt->pm_tmr_blk = cpu_to_le32(pm->io_base + 0x08);
|
||||
fadt->gpe0_blk = cpu_to_le32(pm->gpe0_blk);
|
||||
/* EVT, CNT, TMR length matches hw/acpi/core.c */
|
||||
fadt->pm1_evt_len = 4;
|
||||
fadt->pm1_cnt_len = 2;
|
||||
fadt->pm_tmr_len = 4;
|
||||
fadt->gpe0_blk_len = pm->gpe0_blk_len;
|
||||
fadt->plvl2_lat = cpu_to_le16(0xfff); /* C2 state not supported */
|
||||
fadt->plvl3_lat = cpu_to_le16(0xfff); /* C3 state not supported */
|
||||
fadt->flags = cpu_to_le32((1 << ACPI_FADT_F_WBINVD) |
|
||||
(1 << ACPI_FADT_F_PROC_C1) |
|
||||
(1 << ACPI_FADT_F_SLP_BUTTON) |
|
||||
(1 << ACPI_FADT_F_RTC_S4));
|
||||
fadt->flags |= cpu_to_le32(1 << ACPI_FADT_F_USE_PLATFORM_CLOCK);
|
||||
/* APIC destination mode ("Flat Logical") has an upper limit of 8 CPUs
|
||||
* For more than 8 CPUs, "Clustered Logical" mode has to be used
|
||||
*/
|
||||
if (max_cpus > 8) {
|
||||
fadt->flags |= cpu_to_le32(1 << ACPI_FADT_F_FORCE_APIC_CLUSTER_MODEL);
|
||||
}
|
||||
fadt->century = RTC_CENTURY;
|
||||
if (pm->force_rev1_fadt) {
|
||||
return;
|
||||
}
|
||||
|
||||
fadt->flags |= cpu_to_le32(1 << ACPI_FADT_F_RESET_REG_SUP);
|
||||
fadt->reset_value = 0xf;
|
||||
fadt->reset_register.space_id = AML_SYSTEM_IO;
|
||||
fadt->reset_register.bit_width = 8;
|
||||
fadt->reset_register.address = cpu_to_le64(ICH9_RST_CNT_IOPORT);
|
||||
/* The above need not be conditional on machine type because the reset port
|
||||
* happens to be the same on PIIX (pc) and ICH9 (q35). */
|
||||
QEMU_BUILD_BUG_ON(ICH9_RST_CNT_IOPORT != RCR_IOPORT);
|
||||
|
||||
fadt->xpm1a_event_block.space_id = AML_SYSTEM_IO;
|
||||
fadt->xpm1a_event_block.bit_width = fadt->pm1_evt_len * 8;
|
||||
fadt->xpm1a_event_block.address = cpu_to_le64(pm->io_base);
|
||||
|
||||
fadt->xpm1a_control_block.space_id = AML_SYSTEM_IO;
|
||||
fadt->xpm1a_control_block.bit_width = fadt->pm1_cnt_len * 8;
|
||||
fadt->xpm1a_control_block.address = cpu_to_le64(pm->io_base + 0x4);
|
||||
|
||||
fadt->xpm_timer_block.space_id = AML_SYSTEM_IO;
|
||||
fadt->xpm_timer_block.bit_width = fadt->pm_tmr_len * 8;
|
||||
fadt->xpm_timer_block.address = cpu_to_le64(pm->io_base + 0x8);
|
||||
|
||||
fadt->xgpe0_block.space_id = AML_SYSTEM_IO;
|
||||
fadt->xgpe0_block.bit_width = pm->gpe0_blk_len * 8;
|
||||
fadt->xgpe0_block.address = cpu_to_le64(pm->gpe0_blk);
|
||||
}
|
||||
|
||||
|
||||
/* FADT */
|
||||
static void
|
||||
build_fadt(GArray *table_data, BIOSLinker *linker, AcpiPmInfo *pm,
|
||||
unsigned facs_tbl_offset, unsigned dsdt_tbl_offset,
|
||||
const char *oem_id, const char *oem_table_id)
|
||||
{
|
||||
AcpiFadtDescriptorRev3 *fadt = acpi_data_push(table_data, sizeof(*fadt));
|
||||
unsigned fw_ctrl_offset = (char *)&fadt->firmware_ctrl - table_data->data;
|
||||
unsigned dsdt_entry_offset = (char *)&fadt->dsdt - table_data->data;
|
||||
unsigned xdsdt_entry_offset = (char *)&fadt->x_dsdt - table_data->data;
|
||||
int fadt_size = sizeof(*fadt);
|
||||
int rev = 3;
|
||||
|
||||
/* FACS address to be filled by Guest linker */
|
||||
bios_linker_loader_add_pointer(linker,
|
||||
ACPI_BUILD_TABLE_FILE, fw_ctrl_offset, sizeof(fadt->firmware_ctrl),
|
||||
ACPI_BUILD_TABLE_FILE, facs_tbl_offset);
|
||||
|
||||
/* DSDT address to be filled by Guest linker */
|
||||
fadt_setup(fadt, pm);
|
||||
bios_linker_loader_add_pointer(linker,
|
||||
ACPI_BUILD_TABLE_FILE, dsdt_entry_offset, sizeof(fadt->dsdt),
|
||||
ACPI_BUILD_TABLE_FILE, dsdt_tbl_offset);
|
||||
if (pm->force_rev1_fadt) {
|
||||
rev = 1;
|
||||
fadt_size = offsetof(typeof(*fadt), reset_register);
|
||||
} else {
|
||||
bios_linker_loader_add_pointer(linker,
|
||||
ACPI_BUILD_TABLE_FILE, xdsdt_entry_offset, sizeof(fadt->x_dsdt),
|
||||
ACPI_BUILD_TABLE_FILE, dsdt_tbl_offset);
|
||||
}
|
||||
|
||||
build_header(linker, table_data,
|
||||
(void *)fadt, "FACP", fadt_size, rev, oem_id, oem_table_id);
|
||||
}
|
||||
|
||||
void pc_madt_cpu_entry(AcpiDeviceIf *adev, int uid,
|
||||
const CPUArchIdList *apic_ids, GArray *entry)
|
||||
{
|
||||
@ -2053,7 +1975,12 @@ build_dsdt(GArray *table_data, BIOSLinker *linker,
|
||||
aml_append(dev, aml_name_decl("_STA", aml_int(0xB)));
|
||||
crs = aml_resource_template();
|
||||
aml_append(crs,
|
||||
aml_io(AML_DECODE16, pm->gpe0_blk, pm->gpe0_blk, 1, pm->gpe0_blk_len)
|
||||
aml_io(
|
||||
AML_DECODE16,
|
||||
pm->fadt.gpe0_blk.address,
|
||||
pm->fadt.gpe0_blk.address,
|
||||
1,
|
||||
pm->fadt.gpe0_blk.bit_width / 8)
|
||||
);
|
||||
aml_append(dev, aml_name_decl("_CRS", crs));
|
||||
aml_append(scope, dev);
|
||||
@ -2323,6 +2250,55 @@ build_tpm2(GArray *table_data, BIOSLinker *linker, GArray *tcpalog)
|
||||
#define HOLE_640K_START (640 * 1024)
|
||||
#define HOLE_640K_END (1024 * 1024)
|
||||
|
||||
static void build_srat_hotpluggable_memory(GArray *table_data, uint64_t base,
|
||||
uint64_t len, int default_node)
|
||||
{
|
||||
MemoryDeviceInfoList *info_list = qmp_pc_dimm_device_list();
|
||||
MemoryDeviceInfoList *info;
|
||||
MemoryDeviceInfo *mi;
|
||||
PCDIMMDeviceInfo *di;
|
||||
uint64_t end = base + len, cur, size;
|
||||
bool is_nvdimm;
|
||||
AcpiSratMemoryAffinity *numamem;
|
||||
MemoryAffinityFlags flags;
|
||||
|
||||
for (cur = base, info = info_list;
|
||||
cur < end;
|
||||
cur += size, info = info->next) {
|
||||
numamem = acpi_data_push(table_data, sizeof *numamem);
|
||||
|
||||
if (!info) {
|
||||
build_srat_memory(numamem, cur, end - cur, default_node,
|
||||
MEM_AFFINITY_HOTPLUGGABLE | MEM_AFFINITY_ENABLED);
|
||||
break;
|
||||
}
|
||||
|
||||
mi = info->value;
|
||||
is_nvdimm = (mi->type == MEMORY_DEVICE_INFO_KIND_NVDIMM);
|
||||
di = !is_nvdimm ? mi->u.dimm.data : mi->u.nvdimm.data;
|
||||
|
||||
if (cur < di->addr) {
|
||||
build_srat_memory(numamem, cur, di->addr - cur, default_node,
|
||||
MEM_AFFINITY_HOTPLUGGABLE | MEM_AFFINITY_ENABLED);
|
||||
numamem = acpi_data_push(table_data, sizeof *numamem);
|
||||
}
|
||||
|
||||
size = di->size;
|
||||
|
||||
flags = MEM_AFFINITY_ENABLED;
|
||||
if (di->hotpluggable) {
|
||||
flags |= MEM_AFFINITY_HOTPLUGGABLE;
|
||||
}
|
||||
if (is_nvdimm) {
|
||||
flags |= MEM_AFFINITY_NON_VOLATILE;
|
||||
}
|
||||
|
||||
build_srat_memory(numamem, di->addr, size, di->node, flags);
|
||||
}
|
||||
|
||||
qapi_free_MemoryDeviceInfoList(info_list);
|
||||
}
|
||||
|
||||
static void
|
||||
build_srat(GArray *table_data, BIOSLinker *linker, MachineState *machine)
|
||||
{
|
||||
@ -2434,10 +2410,9 @@ build_srat(GArray *table_data, BIOSLinker *linker, MachineState *machine)
|
||||
* providing _PXM method if necessary.
|
||||
*/
|
||||
if (hotplugabble_address_space_size) {
|
||||
numamem = acpi_data_push(table_data, sizeof *numamem);
|
||||
build_srat_memory(numamem, pcms->hotplug_memory.base,
|
||||
hotplugabble_address_space_size, pcms->numa_nodes - 1,
|
||||
MEM_AFFINITY_HOTPLUGGABLE | MEM_AFFINITY_ENABLED);
|
||||
build_srat_hotpluggable_memory(table_data, pcms->hotplug_memory.base,
|
||||
hotplugabble_address_space_size,
|
||||
pcms->numa_nodes - 1);
|
||||
}
|
||||
|
||||
build_header(linker, table_data,
|
||||
@ -2700,7 +2675,10 @@ void acpi_build(AcpiBuildTables *tables, MachineState *machine)
|
||||
/* ACPI tables pointed to by RSDT */
|
||||
fadt = tables_blob->len;
|
||||
acpi_add_table(table_offsets, tables_blob);
|
||||
build_fadt(tables_blob, tables->linker, &pm, facs, dsdt,
|
||||
pm.fadt.facs_tbl_offset = &facs;
|
||||
pm.fadt.dsdt_tbl_offset = &dsdt;
|
||||
pm.fadt.xdsdt_tbl_offset = &dsdt;
|
||||
build_fadt(tables_blob, tables->linker, &pm.fadt,
|
||||
slic_oem.id, slic_oem.table_id);
|
||||
aml_len += tables_blob->len - fadt;
|
||||
|
||||
|
@ -34,7 +34,6 @@
|
||||
#endif
|
||||
|
||||
/* fixed I/O location */
|
||||
#define APM_CNT_IOPORT 0xb2
|
||||
#define APM_STS_IOPORT 0xb3
|
||||
|
||||
static void apm_ioport_writeb(void *opaque, hwaddr addr, uint64_t val,
|
||||
|
@ -20,6 +20,7 @@
|
||||
|
||||
#include "qemu/osdep.h"
|
||||
#include "hw/mem/pc-dimm.h"
|
||||
#include "hw/mem/nvdimm.h"
|
||||
#include "qapi/error.h"
|
||||
#include "qemu/config-file.h"
|
||||
#include "qapi/visitor.h"
|
||||
@ -162,45 +163,6 @@ uint64_t get_plugged_memory_size(void)
|
||||
return pc_existing_dimms_capacity(&error_abort);
|
||||
}
|
||||
|
||||
int qmp_pc_dimm_device_list(Object *obj, void *opaque)
|
||||
{
|
||||
MemoryDeviceInfoList ***prev = opaque;
|
||||
|
||||
if (object_dynamic_cast(obj, TYPE_PC_DIMM)) {
|
||||
DeviceState *dev = DEVICE(obj);
|
||||
|
||||
if (dev->realized) {
|
||||
MemoryDeviceInfoList *elem = g_new0(MemoryDeviceInfoList, 1);
|
||||
MemoryDeviceInfo *info = g_new0(MemoryDeviceInfo, 1);
|
||||
PCDIMMDeviceInfo *di = g_new0(PCDIMMDeviceInfo, 1);
|
||||
DeviceClass *dc = DEVICE_GET_CLASS(obj);
|
||||
PCDIMMDevice *dimm = PC_DIMM(obj);
|
||||
|
||||
if (dev->id) {
|
||||
di->has_id = true;
|
||||
di->id = g_strdup(dev->id);
|
||||
}
|
||||
di->hotplugged = dev->hotplugged;
|
||||
di->hotpluggable = dc->hotpluggable;
|
||||
di->addr = dimm->addr;
|
||||
di->slot = dimm->slot;
|
||||
di->node = dimm->node;
|
||||
di->size = object_property_get_uint(OBJECT(dimm), PC_DIMM_SIZE_PROP,
|
||||
NULL);
|
||||
di->memdev = object_get_canonical_path(OBJECT(dimm->hostmem));
|
||||
|
||||
info->u.dimm.data = di;
|
||||
elem->value = info;
|
||||
elem->next = NULL;
|
||||
**prev = elem;
|
||||
*prev = &elem->next;
|
||||
}
|
||||
}
|
||||
|
||||
object_child_foreach(obj, qmp_pc_dimm_device_list, opaque);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int pc_dimm_slot2bitmap(Object *obj, void *opaque)
|
||||
{
|
||||
unsigned long *bitmap = opaque;
|
||||
@ -276,6 +238,57 @@ static int pc_dimm_built_list(Object *obj, void *opaque)
|
||||
return 0;
|
||||
}
|
||||
|
||||
MemoryDeviceInfoList *qmp_pc_dimm_device_list(void)
|
||||
{
|
||||
GSList *dimms = NULL, *item;
|
||||
MemoryDeviceInfoList *list = NULL, *prev = NULL;
|
||||
|
||||
object_child_foreach(qdev_get_machine(), pc_dimm_built_list, &dimms);
|
||||
|
||||
for (item = dimms; item; item = g_slist_next(item)) {
|
||||
PCDIMMDevice *dimm = PC_DIMM(item->data);
|
||||
Object *obj = OBJECT(dimm);
|
||||
MemoryDeviceInfoList *elem = g_new0(MemoryDeviceInfoList, 1);
|
||||
MemoryDeviceInfo *info = g_new0(MemoryDeviceInfo, 1);
|
||||
PCDIMMDeviceInfo *di = g_new0(PCDIMMDeviceInfo, 1);
|
||||
bool is_nvdimm = object_dynamic_cast(obj, TYPE_NVDIMM);
|
||||
DeviceClass *dc = DEVICE_GET_CLASS(obj);
|
||||
DeviceState *dev = DEVICE(obj);
|
||||
|
||||
if (dev->id) {
|
||||
di->has_id = true;
|
||||
di->id = g_strdup(dev->id);
|
||||
}
|
||||
di->hotplugged = dev->hotplugged;
|
||||
di->hotpluggable = dc->hotpluggable;
|
||||
di->addr = dimm->addr;
|
||||
di->slot = dimm->slot;
|
||||
di->node = dimm->node;
|
||||
di->size = object_property_get_uint(obj, PC_DIMM_SIZE_PROP, NULL);
|
||||
di->memdev = object_get_canonical_path(OBJECT(dimm->hostmem));
|
||||
|
||||
if (!is_nvdimm) {
|
||||
info->u.dimm.data = di;
|
||||
info->type = MEMORY_DEVICE_INFO_KIND_DIMM;
|
||||
} else {
|
||||
info->u.nvdimm.data = di;
|
||||
info->type = MEMORY_DEVICE_INFO_KIND_NVDIMM;
|
||||
}
|
||||
elem->value = info;
|
||||
elem->next = NULL;
|
||||
if (prev) {
|
||||
prev->next = elem;
|
||||
} else {
|
||||
list = elem;
|
||||
}
|
||||
prev = elem;
|
||||
}
|
||||
|
||||
g_slist_free(dimms);
|
||||
|
||||
return list;
|
||||
}
|
||||
|
||||
uint64_t pc_dimm_get_free_addr(uint64_t address_space_start,
|
||||
uint64_t address_space_size,
|
||||
uint64_t *hint, uint64_t align, uint64_t size,
|
||||
|
@ -26,6 +26,7 @@
|
||||
#include "qapi/qapi-events-net.h"
|
||||
#include "hw/virtio/virtio-access.h"
|
||||
#include "migration/misc.h"
|
||||
#include "standard-headers/linux/ethtool.h"
|
||||
|
||||
#define VIRTIO_NET_VM_VERSION 11
|
||||
|
||||
@ -48,19 +49,21 @@
|
||||
(offsetof(container, field) + sizeof(((container *)0)->field))
|
||||
|
||||
typedef struct VirtIOFeature {
|
||||
uint32_t flags;
|
||||
uint64_t flags;
|
||||
size_t end;
|
||||
} VirtIOFeature;
|
||||
|
||||
static VirtIOFeature feature_sizes[] = {
|
||||
{.flags = 1 << VIRTIO_NET_F_MAC,
|
||||
{.flags = 1ULL << VIRTIO_NET_F_MAC,
|
||||
.end = endof(struct virtio_net_config, mac)},
|
||||
{.flags = 1 << VIRTIO_NET_F_STATUS,
|
||||
{.flags = 1ULL << VIRTIO_NET_F_STATUS,
|
||||
.end = endof(struct virtio_net_config, status)},
|
||||
{.flags = 1 << VIRTIO_NET_F_MQ,
|
||||
{.flags = 1ULL << VIRTIO_NET_F_MQ,
|
||||
.end = endof(struct virtio_net_config, max_virtqueue_pairs)},
|
||||
{.flags = 1 << VIRTIO_NET_F_MTU,
|
||||
{.flags = 1ULL << VIRTIO_NET_F_MTU,
|
||||
.end = endof(struct virtio_net_config, mtu)},
|
||||
{.flags = 1ULL << VIRTIO_NET_F_SPEED_DUPLEX,
|
||||
.end = endof(struct virtio_net_config, duplex)},
|
||||
{}
|
||||
};
|
||||
|
||||
@ -89,6 +92,8 @@ static void virtio_net_get_config(VirtIODevice *vdev, uint8_t *config)
|
||||
virtio_stw_p(vdev, &netcfg.max_virtqueue_pairs, n->max_queues);
|
||||
virtio_stw_p(vdev, &netcfg.mtu, n->net_conf.mtu);
|
||||
memcpy(netcfg.mac, n->mac, ETH_ALEN);
|
||||
virtio_stl_p(vdev, &netcfg.speed, n->net_conf.speed);
|
||||
netcfg.duplex = n->net_conf.duplex;
|
||||
memcpy(config, &netcfg, n->config_size);
|
||||
}
|
||||
|
||||
@ -1938,7 +1943,26 @@ static void virtio_net_device_realize(DeviceState *dev, Error **errp)
|
||||
int i;
|
||||
|
||||
if (n->net_conf.mtu) {
|
||||
n->host_features |= (0x1 << VIRTIO_NET_F_MTU);
|
||||
n->host_features |= (1ULL << VIRTIO_NET_F_MTU);
|
||||
}
|
||||
|
||||
if (n->net_conf.duplex_str) {
|
||||
if (strncmp(n->net_conf.duplex_str, "half", 5) == 0) {
|
||||
n->net_conf.duplex = DUPLEX_HALF;
|
||||
} else if (strncmp(n->net_conf.duplex_str, "full", 5) == 0) {
|
||||
n->net_conf.duplex = DUPLEX_FULL;
|
||||
} else {
|
||||
error_setg(errp, "'duplex' must be 'half' or 'full'");
|
||||
}
|
||||
n->host_features |= (1ULL << VIRTIO_NET_F_SPEED_DUPLEX);
|
||||
} else {
|
||||
n->net_conf.duplex = DUPLEX_UNKNOWN;
|
||||
}
|
||||
|
||||
if (n->net_conf.speed < SPEED_UNKNOWN) {
|
||||
error_setg(errp, "'speed' must be between 0 and INT_MAX");
|
||||
} else if (n->net_conf.speed >= 0) {
|
||||
n->host_features |= (1ULL << VIRTIO_NET_F_SPEED_DUPLEX);
|
||||
}
|
||||
|
||||
virtio_net_set_config_size(n, n->host_features);
|
||||
@ -2109,45 +2133,46 @@ static const VMStateDescription vmstate_virtio_net = {
|
||||
};
|
||||
|
||||
static Property virtio_net_properties[] = {
|
||||
DEFINE_PROP_BIT("csum", VirtIONet, host_features, VIRTIO_NET_F_CSUM, true),
|
||||
DEFINE_PROP_BIT("guest_csum", VirtIONet, host_features,
|
||||
DEFINE_PROP_BIT64("csum", VirtIONet, host_features,
|
||||
VIRTIO_NET_F_CSUM, true),
|
||||
DEFINE_PROP_BIT64("guest_csum", VirtIONet, host_features,
|
||||
VIRTIO_NET_F_GUEST_CSUM, true),
|
||||
DEFINE_PROP_BIT("gso", VirtIONet, host_features, VIRTIO_NET_F_GSO, true),
|
||||
DEFINE_PROP_BIT("guest_tso4", VirtIONet, host_features,
|
||||
DEFINE_PROP_BIT64("gso", VirtIONet, host_features, VIRTIO_NET_F_GSO, true),
|
||||
DEFINE_PROP_BIT64("guest_tso4", VirtIONet, host_features,
|
||||
VIRTIO_NET_F_GUEST_TSO4, true),
|
||||
DEFINE_PROP_BIT("guest_tso6", VirtIONet, host_features,
|
||||
DEFINE_PROP_BIT64("guest_tso6", VirtIONet, host_features,
|
||||
VIRTIO_NET_F_GUEST_TSO6, true),
|
||||
DEFINE_PROP_BIT("guest_ecn", VirtIONet, host_features,
|
||||
DEFINE_PROP_BIT64("guest_ecn", VirtIONet, host_features,
|
||||
VIRTIO_NET_F_GUEST_ECN, true),
|
||||
DEFINE_PROP_BIT("guest_ufo", VirtIONet, host_features,
|
||||
DEFINE_PROP_BIT64("guest_ufo", VirtIONet, host_features,
|
||||
VIRTIO_NET_F_GUEST_UFO, true),
|
||||
DEFINE_PROP_BIT("guest_announce", VirtIONet, host_features,
|
||||
DEFINE_PROP_BIT64("guest_announce", VirtIONet, host_features,
|
||||
VIRTIO_NET_F_GUEST_ANNOUNCE, true),
|
||||
DEFINE_PROP_BIT("host_tso4", VirtIONet, host_features,
|
||||
DEFINE_PROP_BIT64("host_tso4", VirtIONet, host_features,
|
||||
VIRTIO_NET_F_HOST_TSO4, true),
|
||||
DEFINE_PROP_BIT("host_tso6", VirtIONet, host_features,
|
||||
DEFINE_PROP_BIT64("host_tso6", VirtIONet, host_features,
|
||||
VIRTIO_NET_F_HOST_TSO6, true),
|
||||
DEFINE_PROP_BIT("host_ecn", VirtIONet, host_features,
|
||||
DEFINE_PROP_BIT64("host_ecn", VirtIONet, host_features,
|
||||
VIRTIO_NET_F_HOST_ECN, true),
|
||||
DEFINE_PROP_BIT("host_ufo", VirtIONet, host_features,
|
||||
DEFINE_PROP_BIT64("host_ufo", VirtIONet, host_features,
|
||||
VIRTIO_NET_F_HOST_UFO, true),
|
||||
DEFINE_PROP_BIT("mrg_rxbuf", VirtIONet, host_features,
|
||||
DEFINE_PROP_BIT64("mrg_rxbuf", VirtIONet, host_features,
|
||||
VIRTIO_NET_F_MRG_RXBUF, true),
|
||||
DEFINE_PROP_BIT("status", VirtIONet, host_features,
|
||||
DEFINE_PROP_BIT64("status", VirtIONet, host_features,
|
||||
VIRTIO_NET_F_STATUS, true),
|
||||
DEFINE_PROP_BIT("ctrl_vq", VirtIONet, host_features,
|
||||
DEFINE_PROP_BIT64("ctrl_vq", VirtIONet, host_features,
|
||||
VIRTIO_NET_F_CTRL_VQ, true),
|
||||
DEFINE_PROP_BIT("ctrl_rx", VirtIONet, host_features,
|
||||
DEFINE_PROP_BIT64("ctrl_rx", VirtIONet, host_features,
|
||||
VIRTIO_NET_F_CTRL_RX, true),
|
||||
DEFINE_PROP_BIT("ctrl_vlan", VirtIONet, host_features,
|
||||
DEFINE_PROP_BIT64("ctrl_vlan", VirtIONet, host_features,
|
||||
VIRTIO_NET_F_CTRL_VLAN, true),
|
||||
DEFINE_PROP_BIT("ctrl_rx_extra", VirtIONet, host_features,
|
||||
DEFINE_PROP_BIT64("ctrl_rx_extra", VirtIONet, host_features,
|
||||
VIRTIO_NET_F_CTRL_RX_EXTRA, true),
|
||||
DEFINE_PROP_BIT("ctrl_mac_addr", VirtIONet, host_features,
|
||||
DEFINE_PROP_BIT64("ctrl_mac_addr", VirtIONet, host_features,
|
||||
VIRTIO_NET_F_CTRL_MAC_ADDR, true),
|
||||
DEFINE_PROP_BIT("ctrl_guest_offloads", VirtIONet, host_features,
|
||||
DEFINE_PROP_BIT64("ctrl_guest_offloads", VirtIONet, host_features,
|
||||
VIRTIO_NET_F_CTRL_GUEST_OFFLOADS, true),
|
||||
DEFINE_PROP_BIT("mq", VirtIONet, host_features, VIRTIO_NET_F_MQ, false),
|
||||
DEFINE_PROP_BIT64("mq", VirtIONet, host_features, VIRTIO_NET_F_MQ, false),
|
||||
DEFINE_NIC_PROPERTIES(VirtIONet, nic_conf),
|
||||
DEFINE_PROP_UINT32("x-txtimer", VirtIONet, net_conf.txtimer,
|
||||
TX_TIMER_INTERVAL),
|
||||
@ -2160,6 +2185,8 @@ static Property virtio_net_properties[] = {
|
||||
DEFINE_PROP_UINT16("host_mtu", VirtIONet, net_conf.mtu, 0),
|
||||
DEFINE_PROP_BOOL("x-mtu-bypass-backend", VirtIONet, mtu_bypass_backend,
|
||||
true),
|
||||
DEFINE_PROP_INT32("speed", VirtIONet, net_conf.speed, SPEED_UNKNOWN),
|
||||
DEFINE_PROP_STRING("duplex", VirtIONet, net_conf.duplex_str),
|
||||
DEFINE_PROP_END_OF_LIST(),
|
||||
};
|
||||
|
||||
|
14
hw/pci/pci.c
14
hw/pci/pci.c
@ -2048,18 +2048,6 @@ static void pci_qdev_realize(DeviceState *qdev, Error **errp)
|
||||
}
|
||||
}
|
||||
|
||||
static void pci_default_realize(PCIDevice *dev, Error **errp)
|
||||
{
|
||||
PCIDeviceClass *pc = PCI_DEVICE_GET_CLASS(dev);
|
||||
|
||||
if (pc->init) {
|
||||
if (pc->init(dev) < 0) {
|
||||
error_setg(errp, "Device initialization failed");
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
PCIDevice *pci_create_multifunction(PCIBus *bus, int devfn, bool multifunction,
|
||||
const char *name)
|
||||
{
|
||||
@ -2532,13 +2520,11 @@ MemoryRegion *pci_address_space_io(PCIDevice *dev)
|
||||
static void pci_device_class_init(ObjectClass *klass, void *data)
|
||||
{
|
||||
DeviceClass *k = DEVICE_CLASS(klass);
|
||||
PCIDeviceClass *pc = PCI_DEVICE_CLASS(klass);
|
||||
|
||||
k->realize = pci_qdev_realize;
|
||||
k->unrealize = pci_qdev_unrealize;
|
||||
k->bus_type = TYPE_PCI_BUS;
|
||||
k->props = pci_props;
|
||||
pc->realize = pci_default_realize;
|
||||
}
|
||||
|
||||
static void pci_device_class_base_init(ObjectClass *klass, void *data)
|
||||
|
@ -722,8 +722,7 @@ static int spapr_populate_drconf_memory(sPAPRMachineState *spapr, void *fdt)
|
||||
}
|
||||
|
||||
if (hotplug_lmb_start) {
|
||||
MemoryDeviceInfoList **prev = &dimms;
|
||||
qmp_pc_dimm_device_list(qdev_get_machine(), &prev);
|
||||
dimms = qmp_pc_dimm_device_list();
|
||||
}
|
||||
|
||||
/* ibm,dynamic-memory */
|
||||
|
@ -3,9 +3,23 @@
|
||||
# hw/virtio/vhost.c
|
||||
vhost_commit(bool started, bool changed) "Started: %d Changed: %d"
|
||||
vhost_region_add_section(const char *name, uint64_t gpa, uint64_t size, uint64_t host) "%s: 0x%"PRIx64"+0x%"PRIx64" @ 0x%"PRIx64
|
||||
vhost_region_add_section_abut(const char *name, uint64_t new_size) "%s: 0x%"PRIx64
|
||||
vhost_region_add_section_merge(const char *name, uint64_t new_size, uint64_t gpa, uint64_t owr) "%s: size: 0x%"PRIx64 " gpa: 0x%"PRIx64 " owr: 0x%"PRIx64
|
||||
vhost_region_add_section_aligned(const char *name, uint64_t gpa, uint64_t size, uint64_t host) "%s: 0x%"PRIx64"+0x%"PRIx64" @ 0x%"PRIx64
|
||||
vhost_section(const char *name, int r) "%s:%d"
|
||||
|
||||
# hw/virtio/vhost-user.c
|
||||
vhost_user_postcopy_end_entry(void) ""
|
||||
vhost_user_postcopy_end_exit(void) ""
|
||||
vhost_user_postcopy_fault_handler(const char *name, uint64_t fault_address, int nregions) "%s: @0x%"PRIx64" nregions:%d"
|
||||
vhost_user_postcopy_fault_handler_loop(int i, uint64_t client_base, uint64_t size) "%d: client 0x%"PRIx64" +0x%"PRIx64
|
||||
vhost_user_postcopy_fault_handler_found(int i, uint64_t region_offset, uint64_t rb_offset) "%d: region_offset: 0x%"PRIx64" rb_offset:0x%"PRIx64
|
||||
vhost_user_postcopy_listen(void) ""
|
||||
vhost_user_set_mem_table_postcopy(uint64_t client_addr, uint64_t qhva, int reply_i, int region_i) "client:0x%"PRIx64" for hva: 0x%"PRIx64" reply %d region %d"
|
||||
vhost_user_set_mem_table_withfd(int index, const char *name, uint64_t memory_size, uint64_t guest_phys_addr, uint64_t userspace_addr, uint64_t offset) "%d:%s: size:0x%"PRIx64" GPA:0x%"PRIx64" QVA/userspace:0x%"PRIx64" RB offset:0x%"PRIx64
|
||||
vhost_user_postcopy_waker(const char *rb, uint64_t rb_offset) "%s + 0x%"PRIx64
|
||||
vhost_user_postcopy_waker_found(uint64_t client_addr) "0x%"PRIx64
|
||||
vhost_user_postcopy_waker_nomatch(const char *rb, uint64_t rb_offset) "%s + 0x%"PRIx64
|
||||
|
||||
# hw/virtio/virtio.c
|
||||
virtqueue_alloc_element(void *elem, size_t sz, unsigned in_num, unsigned out_num) "elem %p size %zd in_num %u out_num %u"
|
||||
virtqueue_fill(void *vq, const void *elem, unsigned int len, unsigned int idx) "vq %p elem %p len %u idx %u"
|
||||
|
@ -18,11 +18,15 @@
|
||||
#include "qemu/error-report.h"
|
||||
#include "qemu/sockets.h"
|
||||
#include "sysemu/cryptodev.h"
|
||||
#include "migration/migration.h"
|
||||
#include "migration/postcopy-ram.h"
|
||||
#include "trace.h"
|
||||
|
||||
#include <sys/ioctl.h>
|
||||
#include <sys/socket.h>
|
||||
#include <sys/un.h>
|
||||
#include <linux/vhost.h>
|
||||
#include <linux/userfaultfd.h>
|
||||
|
||||
#define VHOST_MEMORY_MAX_NREGIONS 8
|
||||
#define VHOST_USER_F_PROTOCOL_FEATURES 30
|
||||
@ -41,7 +45,7 @@ enum VhostUserProtocolFeature {
|
||||
VHOST_USER_PROTOCOL_F_SLAVE_REQ = 5,
|
||||
VHOST_USER_PROTOCOL_F_CROSS_ENDIAN = 6,
|
||||
VHOST_USER_PROTOCOL_F_CRYPTO_SESSION = 7,
|
||||
|
||||
VHOST_USER_PROTOCOL_F_PAGEFAULT = 8,
|
||||
VHOST_USER_PROTOCOL_F_MAX
|
||||
};
|
||||
|
||||
@ -76,6 +80,9 @@ typedef enum VhostUserRequest {
|
||||
VHOST_USER_SET_CONFIG = 25,
|
||||
VHOST_USER_CREATE_CRYPTO_SESSION = 26,
|
||||
VHOST_USER_CLOSE_CRYPTO_SESSION = 27,
|
||||
VHOST_USER_POSTCOPY_ADVISE = 28,
|
||||
VHOST_USER_POSTCOPY_LISTEN = 29,
|
||||
VHOST_USER_POSTCOPY_END = 30,
|
||||
VHOST_USER_MAX
|
||||
} VhostUserRequest;
|
||||
|
||||
@ -164,8 +171,23 @@ static VhostUserMsg m __attribute__ ((unused));
|
||||
#define VHOST_USER_VERSION (0x1)
|
||||
|
||||
struct vhost_user {
|
||||
struct vhost_dev *dev;
|
||||
CharBackend *chr;
|
||||
int slave_fd;
|
||||
NotifierWithReturn postcopy_notifier;
|
||||
struct PostCopyFD postcopy_fd;
|
||||
uint64_t postcopy_client_bases[VHOST_MEMORY_MAX_NREGIONS];
|
||||
/* Length of the region_rb and region_rb_offset arrays */
|
||||
size_t region_rb_len;
|
||||
/* RAMBlock associated with a given region */
|
||||
RAMBlock **region_rb;
|
||||
/* The offset from the start of the RAMBlock to the start of the
|
||||
* vhost region.
|
||||
*/
|
||||
ram_addr_t *region_rb_offset;
|
||||
|
||||
/* True once we've entered postcopy_listen */
|
||||
bool postcopy_listen;
|
||||
};
|
||||
|
||||
static bool ioeventfd_enabled(void)
|
||||
@ -330,14 +352,167 @@ static int vhost_user_set_log_base(struct vhost_dev *dev, uint64_t base,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int vhost_user_set_mem_table(struct vhost_dev *dev,
|
||||
struct vhost_memory *mem)
|
||||
static int vhost_user_set_mem_table_postcopy(struct vhost_dev *dev,
|
||||
struct vhost_memory *mem)
|
||||
{
|
||||
struct vhost_user *u = dev->opaque;
|
||||
int fds[VHOST_MEMORY_MAX_NREGIONS];
|
||||
int i, fd;
|
||||
size_t fd_num = 0;
|
||||
bool reply_supported = virtio_has_feature(dev->protocol_features,
|
||||
VHOST_USER_PROTOCOL_F_REPLY_ACK);
|
||||
VhostUserMsg msg_reply;
|
||||
int region_i, msg_i;
|
||||
|
||||
VhostUserMsg msg = {
|
||||
.hdr.request = VHOST_USER_SET_MEM_TABLE,
|
||||
.hdr.flags = VHOST_USER_VERSION,
|
||||
};
|
||||
|
||||
if (reply_supported) {
|
||||
msg.hdr.flags |= VHOST_USER_NEED_REPLY_MASK;
|
||||
}
|
||||
|
||||
if (u->region_rb_len < dev->mem->nregions) {
|
||||
u->region_rb = g_renew(RAMBlock*, u->region_rb, dev->mem->nregions);
|
||||
u->region_rb_offset = g_renew(ram_addr_t, u->region_rb_offset,
|
||||
dev->mem->nregions);
|
||||
memset(&(u->region_rb[u->region_rb_len]), '\0',
|
||||
sizeof(RAMBlock *) * (dev->mem->nregions - u->region_rb_len));
|
||||
memset(&(u->region_rb_offset[u->region_rb_len]), '\0',
|
||||
sizeof(ram_addr_t) * (dev->mem->nregions - u->region_rb_len));
|
||||
u->region_rb_len = dev->mem->nregions;
|
||||
}
|
||||
|
||||
for (i = 0; i < dev->mem->nregions; ++i) {
|
||||
struct vhost_memory_region *reg = dev->mem->regions + i;
|
||||
ram_addr_t offset;
|
||||
MemoryRegion *mr;
|
||||
|
||||
assert((uintptr_t)reg->userspace_addr == reg->userspace_addr);
|
||||
mr = memory_region_from_host((void *)(uintptr_t)reg->userspace_addr,
|
||||
&offset);
|
||||
fd = memory_region_get_fd(mr);
|
||||
if (fd > 0) {
|
||||
trace_vhost_user_set_mem_table_withfd(fd_num, mr->name,
|
||||
reg->memory_size,
|
||||
reg->guest_phys_addr,
|
||||
reg->userspace_addr, offset);
|
||||
u->region_rb_offset[i] = offset;
|
||||
u->region_rb[i] = mr->ram_block;
|
||||
msg.payload.memory.regions[fd_num].userspace_addr =
|
||||
reg->userspace_addr;
|
||||
msg.payload.memory.regions[fd_num].memory_size = reg->memory_size;
|
||||
msg.payload.memory.regions[fd_num].guest_phys_addr =
|
||||
reg->guest_phys_addr;
|
||||
msg.payload.memory.regions[fd_num].mmap_offset = offset;
|
||||
assert(fd_num < VHOST_MEMORY_MAX_NREGIONS);
|
||||
fds[fd_num++] = fd;
|
||||
} else {
|
||||
u->region_rb_offset[i] = 0;
|
||||
u->region_rb[i] = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
msg.payload.memory.nregions = fd_num;
|
||||
|
||||
if (!fd_num) {
|
||||
error_report("Failed initializing vhost-user memory map, "
|
||||
"consider using -object memory-backend-file share=on");
|
||||
return -1;
|
||||
}
|
||||
|
||||
msg.hdr.size = sizeof(msg.payload.memory.nregions);
|
||||
msg.hdr.size += sizeof(msg.payload.memory.padding);
|
||||
msg.hdr.size += fd_num * sizeof(VhostUserMemoryRegion);
|
||||
|
||||
if (vhost_user_write(dev, &msg, fds, fd_num) < 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (vhost_user_read(dev, &msg_reply) < 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (msg_reply.hdr.request != VHOST_USER_SET_MEM_TABLE) {
|
||||
error_report("%s: Received unexpected msg type."
|
||||
"Expected %d received %d", __func__,
|
||||
VHOST_USER_SET_MEM_TABLE, msg_reply.hdr.request);
|
||||
return -1;
|
||||
}
|
||||
/* We're using the same structure, just reusing one of the
|
||||
* fields, so it should be the same size.
|
||||
*/
|
||||
if (msg_reply.hdr.size != msg.hdr.size) {
|
||||
error_report("%s: Unexpected size for postcopy reply "
|
||||
"%d vs %d", __func__, msg_reply.hdr.size, msg.hdr.size);
|
||||
return -1;
|
||||
}
|
||||
|
||||
memset(u->postcopy_client_bases, 0,
|
||||
sizeof(uint64_t) * VHOST_MEMORY_MAX_NREGIONS);
|
||||
|
||||
/* They're in the same order as the regions that were sent
|
||||
* but some of the regions were skipped (above) if they
|
||||
* didn't have fd's
|
||||
*/
|
||||
for (msg_i = 0, region_i = 0;
|
||||
region_i < dev->mem->nregions;
|
||||
region_i++) {
|
||||
if (msg_i < fd_num &&
|
||||
msg_reply.payload.memory.regions[msg_i].guest_phys_addr ==
|
||||
dev->mem->regions[region_i].guest_phys_addr) {
|
||||
u->postcopy_client_bases[region_i] =
|
||||
msg_reply.payload.memory.regions[msg_i].userspace_addr;
|
||||
trace_vhost_user_set_mem_table_postcopy(
|
||||
msg_reply.payload.memory.regions[msg_i].userspace_addr,
|
||||
msg.payload.memory.regions[msg_i].userspace_addr,
|
||||
msg_i, region_i);
|
||||
msg_i++;
|
||||
}
|
||||
}
|
||||
if (msg_i != fd_num) {
|
||||
error_report("%s: postcopy reply not fully consumed "
|
||||
"%d vs %zd",
|
||||
__func__, msg_i, fd_num);
|
||||
return -1;
|
||||
}
|
||||
/* Now we've registered this with the postcopy code, we ack to the client,
|
||||
* because now we're in the position to be able to deal with any faults
|
||||
* it generates.
|
||||
*/
|
||||
/* TODO: Use this for failure cases as well with a bad value */
|
||||
msg.hdr.size = sizeof(msg.payload.u64);
|
||||
msg.payload.u64 = 0; /* OK */
|
||||
if (vhost_user_write(dev, &msg, NULL, 0) < 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (reply_supported) {
|
||||
return process_message_reply(dev, &msg);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int vhost_user_set_mem_table(struct vhost_dev *dev,
|
||||
struct vhost_memory *mem)
|
||||
{
|
||||
struct vhost_user *u = dev->opaque;
|
||||
int fds[VHOST_MEMORY_MAX_NREGIONS];
|
||||
int i, fd;
|
||||
size_t fd_num = 0;
|
||||
bool do_postcopy = u->postcopy_listen && u->postcopy_fd.handler;
|
||||
bool reply_supported = virtio_has_feature(dev->protocol_features,
|
||||
VHOST_USER_PROTOCOL_F_REPLY_ACK) &&
|
||||
!do_postcopy;
|
||||
|
||||
if (do_postcopy) {
|
||||
/* Postcopy has enough differences that it's best done in it's own
|
||||
* version
|
||||
*/
|
||||
return vhost_user_set_mem_table_postcopy(dev, mem);
|
||||
}
|
||||
|
||||
VhostUserMsg msg = {
|
||||
.hdr.request = VHOST_USER_SET_MEM_TABLE,
|
||||
@ -362,9 +537,11 @@ static int vhost_user_set_mem_table(struct vhost_dev *dev,
|
||||
error_report("Failed preparing vhost-user memory table msg");
|
||||
return -1;
|
||||
}
|
||||
msg.payload.memory.regions[fd_num].userspace_addr = reg->userspace_addr;
|
||||
msg.payload.memory.regions[fd_num].userspace_addr =
|
||||
reg->userspace_addr;
|
||||
msg.payload.memory.regions[fd_num].memory_size = reg->memory_size;
|
||||
msg.payload.memory.regions[fd_num].guest_phys_addr = reg->guest_phys_addr;
|
||||
msg.payload.memory.regions[fd_num].guest_phys_addr =
|
||||
reg->guest_phys_addr;
|
||||
msg.payload.memory.regions[fd_num].mmap_offset = offset;
|
||||
fds[fd_num++] = fd;
|
||||
}
|
||||
@ -791,6 +968,219 @@ out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Called back from the postcopy fault thread when a fault is received on our
|
||||
* ufd.
|
||||
* TODO: This is Linux specific
|
||||
*/
|
||||
static int vhost_user_postcopy_fault_handler(struct PostCopyFD *pcfd,
|
||||
void *ufd)
|
||||
{
|
||||
struct vhost_dev *dev = pcfd->data;
|
||||
struct vhost_user *u = dev->opaque;
|
||||
struct uffd_msg *msg = ufd;
|
||||
uint64_t faultaddr = msg->arg.pagefault.address;
|
||||
RAMBlock *rb = NULL;
|
||||
uint64_t rb_offset;
|
||||
int i;
|
||||
|
||||
trace_vhost_user_postcopy_fault_handler(pcfd->idstr, faultaddr,
|
||||
dev->mem->nregions);
|
||||
for (i = 0; i < MIN(dev->mem->nregions, u->region_rb_len); i++) {
|
||||
trace_vhost_user_postcopy_fault_handler_loop(i,
|
||||
u->postcopy_client_bases[i], dev->mem->regions[i].memory_size);
|
||||
if (faultaddr >= u->postcopy_client_bases[i]) {
|
||||
/* Ofset of the fault address in the vhost region */
|
||||
uint64_t region_offset = faultaddr - u->postcopy_client_bases[i];
|
||||
if (region_offset < dev->mem->regions[i].memory_size) {
|
||||
rb_offset = region_offset + u->region_rb_offset[i];
|
||||
trace_vhost_user_postcopy_fault_handler_found(i,
|
||||
region_offset, rb_offset);
|
||||
rb = u->region_rb[i];
|
||||
return postcopy_request_shared_page(pcfd, rb, faultaddr,
|
||||
rb_offset);
|
||||
}
|
||||
}
|
||||
}
|
||||
error_report("%s: Failed to find region for fault %" PRIx64,
|
||||
__func__, faultaddr);
|
||||
return -1;
|
||||
}
|
||||
|
||||
static int vhost_user_postcopy_waker(struct PostCopyFD *pcfd, RAMBlock *rb,
|
||||
uint64_t offset)
|
||||
{
|
||||
struct vhost_dev *dev = pcfd->data;
|
||||
struct vhost_user *u = dev->opaque;
|
||||
int i;
|
||||
|
||||
trace_vhost_user_postcopy_waker(qemu_ram_get_idstr(rb), offset);
|
||||
|
||||
if (!u) {
|
||||
return 0;
|
||||
}
|
||||
/* Translate the offset into an address in the clients address space */
|
||||
for (i = 0; i < MIN(dev->mem->nregions, u->region_rb_len); i++) {
|
||||
if (u->region_rb[i] == rb &&
|
||||
offset >= u->region_rb_offset[i] &&
|
||||
offset < (u->region_rb_offset[i] +
|
||||
dev->mem->regions[i].memory_size)) {
|
||||
uint64_t client_addr = (offset - u->region_rb_offset[i]) +
|
||||
u->postcopy_client_bases[i];
|
||||
trace_vhost_user_postcopy_waker_found(client_addr);
|
||||
return postcopy_wake_shared(pcfd, client_addr, rb);
|
||||
}
|
||||
}
|
||||
|
||||
trace_vhost_user_postcopy_waker_nomatch(qemu_ram_get_idstr(rb), offset);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Called at the start of an inbound postcopy on reception of the
|
||||
* 'advise' command.
|
||||
*/
|
||||
static int vhost_user_postcopy_advise(struct vhost_dev *dev, Error **errp)
|
||||
{
|
||||
struct vhost_user *u = dev->opaque;
|
||||
CharBackend *chr = u->chr;
|
||||
int ufd;
|
||||
VhostUserMsg msg = {
|
||||
.hdr.request = VHOST_USER_POSTCOPY_ADVISE,
|
||||
.hdr.flags = VHOST_USER_VERSION,
|
||||
};
|
||||
|
||||
if (vhost_user_write(dev, &msg, NULL, 0) < 0) {
|
||||
error_setg(errp, "Failed to send postcopy_advise to vhost");
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (vhost_user_read(dev, &msg) < 0) {
|
||||
error_setg(errp, "Failed to get postcopy_advise reply from vhost");
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (msg.hdr.request != VHOST_USER_POSTCOPY_ADVISE) {
|
||||
error_setg(errp, "Unexpected msg type. Expected %d received %d",
|
||||
VHOST_USER_POSTCOPY_ADVISE, msg.hdr.request);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (msg.hdr.size) {
|
||||
error_setg(errp, "Received bad msg size.");
|
||||
return -1;
|
||||
}
|
||||
ufd = qemu_chr_fe_get_msgfd(chr);
|
||||
if (ufd < 0) {
|
||||
error_setg(errp, "%s: Failed to get ufd", __func__);
|
||||
return -1;
|
||||
}
|
||||
fcntl(ufd, F_SETFL, O_NONBLOCK);
|
||||
|
||||
/* register ufd with userfault thread */
|
||||
u->postcopy_fd.fd = ufd;
|
||||
u->postcopy_fd.data = dev;
|
||||
u->postcopy_fd.handler = vhost_user_postcopy_fault_handler;
|
||||
u->postcopy_fd.waker = vhost_user_postcopy_waker;
|
||||
u->postcopy_fd.idstr = "vhost-user"; /* Need to find unique name */
|
||||
postcopy_register_shared_ufd(&u->postcopy_fd);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Called at the switch to postcopy on reception of the 'listen' command.
|
||||
*/
|
||||
static int vhost_user_postcopy_listen(struct vhost_dev *dev, Error **errp)
|
||||
{
|
||||
struct vhost_user *u = dev->opaque;
|
||||
int ret;
|
||||
VhostUserMsg msg = {
|
||||
.hdr.request = VHOST_USER_POSTCOPY_LISTEN,
|
||||
.hdr.flags = VHOST_USER_VERSION | VHOST_USER_NEED_REPLY_MASK,
|
||||
};
|
||||
u->postcopy_listen = true;
|
||||
trace_vhost_user_postcopy_listen();
|
||||
if (vhost_user_write(dev, &msg, NULL, 0) < 0) {
|
||||
error_setg(errp, "Failed to send postcopy_listen to vhost");
|
||||
return -1;
|
||||
}
|
||||
|
||||
ret = process_message_reply(dev, &msg);
|
||||
if (ret) {
|
||||
error_setg(errp, "Failed to receive reply to postcopy_listen");
|
||||
return ret;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Called at the end of postcopy
|
||||
*/
|
||||
static int vhost_user_postcopy_end(struct vhost_dev *dev, Error **errp)
|
||||
{
|
||||
VhostUserMsg msg = {
|
||||
.hdr.request = VHOST_USER_POSTCOPY_END,
|
||||
.hdr.flags = VHOST_USER_VERSION | VHOST_USER_NEED_REPLY_MASK,
|
||||
};
|
||||
int ret;
|
||||
struct vhost_user *u = dev->opaque;
|
||||
|
||||
trace_vhost_user_postcopy_end_entry();
|
||||
if (vhost_user_write(dev, &msg, NULL, 0) < 0) {
|
||||
error_setg(errp, "Failed to send postcopy_end to vhost");
|
||||
return -1;
|
||||
}
|
||||
|
||||
ret = process_message_reply(dev, &msg);
|
||||
if (ret) {
|
||||
error_setg(errp, "Failed to receive reply to postcopy_end");
|
||||
return ret;
|
||||
}
|
||||
postcopy_unregister_shared_ufd(&u->postcopy_fd);
|
||||
u->postcopy_fd.handler = NULL;
|
||||
|
||||
trace_vhost_user_postcopy_end_exit();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int vhost_user_postcopy_notifier(NotifierWithReturn *notifier,
|
||||
void *opaque)
|
||||
{
|
||||
struct PostcopyNotifyData *pnd = opaque;
|
||||
struct vhost_user *u = container_of(notifier, struct vhost_user,
|
||||
postcopy_notifier);
|
||||
struct vhost_dev *dev = u->dev;
|
||||
|
||||
switch (pnd->reason) {
|
||||
case POSTCOPY_NOTIFY_PROBE:
|
||||
if (!virtio_has_feature(dev->protocol_features,
|
||||
VHOST_USER_PROTOCOL_F_PAGEFAULT)) {
|
||||
/* TODO: Get the device name into this error somehow */
|
||||
error_setg(pnd->errp,
|
||||
"vhost-user backend not capable of postcopy");
|
||||
return -ENOENT;
|
||||
}
|
||||
break;
|
||||
|
||||
case POSTCOPY_NOTIFY_INBOUND_ADVISE:
|
||||
return vhost_user_postcopy_advise(dev, pnd->errp);
|
||||
|
||||
case POSTCOPY_NOTIFY_INBOUND_LISTEN:
|
||||
return vhost_user_postcopy_listen(dev, pnd->errp);
|
||||
|
||||
case POSTCOPY_NOTIFY_INBOUND_END:
|
||||
return vhost_user_postcopy_end(dev, pnd->errp);
|
||||
|
||||
default:
|
||||
/* We ignore notifications we don't know */
|
||||
break;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int vhost_user_init(struct vhost_dev *dev, void *opaque)
|
||||
{
|
||||
uint64_t features, protocol_features;
|
||||
@ -802,6 +1192,7 @@ static int vhost_user_init(struct vhost_dev *dev, void *opaque)
|
||||
u = g_new0(struct vhost_user, 1);
|
||||
u->chr = opaque;
|
||||
u->slave_fd = -1;
|
||||
u->dev = dev;
|
||||
dev->opaque = u;
|
||||
|
||||
err = vhost_user_get_features(dev, &features);
|
||||
@ -858,6 +1249,9 @@ static int vhost_user_init(struct vhost_dev *dev, void *opaque)
|
||||
return err;
|
||||
}
|
||||
|
||||
u->postcopy_notifier.notify = vhost_user_postcopy_notifier;
|
||||
postcopy_add_notifier(&u->postcopy_notifier);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -868,11 +1262,20 @@ static int vhost_user_cleanup(struct vhost_dev *dev)
|
||||
assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER);
|
||||
|
||||
u = dev->opaque;
|
||||
if (u->postcopy_notifier.notify) {
|
||||
postcopy_remove_notifier(&u->postcopy_notifier);
|
||||
u->postcopy_notifier.notify = NULL;
|
||||
}
|
||||
if (u->slave_fd >= 0) {
|
||||
qemu_set_fd_handler(u->slave_fd, NULL, NULL, NULL);
|
||||
close(u->slave_fd);
|
||||
u->slave_fd = -1;
|
||||
}
|
||||
g_free(u->region_rb);
|
||||
u->region_rb = NULL;
|
||||
g_free(u->region_rb_offset);
|
||||
u->region_rb_offset = NULL;
|
||||
u->region_rb_len = 0;
|
||||
g_free(u);
|
||||
dev->opaque = 0;
|
||||
|
||||
|
@ -522,10 +522,28 @@ static void vhost_region_add_section(struct vhost_dev *dev,
|
||||
uint64_t mrs_gpa = section->offset_within_address_space;
|
||||
uintptr_t mrs_host = (uintptr_t)memory_region_get_ram_ptr(section->mr) +
|
||||
section->offset_within_region;
|
||||
RAMBlock *mrs_rb = section->mr->ram_block;
|
||||
size_t mrs_page = qemu_ram_pagesize(mrs_rb);
|
||||
|
||||
trace_vhost_region_add_section(section->mr->name, mrs_gpa, mrs_size,
|
||||
mrs_host);
|
||||
|
||||
/* Round the section to it's page size */
|
||||
/* First align the start down to a page boundary */
|
||||
uint64_t alignage = mrs_host & (mrs_page - 1);
|
||||
if (alignage) {
|
||||
mrs_host -= alignage;
|
||||
mrs_size += alignage;
|
||||
mrs_gpa -= alignage;
|
||||
}
|
||||
/* Now align the size up to a page boundary */
|
||||
alignage = mrs_size & (mrs_page - 1);
|
||||
if (alignage) {
|
||||
mrs_size += mrs_page - alignage;
|
||||
}
|
||||
trace_vhost_region_add_section_aligned(section->mr->name, mrs_gpa, mrs_size,
|
||||
mrs_host);
|
||||
|
||||
if (dev->n_tmp_sections) {
|
||||
/* Since we already have at least one section, lets see if
|
||||
* this extends it; since we're scanning in order, we only
|
||||
@ -542,18 +560,46 @@ static void vhost_region_add_section(struct vhost_dev *dev,
|
||||
prev_sec->offset_within_region;
|
||||
uint64_t prev_host_end = range_get_last(prev_host_start, prev_size);
|
||||
|
||||
if (prev_gpa_end + 1 == mrs_gpa &&
|
||||
prev_host_end + 1 == mrs_host &&
|
||||
section->mr == prev_sec->mr &&
|
||||
(!dev->vhost_ops->vhost_backend_can_merge ||
|
||||
dev->vhost_ops->vhost_backend_can_merge(dev,
|
||||
if (mrs_gpa <= (prev_gpa_end + 1)) {
|
||||
/* OK, looks like overlapping/intersecting - it's possible that
|
||||
* the rounding to page sizes has made them overlap, but they should
|
||||
* match up in the same RAMBlock if they do.
|
||||
*/
|
||||
if (mrs_gpa < prev_gpa_start) {
|
||||
error_report("%s:Section rounded to %"PRIx64
|
||||
" prior to previous %"PRIx64,
|
||||
__func__, mrs_gpa, prev_gpa_start);
|
||||
/* A way to cleanly fail here would be better */
|
||||
return;
|
||||
}
|
||||
/* Offset from the start of the previous GPA to this GPA */
|
||||
size_t offset = mrs_gpa - prev_gpa_start;
|
||||
|
||||
if (prev_host_start + offset == mrs_host &&
|
||||
section->mr == prev_sec->mr &&
|
||||
(!dev->vhost_ops->vhost_backend_can_merge ||
|
||||
dev->vhost_ops->vhost_backend_can_merge(dev,
|
||||
mrs_host, mrs_size,
|
||||
prev_host_start, prev_size))) {
|
||||
/* The two sections abut */
|
||||
need_add = false;
|
||||
prev_sec->size = int128_add(prev_sec->size, section->size);
|
||||
trace_vhost_region_add_section_abut(section->mr->name,
|
||||
mrs_size + prev_size);
|
||||
uint64_t max_end = MAX(prev_host_end, mrs_host + mrs_size);
|
||||
need_add = false;
|
||||
prev_sec->offset_within_address_space =
|
||||
MIN(prev_gpa_start, mrs_gpa);
|
||||
prev_sec->offset_within_region =
|
||||
MIN(prev_host_start, mrs_host) -
|
||||
(uintptr_t)memory_region_get_ram_ptr(prev_sec->mr);
|
||||
prev_sec->size = int128_make64(max_end - MIN(prev_host_start,
|
||||
mrs_host));
|
||||
trace_vhost_region_add_section_merge(section->mr->name,
|
||||
int128_get64(prev_sec->size),
|
||||
prev_sec->offset_within_address_space,
|
||||
prev_sec->offset_within_region);
|
||||
} else {
|
||||
error_report("%s: Overlapping but not coherent sections "
|
||||
"at %"PRIx64,
|
||||
__func__, mrs_gpa);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -68,10 +68,14 @@ ram_addr_t qemu_ram_addr_from_host(void *ptr);
|
||||
RAMBlock *qemu_ram_block_by_name(const char *name);
|
||||
RAMBlock *qemu_ram_block_from_host(void *ptr, bool round_offset,
|
||||
ram_addr_t *offset);
|
||||
ram_addr_t qemu_ram_block_host_offset(RAMBlock *rb, void *host);
|
||||
void qemu_ram_set_idstr(RAMBlock *block, const char *name, DeviceState *dev);
|
||||
void qemu_ram_unset_idstr(RAMBlock *block);
|
||||
const char *qemu_ram_get_idstr(RAMBlock *rb);
|
||||
bool qemu_ram_is_shared(RAMBlock *rb);
|
||||
bool qemu_ram_is_uf_zeroable(RAMBlock *rb);
|
||||
void qemu_ram_set_uf_zeroable(RAMBlock *rb);
|
||||
|
||||
size_t qemu_ram_pagesize(RAMBlock *block);
|
||||
size_t qemu_ram_pagesize_largest(void);
|
||||
|
||||
|
@ -40,18 +40,6 @@ enum {
|
||||
ACPI_FADT_F_LOW_POWER_S0_IDLE_CAPABLE,
|
||||
};
|
||||
|
||||
/*
|
||||
* ACPI 2.0 Generic Address Space definition.
|
||||
*/
|
||||
struct Acpi20GenericAddress {
|
||||
uint8_t address_space_id;
|
||||
uint8_t register_bit_width;
|
||||
uint8_t register_bit_offset;
|
||||
uint8_t reserved;
|
||||
uint64_t address;
|
||||
} QEMU_PACKED;
|
||||
typedef struct Acpi20GenericAddress Acpi20GenericAddress;
|
||||
|
||||
struct AcpiRsdpDescriptor { /* Root System Descriptor Pointer */
|
||||
uint64_t signature; /* ACPI signature, contains "RSD PTR " */
|
||||
uint8_t checksum; /* To make sum of struct == 0 */
|
||||
@ -87,104 +75,44 @@ struct AcpiTableHeader {
|
||||
} QEMU_PACKED;
|
||||
typedef struct AcpiTableHeader AcpiTableHeader;
|
||||
|
||||
/*
|
||||
* ACPI Fixed ACPI Description Table (FADT)
|
||||
*/
|
||||
#define ACPI_FADT_COMMON_DEF /* FADT common definition */ \
|
||||
ACPI_TABLE_HEADER_DEF /* ACPI common table header */ \
|
||||
uint32_t firmware_ctrl; /* Physical address of FACS */ \
|
||||
uint32_t dsdt; /* Physical address of DSDT */ \
|
||||
uint8_t model; /* System Interrupt Model */ \
|
||||
uint8_t reserved1; /* Reserved */ \
|
||||
uint16_t sci_int; /* System vector of SCI interrupt */ \
|
||||
uint32_t smi_cmd; /* Port address of SMI command port */ \
|
||||
uint8_t acpi_enable; /* Value to write to smi_cmd to enable ACPI */ \
|
||||
uint8_t acpi_disable; /* Value to write to smi_cmd to disable ACPI */ \
|
||||
/* Value to write to SMI CMD to enter S4BIOS state */ \
|
||||
uint8_t S4bios_req; \
|
||||
uint8_t reserved2; /* Reserved - must be zero */ \
|
||||
/* Port address of Power Mgt 1a acpi_event Reg Blk */ \
|
||||
uint32_t pm1a_evt_blk; \
|
||||
/* Port address of Power Mgt 1b acpi_event Reg Blk */ \
|
||||
uint32_t pm1b_evt_blk; \
|
||||
uint32_t pm1a_cnt_blk; /* Port address of Power Mgt 1a Control Reg Blk */ \
|
||||
uint32_t pm1b_cnt_blk; /* Port address of Power Mgt 1b Control Reg Blk */ \
|
||||
uint32_t pm2_cnt_blk; /* Port address of Power Mgt 2 Control Reg Blk */ \
|
||||
uint32_t pm_tmr_blk; /* Port address of Power Mgt Timer Ctrl Reg Blk */ \
|
||||
/* Port addr of General Purpose acpi_event 0 Reg Blk */ \
|
||||
uint32_t gpe0_blk; \
|
||||
/* Port addr of General Purpose acpi_event 1 Reg Blk */ \
|
||||
uint32_t gpe1_blk; \
|
||||
uint8_t pm1_evt_len; /* Byte length of ports at pm1_x_evt_blk */ \
|
||||
uint8_t pm1_cnt_len; /* Byte length of ports at pm1_x_cnt_blk */ \
|
||||
uint8_t pm2_cnt_len; /* Byte Length of ports at pm2_cnt_blk */ \
|
||||
uint8_t pm_tmr_len; /* Byte Length of ports at pm_tm_blk */ \
|
||||
uint8_t gpe0_blk_len; /* Byte Length of ports at gpe0_blk */ \
|
||||
uint8_t gpe1_blk_len; /* Byte Length of ports at gpe1_blk */ \
|
||||
uint8_t gpe1_base; /* Offset in gpe model where gpe1 events start */ \
|
||||
uint8_t reserved3; /* Reserved */ \
|
||||
uint16_t plvl2_lat; /* Worst case HW latency to enter/exit C2 state */ \
|
||||
uint16_t plvl3_lat; /* Worst case HW latency to enter/exit C3 state */ \
|
||||
uint16_t flush_size; /* Size of area read to flush caches */ \
|
||||
uint16_t flush_stride; /* Stride used in flushing caches */ \
|
||||
uint8_t duty_offset; /* Bit location of duty cycle field in p_cnt reg */ \
|
||||
uint8_t duty_width; /* Bit width of duty cycle field in p_cnt reg */ \
|
||||
uint8_t day_alrm; /* Index to day-of-month alarm in RTC CMOS RAM */ \
|
||||
uint8_t mon_alrm; /* Index to month-of-year alarm in RTC CMOS RAM */ \
|
||||
uint8_t century; /* Index to century in RTC CMOS RAM */ \
|
||||
/* IA-PC Boot Architecture Flags (see below for individual flags) */ \
|
||||
uint16_t boot_flags; \
|
||||
uint8_t reserved; /* Reserved, must be zero */ \
|
||||
/* Miscellaneous flag bits (see below for individual flags) */ \
|
||||
uint32_t flags; \
|
||||
/* 64-bit address of the Reset register */ \
|
||||
struct AcpiGenericAddress reset_register; \
|
||||
/* Value to write to the reset_register port to reset the system */ \
|
||||
uint8_t reset_value; \
|
||||
/* ARM-Specific Boot Flags (see below for individual flags) (ACPI 5.1) */ \
|
||||
uint16_t arm_boot_flags; \
|
||||
uint8_t minor_revision; /* FADT Minor Revision (ACPI 5.1) */ \
|
||||
uint64_t x_facs; /* 64-bit physical address of FACS */ \
|
||||
uint64_t x_dsdt; /* 64-bit physical address of DSDT */ \
|
||||
/* 64-bit Extended Power Mgt 1a Event Reg Blk address */ \
|
||||
struct AcpiGenericAddress xpm1a_event_block; \
|
||||
/* 64-bit Extended Power Mgt 1b Event Reg Blk address */ \
|
||||
struct AcpiGenericAddress xpm1b_event_block; \
|
||||
/* 64-bit Extended Power Mgt 1a Control Reg Blk address */ \
|
||||
struct AcpiGenericAddress xpm1a_control_block; \
|
||||
/* 64-bit Extended Power Mgt 1b Control Reg Blk address */ \
|
||||
struct AcpiGenericAddress xpm1b_control_block; \
|
||||
/* 64-bit Extended Power Mgt 2 Control Reg Blk address */ \
|
||||
struct AcpiGenericAddress xpm2_control_block; \
|
||||
/* 64-bit Extended Power Mgt Timer Ctrl Reg Blk address */ \
|
||||
struct AcpiGenericAddress xpm_timer_block; \
|
||||
/* 64-bit Extended General Purpose Event 0 Reg Blk address */ \
|
||||
struct AcpiGenericAddress xgpe0_block; \
|
||||
/* 64-bit Extended General Purpose Event 1 Reg Blk address */ \
|
||||
struct AcpiGenericAddress xgpe1_block; \
|
||||
|
||||
struct AcpiGenericAddress {
|
||||
uint8_t space_id; /* Address space where struct or register exists */
|
||||
uint8_t bit_width; /* Size in bits of given register */
|
||||
uint8_t bit_offset; /* Bit offset within the register */
|
||||
uint8_t access_width; /* Minimum Access size (ACPI 3.0) */
|
||||
uint8_t access_width; /* ACPI 3.0: Minimum Access size (ACPI 3.0),
|
||||
ACPI 2.0: Reserved, Table 5-1 */
|
||||
uint64_t address; /* 64-bit address of struct or register */
|
||||
} QEMU_PACKED;
|
||||
|
||||
struct AcpiFadtDescriptorRev3 {
|
||||
ACPI_FADT_COMMON_DEF
|
||||
} QEMU_PACKED;
|
||||
typedef struct AcpiFadtDescriptorRev3 AcpiFadtDescriptorRev3;
|
||||
typedef struct AcpiFadtData {
|
||||
struct AcpiGenericAddress pm1a_cnt; /* PM1a_CNT_BLK */
|
||||
struct AcpiGenericAddress pm1a_evt; /* PM1a_EVT_BLK */
|
||||
struct AcpiGenericAddress pm_tmr; /* PM_TMR_BLK */
|
||||
struct AcpiGenericAddress gpe0_blk; /* GPE0_BLK */
|
||||
struct AcpiGenericAddress reset_reg; /* RESET_REG */
|
||||
uint8_t reset_val; /* RESET_VALUE */
|
||||
uint8_t rev; /* Revision */
|
||||
uint32_t flags; /* Flags */
|
||||
uint32_t smi_cmd; /* SMI_CMD */
|
||||
uint16_t sci_int; /* SCI_INT */
|
||||
uint8_t int_model; /* INT_MODEL */
|
||||
uint8_t acpi_enable_cmd; /* ACPI_ENABLE */
|
||||
uint8_t acpi_disable_cmd; /* ACPI_DISABLE */
|
||||
uint8_t rtc_century; /* CENTURY */
|
||||
uint16_t plvl2_lat; /* P_LVL2_LAT */
|
||||
uint16_t plvl3_lat; /* P_LVL3_LAT */
|
||||
uint16_t arm_boot_arch; /* ARM_BOOT_ARCH */
|
||||
uint8_t minor_ver; /* FADT Minor Version */
|
||||
|
||||
struct AcpiFadtDescriptorRev5_1 {
|
||||
ACPI_FADT_COMMON_DEF
|
||||
/* 64-bit Sleep Control register (ACPI 5.0) */
|
||||
struct AcpiGenericAddress sleep_control;
|
||||
/* 64-bit Sleep Status register (ACPI 5.0) */
|
||||
struct AcpiGenericAddress sleep_status;
|
||||
} QEMU_PACKED;
|
||||
|
||||
typedef struct AcpiFadtDescriptorRev5_1 AcpiFadtDescriptorRev5_1;
|
||||
/*
|
||||
* respective tables offsets within ACPI_BUILD_TABLE_FILE,
|
||||
* NULL if table doesn't exist (in that case field's value
|
||||
* won't be patched by linker and will be kept set to 0)
|
||||
*/
|
||||
unsigned *facs_tbl_offset; /* FACS offset in */
|
||||
unsigned *dsdt_tbl_offset;
|
||||
unsigned *xdsdt_tbl_offset;
|
||||
} AcpiFadtData;
|
||||
|
||||
#define ACPI_FADT_ARM_PSCI_COMPLIANT (1 << 0)
|
||||
#define ACPI_FADT_ARM_PSCI_USE_HVC (1 << 1)
|
||||
@ -456,7 +384,7 @@ typedef struct AcpiGenericTimerTable AcpiGenericTimerTable;
|
||||
struct Acpi20Hpet {
|
||||
ACPI_TABLE_HEADER_DEF /* ACPI common table header */
|
||||
uint32_t timer_block_id;
|
||||
Acpi20GenericAddress addr;
|
||||
struct AcpiGenericAddress addr;
|
||||
uint8_t hpet_number;
|
||||
uint16_t min_tick;
|
||||
uint8_t page_protect;
|
||||
|
@ -77,6 +77,15 @@ typedef enum {
|
||||
AML_WRITE_AS_ZEROS = 2,
|
||||
} AmlUpdateRule;
|
||||
|
||||
typedef enum {
|
||||
AML_AS_SYSTEM_MEMORY = 0X00,
|
||||
AML_AS_SYSTEM_IO = 0X01,
|
||||
AML_AS_PCI_CONFIG = 0X02,
|
||||
AML_AS_EMBEDDED_CTRL = 0X03,
|
||||
AML_AS_SMBUS = 0X04,
|
||||
AML_AS_FFH = 0X7F,
|
||||
} AmlAddressSpace;
|
||||
|
||||
typedef enum {
|
||||
AML_SYSTEM_MEMORY = 0X00,
|
||||
AML_SYSTEM_IO = 0X01,
|
||||
@ -389,8 +398,22 @@ int
|
||||
build_append_named_dword(GArray *array, const char *name_format, ...)
|
||||
GCC_FMT_ATTR(2, 3);
|
||||
|
||||
void build_append_gas(GArray *table, AmlAddressSpace as,
|
||||
uint8_t bit_width, uint8_t bit_offset,
|
||||
uint8_t access_width, uint64_t address);
|
||||
|
||||
static inline void
|
||||
build_append_gas_from_struct(GArray *table, const struct AcpiGenericAddress *s)
|
||||
{
|
||||
build_append_gas(table, s->space_id, s->bit_width, s->bit_offset,
|
||||
s->access_width, s->address);
|
||||
}
|
||||
|
||||
void build_srat_memory(AcpiSratMemoryAffinity *numamem, uint64_t base,
|
||||
uint64_t len, int node, MemoryAffinityFlags flags);
|
||||
|
||||
void build_slit(GArray *table_data, BIOSLinker *linker);
|
||||
|
||||
void build_fadt(GArray *tbl, BIOSLinker *linker, const AcpiFadtData *f,
|
||||
const char *oem_id, const char *oem_table_id);
|
||||
#endif
|
||||
|
@ -5,6 +5,9 @@
|
||||
#include "hw/hw.h"
|
||||
#include "exec/memory.h"
|
||||
|
||||
#define APM_CNT_IOPORT 0xb2
|
||||
#define ACPI_PORT_SMI_CMD APM_CNT_IOPORT
|
||||
|
||||
typedef void (*apm_ctrl_changed_t)(uint32_t val, void *arg);
|
||||
|
||||
typedef struct APMState {
|
||||
|
@ -93,7 +93,7 @@ uint64_t pc_dimm_get_free_addr(uint64_t address_space_start,
|
||||
|
||||
int pc_dimm_get_free_slot(const int *hint, int max_slots, Error **errp);
|
||||
|
||||
int qmp_pc_dimm_device_list(Object *obj, void *opaque);
|
||||
MemoryDeviceInfoList *qmp_pc_dimm_device_list(void);
|
||||
uint64_t pc_existing_dimms_capacity(Error **errp);
|
||||
uint64_t get_plugged_memory_size(void);
|
||||
void pc_dimm_memory_plug(DeviceState *dev, MemoryHotplugState *hpms,
|
||||
|
@ -217,7 +217,6 @@ typedef struct PCIDeviceClass {
|
||||
DeviceClass parent_class;
|
||||
|
||||
void (*realize)(PCIDevice *dev, Error **errp);
|
||||
int (*init)(PCIDevice *dev);/* TODO convert to realize() and remove */
|
||||
PCIUnregisterFunc *exit;
|
||||
PCIConfigReadFunc *config_read;
|
||||
PCIConfigWriteFunc *config_write;
|
||||
|
@ -38,6 +38,9 @@ typedef struct virtio_net_conf
|
||||
uint16_t rx_queue_size;
|
||||
uint16_t tx_queue_size;
|
||||
uint16_t mtu;
|
||||
int32_t speed;
|
||||
char *duplex_str;
|
||||
uint8_t duplex;
|
||||
} virtio_net_conf;
|
||||
|
||||
/* Maximum packet size we can receive from tap device: header + 64k */
|
||||
@ -67,7 +70,7 @@ typedef struct VirtIONet {
|
||||
uint32_t has_vnet_hdr;
|
||||
size_t host_hdr_len;
|
||||
size_t guest_hdr_len;
|
||||
uint32_t host_features;
|
||||
uint64_t host_features;
|
||||
uint8_t has_ufo;
|
||||
uint32_t mergeable_rx_bufs;
|
||||
uint8_t promisc;
|
||||
|
1821
include/standard-headers/linux/ethtool.h
Normal file
1821
include/standard-headers/linux/ethtool.h
Normal file
File diff suppressed because it is too large
Load Diff
15
include/standard-headers/linux/kernel.h
Normal file
15
include/standard-headers/linux/kernel.h
Normal file
@ -0,0 +1,15 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
|
||||
#ifndef _LINUX_KERNEL_H
|
||||
#define _LINUX_KERNEL_H
|
||||
|
||||
#include "standard-headers/linux/sysinfo.h"
|
||||
|
||||
/*
|
||||
* 'kernel.h' contains some often-used function prototypes etc
|
||||
*/
|
||||
#define __ALIGN_KERNEL(x, a) __ALIGN_KERNEL_MASK(x, (typeof(x))(a) - 1)
|
||||
#define __ALIGN_KERNEL_MASK(x, mask) (((x) + (mask)) & ~(mask))
|
||||
|
||||
#define __KERNEL_DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d))
|
||||
|
||||
#endif /* _LINUX_KERNEL_H */
|
25
include/standard-headers/linux/sysinfo.h
Normal file
25
include/standard-headers/linux/sysinfo.h
Normal file
@ -0,0 +1,25 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
|
||||
#ifndef _LINUX_SYSINFO_H
|
||||
#define _LINUX_SYSINFO_H
|
||||
|
||||
#include "standard-headers/linux/types.h"
|
||||
|
||||
#define SI_LOAD_SHIFT 16
|
||||
struct sysinfo {
|
||||
long uptime; /* Seconds since boot */
|
||||
unsigned long loads[3]; /* 1, 5, and 15 minute load averages */
|
||||
unsigned long totalram; /* Total usable main memory size */
|
||||
unsigned long freeram; /* Available memory size */
|
||||
unsigned long sharedram; /* Amount of shared memory */
|
||||
unsigned long bufferram; /* Memory used by buffers */
|
||||
unsigned long totalswap; /* Total swap space size */
|
||||
unsigned long freeswap; /* swap space still available */
|
||||
uint16_t procs; /* Number of current processes */
|
||||
uint16_t pad; /* Explicit padding for m68k */
|
||||
unsigned long totalhigh; /* Total high memory size */
|
||||
unsigned long freehigh; /* Available high memory size */
|
||||
uint32_t mem_unit; /* Memory unit size in bytes */
|
||||
char _f[20-2*sizeof(unsigned long)-sizeof(uint32_t)]; /* Padding: libc5 uses this.. */
|
||||
};
|
||||
|
||||
#endif /* _LINUX_SYSINFO_H */
|
@ -155,6 +155,8 @@ MigrationIncomingState *migration_incoming_get_current(void)
|
||||
if (!once) {
|
||||
mis_current.state = MIGRATION_STATUS_NONE;
|
||||
memset(&mis_current, 0, sizeof(MigrationIncomingState));
|
||||
mis_current.postcopy_remote_fds = g_array_new(FALSE, TRUE,
|
||||
sizeof(struct PostCopyFD));
|
||||
qemu_mutex_init(&mis_current.rp_mutex);
|
||||
qemu_event_init(&mis_current.main_thread_load_event, false);
|
||||
|
||||
@ -180,6 +182,10 @@ void migration_incoming_state_destroy(void)
|
||||
qemu_fclose(mis->from_src_file);
|
||||
mis->from_src_file = NULL;
|
||||
}
|
||||
if (mis->postcopy_remote_fds) {
|
||||
g_array_free(mis->postcopy_remote_fds, TRUE);
|
||||
mis->postcopy_remote_fds = NULL;
|
||||
}
|
||||
|
||||
qemu_event_reset(&mis->main_thread_load_event);
|
||||
}
|
||||
|
@ -49,8 +49,12 @@ struct MigrationIncomingState {
|
||||
int userfault_event_fd;
|
||||
QEMUFile *to_src_file;
|
||||
QemuMutex rp_mutex; /* We send replies from multiple threads */
|
||||
/* RAMBlock of last request sent to source */
|
||||
RAMBlock *last_rb;
|
||||
void *postcopy_tmp_page;
|
||||
void *postcopy_tmp_zero_page;
|
||||
/* PostCopyFD's for external userfaultfds & handlers of shared memory */
|
||||
GArray *postcopy_remote_fds;
|
||||
|
||||
QEMUBH *bh;
|
||||
|
||||
|
@ -23,6 +23,8 @@
|
||||
#include "savevm.h"
|
||||
#include "postcopy-ram.h"
|
||||
#include "ram.h"
|
||||
#include "qapi/error.h"
|
||||
#include "qemu/notify.h"
|
||||
#include "sysemu/sysemu.h"
|
||||
#include "sysemu/balloon.h"
|
||||
#include "qemu/error-report.h"
|
||||
@ -45,6 +47,33 @@ struct PostcopyDiscardState {
|
||||
unsigned int nsentcmds;
|
||||
};
|
||||
|
||||
static NotifierWithReturnList postcopy_notifier_list;
|
||||
|
||||
void postcopy_infrastructure_init(void)
|
||||
{
|
||||
notifier_with_return_list_init(&postcopy_notifier_list);
|
||||
}
|
||||
|
||||
void postcopy_add_notifier(NotifierWithReturn *nn)
|
||||
{
|
||||
notifier_with_return_list_add(&postcopy_notifier_list, nn);
|
||||
}
|
||||
|
||||
void postcopy_remove_notifier(NotifierWithReturn *n)
|
||||
{
|
||||
notifier_with_return_remove(n);
|
||||
}
|
||||
|
||||
int postcopy_notify(enum PostcopyNotifyReason reason, Error **errp)
|
||||
{
|
||||
struct PostcopyNotifyData pnd;
|
||||
pnd.reason = reason;
|
||||
pnd.errp = errp;
|
||||
|
||||
return notifier_with_return_list_notify(&postcopy_notifier_list,
|
||||
&pnd);
|
||||
}
|
||||
|
||||
/* Postcopy needs to detect accesses to pages that haven't yet been copied
|
||||
* across, and efficiently map new pages in, the techniques for doing this
|
||||
* are target OS specific.
|
||||
@ -186,12 +215,6 @@ static int test_ramblock_postcopiable(const char *block_name, void *host_addr,
|
||||
RAMBlock *rb = qemu_ram_block_by_name(block_name);
|
||||
size_t pagesize = qemu_ram_pagesize(rb);
|
||||
|
||||
if (qemu_ram_is_shared(rb)) {
|
||||
error_report("Postcopy on shared RAM (%s) is not yet supported",
|
||||
block_name);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (length % pagesize) {
|
||||
error_report("Postcopy requires RAM blocks to be a page size multiple,"
|
||||
" block %s is 0x" RAM_ADDR_FMT " bytes with a "
|
||||
@ -215,6 +238,7 @@ bool postcopy_ram_supported_by_host(MigrationIncomingState *mis)
|
||||
struct uffdio_register reg_struct;
|
||||
struct uffdio_range range_struct;
|
||||
uint64_t feature_mask;
|
||||
Error *local_err = NULL;
|
||||
|
||||
if (qemu_target_page_size() > pagesize) {
|
||||
error_report("Target page size bigger than host page size");
|
||||
@ -228,6 +252,12 @@ bool postcopy_ram_supported_by_host(MigrationIncomingState *mis)
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* Give devices a chance to object */
|
||||
if (postcopy_notify(POSTCOPY_NOTIFY_PROBE, &local_err)) {
|
||||
error_report_err(local_err);
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* Version and features check */
|
||||
if (!ufd_check_and_apply(ufd, mis)) {
|
||||
goto out;
|
||||
@ -377,6 +407,13 @@ int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
|
||||
trace_postcopy_ram_incoming_cleanup_entry();
|
||||
|
||||
if (mis->have_fault_thread) {
|
||||
Error *local_err = NULL;
|
||||
|
||||
if (postcopy_notify(POSTCOPY_NOTIFY_INBOUND_END, &local_err)) {
|
||||
error_report_err(local_err);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (qemu_ram_foreach_block(cleanup_range, mis)) {
|
||||
return -1;
|
||||
}
|
||||
@ -481,10 +518,63 @@ static int ram_block_enable_notify(const char *block_name, void *host_addr,
|
||||
error_report("%s userfault: Region doesn't support COPY", __func__);
|
||||
return -1;
|
||||
}
|
||||
if (reg_struct.ioctls & ((__u64)1 << _UFFDIO_ZEROPAGE)) {
|
||||
RAMBlock *rb = qemu_ram_block_by_name(block_name);
|
||||
qemu_ram_set_uf_zeroable(rb);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int postcopy_wake_shared(struct PostCopyFD *pcfd,
|
||||
uint64_t client_addr,
|
||||
RAMBlock *rb)
|
||||
{
|
||||
size_t pagesize = qemu_ram_pagesize(rb);
|
||||
struct uffdio_range range;
|
||||
int ret;
|
||||
trace_postcopy_wake_shared(client_addr, qemu_ram_get_idstr(rb));
|
||||
range.start = client_addr & ~(pagesize - 1);
|
||||
range.len = pagesize;
|
||||
ret = ioctl(pcfd->fd, UFFDIO_WAKE, &range);
|
||||
if (ret) {
|
||||
error_report("%s: Failed to wake: %zx in %s (%s)",
|
||||
__func__, (size_t)client_addr, qemu_ram_get_idstr(rb),
|
||||
strerror(errno));
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Callback from shared fault handlers to ask for a page,
|
||||
* the page must be specified by a RAMBlock and an offset in that rb
|
||||
* Note: Only for use by shared fault handlers (in fault thread)
|
||||
*/
|
||||
int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb,
|
||||
uint64_t client_addr, uint64_t rb_offset)
|
||||
{
|
||||
size_t pagesize = qemu_ram_pagesize(rb);
|
||||
uint64_t aligned_rbo = rb_offset & ~(pagesize - 1);
|
||||
MigrationIncomingState *mis = migration_incoming_get_current();
|
||||
|
||||
trace_postcopy_request_shared_page(pcfd->idstr, qemu_ram_get_idstr(rb),
|
||||
rb_offset);
|
||||
if (ramblock_recv_bitmap_test_byte_offset(rb, aligned_rbo)) {
|
||||
trace_postcopy_request_shared_page_present(pcfd->idstr,
|
||||
qemu_ram_get_idstr(rb), rb_offset);
|
||||
return postcopy_wake_shared(pcfd, client_addr, rb);
|
||||
}
|
||||
if (rb != mis->last_rb) {
|
||||
mis->last_rb = rb;
|
||||
migrate_send_rp_req_pages(mis, qemu_ram_get_idstr(rb),
|
||||
aligned_rbo, pagesize);
|
||||
} else {
|
||||
/* Save some space */
|
||||
migrate_send_rp_req_pages(mis, NULL, aligned_rbo, pagesize);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Handle faults detected by the USERFAULT markings
|
||||
*/
|
||||
@ -493,29 +583,44 @@ static void *postcopy_ram_fault_thread(void *opaque)
|
||||
MigrationIncomingState *mis = opaque;
|
||||
struct uffd_msg msg;
|
||||
int ret;
|
||||
size_t index;
|
||||
RAMBlock *rb = NULL;
|
||||
RAMBlock *last_rb = NULL; /* last RAMBlock we sent part of */
|
||||
|
||||
trace_postcopy_ram_fault_thread_entry();
|
||||
mis->last_rb = NULL; /* last RAMBlock we sent part of */
|
||||
qemu_sem_post(&mis->fault_thread_sem);
|
||||
|
||||
struct pollfd *pfd;
|
||||
size_t pfd_len = 2 + mis->postcopy_remote_fds->len;
|
||||
|
||||
pfd = g_new0(struct pollfd, pfd_len);
|
||||
|
||||
pfd[0].fd = mis->userfault_fd;
|
||||
pfd[0].events = POLLIN;
|
||||
pfd[1].fd = mis->userfault_event_fd;
|
||||
pfd[1].events = POLLIN; /* Waiting for eventfd to go positive */
|
||||
trace_postcopy_ram_fault_thread_fds_core(pfd[0].fd, pfd[1].fd);
|
||||
for (index = 0; index < mis->postcopy_remote_fds->len; index++) {
|
||||
struct PostCopyFD *pcfd = &g_array_index(mis->postcopy_remote_fds,
|
||||
struct PostCopyFD, index);
|
||||
pfd[2 + index].fd = pcfd->fd;
|
||||
pfd[2 + index].events = POLLIN;
|
||||
trace_postcopy_ram_fault_thread_fds_extra(2 + index, pcfd->idstr,
|
||||
pcfd->fd);
|
||||
}
|
||||
|
||||
while (true) {
|
||||
ram_addr_t rb_offset;
|
||||
struct pollfd pfd[2];
|
||||
int poll_result;
|
||||
|
||||
/*
|
||||
* We're mainly waiting for the kernel to give us a faulting HVA,
|
||||
* however we can be told to quit via userfault_quit_fd which is
|
||||
* an eventfd
|
||||
*/
|
||||
pfd[0].fd = mis->userfault_fd;
|
||||
pfd[0].events = POLLIN;
|
||||
pfd[0].revents = 0;
|
||||
pfd[1].fd = mis->userfault_event_fd;
|
||||
pfd[1].events = POLLIN; /* Waiting for eventfd to go positive */
|
||||
pfd[1].revents = 0;
|
||||
|
||||
if (poll(pfd, 2, -1 /* Wait forever */) == -1) {
|
||||
poll_result = poll(pfd, pfd_len, -1 /* Wait forever */);
|
||||
if (poll_result == -1) {
|
||||
error_report("%s: userfault poll: %s", __func__, strerror(errno));
|
||||
break;
|
||||
}
|
||||
@ -535,57 +640,117 @@ static void *postcopy_ram_fault_thread(void *opaque)
|
||||
}
|
||||
}
|
||||
|
||||
ret = read(mis->userfault_fd, &msg, sizeof(msg));
|
||||
if (ret != sizeof(msg)) {
|
||||
if (errno == EAGAIN) {
|
||||
/*
|
||||
* if a wake up happens on the other thread just after
|
||||
* the poll, there is nothing to read.
|
||||
*/
|
||||
continue;
|
||||
if (pfd[0].revents) {
|
||||
poll_result--;
|
||||
ret = read(mis->userfault_fd, &msg, sizeof(msg));
|
||||
if (ret != sizeof(msg)) {
|
||||
if (errno == EAGAIN) {
|
||||
/*
|
||||
* if a wake up happens on the other thread just after
|
||||
* the poll, there is nothing to read.
|
||||
*/
|
||||
continue;
|
||||
}
|
||||
if (ret < 0) {
|
||||
error_report("%s: Failed to read full userfault "
|
||||
"message: %s",
|
||||
__func__, strerror(errno));
|
||||
break;
|
||||
} else {
|
||||
error_report("%s: Read %d bytes from userfaultfd "
|
||||
"expected %zd",
|
||||
__func__, ret, sizeof(msg));
|
||||
break; /* Lost alignment, don't know what we'd read next */
|
||||
}
|
||||
}
|
||||
if (ret < 0) {
|
||||
error_report("%s: Failed to read full userfault message: %s",
|
||||
__func__, strerror(errno));
|
||||
if (msg.event != UFFD_EVENT_PAGEFAULT) {
|
||||
error_report("%s: Read unexpected event %ud from userfaultfd",
|
||||
__func__, msg.event);
|
||||
continue; /* It's not a page fault, shouldn't happen */
|
||||
}
|
||||
|
||||
rb = qemu_ram_block_from_host(
|
||||
(void *)(uintptr_t)msg.arg.pagefault.address,
|
||||
true, &rb_offset);
|
||||
if (!rb) {
|
||||
error_report("postcopy_ram_fault_thread: Fault outside guest: %"
|
||||
PRIx64, (uint64_t)msg.arg.pagefault.address);
|
||||
break;
|
||||
} else {
|
||||
error_report("%s: Read %d bytes from userfaultfd expected %zd",
|
||||
__func__, ret, sizeof(msg));
|
||||
break; /* Lost alignment, don't know what we'd read next */
|
||||
}
|
||||
}
|
||||
if (msg.event != UFFD_EVENT_PAGEFAULT) {
|
||||
error_report("%s: Read unexpected event %ud from userfaultfd",
|
||||
__func__, msg.event);
|
||||
continue; /* It's not a page fault, shouldn't happen */
|
||||
}
|
||||
|
||||
rb = qemu_ram_block_from_host(
|
||||
(void *)(uintptr_t)msg.arg.pagefault.address,
|
||||
true, &rb_offset);
|
||||
if (!rb) {
|
||||
error_report("postcopy_ram_fault_thread: Fault outside guest: %"
|
||||
PRIx64, (uint64_t)msg.arg.pagefault.address);
|
||||
break;
|
||||
}
|
||||
|
||||
rb_offset &= ~(qemu_ram_pagesize(rb) - 1);
|
||||
trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address,
|
||||
rb_offset &= ~(qemu_ram_pagesize(rb) - 1);
|
||||
trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address,
|
||||
qemu_ram_get_idstr(rb),
|
||||
rb_offset);
|
||||
/*
|
||||
* Send the request to the source - we want to request one
|
||||
* of our host page sizes (which is >= TPS)
|
||||
*/
|
||||
if (rb != mis->last_rb) {
|
||||
mis->last_rb = rb;
|
||||
migrate_send_rp_req_pages(mis, qemu_ram_get_idstr(rb),
|
||||
rb_offset, qemu_ram_pagesize(rb));
|
||||
} else {
|
||||
/* Save some space */
|
||||
migrate_send_rp_req_pages(mis, NULL,
|
||||
rb_offset, qemu_ram_pagesize(rb));
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Send the request to the source - we want to request one
|
||||
* of our host page sizes (which is >= TPS)
|
||||
*/
|
||||
if (rb != last_rb) {
|
||||
last_rb = rb;
|
||||
migrate_send_rp_req_pages(mis, qemu_ram_get_idstr(rb),
|
||||
rb_offset, qemu_ram_pagesize(rb));
|
||||
} else {
|
||||
/* Save some space */
|
||||
migrate_send_rp_req_pages(mis, NULL,
|
||||
rb_offset, qemu_ram_pagesize(rb));
|
||||
/* Now handle any requests from external processes on shared memory */
|
||||
/* TODO: May need to handle devices deregistering during postcopy */
|
||||
for (index = 2; index < pfd_len && poll_result; index++) {
|
||||
if (pfd[index].revents) {
|
||||
struct PostCopyFD *pcfd =
|
||||
&g_array_index(mis->postcopy_remote_fds,
|
||||
struct PostCopyFD, index - 2);
|
||||
|
||||
poll_result--;
|
||||
if (pfd[index].revents & POLLERR) {
|
||||
error_report("%s: POLLERR on poll %zd fd=%d",
|
||||
__func__, index, pcfd->fd);
|
||||
pfd[index].events = 0;
|
||||
continue;
|
||||
}
|
||||
|
||||
ret = read(pcfd->fd, &msg, sizeof(msg));
|
||||
if (ret != sizeof(msg)) {
|
||||
if (errno == EAGAIN) {
|
||||
/*
|
||||
* if a wake up happens on the other thread just after
|
||||
* the poll, there is nothing to read.
|
||||
*/
|
||||
continue;
|
||||
}
|
||||
if (ret < 0) {
|
||||
error_report("%s: Failed to read full userfault "
|
||||
"message: %s (shared) revents=%d",
|
||||
__func__, strerror(errno),
|
||||
pfd[index].revents);
|
||||
/*TODO: Could just disable this sharer */
|
||||
break;
|
||||
} else {
|
||||
error_report("%s: Read %d bytes from userfaultfd "
|
||||
"expected %zd (shared)",
|
||||
__func__, ret, sizeof(msg));
|
||||
/*TODO: Could just disable this sharer */
|
||||
break; /*Lost alignment,don't know what we'd read next*/
|
||||
}
|
||||
}
|
||||
if (msg.event != UFFD_EVENT_PAGEFAULT) {
|
||||
error_report("%s: Read unexpected event %ud "
|
||||
"from userfaultfd (shared)",
|
||||
__func__, msg.event);
|
||||
continue; /* It's not a page fault, shouldn't happen */
|
||||
}
|
||||
/* Call the device handler registered with us */
|
||||
ret = pcfd->handler(pcfd, &msg);
|
||||
if (ret) {
|
||||
error_report("%s: Failed to resolve shared fault on %zd/%s",
|
||||
__func__, index, pcfd->idstr);
|
||||
/* TODO: Fail? Disable this sharer? */
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
trace_postcopy_ram_fault_thread_exit();
|
||||
@ -667,6 +832,22 @@ static int qemu_ufd_copy_ioctl(int userfault_fd, void *host_addr,
|
||||
return ret;
|
||||
}
|
||||
|
||||
int postcopy_notify_shared_wake(RAMBlock *rb, uint64_t offset)
|
||||
{
|
||||
int i;
|
||||
MigrationIncomingState *mis = migration_incoming_get_current();
|
||||
GArray *pcrfds = mis->postcopy_remote_fds;
|
||||
|
||||
for (i = 0; i < pcrfds->len; i++) {
|
||||
struct PostCopyFD *cur = &g_array_index(pcrfds, struct PostCopyFD, i);
|
||||
int ret = cur->waker(cur, rb, offset);
|
||||
if (ret) {
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Place a host page (from) at (host) atomically
|
||||
* returns 0 on success
|
||||
@ -690,7 +871,8 @@ int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from,
|
||||
}
|
||||
|
||||
trace_postcopy_place_page(host);
|
||||
return 0;
|
||||
return postcopy_notify_shared_wake(rb,
|
||||
qemu_ram_block_host_offset(rb, host));
|
||||
}
|
||||
|
||||
/*
|
||||
@ -700,17 +882,23 @@ int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from,
|
||||
int postcopy_place_page_zero(MigrationIncomingState *mis, void *host,
|
||||
RAMBlock *rb)
|
||||
{
|
||||
size_t pagesize = qemu_ram_pagesize(rb);
|
||||
trace_postcopy_place_page_zero(host);
|
||||
|
||||
if (qemu_ram_pagesize(rb) == getpagesize()) {
|
||||
if (qemu_ufd_copy_ioctl(mis->userfault_fd, host, NULL, getpagesize(),
|
||||
rb)) {
|
||||
/* Normal RAMBlocks can zero a page using UFFDIO_ZEROPAGE
|
||||
* but it's not available for everything (e.g. hugetlbpages)
|
||||
*/
|
||||
if (qemu_ram_is_uf_zeroable(rb)) {
|
||||
if (qemu_ufd_copy_ioctl(mis->userfault_fd, host, NULL, pagesize, rb)) {
|
||||
int e = errno;
|
||||
error_report("%s: %s zero host: %p",
|
||||
__func__, strerror(e), host);
|
||||
|
||||
return -e;
|
||||
}
|
||||
return postcopy_notify_shared_wake(rb,
|
||||
qemu_ram_block_host_offset(rb,
|
||||
host));
|
||||
} else {
|
||||
/* The kernel can't use UFFDIO_ZEROPAGE for hugepages */
|
||||
if (!mis->postcopy_tmp_zero_page) {
|
||||
@ -730,8 +918,6 @@ int postcopy_place_page_zero(MigrationIncomingState *mis, void *host,
|
||||
return postcopy_place_page(mis, host, mis->postcopy_tmp_zero_page,
|
||||
rb);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -784,6 +970,13 @@ int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
|
||||
return -1;
|
||||
}
|
||||
|
||||
int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb,
|
||||
uint64_t client_addr, uint64_t rb_offset)
|
||||
{
|
||||
assert(0);
|
||||
return -1;
|
||||
}
|
||||
|
||||
int postcopy_ram_enable_notify(MigrationIncomingState *mis)
|
||||
{
|
||||
assert(0);
|
||||
@ -810,6 +1003,13 @@ void *postcopy_get_tmp_page(MigrationIncomingState *mis)
|
||||
return NULL;
|
||||
}
|
||||
|
||||
int postcopy_wake_shared(struct PostCopyFD *pcfd,
|
||||
uint64_t client_addr,
|
||||
RAMBlock *rb)
|
||||
{
|
||||
assert(0);
|
||||
return -1;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* ------------------------------------------------------------------------- */
|
||||
@ -927,3 +1127,31 @@ PostcopyState postcopy_state_set(PostcopyState new_state)
|
||||
{
|
||||
return atomic_xchg(&incoming_postcopy_state, new_state);
|
||||
}
|
||||
|
||||
/* Register a handler for external shared memory postcopy
|
||||
* called on the destination.
|
||||
*/
|
||||
void postcopy_register_shared_ufd(struct PostCopyFD *pcfd)
|
||||
{
|
||||
MigrationIncomingState *mis = migration_incoming_get_current();
|
||||
|
||||
mis->postcopy_remote_fds = g_array_append_val(mis->postcopy_remote_fds,
|
||||
*pcfd);
|
||||
}
|
||||
|
||||
/* Unregister a handler for external shared memory postcopy
|
||||
*/
|
||||
void postcopy_unregister_shared_ufd(struct PostCopyFD *pcfd)
|
||||
{
|
||||
guint i;
|
||||
MigrationIncomingState *mis = migration_incoming_get_current();
|
||||
GArray *pcrfds = mis->postcopy_remote_fds;
|
||||
|
||||
for (i = 0; i < pcrfds->len; i++) {
|
||||
struct PostCopyFD *cur = &g_array_index(pcrfds, struct PostCopyFD, i);
|
||||
if (cur->fd == pcfd->fd) {
|
||||
mis->postcopy_remote_fds = g_array_remove_index(pcrfds, i);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -116,4 +116,77 @@ PostcopyState postcopy_state_set(PostcopyState new_state);
|
||||
|
||||
void postcopy_fault_thread_notify(MigrationIncomingState *mis);
|
||||
|
||||
/*
|
||||
* To be called once at the start before any device initialisation
|
||||
*/
|
||||
void postcopy_infrastructure_init(void);
|
||||
|
||||
/* Add a notifier to a list to be called when checking whether the devices
|
||||
* can support postcopy.
|
||||
* It's data is a *PostcopyNotifyData
|
||||
* It should return 0 if OK, or a negative value on failure.
|
||||
* On failure it must set the data->errp to an error.
|
||||
*
|
||||
*/
|
||||
enum PostcopyNotifyReason {
|
||||
POSTCOPY_NOTIFY_PROBE = 0,
|
||||
POSTCOPY_NOTIFY_INBOUND_ADVISE,
|
||||
POSTCOPY_NOTIFY_INBOUND_LISTEN,
|
||||
POSTCOPY_NOTIFY_INBOUND_END,
|
||||
};
|
||||
|
||||
struct PostcopyNotifyData {
|
||||
enum PostcopyNotifyReason reason;
|
||||
Error **errp;
|
||||
};
|
||||
|
||||
void postcopy_add_notifier(NotifierWithReturn *nn);
|
||||
void postcopy_remove_notifier(NotifierWithReturn *n);
|
||||
/* Call the notifier list set by postcopy_add_start_notifier */
|
||||
int postcopy_notify(enum PostcopyNotifyReason reason, Error **errp);
|
||||
|
||||
struct PostCopyFD;
|
||||
|
||||
/* ufd is a pointer to the struct uffd_msg *TODO: more Portable! */
|
||||
typedef int (*pcfdhandler)(struct PostCopyFD *pcfd, void *ufd);
|
||||
/* Notification to wake, either on place or on reception of
|
||||
* a fault on something that's already arrived (race)
|
||||
*/
|
||||
typedef int (*pcfdwake)(struct PostCopyFD *pcfd, RAMBlock *rb, uint64_t offset);
|
||||
|
||||
struct PostCopyFD {
|
||||
int fd;
|
||||
/* Data to pass to handler */
|
||||
void *data;
|
||||
/* Handler to be called whenever we get a poll event */
|
||||
pcfdhandler handler;
|
||||
/* Notification to wake shared client */
|
||||
pcfdwake waker;
|
||||
/* A string to use in error messages */
|
||||
const char *idstr;
|
||||
};
|
||||
|
||||
/* Register a userfaultfd owned by an external process for
|
||||
* shared memory.
|
||||
*/
|
||||
void postcopy_register_shared_ufd(struct PostCopyFD *pcfd);
|
||||
void postcopy_unregister_shared_ufd(struct PostCopyFD *pcfd);
|
||||
/* Call each of the shared 'waker's registerd telling them of
|
||||
* availability of a block.
|
||||
*/
|
||||
int postcopy_notify_shared_wake(RAMBlock *rb, uint64_t offset);
|
||||
/* postcopy_wake_shared: Notify a client ufd that a page is available
|
||||
*
|
||||
* Returns 0 on success
|
||||
*
|
||||
* @pcfd: Structure with fd, handler and name as above
|
||||
* @client_addr: Address in the client program, not QEMU
|
||||
* @rb: The RAMBlock the page is in
|
||||
*/
|
||||
int postcopy_wake_shared(struct PostCopyFD *pcfd, uint64_t client_addr,
|
||||
RAMBlock *rb);
|
||||
/* Callback from shared fault handlers to ask for a page */
|
||||
int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb,
|
||||
uint64_t client_addr, uint64_t offset);
|
||||
|
||||
#endif
|
||||
|
@ -169,6 +169,11 @@ int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
|
||||
rb->receivedmap);
|
||||
}
|
||||
|
||||
bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
|
||||
{
|
||||
return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
|
||||
}
|
||||
|
||||
void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
|
||||
{
|
||||
set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
|
||||
|
@ -60,6 +60,7 @@ int ram_postcopy_incoming_init(MigrationIncomingState *mis);
|
||||
void ram_handle_compressed(void *host, uint8_t ch, uint64_t size);
|
||||
|
||||
int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr);
|
||||
bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset);
|
||||
void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr);
|
||||
void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr, size_t nr);
|
||||
|
||||
|
@ -1395,6 +1395,7 @@ static int loadvm_postcopy_handle_advise(MigrationIncomingState *mis,
|
||||
{
|
||||
PostcopyState ps = postcopy_state_set(POSTCOPY_INCOMING_ADVISE);
|
||||
uint64_t remote_pagesize_summary, local_pagesize_summary, remote_tps;
|
||||
Error *local_err = NULL;
|
||||
|
||||
trace_loadvm_postcopy_handle_advise();
|
||||
if (ps != POSTCOPY_INCOMING_NONE) {
|
||||
@ -1460,6 +1461,11 @@ static int loadvm_postcopy_handle_advise(MigrationIncomingState *mis,
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (postcopy_notify(POSTCOPY_NOTIFY_INBOUND_ADVISE, &local_err)) {
|
||||
error_report_err(local_err);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (ram_postcopy_incoming_init(mis)) {
|
||||
return -1;
|
||||
}
|
||||
@ -1621,6 +1627,8 @@ static int loadvm_postcopy_handle_listen(MigrationIncomingState *mis)
|
||||
{
|
||||
PostcopyState ps = postcopy_state_set(POSTCOPY_INCOMING_LISTENING);
|
||||
trace_loadvm_postcopy_handle_listen();
|
||||
Error *local_err = NULL;
|
||||
|
||||
if (ps != POSTCOPY_INCOMING_ADVISE && ps != POSTCOPY_INCOMING_DISCARD) {
|
||||
error_report("CMD_POSTCOPY_LISTEN in wrong postcopy state (%d)", ps);
|
||||
return -1;
|
||||
@ -1646,6 +1654,11 @@ static int loadvm_postcopy_handle_listen(MigrationIncomingState *mis)
|
||||
}
|
||||
}
|
||||
|
||||
if (postcopy_notify(POSTCOPY_NOTIFY_INBOUND_LISTEN, &local_err)) {
|
||||
error_report_err(local_err);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (mis->have_listen_thread) {
|
||||
error_report("CMD_POSTCOPY_RAM_LISTEN already has a listen thread");
|
||||
return -1;
|
||||
|
@ -190,12 +190,18 @@ postcopy_place_page_zero(void *host_addr) "host=%p"
|
||||
postcopy_ram_enable_notify(void) ""
|
||||
postcopy_ram_fault_thread_entry(void) ""
|
||||
postcopy_ram_fault_thread_exit(void) ""
|
||||
postcopy_ram_fault_thread_fds_core(int baseufd, int quitfd) "ufd: %d quitfd: %d"
|
||||
postcopy_ram_fault_thread_fds_extra(size_t index, const char *name, int fd) "%zd/%s: %d"
|
||||
postcopy_ram_fault_thread_quit(void) ""
|
||||
postcopy_ram_fault_thread_request(uint64_t hostaddr, const char *ramblock, size_t offset) "Request for HVA=0x%" PRIx64 " rb=%s offset=0x%zx"
|
||||
postcopy_ram_incoming_cleanup_closeuf(void) ""
|
||||
postcopy_ram_incoming_cleanup_entry(void) ""
|
||||
postcopy_ram_incoming_cleanup_exit(void) ""
|
||||
postcopy_ram_incoming_cleanup_join(void) ""
|
||||
postcopy_request_shared_page(const char *sharer, const char *rb, uint64_t rb_offset) "for %s in %s offset 0x%"PRIx64
|
||||
postcopy_request_shared_page_present(const char *sharer, const char *rb, uint64_t rb_offset) "%s already %s offset 0x%"PRIx64
|
||||
postcopy_wake_shared(uint64_t client_addr, const char *rb) "at 0x%"PRIx64" in %s"
|
||||
|
||||
save_xbzrle_page_skipping(void) ""
|
||||
save_xbzrle_page_overflow(void) ""
|
||||
ram_save_iterate_big_wait(uint64_t milliconds, int iterations) "big wait: %" PRIu64 " milliseconds, %d iterations"
|
||||
|
23
numa.c
23
numa.c
@ -520,29 +520,34 @@ void memory_region_allocate_system_memory(MemoryRegion *mr, Object *owner,
|
||||
|
||||
static void numa_stat_memory_devices(NumaNodeMem node_mem[])
|
||||
{
|
||||
MemoryDeviceInfoList *info_list = NULL;
|
||||
MemoryDeviceInfoList **prev = &info_list;
|
||||
MemoryDeviceInfoList *info_list = qmp_pc_dimm_device_list();
|
||||
MemoryDeviceInfoList *info;
|
||||
PCDIMMDeviceInfo *pcdimm_info;
|
||||
|
||||
qmp_pc_dimm_device_list(qdev_get_machine(), &prev);
|
||||
for (info = info_list; info; info = info->next) {
|
||||
MemoryDeviceInfo *value = info->value;
|
||||
|
||||
if (value) {
|
||||
switch (value->type) {
|
||||
case MEMORY_DEVICE_INFO_KIND_DIMM: {
|
||||
case MEMORY_DEVICE_INFO_KIND_DIMM:
|
||||
pcdimm_info = value->u.dimm.data;
|
||||
break;
|
||||
|
||||
case MEMORY_DEVICE_INFO_KIND_NVDIMM:
|
||||
pcdimm_info = value->u.nvdimm.data;
|
||||
break;
|
||||
|
||||
default:
|
||||
pcdimm_info = NULL;
|
||||
break;
|
||||
}
|
||||
|
||||
if (pcdimm_info) {
|
||||
node_mem[pcdimm_info->node].node_mem += pcdimm_info->size;
|
||||
if (pcdimm_info->hotpluggable && pcdimm_info->hotplugged) {
|
||||
node_mem[pcdimm_info->node].node_plugged_mem +=
|
||||
pcdimm_info->size;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Binary file not shown.
@ -2878,7 +2878,11 @@
|
||||
#
|
||||
# Since: 2.1
|
||||
##
|
||||
{ 'union': 'MemoryDeviceInfo', 'data': {'dimm': 'PCDIMMDeviceInfo'} }
|
||||
{ 'union': 'MemoryDeviceInfo',
|
||||
'data': { 'dimm': 'PCDIMMDeviceInfo',
|
||||
'nvdimm': 'PCDIMMDeviceInfo'
|
||||
}
|
||||
}
|
||||
|
||||
##
|
||||
# @query-memory-devices:
|
||||
|
7
qmp.c
7
qmp.c
@ -731,12 +731,7 @@ void qmp_object_del(const char *id, Error **errp)
|
||||
|
||||
MemoryDeviceInfoList *qmp_query_memory_devices(Error **errp)
|
||||
{
|
||||
MemoryDeviceInfoList *head = NULL;
|
||||
MemoryDeviceInfoList **prev = &head;
|
||||
|
||||
qmp_pc_dimm_device_list(qdev_get_machine(), &prev);
|
||||
|
||||
return head;
|
||||
return qmp_pc_dimm_device_list();
|
||||
}
|
||||
|
||||
ACPIOSTInfoList *qmp_query_acpi_ospm_status(Error **errp)
|
||||
|
@ -40,6 +40,9 @@ cp_portable() {
|
||||
-e 'sys/' \
|
||||
-e 'pvrdma_verbs' \
|
||||
-e 'drm.h' \
|
||||
-e 'limits' \
|
||||
-e 'linux/kernel' \
|
||||
-e 'linux/sysinfo' \
|
||||
> /dev/null
|
||||
then
|
||||
echo "Unexpected #include in input file $f".
|
||||
@ -62,6 +65,10 @@ cp_portable() {
|
||||
-e '/sys\/ioctl.h/d' \
|
||||
-e 's/SW_MAX/SW_MAX_/' \
|
||||
-e 's/atomic_t/int/' \
|
||||
-e 's/__kernel_long_t/long/' \
|
||||
-e 's/__kernel_ulong_t/unsigned long/' \
|
||||
-e 's/struct ethhdr/struct eth_header/' \
|
||||
-e '/\#define _LINUX_ETHTOOL_H/a \\n\#include "net/eth.h"' \
|
||||
"$f" > "$to/$header";
|
||||
}
|
||||
|
||||
@ -151,7 +158,9 @@ rm -rf "$output/include/standard-headers/linux"
|
||||
mkdir -p "$output/include/standard-headers/linux"
|
||||
for i in "$tmpdir"/include/linux/*virtio*.h "$tmpdir/include/linux/input.h" \
|
||||
"$tmpdir/include/linux/input-event-codes.h" \
|
||||
"$tmpdir/include/linux/pci_regs.h"; do
|
||||
"$tmpdir/include/linux/pci_regs.h" \
|
||||
"$tmpdir/include/linux/ethtool.h" "$tmpdir/include/linux/kernel.h" \
|
||||
"$tmpdir/include/linux/sysinfo.h"; do
|
||||
cp_portable "$i" "$output/include/standard-headers/linux"
|
||||
done
|
||||
mkdir -p "$output/include/standard-headers/drm"
|
||||
|
@ -2,9 +2,9 @@
|
||||
#include "qom/object.h"
|
||||
#include "hw/mem/pc-dimm.h"
|
||||
|
||||
int qmp_pc_dimm_device_list(Object *obj, void *opaque)
|
||||
MemoryDeviceInfoList *qmp_pc_dimm_device_list(void)
|
||||
{
|
||||
return 0;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
uint64_t get_plugged_memory_size(void)
|
||||
|
BIN
tests/acpi-test-data/pc/APIC.dimmpxm
Normal file
BIN
tests/acpi-test-data/pc/APIC.dimmpxm
Normal file
Binary file not shown.
BIN
tests/acpi-test-data/pc/DSDT.dimmpxm
Normal file
BIN
tests/acpi-test-data/pc/DSDT.dimmpxm
Normal file
Binary file not shown.
BIN
tests/acpi-test-data/pc/NFIT.dimmpxm
Normal file
BIN
tests/acpi-test-data/pc/NFIT.dimmpxm
Normal file
Binary file not shown.
BIN
tests/acpi-test-data/pc/SRAT.dimmpxm
Normal file
BIN
tests/acpi-test-data/pc/SRAT.dimmpxm
Normal file
Binary file not shown.
BIN
tests/acpi-test-data/pc/SSDT.dimmpxm
Normal file
BIN
tests/acpi-test-data/pc/SSDT.dimmpxm
Normal file
Binary file not shown.
BIN
tests/acpi-test-data/q35/APIC.dimmpxm
Normal file
BIN
tests/acpi-test-data/q35/APIC.dimmpxm
Normal file
Binary file not shown.
BIN
tests/acpi-test-data/q35/DSDT.dimmpxm
Normal file
BIN
tests/acpi-test-data/q35/DSDT.dimmpxm
Normal file
Binary file not shown.
BIN
tests/acpi-test-data/q35/NFIT.dimmpxm
Normal file
BIN
tests/acpi-test-data/q35/NFIT.dimmpxm
Normal file
Binary file not shown.
BIN
tests/acpi-test-data/q35/SRAT.dimmpxm
Normal file
BIN
tests/acpi-test-data/q35/SRAT.dimmpxm
Normal file
Binary file not shown.
BIN
tests/acpi-test-data/q35/SSDT.dimmpxm
Normal file
BIN
tests/acpi-test-data/q35/SSDT.dimmpxm
Normal file
Binary file not shown.
@ -29,7 +29,8 @@ typedef struct {
|
||||
uint32_t rsdp_addr;
|
||||
AcpiRsdpDescriptor rsdp_table;
|
||||
AcpiRsdtDescriptorRev1 rsdt_table;
|
||||
AcpiFadtDescriptorRev3 fadt_table;
|
||||
uint32_t dsdt_addr;
|
||||
uint32_t facs_addr;
|
||||
AcpiFacsDescriptorRev1 facs_table;
|
||||
uint32_t *rsdt_tables_addr;
|
||||
int rsdt_tables_nr;
|
||||
@ -127,71 +128,18 @@ static void test_acpi_rsdt_table(test_data *data)
|
||||
data->rsdt_tables_nr = tables_nr;
|
||||
}
|
||||
|
||||
static void test_acpi_fadt_table(test_data *data)
|
||||
static void fadt_fetch_facs_and_dsdt_ptrs(test_data *data)
|
||||
{
|
||||
AcpiFadtDescriptorRev3 *fadt_table = &data->fadt_table;
|
||||
uint32_t addr;
|
||||
AcpiTableHeader hdr;
|
||||
|
||||
/* FADT table comes first */
|
||||
addr = le32_to_cpu(data->rsdt_tables_addr[0]);
|
||||
ACPI_READ_TABLE_HEADER(fadt_table, addr);
|
||||
ACPI_READ_TABLE_HEADER(&hdr, addr);
|
||||
ACPI_ASSERT_CMP(hdr.signature, "FACP");
|
||||
|
||||
ACPI_READ_FIELD(fadt_table->firmware_ctrl, addr);
|
||||
ACPI_READ_FIELD(fadt_table->dsdt, addr);
|
||||
ACPI_READ_FIELD(fadt_table->model, addr);
|
||||
ACPI_READ_FIELD(fadt_table->reserved1, addr);
|
||||
ACPI_READ_FIELD(fadt_table->sci_int, addr);
|
||||
ACPI_READ_FIELD(fadt_table->smi_cmd, addr);
|
||||
ACPI_READ_FIELD(fadt_table->acpi_enable, addr);
|
||||
ACPI_READ_FIELD(fadt_table->acpi_disable, addr);
|
||||
ACPI_READ_FIELD(fadt_table->S4bios_req, addr);
|
||||
ACPI_READ_FIELD(fadt_table->reserved2, addr);
|
||||
ACPI_READ_FIELD(fadt_table->pm1a_evt_blk, addr);
|
||||
ACPI_READ_FIELD(fadt_table->pm1b_evt_blk, addr);
|
||||
ACPI_READ_FIELD(fadt_table->pm1a_cnt_blk, addr);
|
||||
ACPI_READ_FIELD(fadt_table->pm1b_cnt_blk, addr);
|
||||
ACPI_READ_FIELD(fadt_table->pm2_cnt_blk, addr);
|
||||
ACPI_READ_FIELD(fadt_table->pm_tmr_blk, addr);
|
||||
ACPI_READ_FIELD(fadt_table->gpe0_blk, addr);
|
||||
ACPI_READ_FIELD(fadt_table->gpe1_blk, addr);
|
||||
ACPI_READ_FIELD(fadt_table->pm1_evt_len, addr);
|
||||
ACPI_READ_FIELD(fadt_table->pm1_cnt_len, addr);
|
||||
ACPI_READ_FIELD(fadt_table->pm2_cnt_len, addr);
|
||||
ACPI_READ_FIELD(fadt_table->pm_tmr_len, addr);
|
||||
ACPI_READ_FIELD(fadt_table->gpe0_blk_len, addr);
|
||||
ACPI_READ_FIELD(fadt_table->gpe1_blk_len, addr);
|
||||
ACPI_READ_FIELD(fadt_table->gpe1_base, addr);
|
||||
ACPI_READ_FIELD(fadt_table->reserved3, addr);
|
||||
ACPI_READ_FIELD(fadt_table->plvl2_lat, addr);
|
||||
ACPI_READ_FIELD(fadt_table->plvl3_lat, addr);
|
||||
ACPI_READ_FIELD(fadt_table->flush_size, addr);
|
||||
ACPI_READ_FIELD(fadt_table->flush_stride, addr);
|
||||
ACPI_READ_FIELD(fadt_table->duty_offset, addr);
|
||||
ACPI_READ_FIELD(fadt_table->duty_width, addr);
|
||||
ACPI_READ_FIELD(fadt_table->day_alrm, addr);
|
||||
ACPI_READ_FIELD(fadt_table->mon_alrm, addr);
|
||||
ACPI_READ_FIELD(fadt_table->century, addr);
|
||||
ACPI_READ_FIELD(fadt_table->boot_flags, addr);
|
||||
ACPI_READ_FIELD(fadt_table->reserved, addr);
|
||||
ACPI_READ_FIELD(fadt_table->flags, addr);
|
||||
ACPI_READ_GENERIC_ADDRESS(fadt_table->reset_register, addr);
|
||||
ACPI_READ_FIELD(fadt_table->reset_value, addr);
|
||||
ACPI_READ_FIELD(fadt_table->arm_boot_flags, addr);
|
||||
ACPI_READ_FIELD(fadt_table->minor_revision, addr);
|
||||
ACPI_READ_FIELD(fadt_table->x_facs, addr);
|
||||
ACPI_READ_FIELD(fadt_table->x_dsdt, addr);
|
||||
ACPI_READ_GENERIC_ADDRESS(fadt_table->xpm1a_event_block, addr);
|
||||
ACPI_READ_GENERIC_ADDRESS(fadt_table->xpm1b_event_block, addr);
|
||||
ACPI_READ_GENERIC_ADDRESS(fadt_table->xpm1a_control_block, addr);
|
||||
ACPI_READ_GENERIC_ADDRESS(fadt_table->xpm1b_control_block, addr);
|
||||
ACPI_READ_GENERIC_ADDRESS(fadt_table->xpm2_control_block, addr);
|
||||
ACPI_READ_GENERIC_ADDRESS(fadt_table->xpm_timer_block, addr);
|
||||
ACPI_READ_GENERIC_ADDRESS(fadt_table->xgpe0_block, addr);
|
||||
ACPI_READ_GENERIC_ADDRESS(fadt_table->xgpe1_block, addr);
|
||||
|
||||
ACPI_ASSERT_CMP(fadt_table->signature, "FACP");
|
||||
g_assert(!acpi_calc_checksum((uint8_t *)fadt_table,
|
||||
le32_to_cpu(fadt_table->length)));
|
||||
ACPI_READ_FIELD(data->facs_addr, addr);
|
||||
ACPI_READ_FIELD(data->dsdt_addr, addr);
|
||||
}
|
||||
|
||||
static void sanitize_fadt_ptrs(test_data *data)
|
||||
@ -206,6 +154,12 @@ static void sanitize_fadt_ptrs(test_data *data)
|
||||
continue;
|
||||
}
|
||||
|
||||
/* check original FADT checksum before sanitizing table */
|
||||
g_assert(!(uint8_t)(
|
||||
acpi_calc_checksum((uint8_t *)sdt, sizeof(AcpiTableHeader)) +
|
||||
acpi_calc_checksum((uint8_t *)sdt->aml, sdt->aml_len)
|
||||
));
|
||||
|
||||
/* sdt->aml field offset := spec offset - header size */
|
||||
memset(sdt->aml + 0, 0, 4); /* sanitize FIRMWARE_CTRL(36) ptr */
|
||||
memset(sdt->aml + 4, 0, 4); /* sanitize DSDT(40) ptr */
|
||||
@ -226,7 +180,7 @@ static void sanitize_fadt_ptrs(test_data *data)
|
||||
static void test_acpi_facs_table(test_data *data)
|
||||
{
|
||||
AcpiFacsDescriptorRev1 *facs_table = &data->facs_table;
|
||||
uint32_t addr = le32_to_cpu(data->fadt_table.firmware_ctrl);
|
||||
uint32_t addr = le32_to_cpu(data->facs_addr);
|
||||
|
||||
ACPI_READ_FIELD(facs_table->signature, addr);
|
||||
ACPI_READ_FIELD(facs_table->length, addr);
|
||||
@ -265,7 +219,7 @@ static void fetch_table(AcpiSdtTable *sdt_table, uint32_t addr)
|
||||
static void test_acpi_dsdt_table(test_data *data)
|
||||
{
|
||||
AcpiSdtTable dsdt_table;
|
||||
uint32_t addr = le32_to_cpu(data->fadt_table.dsdt);
|
||||
uint32_t addr = le32_to_cpu(data->dsdt_addr);
|
||||
|
||||
fetch_table(&dsdt_table, addr);
|
||||
ACPI_ASSERT_CMP(dsdt_table.header.signature, "DSDT");
|
||||
@ -674,7 +628,7 @@ static void test_acpi_one(const char *params, test_data *data)
|
||||
test_acpi_rsdp_address(data);
|
||||
test_acpi_rsdp_table(data);
|
||||
test_acpi_rsdt_table(data);
|
||||
test_acpi_fadt_table(data);
|
||||
fadt_fetch_facs_and_dsdt_ptrs(data);
|
||||
test_acpi_facs_table(data);
|
||||
test_acpi_dsdt_table(data);
|
||||
fetch_rsdt_referenced_tables(data);
|
||||
@ -869,6 +823,42 @@ static void test_acpi_piix4_tcg_numamem(void)
|
||||
free_test_data(&data);
|
||||
}
|
||||
|
||||
static void test_acpi_tcg_dimm_pxm(const char *machine)
|
||||
{
|
||||
test_data data;
|
||||
|
||||
memset(&data, 0, sizeof(data));
|
||||
data.machine = machine;
|
||||
data.variant = ".dimmpxm";
|
||||
test_acpi_one(" -machine nvdimm=on"
|
||||
" -smp 4,sockets=4"
|
||||
" -m 128M,slots=3,maxmem=1G"
|
||||
" -numa node,mem=32M,nodeid=0"
|
||||
" -numa node,mem=32M,nodeid=1"
|
||||
" -numa node,mem=32M,nodeid=2"
|
||||
" -numa node,mem=32M,nodeid=3"
|
||||
" -numa cpu,node-id=0,socket-id=0"
|
||||
" -numa cpu,node-id=1,socket-id=1"
|
||||
" -numa cpu,node-id=2,socket-id=2"
|
||||
" -numa cpu,node-id=3,socket-id=3"
|
||||
" -object memory-backend-ram,id=ram0,size=128M"
|
||||
" -object memory-backend-ram,id=nvm0,size=128M"
|
||||
" -device pc-dimm,id=dimm0,memdev=ram0,node=1"
|
||||
" -device nvdimm,id=dimm1,memdev=nvm0,node=2",
|
||||
&data);
|
||||
free_test_data(&data);
|
||||
}
|
||||
|
||||
static void test_acpi_q35_tcg_dimm_pxm(void)
|
||||
{
|
||||
test_acpi_tcg_dimm_pxm(MACHINE_Q35);
|
||||
}
|
||||
|
||||
static void test_acpi_piix4_tcg_dimm_pxm(void)
|
||||
{
|
||||
test_acpi_tcg_dimm_pxm(MACHINE_PC);
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
const char *arch = qtest_get_arch();
|
||||
@ -893,6 +883,8 @@ int main(int argc, char *argv[])
|
||||
qtest_add_func("acpi/q35/memhp", test_acpi_q35_tcg_memhp);
|
||||
qtest_add_func("acpi/piix4/numamem", test_acpi_piix4_tcg_numamem);
|
||||
qtest_add_func("acpi/q35/numamem", test_acpi_q35_tcg_numamem);
|
||||
qtest_add_func("acpi/piix4/dimmpxm", test_acpi_piix4_tcg_dimm_pxm);
|
||||
qtest_add_func("acpi/q35/dimmpxm", test_acpi_q35_tcg_dimm_pxm);
|
||||
}
|
||||
ret = g_test_run();
|
||||
boot_sector_cleanup(disk);
|
||||
|
@ -58,9 +58,10 @@ dma_complete(void *dbs, int ret, void *cb) "dbs=%p ret=%d cb=%p"
|
||||
dma_blk_cb(void *dbs, int ret) "dbs=%p ret=%d"
|
||||
dma_map_wait(void *dbs) "dbs=%p"
|
||||
|
||||
# # exec.c
|
||||
# exec.c
|
||||
find_ram_offset(uint64_t size, uint64_t offset) "size: 0x%" PRIx64 " @ 0x%" PRIx64
|
||||
find_ram_offset_loop(uint64_t size, uint64_t candidate, uint64_t offset, uint64_t next, uint64_t mingap) "trying size: 0x%" PRIx64 " @ 0x%" PRIx64 ", offset: 0x%" PRIx64" next: 0x%" PRIx64 " mingap: 0x%" PRIx64
|
||||
ram_block_discard_range(const char *rbname, void *hva, size_t length, bool need_madvise, bool need_fallocate, int ret) "%s@%p + 0x%zx: madvise: %d fallocate: %d ret: %d"
|
||||
|
||||
# memory.c
|
||||
memory_region_ops_read(int cpu_index, void *mr, uint64_t addr, uint64_t value, unsigned size) "cpu %d mr %p addr 0x%"PRIx64" value 0x%"PRIx64" size %u"
|
||||
|
2
vl.c
2
vl.c
@ -94,6 +94,7 @@ int main(int argc, char **argv)
|
||||
#include "audio/audio.h"
|
||||
#include "sysemu/cpus.h"
|
||||
#include "migration/colo.h"
|
||||
#include "migration/postcopy-ram.h"
|
||||
#include "sysemu/kvm.h"
|
||||
#include "sysemu/hax.h"
|
||||
#include "qapi/qobject-input-visitor.h"
|
||||
@ -3101,6 +3102,7 @@ int main(int argc, char **argv, char **envp)
|
||||
module_call_init(MODULE_INIT_OPTS);
|
||||
|
||||
runstate_init();
|
||||
postcopy_infrastructure_init();
|
||||
|
||||
if (qcrypto_init(&err) < 0) {
|
||||
error_reportf_err(err, "cannot initialize crypto: ");
|
||||
|
Loading…
Reference in New Issue
Block a user