ivshmem: Fixes, cleanups, device model split

-----BEGIN PGP SIGNATURE----- Version: GnuPG v1 iQIcBAABAgAGBQJW8FqyAAoJEDhwtADrkYZTjYcP/R1m2LcFnLTxzDjSK38nxWcw 5t/Do7nBNgXL2ZdRHfJsy7bx/9RR55k16rvzkFgW8LpUa5Ro64onRh2PfMz2p0e8 QvZRBhXTh5/y4TD61y5Y8d9xawA6Hr1oEUtwsfovI9EiXzVaLl3sLI/nleed68Rk eAD2h8+ZcBeJ+lRK3UHEzAvqh0u+IScRMJifCxHyJuoZiylHIHVVq7x40ywg0Ejq 8wHEj/nDJZHUxbuH4sm215Lv4dK6CmIP8UzuhfY6MxAS6Jo7Zdk1zv2SjJO2DzwT rWU4hD0+khwTz3hBR341oWxb84C5MujPwkeP7mibR46HLHCn5imQMz0W+6tj7umb dxnwPpXzON00+56B7e4i21aXTO0IaY3AcL9QuETSAaoy3SD5BdDkt3R9XWM+jqqZ armE5nNAv8WEN8qUYL/YpBxFDYSZ3CFgNv1enoP2pSp4DqeF/H3aP4RWu+dYqLDm MyVhcXUkjHfTCY6NVPPBkNwSvz2vq4ft/b6t7tLN+0ZmIRsEegKxxRrI2vB6O8Ga Gh2iKcJfMp90jwwvywfGO+DNQ8npHvhxMkioyzMHflo0QyS2ZDhlf4ubp7cXlYZ6 tj7iGXJKJQpQyJWA58k8EXR9wc2W+fgRYD/H61QTTyTUgxEo6w10KjBDTsbFwvIY R0poHCfRR0DQ7y3GerZO =XEMm -----END PGP SIGNATURE----- Merge remote-tracking branch 'remotes/armbru/tags/pull-ivshmem-2016-03-18' into staging ivshmem: Fixes, cleanups, device model split # gpg: Signature made Mon 21 Mar 2016 20:33:54 GMT using RSA key ID EB918653 # gpg: Good signature from "Markus Armbruster <armbru@redhat.com>" # gpg: aka "Markus Armbruster <armbru@pond.sub.org>" * remotes/armbru/tags/pull-ivshmem-2016-03-18: (40 commits) contrib/ivshmem-server: Print "not for production" warning ivshmem: Require master to have ID zero ivshmem: Drop ivshmem property x-memdev ivshmem: Clean up after the previous commit ivshmem: Split ivshmem-plain, ivshmem-doorbell off ivshmem ivshmem: Replace int role_val by OnOffAuto master qdev: New DEFINE_PROP_ON_OFF_AUTO ivshmem: Inline check_shm_size() into its only caller ivshmem: Simplify memory regions for BAR 2 (shared memory) ivshmem: Implement shm=... with a memory backend ivshmem: Tighten check of property "size" ivshmem: Simplify how we cope with short reads from server ivshmem: Drop the hackish test for UNIX domain chardev ivshmem: Rely on server sending the ID right after the version ivshmem: Propagate errors through ivshmem_recv_setup() ivshmem: Receive shared memory synchronously in realize() ivshmem: Plug leaks on unplug, fix peer disconnect ivshmem: Disentangle ivshmem_read() ivshmem: Simplify rejection of invalid peer ID from server ivshmem: Assert interrupts are set up once ... Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
2016-03-23 12:57:44 +00:00 · 2016-03-23 12:57:44 +00:00 · 2538039f2c
commit 2538039f2c
parent ffa6564c9b a335c6f204
15 changed files with 1053 additions and 851 deletions
--- a/contrib/ivshmem-server/ivshmem-server.c
+++ b/contrib/ivshmem-server/ivshmem-server.c
@ -12,9 +12,6 @@
 #include <sys/mman.h>
 #include <sys/socket.h>
 #include <sys/un.h>
 #ifdef CONFIG_LINUX
 #include <sys/vfs.h>
 #endif
 #include "ivshmem-server.h"
@ -257,7 +254,8 @@ ivshmem_server_ftruncate(int fd, unsigned shmsize)
 /* Init a new ivshmem server */
 int
 ivshmem_server_init(IvshmemServer *server, const char *unix_sock_path,
-                    const char *shm_path, size_t shm_size, unsigned n_vectors,
+                    const char *shm_path, bool use_shm_open,
                    size_t shm_size, unsigned n_vectors,
                    bool verbose)
 {
    int ret;
@ -278,6 +276,7 @@ ivshmem_server_init(IvshmemServer *server, const char *unix_sock_path,
        return -1;
    }
    server->use_shm_open = use_shm_open;
    server->shm_size = shm_size;
    server->n_vectors = n_vectors;
@ -286,31 +285,6 @@ ivshmem_server_init(IvshmemServer *server, const char *unix_sock_path,
    return 0;
 }
 #ifdef CONFIG_LINUX
 #define HUGETLBFS_MAGIC       0x958458f6
 static long gethugepagesize(const char *path)
 {
    struct statfs fs;
    int ret;
    do {
        ret = statfs(path, &fs);
    } while (ret != 0 && errno == EINTR);
    if (ret != 0) {
        return -1;
    }
    if (fs.f_type != HUGETLBFS_MAGIC) {
        return -1;
    }
    return fs.f_bsize;
 }
 #endif
 /* open shm, create and bind to the unix socket */
 int
 ivshmem_server_start(IvshmemServer *server)
@ -319,27 +293,17 @@ ivshmem_server_start(IvshmemServer *server)
    int shm_fd, sock_fd, ret;
    /* open shm file */
-#ifdef CONFIG_LINUX
+    if (server->use_shm_open) {
-    long hpagesize;
+        IVSHMEM_SERVER_DEBUG(server, "Using POSIX shared memory: %s\n",
-
+                             server->shm_path);
-    hpagesize = gethugepagesize(server->shm_path);
+        shm_fd = shm_open(server->shm_path, O_CREAT | O_RDWR, S_IRWXU);
-    if (hpagesize < 0 && errno != ENOENT) {
+    } else {
        IVSHMEM_SERVER_DEBUG(server, "cannot stat shm file %s: %s\n",
                             server->shm_path, strerror(errno));
    }
    if (hpagesize > 0) {
        gchar *filename = g_strdup_printf("%s/ivshmem.XXXXXX", server->shm_path);
-        IVSHMEM_SERVER_DEBUG(server, "Using hugepages: %s\n", server->shm_path);
+        IVSHMEM_SERVER_DEBUG(server, "Using file-backed shared memory: %s\n",
                             server->shm_path);
        shm_fd = mkstemp(filename);
        unlink(filename);
        g_free(filename);
    } else
 #endif
    {
        IVSHMEM_SERVER_DEBUG(server, "Using POSIX shared memory: %s\n",
                             server->shm_path);
        shm_fd = shm_open(server->shm_path, O_CREAT|O_RDWR, S_IRWXU);
    }
    if (shm_fd < 0) {
--- a/contrib/ivshmem-server/ivshmem-server.h
+++ b/contrib/ivshmem-server/ivshmem-server.h
@ -66,6 +66,7 @@ typedef struct IvshmemServer {
    char unix_sock_path[PATH_MAX];   /**< path to unix socket */
    int sock_fd;                     /**< unix sock file descriptor */
    char shm_path[PATH_MAX];         /**< path to shm */
    bool use_shm_open;
    size_t shm_size;                 /**< size of shm */
    int shm_fd;                      /**< shm file descriptor */
    unsigned n_vectors;              /**< number of vectors */
@ -89,7 +90,8 @@ typedef struct IvshmemServer {
 */
 int
 ivshmem_server_init(IvshmemServer *server, const char *unix_sock_path,
-                    const char *shm_path, size_t shm_size, unsigned n_vectors,
+                    const char *shm_path, bool use_shm_open,
                    size_t shm_size, unsigned n_vectors,
                    bool verbose);
 /**
--- a/contrib/ivshmem-server/main.c
+++ b/contrib/ivshmem-server/main.c
@ -29,35 +29,38 @@ typedef struct IvshmemServerArgs {
    const char *pid_file;
    const char *unix_socket_path;
    const char *shm_path;
    bool use_shm_open;
    uint64_t shm_size;
    unsigned n_vectors;
 } IvshmemServerArgs;
 /* show ivshmem_server_usage and exit with given error code */
 static void
-ivshmem_server_usage(const char *name, int code)
+ivshmem_server_usage(const char *progname)
 {
-    fprintf(stderr, "%s [opts]\n", name);
+    printf("Usage: %s [OPTION]...\n"
-    fprintf(stderr, "  -h: show this help\n");
+           "  -h: show this help\n"
-    fprintf(stderr, "  -v: verbose mode\n");
+           "  -v: verbose mode\n"
-    fprintf(stderr, "  -F: foreground mode (default is to daemonize)\n");
+           "  -F: foreground mode (default is to daemonize)\n"
-    fprintf(stderr, "  -p <pid_file>: path to the PID file (used in daemon\n"
+           "  -p <pid-file>: path to the PID file (used in daemon mode only)\n"
-                    "     mode only).\n"
+           "     default " IVSHMEM_SERVER_DEFAULT_PID_FILE "\n"
-                    "     Default=%s\n", IVSHMEM_SERVER_DEFAULT_SHM_PATH);
+           "  -S <unix-socket-path>: path to the unix socket to listen to\n"
-    fprintf(stderr, "  -S <unix_socket_path>: path to the unix socket\n"
+           "     default " IVSHMEM_SERVER_DEFAULT_UNIX_SOCK_PATH "\n"
-                    "     to listen to.\n"
+           "  -M <shm-name>: POSIX shared memory object to use\n"
-                    "     Default=%s\n", IVSHMEM_SERVER_DEFAULT_UNIX_SOCK_PATH);
+           "     default " IVSHMEM_SERVER_DEFAULT_SHM_PATH "\n"
-    fprintf(stderr, "  -m <shm_path>: path to the shared memory.\n"
+           "  -m <dir-name>: where to create shared memory\n"
-                    "     The path corresponds to a POSIX shm name or a\n"
+           "  -l <size>: size of shared memory in bytes\n"
-                    "     hugetlbfs mount point.\n"
+           "     suffixes K, M and G can be used, e.g. 1K means 1024\n"
-                    "     default=%s\n", IVSHMEM_SERVER_DEFAULT_SHM_PATH);
+           "     default %u\n"
-    fprintf(stderr, "  -l <size>: size of shared memory in bytes. The suffix\n"
+           "  -n <nvectors>: number of vectors\n"
-                    "     K, M and G can be used (ex: 1K means 1024).\n"
+           "     default %u\n",
-                    "     default=%u\n", IVSHMEM_SERVER_DEFAULT_SHM_SIZE);
+           progname, IVSHMEM_SERVER_DEFAULT_SHM_SIZE,
-    fprintf(stderr, "  -n <n_vects>: number of vectors.\n"
+           IVSHMEM_SERVER_DEFAULT_N_VECTORS);
-                    "     default=%u\n", IVSHMEM_SERVER_DEFAULT_N_VECTORS);
+}
-    exit(code);
+static void
 ivshmem_server_help(const char *progname)
 {
    fprintf(stderr, "Try '%s -h' for more information.\n", progname);
 }
 /* parse the program arguments, exit on error */
@ -68,20 +71,12 @@ ivshmem_server_parse_args(IvshmemServerArgs *args, int argc, char *argv[])
    unsigned long long v;
    Error *err = NULL;
-    while ((c = getopt(argc, argv,
+    while ((c = getopt(argc, argv, "hvFp:S:m:M:l:n:")) != -1) {
                       "h"  /* help */
                       "v"  /* verbose */
                       "F"  /* foreground */
                       "p:" /* pid_file */
                       "S:" /* unix_socket_path */
                       "m:" /* shm_path */
                       "l:" /* shm_size */
                       "n:" /* n_vectors */
                      )) != -1) {
        switch (c) {
        case 'h': /* help */
-            ivshmem_server_usage(argv[0], 0);
+            ivshmem_server_usage(argv[0]);
            exit(0);
            break;
        case 'v': /* verbose */
@ -92,36 +87,41 @@ ivshmem_server_parse_args(IvshmemServerArgs *args, int argc, char *argv[])
            args->foreground = 1;
            break;
-        case 'p': /* pid_file */
+        case 'p': /* pid file */
            args->pid_file = optarg;
            break;
-        case 'S': /* unix_socket_path */
+        case 'S': /* unix socket path */
            args->unix_socket_path = optarg;
            break;
-        case 'm': /* shm_path */
+        case 'M': /* shm name */
        case 'm': /* dir name */
            args->shm_path = optarg;
            args->use_shm_open = c == 'M';
            break;
-        case 'l': /* shm_size */
+        case 'l': /* shm size */
            parse_option_size("shm_size", optarg, &args->shm_size, &err);
            if (err) {
                error_report_err(err);
-                ivshmem_server_usage(argv[0], 1);
+                ivshmem_server_help(argv[0]);
                exit(1);
            }
            break;
-        case 'n': /* n_vectors */
+        case 'n': /* number of vectors */
            if (parse_uint_full(optarg, &v, 0) < 0) {
                fprintf(stderr, "cannot parse n_vectors\n");
-                ivshmem_server_usage(argv[0], 1);
+                ivshmem_server_help(argv[0]);
                exit(1);
            }
            args->n_vectors = v;
            break;
        default:
-            ivshmem_server_usage(argv[0], 1);
+            ivshmem_server_usage(argv[0]);
            exit(1);
            break;
        }
    }
@ -129,12 +129,14 @@ ivshmem_server_parse_args(IvshmemServerArgs *args, int argc, char *argv[])
    if (args->n_vectors > IVSHMEM_SERVER_MAX_VECTORS) {
        fprintf(stderr, "too many requested vectors (max is %d)\n",
                IVSHMEM_SERVER_MAX_VECTORS);
-        ivshmem_server_usage(argv[0], 1);
+        ivshmem_server_help(argv[0]);
        exit(1);
    }
    if (args->verbose == 1 && args->foreground == 0) {
        fprintf(stderr, "cannot use verbose in daemon mode\n");
-        ivshmem_server_usage(argv[0], 1);
+        ivshmem_server_help(argv[0]);
        exit(1);
    }
 }
@ -192,11 +194,18 @@ main(int argc, char *argv[])
        .pid_file = IVSHMEM_SERVER_DEFAULT_PID_FILE,
        .unix_socket_path = IVSHMEM_SERVER_DEFAULT_UNIX_SOCK_PATH,
        .shm_path = IVSHMEM_SERVER_DEFAULT_SHM_PATH,
        .use_shm_open = true,
        .shm_size = IVSHMEM_SERVER_DEFAULT_SHM_SIZE,
        .n_vectors = IVSHMEM_SERVER_DEFAULT_N_VECTORS,
    };
    int ret = 1;
    /*
     * Do not remove this notice without adding proper error handling!
     * Start with handling ivshmem_server_send_one_msg() failure.
     */
    printf("*** Example code, do not use in production ***\n");
    /* parse arguments, will exit on error */
    ivshmem_server_parse_args(&args, argc, argv);
@ -219,7 +228,8 @@ main(int argc, char *argv[])
    }
    /* init the ivshms structure */
-    if (ivshmem_server_init(&server, args.unix_socket_path, args.shm_path,
+    if (ivshmem_server_init(&server, args.unix_socket_path,
                            args.shm_path, args.use_shm_open,
                            args.shm_size, args.n_vectors, args.verbose) < 0) {
        fprintf(stderr, "cannot init server\n");
        goto err;
--- a/default-configs/pci.mak
+++ b/default-configs/pci.mak
@ -36,5 +36,5 @@ CONFIG_SDHCI=y
 CONFIG_EDU=y
 CONFIG_VGA=y
 CONFIG_VGA_PCI=y
-CONFIG_IVSHMEM=$(CONFIG_POSIX)
+CONFIG_IVSHMEM=$(CONFIG_EVENTFD)
 CONFIG_ROCKER=y
--- a/docs/specs/ivshmem-spec.txt
+++ b/docs/specs/ivshmem-spec.txt
@ -0,0 +1,254 @@
 = Device Specification for Inter-VM shared memory device =
 The Inter-VM shared memory device (ivshmem) is designed to share a
 memory region between multiple QEMU processes running different guests
 and the host.  In order for all guests to be able to pick up the
 shared memory area, it is modeled by QEMU as a PCI device exposing
 said memory to the guest as a PCI BAR.
 The device can use a shared memory object on the host directly, or it
 can obtain one from an ivshmem server.
 In the latter case, the device can additionally interrupt its peers, and
 get interrupted by its peers.
 == Configuring the ivshmem PCI device ==
 There are two basic configurations:
 - Just shared memory: -device ivshmem-plain,memdev=HMB,...
  This uses host memory backend HMB.  It should have option "share"
  set.
 - Shared memory plus interrupts: -device ivshmem,chardev=CHR,vectors=N,...
  An ivshmem server must already be running on the host.  The device
  connects to the server's UNIX domain socket via character device
  CHR.
  Each peer gets assigned a unique ID by the server.  IDs must be
  between 0 and 65535.
  Interrupts are message-signaled (MSI-X).  vectors=N configures the
  number of vectors to use.
 For more details on ivshmem device properties, see The QEMU Emulator
 User Documentation (qemu-doc.*).
 == The ivshmem PCI device's guest interface ==
 The device has vendor ID 1af4, device ID 1110, revision 1.  Before
 QEMU 2.6.0, it had revision 0.
 === PCI BARs ===
 The ivshmem PCI device has two or three BARs:
 - BAR0 holds device registers (256 Byte MMIO)
 - BAR1 holds MSI-X table and PBA (only ivshmem-doorbell)
 - BAR2 maps the shared memory object
 There are two ways to use this device:
 - If you only need the shared memory part, BAR2 suffices.  This way,
  you have access to the shared memory in the guest and can use it as
  you see fit.  Memnic, for example, uses ivshmem this way from guest
  user space (see http://dpdk.org/browse/memnic).
 - If you additionally need the capability for peers to interrupt each
  other, you need BAR0 and BAR1.  You will most likely want to write a
  kernel driver to handle interrupts.  Requires the device to be
  configured for interrupts, obviously.
 Before QEMU 2.6.0, BAR2 can initially be invalid if the device is
 configured for interrupts.  It becomes safely accessible only after
 the ivshmem server provided the shared memory.  These devices have PCI
 revision 0 rather than 1.  Guest software should wait for the
 IVPosition register (described below) to become non-negative before
 accessing BAR2.
 Revision 0 of the device is not capable to tell guest software whether
 it is configured for interrupts.
 === PCI device registers ===
 BAR 0 contains the following registers:
    Offset  Size  Access      On reset  Function
        0     4   read/write        0   Interrupt Mask
                                        bit 0: peer interrupt (rev 0)
                                               reserved       (rev 1)
                                        bit 1..31: reserved
        4     4   read/write        0   Interrupt Status
                                        bit 0: peer interrupt (rev 0)
                                               reserved       (rev 1)
                                        bit 1..31: reserved
        8     4   read-only   0 or ID   IVPosition
       12     4   write-only      N/A   Doorbell
                                        bit 0..15: vector
                                        bit 16..31: peer ID
       16   240   none            N/A   reserved
 Software should only access the registers as specified in column
 "Access".  Reserved bits should be ignored on read, and preserved on
 write.
 In revision 0 of the device, Interrupt Status and Mask Register
 together control the legacy INTx interrupt when the device has no
 MSI-X capability: INTx is asserted when the bit-wise AND of Status and
 Mask is non-zero and the device has no MSI-X capability.  Interrupt
 Status Register bit 0 becomes 1 when an interrupt request from a peer
 is received.  Reading the register clears it.
 IVPosition Register: if the device is not configured for interrupts,
 this is zero.  Else, it is the device's ID (between 0 and 65535).
 Before QEMU 2.6.0, the register may read -1 for a short while after
 reset.  These devices have PCI revision 0 rather than 1.
 There is no good way for software to find out whether the device is
 configured for interrupts.  A positive IVPosition means interrupts,
 but zero could be either.
 Doorbell Register: writing this register requests to interrupt a peer.
 The written value's high 16 bits are the ID of the peer to interrupt,
 and its low 16 bits select an interrupt vector.
 If the device is not configured for interrupts, the write is ignored.
 If the interrupt hasn't completed setup, the write is ignored.  The
 device is not capable to tell guest software whether setup is
 complete.  Interrupts can regress to this state on migration.
 If the peer with the requested ID isn't connected, or it has fewer
 interrupt vectors connected, the write is ignored.  The device is not
 capable to tell guest software what peers are connected, or how many
 interrupt vectors are connected.
 The peer's interrupt for this vector then becomes pending.  There is
 no way for software to clear the pending bit, and a polling mode of
 operation is therefore impossible.
 If the peer is a revision 0 device without MSI-X capability, its
 Interrupt Status register is set to 1.  This asserts INTx unless
 masked by the Interrupt Mask register.  The device is not capable to
 communicate the interrupt vector to guest software then.
 With multiple MSI-X vectors, different vectors can be used to indicate
 different events have occurred.  The semantics of interrupt vectors
 are left to the application.
 == Interrupt infrastructure ==
 When configured for interrupts, the peers share eventfd objects in
 addition to shared memory.  The shared resources are managed by an
 ivshmem server.
 === The ivshmem server ===
 The server listens on a UNIX domain socket.
 For each new client that connects to the server, the server
 - picks an ID,
 - creates eventfd file descriptors for the interrupt vectors,
 - sends the ID and the file descriptor for the shared memory to the
  new client,
 - sends connect notifications for the new client to the other clients
  (these contain file descriptors for sending interrupts),
 - sends connect notifications for the other clients to the new client,
  and
 - sends interrupt setup messages to the new client (these contain file
  descriptors for receiving interrupts).
 The first client to connect to the server receives ID zero.
 When a client disconnects from the server, the server sends disconnect
 notifications to the other clients.
 The next section describes the protocol in detail.
 If the server terminates without sending disconnect notifications for
 its connected clients, the clients can elect to continue.  They can
 communicate with each other normally, but won't receive disconnect
 notification on disconnect, and no new clients can connect.  There is
 no way for the clients to connect to a restarted server.  The device
 is not capable to tell guest software whether the server is still up.
 Example server code is in contrib/ivshmem-server/.  Not to be used in
 production.  It assumes all clients use the same number of interrupt
 vectors.
 A standalone client is in contrib/ivshmem-client/.  It can be useful
 for debugging.
 === The ivshmem Client-Server Protocol ===
 An ivshmem device configured for interrupts connects to an ivshmem
 server.  This section details the protocol between the two.
 The connection is one-way: the server sends messages to the client.
 Each message consists of a single 8 byte little-endian signed number,
 and may be accompanied by a file descriptor via SCM_RIGHTS.  Both
 client and server close the connection on error.
 Note: QEMU currently doesn't close the connection right on error, but
 only when the character device is destroyed.
 On connect, the server sends the following messages in order:
 1. The protocol version number, currently zero.  The client should
   close the connection on receipt of versions it can't handle.
 2. The client's ID.  This is unique among all clients of this server.
   IDs must be between 0 and 65535, because the Doorbell register
   provides only 16 bits for them.
 3. The number -1, accompanied by the file descriptor for the shared
   memory.
 4. Connect notifications for existing other clients, if any.  This is
   a peer ID (number between 0 and 65535 other than the client's ID),
   repeated N times.  Each repetition is accompanied by one file
   descriptor.  These are for interrupting the peer with that ID using
   vector 0,..,N-1, in order.  If the client is configured for fewer
   vectors, it closes the extra file descriptors.  If it is configured
   for more, the extra vectors remain unconnected.
 5. Interrupt setup.  This is the client's own ID, repeated N times.
   Each repetition is accompanied by one file descriptor.  These are
   for receiving interrupts from peers using vector 0,..,N-1, in
   order.  If the client is configured for fewer vectors, it closes
   the extra file descriptors.  If it is configured for more, the
   extra vectors remain unconnected.
 From then on, the server sends these kinds of messages:
 6. Connection / disconnection notification.  This is a peer ID.
  - If the number comes with a file descriptor, it's a connection
    notification, exactly like in step 4.
  - Else, it's a disconnection notification for the peer with that ID.
 Known bugs:
 * The protocol changed incompatibly in QEMU 2.5.  Before, messages
  were native endian long, and there was no version number.
 * The protocol is poorly designed.
 === The ivshmem Client-Client Protocol ===
 An ivshmem device configured for interrupts receives eventfd file
 descriptors for interrupting peers and getting interrupted by peers
 from the server, as explained in the previous section.
 To interrupt a peer, the device writes the 8-byte integer 1 in native
 byte order to the respective file descriptor.
 To receive an interrupt, the device reads and discards as many 8-byte
 integers as it can.
--- a/docs/specs/ivshmem_device_spec.txt
+++ b/docs/specs/ivshmem_device_spec.txt
@ -1,161 +0,0 @@
 Device Specification for Inter-VM shared memory device
 ------------------------------------------------------
 The Inter-VM shared memory device is designed to share a memory region (created
 on the host via the POSIX shared memory API) between multiple QEMU processes
 running different guests. In order for all guests to be able to pick up the
 shared memory area, it is modeled by QEMU as a PCI device exposing said memory
 to the guest as a PCI BAR.
 The memory region does not belong to any guest, but is a POSIX memory object on
 the host. The host can access this shared memory if needed.
 The device also provides an optional communication mechanism between guests
 sharing the same memory object. More details about that in the section 'Guest to
 guest communication' section.
 The Inter-VM PCI device
 -----------------------
 From the VM point of view, the ivshmem PCI device supports three BARs.
 - BAR0 is a 1 Kbyte MMIO region to support registers and interrupts when MSI is
  not used.
 - BAR1 is used for MSI-X when it is enabled in the device.
 - BAR2 is used to access the shared memory object.
 It is your choice how to use the device but you must choose between two
 behaviors :
 - basically, if you only need the shared memory part, you will map BAR2.
  This way, you have access to the shared memory in guest and can use it as you
  see fit (memnic, for example, uses it in userland
  http://dpdk.org/browse/memnic).
 - BAR0 and BAR1 are used to implement an optional communication mechanism
  through interrupts in the guests. If you need an event mechanism between the
  guests accessing the shared memory, you will most likely want to write a
  kernel driver that will handle interrupts. See details in the section 'Guest
  to guest communication' section.
 The behavior is chosen when starting your QEMU processes:
 - no communication mechanism needed, the first QEMU to start creates the shared
  memory on the host, subsequent QEMU processes will use it.
 - communication mechanism needed, an ivshmem server must be started before any
  QEMU processes, then each QEMU process connects to the server unix socket.
 For more details on the QEMU ivshmem parameters, see qemu-doc documentation.
 Guest to guest communication
 ----------------------------
 This section details the communication mechanism between the guests accessing
 the ivhsmem shared memory.
 *ivshmem server*
 This server code is available in qemu.git/contrib/ivshmem-server.
 The server must be started on the host before any guest.
 It creates a shared memory object then waits for clients to connect on a unix
 socket. All the messages are little-endian int64_t integer.
 For each client (QEMU process) that connects to the server:
 - the server sends a protocol version, if client does not support it, the client
  closes the communication,
 - the server assigns an ID for this client and sends this ID to him as the first
  message,
 - the server sends a fd to the shared memory object to this client,
 - the server creates a new set of host eventfds associated to the new client and
  sends this set to all already connected clients,
 - finally, the server sends all the eventfds sets for all clients to the new
  client.
 The server signals all clients when one of them disconnects.
 The client IDs are limited to 16 bits because of the current implementation (see
 Doorbell register in 'PCI device registers' subsection). Hence only 65536
 clients are supported.
 All the file descriptors (fd to the shared memory, eventfds for each client)
 are passed to clients using SCM_RIGHTS over the server unix socket.
 Apart from the current ivshmem implementation in QEMU, an ivshmem client has
 been provided in qemu.git/contrib/ivshmem-client for debug.
 *QEMU as an ivshmem client*
 At initialisation, when creating the ivshmem device, QEMU first receives a
 protocol version and closes communication with server if it does not match.
 Then, QEMU gets its ID from the server then makes it available through BAR0
 IVPosition register for the VM to use (see 'PCI device registers' subsection).
 QEMU then uses the fd to the shared memory to map it to BAR2.
 eventfds for all other clients received from the server are stored to implement
 BAR0 Doorbell register (see 'PCI device registers' subsection).
 Finally, eventfds assigned to this QEMU process are used to send interrupts in
 this VM.
 *PCI device registers*
 From the VM point of view, the ivshmem PCI device supports 4 registers of
 32-bits each.
 enum ivshmem_registers {
    IntrMask = 0,
    IntrStatus = 4,
    IVPosition = 8,
    Doorbell = 12
 };
 The first two registers are the interrupt mask and status registers.  Mask and
 status are only used with pin-based interrupts.  They are unused with MSI
 interrupts.
 Status Register: The status register is set to 1 when an interrupt occurs.
 Mask Register: The mask register is bitwise ANDed with the interrupt status
 and the result will raise an interrupt if it is non-zero.  However, since 1 is
 the only value the status will be set to, it is only the first bit of the mask
 that has any effect.  Therefore interrupts can be masked by setting the first
 bit to 0 and unmasked by setting the first bit to 1.
 IVPosition Register: The IVPosition register is read-only and reports the
 guest's ID number.  The guest IDs are non-negative integers.  When using the
 server, since the server is a separate process, the VM ID will only be set when
 the device is ready (shared memory is received from the server and accessible
 via the device).  If the device is not ready, the IVPosition will return -1.
 Applications should ensure that they have a valid VM ID before accessing the
 shared memory.
 Doorbell Register:  To interrupt another guest, a guest must write to the
 Doorbell register.  The doorbell register is 32-bits, logically divided into
 two 16-bit fields.  The high 16-bits are the guest ID to interrupt and the low
 16-bits are the interrupt vector to trigger.  The semantics of the value
 written to the doorbell depends on whether the device is using MSI or a regular
 pin-based interrupt.  In short, MSI uses vectors while regular interrupts set
 the status register.
 Regular Interrupts
 If regular interrupts are used (due to either a guest not supporting MSI or the
 user specifying not to use them on startup) then the value written to the lower
 16-bits of the Doorbell register results is arbitrary and will trigger an
 interrupt in the destination guest.
 Message Signalled Interrupts
 An ivshmem device may support multiple MSI vectors.  If so, the lower 16-bits
 written to the Doorbell register must be between 0 and the maximum number of
 vectors the guest supports.  The lower 16 bits written to the doorbell is the
 MSI vector that will be raised in the destination guest.  The number of MSI
 vectors is configurable but it is set when the VM is started.
 The important thing to remember with MSI is that it is only a signal, no status
 is set (since MSI interrupts are not shared).  All information other than the
 interrupt itself should be communicated via the shared memory region.  Devices
 supporting multiple MSI vectors can use different vectors to indicate different
 events have occurred.  The semantics of interrupt vectors are left to the
 user's discretion.
--- a/hw/core/qdev-properties.c
+++ b/hw/core/qdev-properties.c
@ -516,6 +516,16 @@ PropertyInfo qdev_prop_macaddr = {
    .set   = set_mac,
 };
 /* --- on/off/auto --- */
 PropertyInfo qdev_prop_on_off_auto = {
    .name = "OnOffAuto",
    .description = "on/off/auto",
    .enum_table = OnOffAuto_lookup,
    .get = get_enum,
    .set = set_enum,
 };
 /* --- lost tick policy --- */
 QEMU_BUILD_BUG_ON(sizeof(LostTickPolicy) != sizeof(int));
--- a/hw/misc/ivshmem.c
+++ b/hw/misc/ivshmem.c
--- a/include/hw/qdev-properties.h
+++ b/include/hw/qdev-properties.h
@ -18,6 +18,7 @@ extern PropertyInfo qdev_prop_string;
 extern PropertyInfo qdev_prop_chr;
 extern PropertyInfo qdev_prop_ptr;
 extern PropertyInfo qdev_prop_macaddr;
 extern PropertyInfo qdev_prop_on_off_auto;
 extern PropertyInfo qdev_prop_losttickpolicy;
 extern PropertyInfo qdev_prop_bios_chs_trans;
 extern PropertyInfo qdev_prop_fdc_drive_type;
@ -155,6 +156,8 @@ extern PropertyInfo qdev_prop_arraylen;
    DEFINE_PROP(_n, _s, _f, qdev_prop_drive, BlockBackend *)
 #define DEFINE_PROP_MACADDR(_n, _s, _f)         \
    DEFINE_PROP(_n, _s, _f, qdev_prop_macaddr, MACAddr)
 #define DEFINE_PROP_ON_OFF_AUTO(_n, _s, _f, _d) \
    DEFINE_PROP_DEFAULT(_n, _s, _f, _d, qdev_prop_on_off_auto, OnOffAuto)
 #define DEFINE_PROP_LOSTTICKPOLICY(_n, _s, _f, _d) \
    DEFINE_PROP_DEFAULT(_n, _s, _f, _d, qdev_prop_losttickpolicy, \
                        LostTickPolicy)
--- a/qemu-doc.texi
+++ b/qemu-doc.texi
@ -1262,13 +1262,18 @@ basic example.
@subsection Inter-VM Shared Memory device
-With KVM enabled on a Linux host, a shared memory device is available.  Guests
+On Linux hosts, a shared memory device is available.  The basic syntax
-map a POSIX shared memory region into the guest as a PCI device that enables
+is:
 zero-copy communication to the application level of the guests.  The basic
 syntax is:
@example
-qemu-system-i386 -device ivshmem,size=@var{size},shm=@var{shm-name}
+qemu-system-x86_64 -device ivshmem-plain,memdev=@var{hostmem}
@end example
 where @var{hostmem} names a host memory backend.  For a POSIX shared
 memory backend, use something like
@example
 -object memory-backend-file,size=1M,share,mem-path=/dev/shm/ivshmem,id=@var{hostmem}
@end example
 If desired, interrupts can be sent between guest VMs accessing the same shared
@ -1282,28 +1287,24 @@ memory server is:
 ivshmem-server -p @var{pidfile} -S @var{path} -m @var{shm-name} -l @var{shm-size} -n @var{vectors}
 # Then start your qemu instances with matching arguments
-qemu-system-i386 -device ivshmem,size=@var{shm-size},vectors=@var{vectors},chardev=@var{id}
+qemu-system-x86_64 -device ivshmem-doorbell,vectors=@var{vectors},chardev=@var{id}
                 [,msi=on][,ioeventfd=on][,role=peer|master]
                 -chardev socket,path=@var{path},id=@var{id}
@end example
 When using the server, the guest will be assigned a VM ID (>=0) that allows guests
 using the same server to communicate via interrupts.  Guests can read their
-VM ID from a device register (see example code).  Since receiving the shared
+VM ID from a device register (see ivshmem-spec.txt).
 memory region from the server is asynchronous, there is a (small) chance the
 guest may boot before the shared memory is attached.  To allow an application
 to ensure shared memory is attached, the VM ID register will return -1 (an
 invalid VM ID) until the memory is attached.  Once the shared memory is
 attached, the VM ID will return the guest's valid VM ID.  With these semantics,
 the guest application can check to ensure the shared memory is attached to the
 guest before proceeding.
-The @option{role} argument can be set to either master or peer and will affect
+@subsubsection Migration with ivshmem
-how the shared memory is migrated.  With @option{role=master}, the guest will
+
-copy the shared memory on migration to the destination host.  With
+With device property @option{master=on}, the guest will copy the shared
-@option{role=peer}, the guest will not be able to migrate with the device attached.
+memory on migration to the destination host.  With @option{master=off},
-With the @option{peer} case, the device should be detached and then reattached
+the guest will not be able to migrate with the device attached.  In the
-after migration using the PCI hotplug support.
+latter case, the device should be detached and then reattached after
 migration using the PCI hotplug support.
 At most one of the devices sharing the same memory can be master.  The
 master must complete migration before you plug back the other devices.
@subsubsection ivshmem and hugepages
@ -1311,8 +1312,8 @@ Instead of specifying the <shm size> using POSIX shm, you may specify
 a memory backend that has hugepage support:
@example
-qemu-system-i386 -object memory-backend-file,size=1G,mem-path=/mnt/hugepages/my-shmem-file,id=mb1
+qemu-system-x86_64 -object memory-backend-file,size=1G,mem-path=/dev/hugepages/my-shmem-file,share,id=mb1
-                 -device ivshmem,x-memdev=mb1
+                 -device ivshmem-plain,memdev=mb1
@end example
 ivshmem-server also supports hugepages mount points with the
--- a/target-ppc/kvm.c
+++ b/target-ppc/kvm.c
@ -333,6 +333,12 @@ static long gethugepagesize(const char *mem_path)
    return fs.f_bsize;
 }
 /*
 * FIXME TOCTTOU: this iterates over memory backends' mem-path, which
 * may or may not name the same files / on the same filesystem now as
 * when we actually open and map them.  Iterate over the file
 * descriptors instead, and use qemu_fd_getpagesize().
 */
 static int find_max_supported_pagesize(Object *obj, void *opaque)
 {
    char *mem_path;
--- a/tests/Makefile
+++ b/tests/Makefile
@ -166,7 +166,7 @@ gcov-files-pci-y += hw/display/virtio-gpu-pci.c
 gcov-files-pci-$(CONFIG_VIRTIO_VGA) += hw/display/virtio-vga.c
 check-qtest-pci-y += tests/intel-hda-test$(EXESUF)
 gcov-files-pci-y += hw/audio/intel-hda.c hw/audio/hda-codec.c
-check-qtest-pci-$(CONFIG_POSIX) += tests/ivshmem-test$(EXESUF)
+check-qtest-pci-$(CONFIG_EVENTFD) += tests/ivshmem-test$(EXESUF)
 gcov-files-pci-y += hw/misc/ivshmem.c
 check-qtest-i386-y = tests/endianness-test$(EXESUF)
--- a/tests/ivshmem-test.c
+++ b/tests/ivshmem-test.c
@ -110,25 +110,26 @@ static void setup_vm_cmd(IVState *s, const char *cmd, bool msix)
    s->pcibus = qpci_init_pc();
    s->dev = get_device(s->pcibus);
-    /* FIXME: other bar order fails, mappings changes */
+    s->reg_base = qpci_iomap(s->dev, 0, &barsize);
-    s->mem_base = qpci_iomap(s->dev, 2, &barsize);
+    g_assert_nonnull(s->reg_base);
-    g_assert_nonnull(s->mem_base);
+    g_assert_cmpuint(barsize, ==, 256);
    g_assert_cmpuint(barsize, ==, TMPSHMSIZE);
    if (msix) {
        qpci_msix_enable(s->dev);
    }
-    s->reg_base = qpci_iomap(s->dev, 0, &barsize);
+    s->mem_base = qpci_iomap(s->dev, 2, &barsize);
-    g_assert_nonnull(s->reg_base);
+    g_assert_nonnull(s->mem_base);
-    g_assert_cmpuint(barsize, ==, 256);
+    g_assert_cmpuint(barsize, ==, TMPSHMSIZE);
    qpci_device_enable(s->dev);
 }
 static void setup_vm(IVState *s)
 {
-    char *cmd = g_strdup_printf("-device ivshmem,shm=%s,size=1M", tmpshm);
+    char *cmd = g_strdup_printf("-object memory-backend-file"
                                ",id=mb1,size=1M,share,mem-path=/dev/shm%s"
                                " -device ivshmem-plain,memdev=mb1", tmpshm);
    setup_vm_cmd(s, cmd, false);
@ -144,32 +145,41 @@ static void test_ivshmem_single(void)
    setup_vm(&state);
    s = &state;
-    /* valid io */
+    /* initial state of readable registers */
-    out_reg(s, INTRMASK, 0);
+    g_assert_cmpuint(in_reg(s, INTRMASK), ==, 0);
-    in_reg(s, INTRSTATUS);
+    g_assert_cmpuint(in_reg(s, INTRSTATUS), ==, 0);
-    in_reg(s, IVPOSITION);
+    g_assert_cmpuint(in_reg(s, IVPOSITION), ==, 0);
    /* trigger interrupt via registers */
    out_reg(s, INTRMASK, 0xffffffff);
    g_assert_cmpuint(in_reg(s, INTRMASK), ==, 0xffffffff);
    out_reg(s, INTRSTATUS, 1);
-    /* XXX: intercept IRQ, not seen in resp */
+    /* check interrupt status */
    g_assert_cmpuint(in_reg(s, INTRSTATUS), ==, 1);
    /* reading clears */
    g_assert_cmpuint(in_reg(s, INTRSTATUS), ==, 0);
    /* TODO intercept actual interrupt (needs qtest work) */
-    /* invalid io */
+    /* invalid register access */
    out_reg(s, IVPOSITION, 1);
    in_reg(s, DOORBELL);
    /* ring the (non-functional) doorbell */
    out_reg(s, DOORBELL, 8 << 16);
    /* write shared memory */
    for (i = 0; i < G_N_ELEMENTS(data); i++) {
        data[i] = i;
    }
    qtest_memwrite(s->qtest, (uintptr_t)s->mem_base, data, sizeof(data));
    /* verify write */
    for (i = 0; i < G_N_ELEMENTS(data); i++) {
        g_assert_cmpuint(((uint32_t *)tmpshmem)[i], ==, i);
    }
    /* read it back and verify read */
    memset(data, 0, sizeof(data));
    qtest_memread(s->qtest, (uintptr_t)s->mem_base, data, sizeof(data));
    for (i = 0; i < G_N_ELEMENTS(data); i++) {
        g_assert_cmpuint(data[i], ==, i);
@ -276,8 +286,10 @@ static void *server_thread(void *data)
 static void setup_vm_with_server(IVState *s, int nvectors, bool msi)
 {
    char *cmd = g_strdup_printf("-chardev socket,id=chr0,path=%s,nowait "
-                                "-device ivshmem,size=1M,chardev=chr0,vectors=%d,msi=%s",
+                                "-device ivshmem%s,chardev=chr0,vectors=%d",
-                                tmpserver, nvectors, msi ? "true" : "false");
+                                tmpserver,
                                msi ? "-doorbell" : ",size=1M,msi=off",
                                nvectors);
    setup_vm_cmd(s, cmd, msi);
@ -293,8 +305,7 @@ static void test_ivshmem_server(bool msi)
    int nvectors = 2;
    guint64 end_time = g_get_monotonic_time() + 5 * G_TIME_SPAN_SECOND;
-    memset(tmpshmem, 0x42, TMPSHMSIZE);
+    ret = ivshmem_server_init(&server, tmpserver, tmpshm, true,
    ret = ivshmem_server_init(&server, tmpserver, tmpshm,
                              TMPSHMSIZE, nvectors,
                              g_test_verbose());
    g_assert_cmpint(ret, ==, 0);
@ -302,49 +313,39 @@ static void test_ivshmem_server(bool msi)
    ret = ivshmem_server_start(&server);
    g_assert_cmpint(ret, ==, 0);
    setup_vm_with_server(&state1, nvectors, msi);
    s1 = &state1;
    setup_vm_with_server(&state2, nvectors, msi);
    s2 = &state2;
    g_assert_cmpuint(in_reg(s1, IVPOSITION), ==, 0xffffffff);
    g_assert_cmpuint(in_reg(s2, IVPOSITION), ==, 0xffffffff);
    g_assert_cmpuint(qtest_readb(s1->qtest, (uintptr_t)s1->mem_base), ==, 0x00);
    thread.server = &server;
    ret = pipe(thread.pipe);
    g_assert_cmpint(ret, ==, 0);
    thread.thread = g_thread_new("ivshmem-server", server_thread, &thread);
    g_assert(thread.thread != NULL);
-    /* waiting until mapping is done */
+    setup_vm_with_server(&state1, nvectors, msi);
-    while (g_get_monotonic_time() < end_time) {
+    s1 = &state1;
-        g_usleep(1000);
+    setup_vm_with_server(&state2, nvectors, msi);
-
+    s2 = &state2;
        if (qtest_readb(s1->qtest, (uintptr_t)s1->mem_base) == 0x42 &&
            qtest_readb(s2->qtest, (uintptr_t)s2->mem_base) == 0x42) {
            break;
        }
    }
    /* check got different VM ids */
    vm1 = in_reg(s1, IVPOSITION);
    vm2 = in_reg(s2, IVPOSITION);
-    g_assert_cmpuint(vm1, !=, vm2);
+    g_assert_cmpint(vm1, >=, 0);
    g_assert_cmpint(vm2, >=, 0);
    g_assert_cmpint(vm1, !=, vm2);
    /* check number of MSI-X vectors */
    global_qtest = s1->qtest;
    if (msi) {
        ret = qpci_msix_table_size(s1->dev);
        g_assert_cmpuint(ret, ==, nvectors);
    }
-    /* ping vm2 -> vm1 */
+    /* TODO test behavior before MSI-X is enabled */
    /* ping vm2 -> vm1 on vector 0 */
    if (msi) {
        ret = qpci_msix_pending(s1->dev, 0);
        g_assert_cmpuint(ret, ==, 0);
    } else {
-        out_reg(s1, INTRSTATUS, 0);
+        g_assert_cmpuint(in_reg(s1, INTRSTATUS), ==, 0);
    }
    out_reg(s2, DOORBELL, vm1 << 16);
    do {
@ -353,18 +354,18 @@ static void test_ivshmem_server(bool msi)
    } while (ret == 0 && g_get_monotonic_time() < end_time);
    g_assert_cmpuint(ret, !=, 0);
-    /* ping vm1 -> vm2 */
+    /* ping vm1 -> vm2 on vector 1 */
    global_qtest = s2->qtest;
    if (msi) {
-        ret = qpci_msix_pending(s2->dev, 0);
+        ret = qpci_msix_pending(s2->dev, 1);
        g_assert_cmpuint(ret, ==, 0);
    } else {
-        out_reg(s2, INTRSTATUS, 0);
+        g_assert_cmpuint(in_reg(s2, INTRSTATUS), ==, 0);
    }
-    out_reg(s1, DOORBELL, vm2 << 16);
+    out_reg(s1, DOORBELL, vm2 << 16 | 1);
    do {
        g_usleep(10000);
-        ret = msi ? qpci_msix_pending(s2->dev, 0) : in_reg(s2, INTRSTATUS);
+        ret = msi ? qpci_msix_pending(s2->dev, 1) : in_reg(s2, INTRSTATUS);
    } while (ret == 0 && g_get_monotonic_time() < end_time);
    g_assert_cmpuint(ret, !=, 0);
@ -415,7 +416,7 @@ static void test_ivshmem_memdev(void)
    /* just for the sake of checking memory-backend property */
    setup_vm_cmd(&state, "-object memory-backend-ram,size=1M,id=mb1"
-                 " -device ivshmem,x-memdev=mb1", false);
+                 " -device ivshmem-plain,memdev=mb1", false);
    cleanup_vm(&state);
 }
--- a/tests/libqos/pci-pc.c
+++ b/tests/libqos/pci-pc.c
@ -184,7 +184,9 @@ static void *qpci_pc_iomap(QPCIBus *bus, QPCIDevice *dev, int barno, uint64_t *s
    if (io_type == PCI_BASE_ADDRESS_SPACE_IO) {
        uint16_t loc;
-        g_assert((s->pci_iohole_alloc + size) <= s->pci_iohole_size);
+        g_assert(QEMU_ALIGN_UP(s->pci_iohole_alloc, size) + size
                 <= s->pci_iohole_size);
        s->pci_iohole_alloc = QEMU_ALIGN_UP(s->pci_iohole_alloc, size);
        loc = s->pci_iohole_start + s->pci_iohole_alloc;
        s->pci_iohole_alloc += size;
@ -194,7 +196,9 @@ static void *qpci_pc_iomap(QPCIBus *bus, QPCIDevice *dev, int barno, uint64_t *s
    } else {
        uint64_t loc;
-        g_assert((s->pci_hole_alloc + size) <= s->pci_hole_size);
+        g_assert(QEMU_ALIGN_UP(s->pci_hole_alloc, size) + size
                 <= s->pci_hole_size);
        s->pci_hole_alloc = QEMU_ALIGN_UP(s->pci_hole_alloc, size);
        loc = s->pci_hole_start + s->pci_hole_alloc;
        s->pci_hole_alloc += size;
--- a/util/event_notifier-posix.c
+++ b/util/event_notifier-posix.c
@ -20,11 +20,17 @@
 #include <sys/eventfd.h>
 #endif
 #ifdef CONFIG_EVENTFD
 /*
 * Initialize @e with existing file descriptor @fd.
 * @fd must be a genuine eventfd object, emulation with pipe won't do.
 */
 void event_notifier_init_fd(EventNotifier *e, int fd)
 {
    e->rfd = fd;
    e->wfd = fd;
 }
 #endif
 int event_notifier_init(EventNotifier *e, int active)
 {