qemu/include/hw/virtio/vhost.h
Si-Wei Liu c5cd7e5f23 vhost: Perform memory section dirty scans once per iteration
On setups with one or more virtio-net devices with vhost on,
dirty tracking iteration increases cost the bigger the number
amount of queues are set up e.g. on idle guests migration the
following is observed with virtio-net with vhost=on:

48 queues -> 78.11%  [.] vhost_dev_sync_region.isra.13
8 queues -> 40.50%   [.] vhost_dev_sync_region.isra.13
1 queue -> 6.89%     [.] vhost_dev_sync_region.isra.13
2 devices, 1 queue -> 18.60%  [.] vhost_dev_sync_region.isra.14

With high memory rates the symptom is lack of convergence as soon
as it has a vhost device with a sufficiently high number of queues,
the sufficient number of vhost devices.

On every migration iteration (every 100msecs) it will redundantly
query the *shared log* the number of queues configured with vhost
that exist in the guest. For the virtqueue data, this is necessary,
but not for the memory sections which are the same. So essentially
we end up scanning the dirty log too often.

To fix that, select a vhost device responsible for scanning the
log with regards to memory sections dirty tracking. It is selected
when we enable the logger (during migration) and cleared when we
disable the logger. If the vhost logger device goes away for some
reason, the logger will be re-selected from the rest of vhost
devices.

After making mem-section logger a singleton instance, constant cost
of 7%-9% (like the 1 queue report) will be seen, no matter how many
queues or how many vhost devices are configured:

48 queues -> 8.71%    [.] vhost_dev_sync_region.isra.13
2 devices, 8 queues -> 7.97%   [.] vhost_dev_sync_region.isra.14

Co-developed-by: Joao Martins <joao.m.martins@oracle.com>
Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
Signed-off-by: Si-Wei Liu <si-wei.liu@oracle.com>
Message-Id: <1710448055-11709-2-git-send-email-si-wei.liu@oracle.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
2024-07-01 14:56:23 -04:00

469 lines
16 KiB
C

#ifndef VHOST_H
#define VHOST_H
#include "hw/virtio/vhost-backend.h"
#include "hw/virtio/virtio.h"
#include "exec/memory.h"
#define VHOST_F_DEVICE_IOTLB 63
#define VHOST_USER_F_PROTOCOL_FEATURES 30
#define VU_REALIZE_CONN_RETRIES 3
/* Generic structures common for any vhost based device. */
struct vhost_inflight {
int fd;
void *addr;
uint64_t size;
uint64_t offset;
uint16_t queue_size;
};
struct vhost_virtqueue {
int kick;
int call;
void *desc;
void *avail;
void *used;
int num;
unsigned long long desc_phys;
unsigned desc_size;
unsigned long long avail_phys;
unsigned avail_size;
unsigned long long used_phys;
unsigned used_size;
EventNotifier masked_notifier;
EventNotifier error_notifier;
EventNotifier masked_config_notifier;
struct vhost_dev *dev;
};
typedef unsigned long vhost_log_chunk_t;
#define VHOST_LOG_PAGE 0x1000
#define VHOST_LOG_BITS (8 * sizeof(vhost_log_chunk_t))
#define VHOST_LOG_CHUNK (VHOST_LOG_PAGE * VHOST_LOG_BITS)
#define VHOST_INVALID_FEATURE_BIT (0xff)
#define VHOST_QUEUE_NUM_CONFIG_INR 0
struct vhost_log {
unsigned long long size;
int refcnt;
int fd;
vhost_log_chunk_t *log;
};
struct vhost_dev;
struct vhost_iommu {
struct vhost_dev *hdev;
MemoryRegion *mr;
hwaddr iommu_offset;
IOMMUNotifier n;
QLIST_ENTRY(vhost_iommu) iommu_next;
};
typedef struct VhostDevConfigOps {
/* Vhost device config space changed callback
*/
int (*vhost_dev_config_notifier)(struct vhost_dev *dev);
} VhostDevConfigOps;
struct vhost_memory;
/**
* struct vhost_dev - common vhost_dev structure
* @vhost_ops: backend specific ops
* @config_ops: ops for config changes (see @vhost_dev_set_config_notifier)
*/
struct vhost_dev {
VirtIODevice *vdev;
MemoryListener memory_listener;
MemoryListener iommu_listener;
struct vhost_memory *mem;
int n_mem_sections;
MemoryRegionSection *mem_sections;
int n_tmp_sections;
MemoryRegionSection *tmp_sections;
struct vhost_virtqueue *vqs;
unsigned int nvqs;
/* the first virtqueue which would be used by this vhost dev */
int vq_index;
/* one past the last vq index for the virtio device (not vhost) */
int vq_index_end;
/* if non-zero, minimum required value for max_queues */
int num_queues;
/**
* vhost feature handling requires matching the feature set
* offered by a backend which may be a subset of the total
* features eventually offered to the guest.
*
* @features: available features provided by the backend
* @acked_features: final negotiated features with front-end driver
*
* @backend_features: this is used in a couple of places to either
* store VHOST_USER_F_PROTOCOL_FEATURES to apply to
* VHOST_USER_SET_FEATURES or VHOST_NET_F_VIRTIO_NET_HDR. Its
* future use should be discouraged and the variable retired as
* its easy to confuse with the VirtIO backend_features.
*/
uint64_t features;
uint64_t acked_features;
uint64_t backend_features;
/**
* @protocol_features: is the vhost-user only feature set by
* VHOST_USER_SET_PROTOCOL_FEATURES. Protocol features are only
* negotiated if VHOST_USER_F_PROTOCOL_FEATURES has been offered
* by the backend (see @features).
*/
uint64_t protocol_features;
uint64_t max_queues;
uint64_t backend_cap;
/* @started: is the vhost device started? */
bool started;
bool log_enabled;
uint64_t log_size;
Error *migration_blocker;
const VhostOps *vhost_ops;
void *opaque;
struct vhost_log *log;
QLIST_ENTRY(vhost_dev) entry;
QLIST_ENTRY(vhost_dev) logdev_entry;
QLIST_HEAD(, vhost_iommu) iommu_list;
IOMMUNotifier n;
const VhostDevConfigOps *config_ops;
};
extern const VhostOps kernel_ops;
extern const VhostOps user_ops;
extern const VhostOps vdpa_ops;
struct vhost_net {
struct vhost_dev dev;
struct vhost_virtqueue vqs[2];
int backend;
NetClientState *nc;
};
/**
* vhost_dev_init() - initialise the vhost interface
* @hdev: the common vhost_dev structure
* @opaque: opaque ptr passed to backend (vhost/vhost-user/vdpa)
* @backend_type: type of backend
* @busyloop_timeout: timeout for polling virtqueue
* @errp: error handle
*
* The initialisation of the vhost device will trigger the
* initialisation of the backend and potentially capability
* negotiation of backend interface. Configuration of the VirtIO
* itself won't happen until the interface is started.
*
* Return: 0 on success, non-zero on error while setting errp.
*/
int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
VhostBackendType backend_type,
uint32_t busyloop_timeout, Error **errp);
/**
* vhost_dev_cleanup() - tear down and cleanup vhost interface
* @hdev: the common vhost_dev structure
*/
void vhost_dev_cleanup(struct vhost_dev *hdev);
/**
* vhost_dev_enable_notifiers() - enable event notifiers
* @hdev: common vhost_dev structure
* @vdev: the VirtIODevice structure
*
* Enable notifications directly to the vhost device rather than being
* triggered by QEMU itself. Notifications should be enabled before
* the vhost device is started via @vhost_dev_start.
*
* Return: 0 on success, < 0 on error.
*/
int vhost_dev_enable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev);
/**
* vhost_dev_disable_notifiers - disable event notifications
* @hdev: common vhost_dev structure
* @vdev: the VirtIODevice structure
*
* Disable direct notifications to vhost device.
*/
void vhost_dev_disable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev);
bool vhost_config_pending(struct vhost_dev *hdev);
void vhost_config_mask(struct vhost_dev *hdev, VirtIODevice *vdev, bool mask);
/**
* vhost_dev_is_started() - report status of vhost device
* @hdev: common vhost_dev structure
*
* Return the started status of the vhost device
*/
static inline bool vhost_dev_is_started(struct vhost_dev *hdev)
{
return hdev->started;
}
/**
* vhost_dev_start() - start the vhost device
* @hdev: common vhost_dev structure
* @vdev: the VirtIODevice structure
* @vrings: true to have vrings enabled in this call
*
* Starts the vhost device. From this point VirtIO feature negotiation
* can start and the device can start processing VirtIO transactions.
*
* Return: 0 on success, < 0 on error.
*/
int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings);
/**
* vhost_dev_stop() - stop the vhost device
* @hdev: common vhost_dev structure
* @vdev: the VirtIODevice structure
* @vrings: true to have vrings disabled in this call
*
* Stop the vhost device. After the device is stopped the notifiers
* can be disabled (@vhost_dev_disable_notifiers) and the device can
* be torn down (@vhost_dev_cleanup).
*/
void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings);
/**
* DOC: vhost device configuration handling
*
* The VirtIO device configuration space is used for rarely changing
* or initialisation time parameters. The configuration can be updated
* by either the guest driver or the device itself. If the device can
* change the configuration over time the vhost handler should
* register a @VhostDevConfigOps structure with
* @vhost_dev_set_config_notifier so the guest can be notified. Some
* devices register a handler anyway and will signal an error if an
* unexpected config change happens.
*/
/**
* vhost_dev_get_config() - fetch device configuration
* @hdev: common vhost_dev_structure
* @config: pointer to device appropriate config structure
* @config_len: size of device appropriate config structure
*
* Return: 0 on success, < 0 on error while setting errp
*/
int vhost_dev_get_config(struct vhost_dev *hdev, uint8_t *config,
uint32_t config_len, Error **errp);
/**
* vhost_dev_set_config() - set device configuration
* @hdev: common vhost_dev_structure
* @data: pointer to data to set
* @offset: offset into configuration space
* @size: length of set
* @flags: @VhostSetConfigType flags
*
* By use of @offset/@size a subset of the configuration space can be
* written to. The @flags are used to indicate if it is a normal
* transaction or related to migration.
*
* Return: 0 on success, non-zero on error
*/
int vhost_dev_set_config(struct vhost_dev *dev, const uint8_t *data,
uint32_t offset, uint32_t size, uint32_t flags);
/**
* vhost_dev_set_config_notifier() - register VhostDevConfigOps
* @hdev: common vhost_dev_structure
* @ops: notifier ops
*
* If the device is expected to change configuration a notifier can be
* setup to handle the case.
*/
void vhost_dev_set_config_notifier(struct vhost_dev *dev,
const VhostDevConfigOps *ops);
/* Test and clear masked event pending status.
* Should be called after unmask to avoid losing events.
*/
bool vhost_virtqueue_pending(struct vhost_dev *hdev, int n);
/* Mask/unmask events from this vq.
*/
void vhost_virtqueue_mask(struct vhost_dev *hdev, VirtIODevice *vdev, int n,
bool mask);
/**
* vhost_get_features() - return a sanitised set of feature bits
* @hdev: common vhost_dev structure
* @feature_bits: pointer to terminated table of feature bits
* @features: original feature set
*
* This returns a set of features bits that is an intersection of what
* is supported by the vhost backend (hdev->features), the supported
* feature_bits and the requested feature set.
*/
uint64_t vhost_get_features(struct vhost_dev *hdev, const int *feature_bits,
uint64_t features);
/**
* vhost_ack_features() - set vhost acked_features
* @hdev: common vhost_dev structure
* @feature_bits: pointer to terminated table of feature bits
* @features: requested feature set
*
* This sets the internal hdev->acked_features to the intersection of
* the backends advertised features and the supported feature_bits.
*/
void vhost_ack_features(struct vhost_dev *hdev, const int *feature_bits,
uint64_t features);
unsigned int vhost_get_max_memslots(void);
unsigned int vhost_get_free_memslots(void);
int vhost_net_set_backend(struct vhost_dev *hdev,
struct vhost_vring_file *file);
void vhost_toggle_device_iotlb(VirtIODevice *vdev);
int vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write);
int vhost_virtqueue_start(struct vhost_dev *dev, struct VirtIODevice *vdev,
struct vhost_virtqueue *vq, unsigned idx);
void vhost_virtqueue_stop(struct vhost_dev *dev, struct VirtIODevice *vdev,
struct vhost_virtqueue *vq, unsigned idx);
void vhost_dev_reset_inflight(struct vhost_inflight *inflight);
void vhost_dev_free_inflight(struct vhost_inflight *inflight);
void vhost_dev_save_inflight(struct vhost_inflight *inflight, QEMUFile *f);
int vhost_dev_load_inflight(struct vhost_inflight *inflight, QEMUFile *f);
int vhost_dev_prepare_inflight(struct vhost_dev *hdev, VirtIODevice *vdev);
int vhost_dev_set_inflight(struct vhost_dev *dev,
struct vhost_inflight *inflight);
int vhost_dev_get_inflight(struct vhost_dev *dev, uint16_t queue_size,
struct vhost_inflight *inflight);
bool vhost_dev_has_iommu(struct vhost_dev *dev);
#ifdef CONFIG_VHOST
int vhost_reset_device(struct vhost_dev *hdev);
#else
static inline int vhost_reset_device(struct vhost_dev *hdev)
{
return -ENOSYS;
}
#endif /* CONFIG_VHOST */
/**
* vhost_supports_device_state(): Checks whether the back-end supports
* transferring internal device state for the purpose of migration.
* Support for this feature is required for vhost_set_device_state_fd()
* and vhost_check_device_state().
*
* @dev: The vhost device
*
* Returns true if the device supports these commands, and false if it
* does not.
*/
bool vhost_supports_device_state(struct vhost_dev *dev);
/**
* vhost_set_device_state_fd(): Begin transfer of internal state from/to
* the back-end for the purpose of migration. Data is to be transferred
* over a pipe according to @direction and @phase. The sending end must
* only write to the pipe, and the receiving end must only read from it.
* Once the sending end is done, it closes its FD. The receiving end
* must take this as the end-of-transfer signal and close its FD, too.
*
* @fd is the back-end's end of the pipe: The write FD for SAVE, and the
* read FD for LOAD. This function transfers ownership of @fd to the
* back-end, i.e. closes it in the front-end.
*
* The back-end may optionally reply with an FD of its own, if this
* improves efficiency on its end. In this case, the returned FD is
* stored in *reply_fd. The back-end will discard the FD sent to it,
* and the front-end must use *reply_fd for transferring state to/from
* the back-end.
*
* @dev: The vhost device
* @direction: The direction in which the state is to be transferred.
* For outgoing migrations, this is SAVE, and data is read
* from the back-end and stored by the front-end in the
* migration stream.
* For incoming migrations, this is LOAD, and data is read
* by the front-end from the migration stream and sent to
* the back-end to restore the saved state.
* @phase: Which migration phase we are in. Currently, there is only
* STOPPED (device and all vrings are stopped), in the future,
* more phases such as PRE_COPY or POST_COPY may be added.
* @fd: Back-end's end of the pipe through which to transfer state; note
* that ownership is transferred to the back-end, so this function
* closes @fd in the front-end.
* @reply_fd: If the back-end wishes to use a different pipe for state
* transfer, this will contain an FD for the front-end to
* use. Otherwise, -1 is stored here.
* @errp: Potential error description
*
* Returns 0 on success, and -errno on failure.
*/
int vhost_set_device_state_fd(struct vhost_dev *dev,
VhostDeviceStateDirection direction,
VhostDeviceStatePhase phase,
int fd,
int *reply_fd,
Error **errp);
/**
* vhost_set_device_state_fd(): After transferring state from/to the
* back-end via vhost_set_device_state_fd(), i.e. once the sending end
* has closed the pipe, inquire the back-end to report any potential
* errors that have occurred on its side. This allows to sense errors
* like:
* - During outgoing migration, when the source side had already started
* to produce its state, something went wrong and it failed to finish
* - During incoming migration, when the received state is somehow
* invalid and cannot be processed by the back-end
*
* @dev: The vhost device
* @errp: Potential error description
*
* Returns 0 when the back-end reports successful state transfer and
* processing, and -errno when an error occurred somewhere.
*/
int vhost_check_device_state(struct vhost_dev *dev, Error **errp);
/**
* vhost_save_backend_state(): High-level function to receive a vhost
* back-end's state, and save it in @f. Uses
* `vhost_set_device_state_fd()` to get the data from the back-end, and
* stores it in consecutive chunks that are each prefixed by their
* respective length (be32). The end is marked by a 0-length chunk.
*
* Must only be called while the device and all its vrings are stopped
* (`VHOST_TRANSFER_STATE_PHASE_STOPPED`).
*
* @dev: The vhost device from which to save the state
* @f: Migration stream in which to save the state
* @errp: Potential error message
*
* Returns 0 on success, and -errno otherwise.
*/
int vhost_save_backend_state(struct vhost_dev *dev, QEMUFile *f, Error **errp);
/**
* vhost_load_backend_state(): High-level function to load a vhost
* back-end's state from @f, and send it over to the back-end. Reads
* the data from @f in the format used by `vhost_save_state()`, and uses
* `vhost_set_device_state_fd()` to transfer it to the back-end.
*
* Must only be called while the device and all its vrings are stopped
* (`VHOST_TRANSFER_STATE_PHASE_STOPPED`).
*
* @dev: The vhost device to which to send the state
* @f: Migration stream from which to load the state
* @errp: Potential error message
*
* Returns 0 on success, and -errno otherwise.
*/
int vhost_load_backend_state(struct vhost_dev *dev, QEMUFile *f, Error **errp);
#endif