qemu/qapi/net.json

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

1010 lines
26 KiB
JSON
Raw Normal View History

# -*- Mode: Python -*-
# vim: filetype=python
#
##
# = Net devices
##
qapi: net: add stream and dgram netdevs Copied from socket netdev file and modified to use SocketAddress to be able to introduce new features like unix socket. "udp" and "mcast" are squashed into dgram netdev, multicast is detected according to the IP address type. "listen" and "connect" modes are managed by stream netdev. An optional parameter "server" defines the mode (off by default) The two new types need to be parsed the modern way with -netdev, because with the traditional way, the "type" field of netdev structure collides with the "type" field of SocketAddress and prevents the correct evaluation of the command line option. Moreover the traditional way doesn't allow to use the same type (SocketAddress) several times with the -netdev option (needed to specify "local" and "remote" addresses). The previous commit paved the way for parsing the modern way, but omitted one detail: how to pick modern vs. traditional, in netdev_is_modern(). We want to pick based on the value of parameter "type". But how to extract it from the option argument? Parsing the option argument, either the modern or the traditional way, extracts it for us, but only if parsing succeeds. If parsing fails, there is no good option. No matter which parser we pick, it'll be the wrong one for some arguments, and the error reporting will be confusing. Fortunately, the traditional parser accepts *anything* when called in a certain way. This maximizes our chance to extract the value of "type", and in turn minimizes the risk of confusing error reporting. Signed-off-by: Laurent Vivier <lvivier@redhat.com> Reviewed-by: Stefano Brivio <sbrivio@redhat.com> Acked-by: Markus Armbruster <armbru@redhat.com> Acked-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Jason Wang <jasowang@redhat.com>
2022-10-21 12:09:11 +03:00
{ 'include': 'sockets.json' }
##
# @set_link:
#
# Sets the link status of a virtual network adapter.
#
# @name: the device name of the virtual network adapter
#
# @up: true to set the link status to be up
#
# Returns:
# - Nothing on success
# - If @name is not a valid network device, DeviceNotFound
#
# Since: 0.14
#
# Notes: Not all network adapters support setting link status. This
# command will succeed even if the network adapter does not
# support link status notification.
#
# Example:
#
# -> { "execute": "set_link",
# "arguments": { "name": "e1000.0", "up": false } }
# <- { "return": {} }
##
{ 'command': 'set_link', 'data': {'name': 'str', 'up': 'bool'} }
##
# @netdev_add:
#
# Add a network backend.
#
# Additional arguments depend on the type.
#
# Since: 0.14
#
# Returns:
# - Nothing on success
# - If @type is not a valid network backend, DeviceNotFound
#
# Example:
#
# -> { "execute": "netdev_add",
# "arguments": { "type": "user", "id": "netdev1",
# "dnssearch": [ { "str": "example.org" } ] } }
# <- { "return": {} }
##
{ 'command': 'netdev_add', 'data': 'Netdev', 'boxed': true,
'allow-preconfig': true }
##
# @netdev_del:
#
# Remove a network backend.
#
# @id: the name of the network backend to remove
#
# Returns:
# - Nothing on success
# - If @id is not a valid network backend, DeviceNotFound
#
# Since: 0.14
#
# Example:
#
# -> { "execute": "netdev_del", "arguments": { "id": "netdev1" } }
# <- { "return": {} }
##
{ 'command': 'netdev_del', 'data': {'id': 'str'},
'allow-preconfig': true }
##
# @NetLegacyNicOptions:
#
# Create a new Network Interface Card.
#
# @netdev: id of -netdev to connect to
#
# @macaddr: MAC address
#
# @model: device model (e1000, rtl8139, virtio etc.)
#
# @addr: PCI device address
#
# @vectors: number of MSI-x vectors, 0 to disable MSI-X
#
# Since: 1.2
##
{ 'struct': 'NetLegacyNicOptions',
'data': {
'*netdev': 'str',
'*macaddr': 'str',
'*model': 'str',
'*addr': 'str',
'*vectors': 'uint32' } }
##
# @String:
#
# A fat type wrapping 'str', to be embedded in lists.
#
# Since: 1.2
##
{ 'struct': 'String',
'data': {
'str': 'str' } }
##
# @NetdevUserOptions:
#
# Use the user mode network stack which requires no administrator
# privilege to run.
#
# @hostname: client hostname reported by the builtin DHCP server
#
# @restrict: isolate the guest from the host
#
# @ipv4: whether to support IPv4, default true for enabled (since 2.6)
#
# @ipv6: whether to support IPv6, default true for enabled (since 2.6)
#
# @ip: legacy parameter, use net= instead
#
# @net: IP network address that the guest will see, in the form
# addr[/netmask] The netmask is optional, and can be either in the
# form a.b.c.d or as a number of valid top-most bits. Default is
# 10.0.2.0/24.
#
# @host: guest-visible address of the host
#
# @tftp: root directory of the built-in TFTP server
#
# @bootfile: BOOTP filename, for use with tftp=
#
# @dhcpstart: the first of the 16 IPs the built-in DHCP server can
# assign
#
# @dns: guest-visible address of the virtual nameserver
#
# @dnssearch: list of DNS suffixes to search, passed as DHCP option to
# the guest
#
# @domainname: guest-visible domain name of the virtual nameserver
# (since 3.0)
#
# @ipv6-prefix: IPv6 network prefix (default is fec0::) (since 2.6).
# The network prefix is given in the usual hexadecimal IPv6
# address notation.
#
# @ipv6-prefixlen: IPv6 network prefix length (default is 64) (since
# 2.6)
#
# @ipv6-host: guest-visible IPv6 address of the host (since 2.6)
#
# @ipv6-dns: guest-visible IPv6 address of the virtual nameserver
# (since 2.6)
#
# @smb: root directory of the built-in SMB server
#
# @smbserver: IP address of the built-in SMB server
#
# @hostfwd: redirect incoming TCP or UDP host connections to guest
# endpoints
#
# @guestfwd: forward guest TCP connections
#
# @tftp-server-name: RFC2132 "TFTP server name" string (Since 3.1)
#
# Since: 1.2
##
{ 'struct': 'NetdevUserOptions',
'data': {
'*hostname': 'str',
'*restrict': 'bool',
'*ipv4': 'bool',
'*ipv6': 'bool',
'*ip': 'str',
'*net': 'str',
'*host': 'str',
'*tftp': 'str',
'*bootfile': 'str',
'*dhcpstart': 'str',
'*dns': 'str',
'*dnssearch': ['String'],
'*domainname': 'str',
'*ipv6-prefix': 'str',
'*ipv6-prefixlen': 'int',
'*ipv6-host': 'str',
'*ipv6-dns': 'str',
'*smb': 'str',
'*smbserver': 'str',
'*hostfwd': ['String'],
'*guestfwd': ['String'],
'*tftp-server-name': 'str' } }
##
# @NetdevTapOptions:
#
# Used to configure a host TAP network interface backend.
#
# @ifname: interface name
#
# @fd: file descriptor of an already opened tap
#
# @fds: multiple file descriptors of already opened multiqueue capable
# tap
#
# @script: script to initialize the interface
#
# @downscript: script to shut down the interface
#
# @br: bridge name (since 2.8)
#
# @helper: command to execute to configure bridge
#
# @sndbuf: send buffer limit. Understands [TGMKkb] suffixes.
#
# @vnet_hdr: enable the IFF_VNET_HDR flag on the tap interface
#
# @vhost: enable vhost-net network accelerator
#
# @vhostfd: file descriptor of an already opened vhost net device
#
# @vhostfds: file descriptors of multiple already opened vhost net
# devices
#
# @vhostforce: vhost on for non-MSIX virtio guests
#
# @queues: number of queues to be created for multiqueue capable tap
#
# @poll-us: maximum number of microseconds that could be spent on busy
# polling for tap (since 2.7)
#
# Since: 1.2
##
{ 'struct': 'NetdevTapOptions',
'data': {
'*ifname': 'str',
'*fd': 'str',
'*fds': 'str',
'*script': 'str',
'*downscript': 'str',
'*br': 'str',
'*helper': 'str',
'*sndbuf': 'size',
'*vnet_hdr': 'bool',
'*vhost': 'bool',
'*vhostfd': 'str',
'*vhostfds': 'str',
'*vhostforce': 'bool',
'*queues': 'uint32',
'*poll-us': 'uint32'} }
##
# @NetdevSocketOptions:
#
# Socket netdevs are used to establish a network connection to another
# QEMU virtual machine via a TCP socket.
#
# @fd: file descriptor of an already opened socket
#
# @listen: port number, and optional hostname, to listen on
#
# @connect: port number, and optional hostname, to connect to
#
# @mcast: UDP multicast address and port number
#
# @localaddr: source address and port for multicast and udp packets
#
# @udp: UDP unicast address and port number
#
# Since: 1.2
##
{ 'struct': 'NetdevSocketOptions',
'data': {
'*fd': 'str',
'*listen': 'str',
'*connect': 'str',
'*mcast': 'str',
'*localaddr': 'str',
'*udp': 'str' } }
##
# @NetdevL2TPv3Options:
#
# Configure an Ethernet over L2TPv3 tunnel.
#
# @src: source address
#
# @dst: destination address
#
# @srcport: source port - mandatory for udp, optional for ip
#
# @dstport: destination port - mandatory for udp, optional for ip
#
# @ipv6: force the use of ipv6
#
# @udp: use the udp version of l2tpv3 encapsulation
#
# @cookie64: use 64 bit cookies
#
# @counter: have sequence counter
#
# @pincounter: pin sequence counter to zero - workaround for buggy
# implementations or networks with packet reorder
#
# @txcookie: 32 or 64 bit transmit cookie
#
# @rxcookie: 32 or 64 bit receive cookie
#
# @txsession: 32 bit transmit session
#
# @rxsession: 32 bit receive session - if not specified set to the
# same value as transmit
#
# @offset: additional offset - allows the insertion of additional
# application-specific data before the packet payload
#
# Since: 2.1
##
{ 'struct': 'NetdevL2TPv3Options',
'data': {
'src': 'str',
'dst': 'str',
'*srcport': 'str',
'*dstport': 'str',
'*ipv6': 'bool',
'*udp': 'bool',
'*cookie64': 'bool',
'*counter': 'bool',
'*pincounter': 'bool',
'*txcookie': 'uint64',
'*rxcookie': 'uint64',
'txsession': 'uint32',
'*rxsession': 'uint32',
'*offset': 'uint32' } }
##
# @NetdevVdeOptions:
#
# Connect to a vde switch running on the host.
#
# @sock: socket path
#
# @port: port number
#
# @group: group owner of socket
#
# @mode: permissions for socket
#
# Since: 1.2
##
{ 'struct': 'NetdevVdeOptions',
'data': {
'*sock': 'str',
'*port': 'uint16',
'*group': 'str',
'*mode': 'uint16' } }
##
# @NetdevBridgeOptions:
#
# Connect a host TAP network interface to a host bridge device.
#
# @br: bridge name
#
# @helper: command to execute to configure bridge
#
# Since: 1.2
##
{ 'struct': 'NetdevBridgeOptions',
'data': {
'*br': 'str',
'*helper': 'str' } }
##
# @NetdevHubPortOptions:
#
# Connect two or more net clients through a software hub.
#
# @hubid: hub identifier number
#
# @netdev: used to connect hub to a netdev instead of a device (since
# 2.12)
#
# Since: 1.2
##
{ 'struct': 'NetdevHubPortOptions',
'data': {
net: Allow hubports to connect to other netdevs QEMU can emulate hubs to connect NICs and netdevs. This is currently primarily used for the mis-named 'vlan' feature of the networking subsystem. Now the 'vlan' feature has been marked as deprecated, since its name is rather confusing and the users often rather mis-configure their network when trying to use it. But while the 'vlan' parameter should be removed at one point in time, the basic idea of emulating a hub in QEMU is still good: It's useful for bundling up the output of multiple NICs into one single l2tp netdev for example. Now to be able to use the hubport feature without 'vlan's, there is one missing piece: The possibility to connect a hubport to a netdev, too. This patch adds this possibility by introducing a new "netdev=..." parameter to the hubports. To bundle up the output of multiple NICs into one socket netdev, you can now run QEMU with these parameters for example: qemu-system-ppc64 ... -netdev socket,id=s1,connect=:11122 \ -netdev hubport,hubid=1,id=h1,netdev=s1 \ -netdev hubport,hubid=1,id=h2 -device e1000,netdev=h2 \ -netdev hubport,hubid=1,id=h3 -device virtio-net-pci,netdev=h3 For using the socket netdev, you have got to start another QEMU as the receiving side first, for example with network dumping enabled: qemu-system-x86_64 -M isapc -netdev socket,id=s0,listen=:11122 \ -device ne2k_isa,netdev=s0 \ -object filter-dump,id=f1,netdev=s0,file=/tmp/dump.dat After the ppc64 guest tried to boot from both NICs, you can see in the dump file (using Wireshark, for example), that the output of both NICs (the e1000 and the virtio-net-pci) has been successfully transfered via the socket netdev in this case. Suggested-by: Paolo Bonzini <pbonzini@redhat.com> Signed-off-by: Thomas Huth <thuth@redhat.com> Signed-off-by: Jason Wang <jasowang@redhat.com>
2018-01-15 22:50:55 +03:00
'hubid': 'int32',
'*netdev': 'str' } }
##
# @NetdevNetmapOptions:
#
# Connect a client to a netmap-enabled NIC or to a VALE switch port
#
# @ifname: Either the name of an existing network interface supported
# by netmap, or the name of a VALE port (created on the fly). A
# VALE port name is in the form 'valeXXX:YYY', where XXX and YYY
# are non-negative integers. XXX identifies a switch and YYY
# identifies a port of the switch. VALE ports having the same XXX
# are therefore connected to the same switch.
#
# @devname: path of the netmap device (default: '/dev/netmap').
#
# Since: 2.0
##
{ 'struct': 'NetdevNetmapOptions',
'data': {
'ifname': 'str',
'*devname': 'str' } }
net: add initial support for AF_XDP network backend AF_XDP is a network socket family that allows communication directly with the network device driver in the kernel, bypassing most or all of the kernel networking stack. In the essence, the technology is pretty similar to netmap. But, unlike netmap, AF_XDP is Linux-native and works with any network interfaces without driver modifications. Unlike vhost-based backends (kernel, user, vdpa), AF_XDP doesn't require access to character devices or unix sockets. Only access to the network interface itself is necessary. This patch implements a network backend that communicates with the kernel by creating an AF_XDP socket. A chunk of userspace memory is shared between QEMU and the host kernel. 4 ring buffers (Tx, Rx, Fill and Completion) are placed in that memory along with a pool of memory buffers for the packet data. Data transmission is done by allocating one of the buffers, copying packet data into it and placing the pointer into Tx ring. After transmission, device will return the buffer via Completion ring. On Rx, device will take a buffer form a pre-populated Fill ring, write the packet data into it and place the buffer into Rx ring. AF_XDP network backend takes on the communication with the host kernel and the network interface and forwards packets to/from the peer device in QEMU. Usage example: -device virtio-net-pci,netdev=guest1,mac=00:16:35:AF:AA:5C -netdev af-xdp,ifname=ens6f1np1,id=guest1,mode=native,queues=1 XDP program bridges the socket with a network interface. It can be attached to the interface in 2 different modes: 1. skb - this mode should work for any interface and doesn't require driver support. With a caveat of lower performance. 2. native - this does require support from the driver and allows to bypass skb allocation in the kernel and potentially use zero-copy while getting packets in/out userspace. By default, QEMU will try to use native mode and fall back to skb. Mode can be forced via 'mode' option. To force 'copy' even in native mode, use 'force-copy=on' option. This might be useful if there is some issue with the driver. Option 'queues=N' allows to specify how many device queues should be open. Note that all the queues that are not open are still functional and can receive traffic, but it will not be delivered to QEMU. So, the number of device queues should generally match the QEMU configuration, unless the device is shared with something else and the traffic re-direction to appropriate queues is correctly configured on a device level (e.g. with ethtool -N). 'start-queue=M' option can be used to specify from which queue id QEMU should start configuring 'N' queues. It might also be necessary to use this option with certain NICs, e.g. MLX5 NICs. See the docs for examples. In a general case QEMU will need CAP_NET_ADMIN and CAP_SYS_ADMIN or CAP_BPF capabilities in order to load default XSK/XDP programs to the network interface and configure BPF maps. It is possible, however, to run with no capabilities. For that to work, an external process with enough capabilities will need to pre-load default XSK program, create AF_XDP sockets and pass their file descriptors to QEMU process on startup via 'sock-fds' option. Network backend will need to be configured with 'inhibit=on' to avoid loading of the program. QEMU will need 32 MB of locked memory (RLIMIT_MEMLOCK) per queue or CAP_IPC_LOCK. There are few performance challenges with the current network backends. First is that they do not support IO threads. This means that data path is handled by the main thread in QEMU and may slow down other work or may be slowed down by some other work. This also means that taking advantage of multi-queue is generally not possible today. Another thing is that data path is going through the device emulation code, which is not really optimized for performance. The fastest "frontend" device is virtio-net. But it's not optimized for heavy traffic either, because it expects such use-cases to be handled via some implementation of vhost (user, kernel, vdpa). In practice, we have virtio notifications and rcu lock/unlock on a per-packet basis and not very efficient accesses to the guest memory. Communication channels between backend and frontend devices do not allow passing more than one packet at a time as well. Some of these challenges can be avoided in the future by adding better batching into device emulation or by implementing vhost-af-xdp variant. There are also a few kernel limitations. AF_XDP sockets do not support any kinds of checksum or segmentation offloading. Buffers are limited to a page size (4K), i.e. MTU is limited. Multi-buffer support implementation for AF_XDP is in progress, but not ready yet. Also, transmission in all non-zero-copy modes is synchronous, i.e. done in a syscall. That doesn't allow high packet rates on virtual interfaces. However, keeping in mind all of these challenges, current implementation of the AF_XDP backend shows a decent performance while running on top of a physical NIC with zero-copy support. Test setup: 2 VMs running on 2 physical hosts connected via ConnectX6-Dx card. Network backend is configured to open the NIC directly in native mode. The driver supports zero-copy. NIC is configured to use 1 queue. Inside a VM - iperf3 for basic TCP performance testing and dpdk-testpmd for PPS testing. iperf3 result: TCP stream : 19.1 Gbps dpdk-testpmd (single queue, single CPU core, 64 B packets) results: Tx only : 3.4 Mpps Rx only : 2.0 Mpps L2 FWD Loopback : 1.5 Mpps In skb mode the same setup shows much lower performance, similar to the setup where pair of physical NICs is replaced with veth pair: iperf3 result: TCP stream : 9 Gbps dpdk-testpmd (single queue, single CPU core, 64 B packets) results: Tx only : 1.2 Mpps Rx only : 1.0 Mpps L2 FWD Loopback : 0.7 Mpps Results in skb mode or over the veth are close to results of a tap backend with vhost=on and disabled segmentation offloading bridged with a NIC. Signed-off-by: Ilya Maximets <i.maximets@ovn.org> Reviewed-by: Daniel P. Berrangé <berrange@redhat.com> (docker/lcitool) Signed-off-by: Jason Wang <jasowang@redhat.com>
2023-09-13 21:34:37 +03:00
##
# @AFXDPMode:
#
# Attach mode for a default XDP program
#
# @skb: generic mode, no driver support necessary
#
# @native: DRV mode, program is attached to a driver, packets are passed to
# the socket without allocation of skb.
#
# Since: 8.2
##
{ 'enum': 'AFXDPMode',
'data': [ 'native', 'skb' ],
'if': 'CONFIG_AF_XDP' }
##
# @NetdevAFXDPOptions:
#
# AF_XDP network backend
#
# @ifname: The name of an existing network interface.
#
# @mode: Attach mode for a default XDP program. If not specified, then
# 'native' will be tried first, then 'skb'.
#
# @force-copy: Force XDP copy mode even if device supports zero-copy.
# (default: false)
#
# @queues: number of queues to be used for multiqueue interfaces (default: 1).
#
# @start-queue: Use @queues starting from this queue number (default: 0).
#
# @inhibit: Don't load a default XDP program, use one already loaded to
# the interface (default: false). Requires @sock-fds.
#
# @sock-fds: A colon (:) separated list of file descriptors for already open
# but not bound AF_XDP sockets in the queue order. One fd per queue.
# These descriptors should already be added into XDP socket map for
# corresponding queues. Requires @inhibit.
#
# Since: 8.2
##
{ 'struct': 'NetdevAFXDPOptions',
'data': {
'ifname': 'str',
'*mode': 'AFXDPMode',
'*force-copy': 'bool',
'*queues': 'int',
'*start-queue': 'int',
'*inhibit': 'bool',
'*sock-fds': 'str' },
'if': 'CONFIG_AF_XDP' }
##
# @NetdevVhostUserOptions:
#
# Vhost-user network backend
#
# @chardev: name of a unix socket chardev
#
# @vhostforce: vhost on for non-MSIX virtio guests (default: false).
#
# @queues: number of queues to be created for multiqueue vhost-user
# (default: 1) (Since 2.5)
#
# Since: 2.1
##
{ 'struct': 'NetdevVhostUserOptions',
'data': {
'chardev': 'str',
'*vhostforce': 'bool',
'*queues': 'int' } }
##
# @NetdevVhostVDPAOptions:
#
# Vhost-vdpa network backend
#
# vDPA device is a device that uses a datapath which complies with the
# virtio specifications with a vendor specific control path.
#
# @vhostdev: path of vhost-vdpa device (default:'/dev/vhost-vdpa-0')
#
# @vhostfd: file descriptor of an already opened vhost vdpa device
#
# @queues: number of queues to be created for multiqueue vhost-vdpa
# (default: 1)
#
# @x-svq: Start device with (experimental) shadow virtqueue. (Since
# 7.1) (default: false)
#
# Features:
#
# @unstable: Member @x-svq is experimental.
#
# Since: 5.1
##
{ 'struct': 'NetdevVhostVDPAOptions',
'data': {
'*vhostdev': 'str',
'*vhostfd': 'str',
'*queues': 'int',
'*x-svq': {'type': 'bool', 'features' : [ 'unstable'] } } }
##
# @NetdevVmnetHostOptions:
#
# vmnet (host mode) network backend.
#
# Allows the vmnet interface to communicate with other vmnet
# interfaces that are in host mode and also with the host.
#
# @start-address: The starting IPv4 address to use for the interface.
# Must be in the private IP range (RFC 1918). Must be specified
# along with @end-address and @subnet-mask. This address is used
# as the gateway address. The subsequent address up to and
# including end-address are placed in the DHCP pool.
#
# @end-address: The DHCP IPv4 range end address to use for the
# interface. Must be in the private IP range (RFC 1918). Must be
# specified along with @start-address and @subnet-mask.
#
# @subnet-mask: The IPv4 subnet mask to use on the interface. Must be
# specified along with @start-address and @subnet-mask.
#
# @isolated: Enable isolation for this interface. Interface isolation
# ensures that vmnet interface is not able to communicate with any
# other vmnet interfaces. Only communication with host is
# allowed. Requires at least macOS Big Sur 11.0.
#
# @net-uuid: The identifier (UUID) to uniquely identify the isolated
# network vmnet interface should be added to. If set, no DHCP
# service is provided for this interface and network communication
# is allowed only with other interfaces added to this network
# identified by the UUID. Requires at least macOS Big Sur 11.0.
#
# Since: 7.1
##
{ 'struct': 'NetdevVmnetHostOptions',
'data': {
'*start-address': 'str',
'*end-address': 'str',
'*subnet-mask': 'str',
'*isolated': 'bool',
'*net-uuid': 'str' },
'if': 'CONFIG_VMNET' }
##
# @NetdevVmnetSharedOptions:
#
# vmnet (shared mode) network backend.
#
# Allows traffic originating from the vmnet interface to reach the
# Internet through a network address translator (NAT). The vmnet
# interface can communicate with the host and with other shared mode
# interfaces on the same subnet. If no DHCP settings, subnet mask and
# IPv6 prefix specified, the interface can communicate with any of
# other interfaces in shared mode.
#
# @start-address: The starting IPv4 address to use for the interface.
# Must be in the private IP range (RFC 1918). Must be specified
# along with @end-address and @subnet-mask. This address is used
# as the gateway address. The subsequent address up to and
# including end-address are placed in the DHCP pool.
#
# @end-address: The DHCP IPv4 range end address to use for the
# interface. Must be in the private IP range (RFC 1918). Must be
# specified along with @start-address and @subnet-mask.
#
# @subnet-mask: The IPv4 subnet mask to use on the interface. Must be
# specified along with @start-address and @subnet-mask.
#
# @isolated: Enable isolation for this interface. Interface isolation
# ensures that vmnet interface is not able to communicate with any
# other vmnet interfaces. Only communication with host is
# allowed. Requires at least macOS Big Sur 11.0.
#
# @nat66-prefix: The IPv6 prefix to use into guest network. Must be a
# unique local address i.e. start with fd00::/8 and have length of
# 64.
#
# Since: 7.1
##
{ 'struct': 'NetdevVmnetSharedOptions',
'data': {
'*start-address': 'str',
'*end-address': 'str',
'*subnet-mask': 'str',
'*isolated': 'bool',
'*nat66-prefix': 'str' },
'if': 'CONFIG_VMNET' }
##
# @NetdevVmnetBridgedOptions:
#
# vmnet (bridged mode) network backend.
#
# Bridges the vmnet interface with a physical network interface.
#
# @ifname: The name of the physical interface to be bridged.
#
# @isolated: Enable isolation for this interface. Interface isolation
# ensures that vmnet interface is not able to communicate with any
# other vmnet interfaces. Only communication with host is
# allowed. Requires at least macOS Big Sur 11.0.
#
# Since: 7.1
##
{ 'struct': 'NetdevVmnetBridgedOptions',
'data': {
'ifname': 'str',
'*isolated': 'bool' },
'if': 'CONFIG_VMNET' }
qapi: net: add stream and dgram netdevs Copied from socket netdev file and modified to use SocketAddress to be able to introduce new features like unix socket. "udp" and "mcast" are squashed into dgram netdev, multicast is detected according to the IP address type. "listen" and "connect" modes are managed by stream netdev. An optional parameter "server" defines the mode (off by default) The two new types need to be parsed the modern way with -netdev, because with the traditional way, the "type" field of netdev structure collides with the "type" field of SocketAddress and prevents the correct evaluation of the command line option. Moreover the traditional way doesn't allow to use the same type (SocketAddress) several times with the -netdev option (needed to specify "local" and "remote" addresses). The previous commit paved the way for parsing the modern way, but omitted one detail: how to pick modern vs. traditional, in netdev_is_modern(). We want to pick based on the value of parameter "type". But how to extract it from the option argument? Parsing the option argument, either the modern or the traditional way, extracts it for us, but only if parsing succeeds. If parsing fails, there is no good option. No matter which parser we pick, it'll be the wrong one for some arguments, and the error reporting will be confusing. Fortunately, the traditional parser accepts *anything* when called in a certain way. This maximizes our chance to extract the value of "type", and in turn minimizes the risk of confusing error reporting. Signed-off-by: Laurent Vivier <lvivier@redhat.com> Reviewed-by: Stefano Brivio <sbrivio@redhat.com> Acked-by: Markus Armbruster <armbru@redhat.com> Acked-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Jason Wang <jasowang@redhat.com>
2022-10-21 12:09:11 +03:00
##
# @NetdevStreamOptions:
#
# Configuration info for stream socket netdev
#
# @addr: socket address to listen on (server=true) or connect to
# (server=false)
#
qapi: net: add stream and dgram netdevs Copied from socket netdev file and modified to use SocketAddress to be able to introduce new features like unix socket. "udp" and "mcast" are squashed into dgram netdev, multicast is detected according to the IP address type. "listen" and "connect" modes are managed by stream netdev. An optional parameter "server" defines the mode (off by default) The two new types need to be parsed the modern way with -netdev, because with the traditional way, the "type" field of netdev structure collides with the "type" field of SocketAddress and prevents the correct evaluation of the command line option. Moreover the traditional way doesn't allow to use the same type (SocketAddress) several times with the -netdev option (needed to specify "local" and "remote" addresses). The previous commit paved the way for parsing the modern way, but omitted one detail: how to pick modern vs. traditional, in netdev_is_modern(). We want to pick based on the value of parameter "type". But how to extract it from the option argument? Parsing the option argument, either the modern or the traditional way, extracts it for us, but only if parsing succeeds. If parsing fails, there is no good option. No matter which parser we pick, it'll be the wrong one for some arguments, and the error reporting will be confusing. Fortunately, the traditional parser accepts *anything* when called in a certain way. This maximizes our chance to extract the value of "type", and in turn minimizes the risk of confusing error reporting. Signed-off-by: Laurent Vivier <lvivier@redhat.com> Reviewed-by: Stefano Brivio <sbrivio@redhat.com> Acked-by: Markus Armbruster <armbru@redhat.com> Acked-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Jason Wang <jasowang@redhat.com>
2022-10-21 12:09:11 +03:00
# @server: create server socket (default: false)
#
# @reconnect: For a client socket, if a socket is disconnected, then
# attempt a reconnect after the given number of seconds. Setting
# this to zero disables this function. (default: 0) (since 8.0)
qapi: net: add stream and dgram netdevs Copied from socket netdev file and modified to use SocketAddress to be able to introduce new features like unix socket. "udp" and "mcast" are squashed into dgram netdev, multicast is detected according to the IP address type. "listen" and "connect" modes are managed by stream netdev. An optional parameter "server" defines the mode (off by default) The two new types need to be parsed the modern way with -netdev, because with the traditional way, the "type" field of netdev structure collides with the "type" field of SocketAddress and prevents the correct evaluation of the command line option. Moreover the traditional way doesn't allow to use the same type (SocketAddress) several times with the -netdev option (needed to specify "local" and "remote" addresses). The previous commit paved the way for parsing the modern way, but omitted one detail: how to pick modern vs. traditional, in netdev_is_modern(). We want to pick based on the value of parameter "type". But how to extract it from the option argument? Parsing the option argument, either the modern or the traditional way, extracts it for us, but only if parsing succeeds. If parsing fails, there is no good option. No matter which parser we pick, it'll be the wrong one for some arguments, and the error reporting will be confusing. Fortunately, the traditional parser accepts *anything* when called in a certain way. This maximizes our chance to extract the value of "type", and in turn minimizes the risk of confusing error reporting. Signed-off-by: Laurent Vivier <lvivier@redhat.com> Reviewed-by: Stefano Brivio <sbrivio@redhat.com> Acked-by: Markus Armbruster <armbru@redhat.com> Acked-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Jason Wang <jasowang@redhat.com>
2022-10-21 12:09:11 +03:00
#
# Only SocketAddress types 'unix', 'inet' and 'fd' are supported.
qapi: net: add stream and dgram netdevs Copied from socket netdev file and modified to use SocketAddress to be able to introduce new features like unix socket. "udp" and "mcast" are squashed into dgram netdev, multicast is detected according to the IP address type. "listen" and "connect" modes are managed by stream netdev. An optional parameter "server" defines the mode (off by default) The two new types need to be parsed the modern way with -netdev, because with the traditional way, the "type" field of netdev structure collides with the "type" field of SocketAddress and prevents the correct evaluation of the command line option. Moreover the traditional way doesn't allow to use the same type (SocketAddress) several times with the -netdev option (needed to specify "local" and "remote" addresses). The previous commit paved the way for parsing the modern way, but omitted one detail: how to pick modern vs. traditional, in netdev_is_modern(). We want to pick based on the value of parameter "type". But how to extract it from the option argument? Parsing the option argument, either the modern or the traditional way, extracts it for us, but only if parsing succeeds. If parsing fails, there is no good option. No matter which parser we pick, it'll be the wrong one for some arguments, and the error reporting will be confusing. Fortunately, the traditional parser accepts *anything* when called in a certain way. This maximizes our chance to extract the value of "type", and in turn minimizes the risk of confusing error reporting. Signed-off-by: Laurent Vivier <lvivier@redhat.com> Reviewed-by: Stefano Brivio <sbrivio@redhat.com> Acked-by: Markus Armbruster <armbru@redhat.com> Acked-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Jason Wang <jasowang@redhat.com>
2022-10-21 12:09:11 +03:00
#
# Since: 7.2
##
{ 'struct': 'NetdevStreamOptions',
'data': {
'addr': 'SocketAddress',
'*server': 'bool',
'*reconnect': 'uint32' } }
qapi: net: add stream and dgram netdevs Copied from socket netdev file and modified to use SocketAddress to be able to introduce new features like unix socket. "udp" and "mcast" are squashed into dgram netdev, multicast is detected according to the IP address type. "listen" and "connect" modes are managed by stream netdev. An optional parameter "server" defines the mode (off by default) The two new types need to be parsed the modern way with -netdev, because with the traditional way, the "type" field of netdev structure collides with the "type" field of SocketAddress and prevents the correct evaluation of the command line option. Moreover the traditional way doesn't allow to use the same type (SocketAddress) several times with the -netdev option (needed to specify "local" and "remote" addresses). The previous commit paved the way for parsing the modern way, but omitted one detail: how to pick modern vs. traditional, in netdev_is_modern(). We want to pick based on the value of parameter "type". But how to extract it from the option argument? Parsing the option argument, either the modern or the traditional way, extracts it for us, but only if parsing succeeds. If parsing fails, there is no good option. No matter which parser we pick, it'll be the wrong one for some arguments, and the error reporting will be confusing. Fortunately, the traditional parser accepts *anything* when called in a certain way. This maximizes our chance to extract the value of "type", and in turn minimizes the risk of confusing error reporting. Signed-off-by: Laurent Vivier <lvivier@redhat.com> Reviewed-by: Stefano Brivio <sbrivio@redhat.com> Acked-by: Markus Armbruster <armbru@redhat.com> Acked-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Jason Wang <jasowang@redhat.com>
2022-10-21 12:09:11 +03:00
##
# @NetdevDgramOptions:
#
# Configuration info for datagram socket netdev.
#
# @remote: remote address
#
qapi: net: add stream and dgram netdevs Copied from socket netdev file and modified to use SocketAddress to be able to introduce new features like unix socket. "udp" and "mcast" are squashed into dgram netdev, multicast is detected according to the IP address type. "listen" and "connect" modes are managed by stream netdev. An optional parameter "server" defines the mode (off by default) The two new types need to be parsed the modern way with -netdev, because with the traditional way, the "type" field of netdev structure collides with the "type" field of SocketAddress and prevents the correct evaluation of the command line option. Moreover the traditional way doesn't allow to use the same type (SocketAddress) several times with the -netdev option (needed to specify "local" and "remote" addresses). The previous commit paved the way for parsing the modern way, but omitted one detail: how to pick modern vs. traditional, in netdev_is_modern(). We want to pick based on the value of parameter "type". But how to extract it from the option argument? Parsing the option argument, either the modern or the traditional way, extracts it for us, but only if parsing succeeds. If parsing fails, there is no good option. No matter which parser we pick, it'll be the wrong one for some arguments, and the error reporting will be confusing. Fortunately, the traditional parser accepts *anything* when called in a certain way. This maximizes our chance to extract the value of "type", and in turn minimizes the risk of confusing error reporting. Signed-off-by: Laurent Vivier <lvivier@redhat.com> Reviewed-by: Stefano Brivio <sbrivio@redhat.com> Acked-by: Markus Armbruster <armbru@redhat.com> Acked-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Jason Wang <jasowang@redhat.com>
2022-10-21 12:09:11 +03:00
# @local: local address
#
# Only SocketAddress types 'unix', 'inet' and 'fd' are supported.
qapi: net: add stream and dgram netdevs Copied from socket netdev file and modified to use SocketAddress to be able to introduce new features like unix socket. "udp" and "mcast" are squashed into dgram netdev, multicast is detected according to the IP address type. "listen" and "connect" modes are managed by stream netdev. An optional parameter "server" defines the mode (off by default) The two new types need to be parsed the modern way with -netdev, because with the traditional way, the "type" field of netdev structure collides with the "type" field of SocketAddress and prevents the correct evaluation of the command line option. Moreover the traditional way doesn't allow to use the same type (SocketAddress) several times with the -netdev option (needed to specify "local" and "remote" addresses). The previous commit paved the way for parsing the modern way, but omitted one detail: how to pick modern vs. traditional, in netdev_is_modern(). We want to pick based on the value of parameter "type". But how to extract it from the option argument? Parsing the option argument, either the modern or the traditional way, extracts it for us, but only if parsing succeeds. If parsing fails, there is no good option. No matter which parser we pick, it'll be the wrong one for some arguments, and the error reporting will be confusing. Fortunately, the traditional parser accepts *anything* when called in a certain way. This maximizes our chance to extract the value of "type", and in turn minimizes the risk of confusing error reporting. Signed-off-by: Laurent Vivier <lvivier@redhat.com> Reviewed-by: Stefano Brivio <sbrivio@redhat.com> Acked-by: Markus Armbruster <armbru@redhat.com> Acked-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Jason Wang <jasowang@redhat.com>
2022-10-21 12:09:11 +03:00
#
# If remote address is present and it's a multicast address, local
# address is optional. Otherwise local address is required and remote
# address is optional.
qapi: net: add stream and dgram netdevs Copied from socket netdev file and modified to use SocketAddress to be able to introduce new features like unix socket. "udp" and "mcast" are squashed into dgram netdev, multicast is detected according to the IP address type. "listen" and "connect" modes are managed by stream netdev. An optional parameter "server" defines the mode (off by default) The two new types need to be parsed the modern way with -netdev, because with the traditional way, the "type" field of netdev structure collides with the "type" field of SocketAddress and prevents the correct evaluation of the command line option. Moreover the traditional way doesn't allow to use the same type (SocketAddress) several times with the -netdev option (needed to specify "local" and "remote" addresses). The previous commit paved the way for parsing the modern way, but omitted one detail: how to pick modern vs. traditional, in netdev_is_modern(). We want to pick based on the value of parameter "type". But how to extract it from the option argument? Parsing the option argument, either the modern or the traditional way, extracts it for us, but only if parsing succeeds. If parsing fails, there is no good option. No matter which parser we pick, it'll be the wrong one for some arguments, and the error reporting will be confusing. Fortunately, the traditional parser accepts *anything* when called in a certain way. This maximizes our chance to extract the value of "type", and in turn minimizes the risk of confusing error reporting. Signed-off-by: Laurent Vivier <lvivier@redhat.com> Reviewed-by: Stefano Brivio <sbrivio@redhat.com> Acked-by: Markus Armbruster <armbru@redhat.com> Acked-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Jason Wang <jasowang@redhat.com>
2022-10-21 12:09:11 +03:00
#
# .. table:: Valid parameters combination table
# :widths: auto
#
# ============= ======== =====
# remote local okay?
# ============= ======== =====
# absent absent no
# absent not fd no
# absent fd yes
# multicast absent yes
# multicast present yes
# not multicast absent no
# not multicast present yes
# ============= ======== =====
#
# Since: 7.2
##
{ 'struct': 'NetdevDgramOptions',
'data': {
'*local': 'SocketAddress',
'*remote': 'SocketAddress' } }
##
# @NetClientDriver:
#
# Available netdev drivers.
#
# @l2tpv3: since 2.1
# @vhost-vdpa: since 5.1
# @vmnet-host: since 7.1
# @vmnet-shared: since 7.1
# @vmnet-bridged: since 7.1
# @stream: since 7.2
# @dgram: since 7.2
net: add initial support for AF_XDP network backend AF_XDP is a network socket family that allows communication directly with the network device driver in the kernel, bypassing most or all of the kernel networking stack. In the essence, the technology is pretty similar to netmap. But, unlike netmap, AF_XDP is Linux-native and works with any network interfaces without driver modifications. Unlike vhost-based backends (kernel, user, vdpa), AF_XDP doesn't require access to character devices or unix sockets. Only access to the network interface itself is necessary. This patch implements a network backend that communicates with the kernel by creating an AF_XDP socket. A chunk of userspace memory is shared between QEMU and the host kernel. 4 ring buffers (Tx, Rx, Fill and Completion) are placed in that memory along with a pool of memory buffers for the packet data. Data transmission is done by allocating one of the buffers, copying packet data into it and placing the pointer into Tx ring. After transmission, device will return the buffer via Completion ring. On Rx, device will take a buffer form a pre-populated Fill ring, write the packet data into it and place the buffer into Rx ring. AF_XDP network backend takes on the communication with the host kernel and the network interface and forwards packets to/from the peer device in QEMU. Usage example: -device virtio-net-pci,netdev=guest1,mac=00:16:35:AF:AA:5C -netdev af-xdp,ifname=ens6f1np1,id=guest1,mode=native,queues=1 XDP program bridges the socket with a network interface. It can be attached to the interface in 2 different modes: 1. skb - this mode should work for any interface and doesn't require driver support. With a caveat of lower performance. 2. native - this does require support from the driver and allows to bypass skb allocation in the kernel and potentially use zero-copy while getting packets in/out userspace. By default, QEMU will try to use native mode and fall back to skb. Mode can be forced via 'mode' option. To force 'copy' even in native mode, use 'force-copy=on' option. This might be useful if there is some issue with the driver. Option 'queues=N' allows to specify how many device queues should be open. Note that all the queues that are not open are still functional and can receive traffic, but it will not be delivered to QEMU. So, the number of device queues should generally match the QEMU configuration, unless the device is shared with something else and the traffic re-direction to appropriate queues is correctly configured on a device level (e.g. with ethtool -N). 'start-queue=M' option can be used to specify from which queue id QEMU should start configuring 'N' queues. It might also be necessary to use this option with certain NICs, e.g. MLX5 NICs. See the docs for examples. In a general case QEMU will need CAP_NET_ADMIN and CAP_SYS_ADMIN or CAP_BPF capabilities in order to load default XSK/XDP programs to the network interface and configure BPF maps. It is possible, however, to run with no capabilities. For that to work, an external process with enough capabilities will need to pre-load default XSK program, create AF_XDP sockets and pass their file descriptors to QEMU process on startup via 'sock-fds' option. Network backend will need to be configured with 'inhibit=on' to avoid loading of the program. QEMU will need 32 MB of locked memory (RLIMIT_MEMLOCK) per queue or CAP_IPC_LOCK. There are few performance challenges with the current network backends. First is that they do not support IO threads. This means that data path is handled by the main thread in QEMU and may slow down other work or may be slowed down by some other work. This also means that taking advantage of multi-queue is generally not possible today. Another thing is that data path is going through the device emulation code, which is not really optimized for performance. The fastest "frontend" device is virtio-net. But it's not optimized for heavy traffic either, because it expects such use-cases to be handled via some implementation of vhost (user, kernel, vdpa). In practice, we have virtio notifications and rcu lock/unlock on a per-packet basis and not very efficient accesses to the guest memory. Communication channels between backend and frontend devices do not allow passing more than one packet at a time as well. Some of these challenges can be avoided in the future by adding better batching into device emulation or by implementing vhost-af-xdp variant. There are also a few kernel limitations. AF_XDP sockets do not support any kinds of checksum or segmentation offloading. Buffers are limited to a page size (4K), i.e. MTU is limited. Multi-buffer support implementation for AF_XDP is in progress, but not ready yet. Also, transmission in all non-zero-copy modes is synchronous, i.e. done in a syscall. That doesn't allow high packet rates on virtual interfaces. However, keeping in mind all of these challenges, current implementation of the AF_XDP backend shows a decent performance while running on top of a physical NIC with zero-copy support. Test setup: 2 VMs running on 2 physical hosts connected via ConnectX6-Dx card. Network backend is configured to open the NIC directly in native mode. The driver supports zero-copy. NIC is configured to use 1 queue. Inside a VM - iperf3 for basic TCP performance testing and dpdk-testpmd for PPS testing. iperf3 result: TCP stream : 19.1 Gbps dpdk-testpmd (single queue, single CPU core, 64 B packets) results: Tx only : 3.4 Mpps Rx only : 2.0 Mpps L2 FWD Loopback : 1.5 Mpps In skb mode the same setup shows much lower performance, similar to the setup where pair of physical NICs is replaced with veth pair: iperf3 result: TCP stream : 9 Gbps dpdk-testpmd (single queue, single CPU core, 64 B packets) results: Tx only : 1.2 Mpps Rx only : 1.0 Mpps L2 FWD Loopback : 0.7 Mpps Results in skb mode or over the veth are close to results of a tap backend with vhost=on and disabled segmentation offloading bridged with a NIC. Signed-off-by: Ilya Maximets <i.maximets@ovn.org> Reviewed-by: Daniel P. Berrangé <berrange@redhat.com> (docker/lcitool) Signed-off-by: Jason Wang <jasowang@redhat.com>
2023-09-13 21:34:37 +03:00
# @af-xdp: since 8.2
#
# Since: 2.7
##
{ 'enum': 'NetClientDriver',
qapi: net: add stream and dgram netdevs Copied from socket netdev file and modified to use SocketAddress to be able to introduce new features like unix socket. "udp" and "mcast" are squashed into dgram netdev, multicast is detected according to the IP address type. "listen" and "connect" modes are managed by stream netdev. An optional parameter "server" defines the mode (off by default) The two new types need to be parsed the modern way with -netdev, because with the traditional way, the "type" field of netdev structure collides with the "type" field of SocketAddress and prevents the correct evaluation of the command line option. Moreover the traditional way doesn't allow to use the same type (SocketAddress) several times with the -netdev option (needed to specify "local" and "remote" addresses). The previous commit paved the way for parsing the modern way, but omitted one detail: how to pick modern vs. traditional, in netdev_is_modern(). We want to pick based on the value of parameter "type". But how to extract it from the option argument? Parsing the option argument, either the modern or the traditional way, extracts it for us, but only if parsing succeeds. If parsing fails, there is no good option. No matter which parser we pick, it'll be the wrong one for some arguments, and the error reporting will be confusing. Fortunately, the traditional parser accepts *anything* when called in a certain way. This maximizes our chance to extract the value of "type", and in turn minimizes the risk of confusing error reporting. Signed-off-by: Laurent Vivier <lvivier@redhat.com> Reviewed-by: Stefano Brivio <sbrivio@redhat.com> Acked-by: Markus Armbruster <armbru@redhat.com> Acked-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Jason Wang <jasowang@redhat.com>
2022-10-21 12:09:11 +03:00
'data': [ 'none', 'nic', 'user', 'tap', 'l2tpv3', 'socket', 'stream',
'dgram', 'vde', 'bridge', 'hubport', 'netmap', 'vhost-user',
'vhost-vdpa',
net: add initial support for AF_XDP network backend AF_XDP is a network socket family that allows communication directly with the network device driver in the kernel, bypassing most or all of the kernel networking stack. In the essence, the technology is pretty similar to netmap. But, unlike netmap, AF_XDP is Linux-native and works with any network interfaces without driver modifications. Unlike vhost-based backends (kernel, user, vdpa), AF_XDP doesn't require access to character devices or unix sockets. Only access to the network interface itself is necessary. This patch implements a network backend that communicates with the kernel by creating an AF_XDP socket. A chunk of userspace memory is shared between QEMU and the host kernel. 4 ring buffers (Tx, Rx, Fill and Completion) are placed in that memory along with a pool of memory buffers for the packet data. Data transmission is done by allocating one of the buffers, copying packet data into it and placing the pointer into Tx ring. After transmission, device will return the buffer via Completion ring. On Rx, device will take a buffer form a pre-populated Fill ring, write the packet data into it and place the buffer into Rx ring. AF_XDP network backend takes on the communication with the host kernel and the network interface and forwards packets to/from the peer device in QEMU. Usage example: -device virtio-net-pci,netdev=guest1,mac=00:16:35:AF:AA:5C -netdev af-xdp,ifname=ens6f1np1,id=guest1,mode=native,queues=1 XDP program bridges the socket with a network interface. It can be attached to the interface in 2 different modes: 1. skb - this mode should work for any interface and doesn't require driver support. With a caveat of lower performance. 2. native - this does require support from the driver and allows to bypass skb allocation in the kernel and potentially use zero-copy while getting packets in/out userspace. By default, QEMU will try to use native mode and fall back to skb. Mode can be forced via 'mode' option. To force 'copy' even in native mode, use 'force-copy=on' option. This might be useful if there is some issue with the driver. Option 'queues=N' allows to specify how many device queues should be open. Note that all the queues that are not open are still functional and can receive traffic, but it will not be delivered to QEMU. So, the number of device queues should generally match the QEMU configuration, unless the device is shared with something else and the traffic re-direction to appropriate queues is correctly configured on a device level (e.g. with ethtool -N). 'start-queue=M' option can be used to specify from which queue id QEMU should start configuring 'N' queues. It might also be necessary to use this option with certain NICs, e.g. MLX5 NICs. See the docs for examples. In a general case QEMU will need CAP_NET_ADMIN and CAP_SYS_ADMIN or CAP_BPF capabilities in order to load default XSK/XDP programs to the network interface and configure BPF maps. It is possible, however, to run with no capabilities. For that to work, an external process with enough capabilities will need to pre-load default XSK program, create AF_XDP sockets and pass their file descriptors to QEMU process on startup via 'sock-fds' option. Network backend will need to be configured with 'inhibit=on' to avoid loading of the program. QEMU will need 32 MB of locked memory (RLIMIT_MEMLOCK) per queue or CAP_IPC_LOCK. There are few performance challenges with the current network backends. First is that they do not support IO threads. This means that data path is handled by the main thread in QEMU and may slow down other work or may be slowed down by some other work. This also means that taking advantage of multi-queue is generally not possible today. Another thing is that data path is going through the device emulation code, which is not really optimized for performance. The fastest "frontend" device is virtio-net. But it's not optimized for heavy traffic either, because it expects such use-cases to be handled via some implementation of vhost (user, kernel, vdpa). In practice, we have virtio notifications and rcu lock/unlock on a per-packet basis and not very efficient accesses to the guest memory. Communication channels between backend and frontend devices do not allow passing more than one packet at a time as well. Some of these challenges can be avoided in the future by adding better batching into device emulation or by implementing vhost-af-xdp variant. There are also a few kernel limitations. AF_XDP sockets do not support any kinds of checksum or segmentation offloading. Buffers are limited to a page size (4K), i.e. MTU is limited. Multi-buffer support implementation for AF_XDP is in progress, but not ready yet. Also, transmission in all non-zero-copy modes is synchronous, i.e. done in a syscall. That doesn't allow high packet rates on virtual interfaces. However, keeping in mind all of these challenges, current implementation of the AF_XDP backend shows a decent performance while running on top of a physical NIC with zero-copy support. Test setup: 2 VMs running on 2 physical hosts connected via ConnectX6-Dx card. Network backend is configured to open the NIC directly in native mode. The driver supports zero-copy. NIC is configured to use 1 queue. Inside a VM - iperf3 for basic TCP performance testing and dpdk-testpmd for PPS testing. iperf3 result: TCP stream : 19.1 Gbps dpdk-testpmd (single queue, single CPU core, 64 B packets) results: Tx only : 3.4 Mpps Rx only : 2.0 Mpps L2 FWD Loopback : 1.5 Mpps In skb mode the same setup shows much lower performance, similar to the setup where pair of physical NICs is replaced with veth pair: iperf3 result: TCP stream : 9 Gbps dpdk-testpmd (single queue, single CPU core, 64 B packets) results: Tx only : 1.2 Mpps Rx only : 1.0 Mpps L2 FWD Loopback : 0.7 Mpps Results in skb mode or over the veth are close to results of a tap backend with vhost=on and disabled segmentation offloading bridged with a NIC. Signed-off-by: Ilya Maximets <i.maximets@ovn.org> Reviewed-by: Daniel P. Berrangé <berrange@redhat.com> (docker/lcitool) Signed-off-by: Jason Wang <jasowang@redhat.com>
2023-09-13 21:34:37 +03:00
{ 'name': 'af-xdp', 'if': 'CONFIG_AF_XDP' },
{ 'name': 'vmnet-host', 'if': 'CONFIG_VMNET' },
{ 'name': 'vmnet-shared', 'if': 'CONFIG_VMNET' },
{ 'name': 'vmnet-bridged', 'if': 'CONFIG_VMNET' }] }
##
# @Netdev:
#
# Captures the configuration of a network device.
#
# @id: identifier for monitor commands.
#
# @type: Specify the driver used for interpreting remaining arguments.
#
# Since: 1.2
##
{ 'union': 'Netdev',
'base': { 'id': 'str', 'type': 'NetClientDriver' },
'discriminator': 'type',
'data': {
'nic': 'NetLegacyNicOptions',
'user': 'NetdevUserOptions',
'tap': 'NetdevTapOptions',
'l2tpv3': 'NetdevL2TPv3Options',
'socket': 'NetdevSocketOptions',
qapi: net: add stream and dgram netdevs Copied from socket netdev file and modified to use SocketAddress to be able to introduce new features like unix socket. "udp" and "mcast" are squashed into dgram netdev, multicast is detected according to the IP address type. "listen" and "connect" modes are managed by stream netdev. An optional parameter "server" defines the mode (off by default) The two new types need to be parsed the modern way with -netdev, because with the traditional way, the "type" field of netdev structure collides with the "type" field of SocketAddress and prevents the correct evaluation of the command line option. Moreover the traditional way doesn't allow to use the same type (SocketAddress) several times with the -netdev option (needed to specify "local" and "remote" addresses). The previous commit paved the way for parsing the modern way, but omitted one detail: how to pick modern vs. traditional, in netdev_is_modern(). We want to pick based on the value of parameter "type". But how to extract it from the option argument? Parsing the option argument, either the modern or the traditional way, extracts it for us, but only if parsing succeeds. If parsing fails, there is no good option. No matter which parser we pick, it'll be the wrong one for some arguments, and the error reporting will be confusing. Fortunately, the traditional parser accepts *anything* when called in a certain way. This maximizes our chance to extract the value of "type", and in turn minimizes the risk of confusing error reporting. Signed-off-by: Laurent Vivier <lvivier@redhat.com> Reviewed-by: Stefano Brivio <sbrivio@redhat.com> Acked-by: Markus Armbruster <armbru@redhat.com> Acked-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Jason Wang <jasowang@redhat.com>
2022-10-21 12:09:11 +03:00
'stream': 'NetdevStreamOptions',
'dgram': 'NetdevDgramOptions',
'vde': 'NetdevVdeOptions',
'bridge': 'NetdevBridgeOptions',
'hubport': 'NetdevHubPortOptions',
'netmap': 'NetdevNetmapOptions',
net: add initial support for AF_XDP network backend AF_XDP is a network socket family that allows communication directly with the network device driver in the kernel, bypassing most or all of the kernel networking stack. In the essence, the technology is pretty similar to netmap. But, unlike netmap, AF_XDP is Linux-native and works with any network interfaces without driver modifications. Unlike vhost-based backends (kernel, user, vdpa), AF_XDP doesn't require access to character devices or unix sockets. Only access to the network interface itself is necessary. This patch implements a network backend that communicates with the kernel by creating an AF_XDP socket. A chunk of userspace memory is shared between QEMU and the host kernel. 4 ring buffers (Tx, Rx, Fill and Completion) are placed in that memory along with a pool of memory buffers for the packet data. Data transmission is done by allocating one of the buffers, copying packet data into it and placing the pointer into Tx ring. After transmission, device will return the buffer via Completion ring. On Rx, device will take a buffer form a pre-populated Fill ring, write the packet data into it and place the buffer into Rx ring. AF_XDP network backend takes on the communication with the host kernel and the network interface and forwards packets to/from the peer device in QEMU. Usage example: -device virtio-net-pci,netdev=guest1,mac=00:16:35:AF:AA:5C -netdev af-xdp,ifname=ens6f1np1,id=guest1,mode=native,queues=1 XDP program bridges the socket with a network interface. It can be attached to the interface in 2 different modes: 1. skb - this mode should work for any interface and doesn't require driver support. With a caveat of lower performance. 2. native - this does require support from the driver and allows to bypass skb allocation in the kernel and potentially use zero-copy while getting packets in/out userspace. By default, QEMU will try to use native mode and fall back to skb. Mode can be forced via 'mode' option. To force 'copy' even in native mode, use 'force-copy=on' option. This might be useful if there is some issue with the driver. Option 'queues=N' allows to specify how many device queues should be open. Note that all the queues that are not open are still functional and can receive traffic, but it will not be delivered to QEMU. So, the number of device queues should generally match the QEMU configuration, unless the device is shared with something else and the traffic re-direction to appropriate queues is correctly configured on a device level (e.g. with ethtool -N). 'start-queue=M' option can be used to specify from which queue id QEMU should start configuring 'N' queues. It might also be necessary to use this option with certain NICs, e.g. MLX5 NICs. See the docs for examples. In a general case QEMU will need CAP_NET_ADMIN and CAP_SYS_ADMIN or CAP_BPF capabilities in order to load default XSK/XDP programs to the network interface and configure BPF maps. It is possible, however, to run with no capabilities. For that to work, an external process with enough capabilities will need to pre-load default XSK program, create AF_XDP sockets and pass their file descriptors to QEMU process on startup via 'sock-fds' option. Network backend will need to be configured with 'inhibit=on' to avoid loading of the program. QEMU will need 32 MB of locked memory (RLIMIT_MEMLOCK) per queue or CAP_IPC_LOCK. There are few performance challenges with the current network backends. First is that they do not support IO threads. This means that data path is handled by the main thread in QEMU and may slow down other work or may be slowed down by some other work. This also means that taking advantage of multi-queue is generally not possible today. Another thing is that data path is going through the device emulation code, which is not really optimized for performance. The fastest "frontend" device is virtio-net. But it's not optimized for heavy traffic either, because it expects such use-cases to be handled via some implementation of vhost (user, kernel, vdpa). In practice, we have virtio notifications and rcu lock/unlock on a per-packet basis and not very efficient accesses to the guest memory. Communication channels between backend and frontend devices do not allow passing more than one packet at a time as well. Some of these challenges can be avoided in the future by adding better batching into device emulation or by implementing vhost-af-xdp variant. There are also a few kernel limitations. AF_XDP sockets do not support any kinds of checksum or segmentation offloading. Buffers are limited to a page size (4K), i.e. MTU is limited. Multi-buffer support implementation for AF_XDP is in progress, but not ready yet. Also, transmission in all non-zero-copy modes is synchronous, i.e. done in a syscall. That doesn't allow high packet rates on virtual interfaces. However, keeping in mind all of these challenges, current implementation of the AF_XDP backend shows a decent performance while running on top of a physical NIC with zero-copy support. Test setup: 2 VMs running on 2 physical hosts connected via ConnectX6-Dx card. Network backend is configured to open the NIC directly in native mode. The driver supports zero-copy. NIC is configured to use 1 queue. Inside a VM - iperf3 for basic TCP performance testing and dpdk-testpmd for PPS testing. iperf3 result: TCP stream : 19.1 Gbps dpdk-testpmd (single queue, single CPU core, 64 B packets) results: Tx only : 3.4 Mpps Rx only : 2.0 Mpps L2 FWD Loopback : 1.5 Mpps In skb mode the same setup shows much lower performance, similar to the setup where pair of physical NICs is replaced with veth pair: iperf3 result: TCP stream : 9 Gbps dpdk-testpmd (single queue, single CPU core, 64 B packets) results: Tx only : 1.2 Mpps Rx only : 1.0 Mpps L2 FWD Loopback : 0.7 Mpps Results in skb mode or over the veth are close to results of a tap backend with vhost=on and disabled segmentation offloading bridged with a NIC. Signed-off-by: Ilya Maximets <i.maximets@ovn.org> Reviewed-by: Daniel P. Berrangé <berrange@redhat.com> (docker/lcitool) Signed-off-by: Jason Wang <jasowang@redhat.com>
2023-09-13 21:34:37 +03:00
'af-xdp': { 'type': 'NetdevAFXDPOptions',
'if': 'CONFIG_AF_XDP' },
'vhost-user': 'NetdevVhostUserOptions',
'vhost-vdpa': 'NetdevVhostVDPAOptions',
'vmnet-host': { 'type': 'NetdevVmnetHostOptions',
'if': 'CONFIG_VMNET' },
'vmnet-shared': { 'type': 'NetdevVmnetSharedOptions',
'if': 'CONFIG_VMNET' },
'vmnet-bridged': { 'type': 'NetdevVmnetBridgedOptions',
'if': 'CONFIG_VMNET' } } }
##
# @RxState:
#
# Packets receiving state
#
# @normal: filter assigned packets according to the mac-table
#
# @none: don't receive any assigned packet
#
# @all: receive all assigned packets
#
# Since: 1.6
##
{ 'enum': 'RxState', 'data': [ 'normal', 'none', 'all' ] }
##
# @RxFilterInfo:
#
# Rx-filter information for a NIC.
#
# @name: net client name
#
# @promiscuous: whether promiscuous mode is enabled
#
# @multicast: multicast receive state
#
# @unicast: unicast receive state
#
# @vlan: vlan receive state (Since 2.0)
#
# @broadcast-allowed: whether to receive broadcast
#
# @multicast-overflow: multicast table is overflowed or not
#
# @unicast-overflow: unicast table is overflowed or not
#
# @main-mac: the main macaddr string
#
# @vlan-table: a list of active vlan id
#
# @unicast-table: a list of unicast macaddr string
#
# @multicast-table: a list of multicast macaddr string
#
# Since: 1.6
##
{ 'struct': 'RxFilterInfo',
'data': {
'name': 'str',
'promiscuous': 'bool',
'multicast': 'RxState',
'unicast': 'RxState',
'vlan': 'RxState',
'broadcast-allowed': 'bool',
'multicast-overflow': 'bool',
'unicast-overflow': 'bool',
'main-mac': 'str',
'vlan-table': ['int'],
'unicast-table': ['str'],
'multicast-table': ['str'] }}
##
# @query-rx-filter:
#
# Return rx-filter information for all NICs (or for the given NIC).
#
# @name: net client name
#
# Returns: list of @RxFilterInfo for all NICs (or for the given NIC).
# Returns an error if the given @name doesn't exist, or given NIC
# doesn't support rx-filter querying, or given net client isn't a
# NIC.
#
# Since: 1.6
#
# Example:
#
# -> { "execute": "query-rx-filter", "arguments": { "name": "vnet0" } }
# <- { "return": [
# {
# "promiscuous": true,
# "name": "vnet0",
# "main-mac": "52:54:00:12:34:56",
# "unicast": "normal",
# "vlan": "normal",
# "vlan-table": [
# 4,
# 0
# ],
# "unicast-table": [
# ],
# "multicast": "normal",
# "multicast-overflow": false,
# "unicast-overflow": false,
# "multicast-table": [
# "01:00:5e:00:00:01",
# "33:33:00:00:00:01",
# "33:33:ff:12:34:56"
# ],
# "broadcast-allowed": false
# }
# ]
# }
##
{ 'command': 'query-rx-filter',
'data': { '*name': 'str' },
'returns': ['RxFilterInfo'] }
##
# @NIC_RX_FILTER_CHANGED:
#
# Emitted once until the 'query-rx-filter' command is executed, the
# first event will always be emitted
#
# @name: net client name
#
# @path: device path
#
# Since: 1.6
#
# Example:
#
# <- { "event": "NIC_RX_FILTER_CHANGED",
# "data": { "name": "vnet0",
# "path": "/machine/peripheral/vnet0/virtio-backend" },
# "timestamp": { "seconds": 1368697518, "microseconds": 326866 } }
##
{ 'event': 'NIC_RX_FILTER_CHANGED',
'data': { '*name': 'str', 'path': 'str' } }
##
# @AnnounceParameters:
#
# Parameters for self-announce timers
#
# @initial: Initial delay (in ms) before sending the first GARP/RARP
# announcement
#
# @max: Maximum delay (in ms) between GARP/RARP announcement packets
#
# @rounds: Number of self-announcement attempts
#
# @step: Delay increase (in ms) after each self-announcement attempt
#
# @interfaces: An optional list of interface names, which restricts
# the announcement to the listed interfaces. (Since 4.1)
#
# @id: A name to be used to identify an instance of announce-timers
# and to allow it to modified later. Not for use as part of the
# migration parameters. (Since 4.1)
#
# Since: 4.0
##
{ 'struct': 'AnnounceParameters',
'data': { 'initial': 'int',
'max': 'int',
'rounds': 'int',
'step': 'int',
'*interfaces': ['str'],
'*id' : 'str' } }
##
# @announce-self:
#
# Trigger generation of broadcast RARP frames to update network
# switches. This can be useful when network bonds fail-over the
# active slave.
#
# Example:
#
# -> { "execute": "announce-self",
# "arguments": {
# "initial": 50, "max": 550, "rounds": 10, "step": 50,
# "interfaces": ["vn2", "vn3"], "id": "bob" } }
# <- { "return": {} }
#
# Since: 4.0
##
{ 'command': 'announce-self', 'boxed': true,
'data' : 'AnnounceParameters'}
##
# @FAILOVER_NEGOTIATED:
#
# Emitted when VIRTIO_NET_F_STANDBY was enabled during feature
# negotiation. Failover primary devices which were hidden (not
# hotplugged when requested) before will now be hotplugged by the
# virtio-net standby device.
#
# @device-id: QEMU device id of the unplugged device
#
# Since: 4.2
#
# Example:
#
# <- { "event": "FAILOVER_NEGOTIATED",
# "data": { "device-id": "net1" },
# "timestamp": { "seconds": 1368697518, "microseconds": 326866 } }
##
{ 'event': 'FAILOVER_NEGOTIATED',
'data': {'device-id': 'str'} }
##
# @NETDEV_STREAM_CONNECTED:
#
# Emitted when the netdev stream backend is connected
#
# @netdev-id: QEMU netdev id that is connected
#
# @addr: The destination address
#
# Since: 7.2
#
# Examples:
#
# <- { "event": "NETDEV_STREAM_CONNECTED",
# "data": { "netdev-id": "netdev0",
# "addr": { "port": "47666", "ipv6": true,
# "host": "::1", "type": "inet" } },
# "timestamp": { "seconds": 1666269863, "microseconds": 311222 } }
#
# <- { "event": "NETDEV_STREAM_CONNECTED",
# "data": { "netdev-id": "netdev0",
# "addr": { "path": "/tmp/qemu0", "type": "unix" } },
# "timestamp": { "seconds": 1666269706, "microseconds": 413651 } }
##
{ 'event': 'NETDEV_STREAM_CONNECTED',
'data': { 'netdev-id': 'str',
'addr': 'SocketAddress' } }
##
# @NETDEV_STREAM_DISCONNECTED:
#
# Emitted when the netdev stream backend is disconnected
#
# @netdev-id: QEMU netdev id that is disconnected
#
# Since: 7.2
#
# Example:
#
# <- { 'event': 'NETDEV_STREAM_DISCONNECTED',
# 'data': {'netdev-id': 'netdev0'},
# 'timestamp': {'seconds': 1663330937, 'microseconds': 526695} }
##
{ 'event': 'NETDEV_STREAM_DISCONNECTED',
'data': { 'netdev-id': 'str' } }