-----BEGIN PGP SIGNATURE-----

Version: GnuPG v1
 
 iQEcBAABAgAGBQJlB/SLAAoJEO8Ells5jWIR7EQH/1kAbxHcSGJXDOgQAXJ/rOZi
 UKn3ugJzD0Hxd4Xz8cvdVLM+9/JoEEOK1uB+NIG7Ask/gA5D7eUYzaLtp1OJ8VNO
 mamfKmn3EIBWJoLSHH19TKzfW2tGMJHQ0Nj+sbDQRkK5f2c7hwLTRXa1EmlJd4dB
 VoVzX4OiJtrQyv4OVmpP/PSETXJDvYYX/DNcRl9/3ccKtQW/wVDI3YzrMzXrsgyc
 w9ItJi8k+19mVH6RgQwciqRvTbVMdzkOxqvU//LY0TxnjsHfbyHr+KlNAa2WTY2N
 QgpAlMZhHqUG6/XXAs0o2VEtA66zmw932Xfy/CZUEcdGWfkG/9CEVfbuT4CKGY4=
 =tF7K
 -----END PGP SIGNATURE-----

Merge tag 'net-pull-request' of https://github.com/jasowang/qemu into staging

# -----BEGIN PGP SIGNATURE-----
# Version: GnuPG v1
#
# iQEcBAABAgAGBQJlB/SLAAoJEO8Ells5jWIR7EQH/1kAbxHcSGJXDOgQAXJ/rOZi
# UKn3ugJzD0Hxd4Xz8cvdVLM+9/JoEEOK1uB+NIG7Ask/gA5D7eUYzaLtp1OJ8VNO
# mamfKmn3EIBWJoLSHH19TKzfW2tGMJHQ0Nj+sbDQRkK5f2c7hwLTRXa1EmlJd4dB
# VoVzX4OiJtrQyv4OVmpP/PSETXJDvYYX/DNcRl9/3ccKtQW/wVDI3YzrMzXrsgyc
# w9ItJi8k+19mVH6RgQwciqRvTbVMdzkOxqvU//LY0TxnjsHfbyHr+KlNAa2WTY2N
# QgpAlMZhHqUG6/XXAs0o2VEtA66zmw932Xfy/CZUEcdGWfkG/9CEVfbuT4CKGY4=
# =tF7K
# -----END PGP SIGNATURE-----
# gpg: Signature made Mon 18 Sep 2023 02:56:11 EDT
# gpg:                using RSA key EF04965B398D6211
# gpg: Good signature from "Jason Wang (Jason Wang on RedHat) <jasowang@redhat.com>" [full]
# Primary key fingerprint: 215D 46F4 8246 689E C77F  3562 EF04 965B 398D 6211

* tag 'net-pull-request' of https://github.com/jasowang/qemu:
  net/tap: Avoid variable-length array
  net/dump: Avoid variable length array
  hw/net/rocker: Avoid variable length array
  hw/net/fsl_etsec/rings.c: Avoid variable length array
  net: add initial support for AF_XDP network backend
  tests: bump libvirt-ci for libasan and libxdp
  e1000e: rename e1000e_ba_state and e1000e_write_hdr_to_rx_buffers
  igb: packet-split descriptors support
  igb: add IPv6 extended headers traffic detection
  igb: RX payload guest writting refactoring
  igb: RX descriptors guest writting refactoring
  igb: rename E1000E_RingInfo_st
  igb: remove TCP ACK detection
  virtio-net: Add support for USO features
  virtio-net: Add USO flags to vhost support.
  tap: Add check for USO features
  tap: Add USO support to tap device.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
This commit is contained in:
Stefan Hajnoczi 2023-09-19 13:21:49 -04:00
commit dd0c84983d
50 changed files with 1435 additions and 284 deletions

View File

@ -2957,6 +2957,10 @@ W: http://info.iet.unipi.it/~luigi/netmap/
S: Maintained
F: net/netmap.c
AF_XDP network backend
R: Ilya Maximets <i.maximets@ovn.org>
F: net/af-xdp.c
Host Memory Backends
M: David Hildenbrand <david@redhat.com>
M: Igor Mammedov <imammedo@redhat.com>

View File

@ -1296,6 +1296,9 @@ ERST
.name = "netdev_add",
.args_type = "netdev:O",
.params = "[user|tap|socket|stream|dgram|vde|bridge|hubport|netmap|vhost-user"
#ifdef CONFIG_AF_XDP
"|af-xdp"
#endif
#ifdef CONFIG_VMNET
"|vmnet-host|vmnet-shared|vmnet-bridged"
#endif

View File

@ -38,6 +38,7 @@
#include "exec/confidential-guest-support.h"
#include "hw/virtio/virtio.h"
#include "hw/virtio/virtio-pci.h"
#include "hw/virtio/virtio-net.h"
GlobalProperty hw_compat_8_1[] = {};
const size_t hw_compat_8_1_len = G_N_ELEMENTS(hw_compat_8_1);
@ -45,6 +46,9 @@ const size_t hw_compat_8_1_len = G_N_ELEMENTS(hw_compat_8_1);
GlobalProperty hw_compat_8_0[] = {
{ "migration", "multifd-flush-after-each-section", "on"},
{ TYPE_PCI_DEVICE, "x-pcie-ari-nextfn-1", "on" },
{ TYPE_VIRTIO_NET, "host_uso", "off"},
{ TYPE_VIRTIO_NET, "guest_uso4", "off"},
{ TYPE_VIRTIO_NET, "guest_uso6", "off"},
};
const size_t hw_compat_8_0_len = G_N_ELEMENTS(hw_compat_8_0);

View File

@ -810,24 +810,24 @@ e1000e_txdesc_writeback(E1000ECore *core, dma_addr_t base,
return e1000e_tx_wb_interrupt_cause(core, queue_idx);
}
typedef struct E1000E_RingInfo_st {
typedef struct E1000ERingInfo {
int dbah;
int dbal;
int dlen;
int dh;
int dt;
int idx;
} E1000E_RingInfo;
} E1000ERingInfo;
static inline bool
e1000e_ring_empty(E1000ECore *core, const E1000E_RingInfo *r)
e1000e_ring_empty(E1000ECore *core, const E1000ERingInfo *r)
{
return core->mac[r->dh] == core->mac[r->dt] ||
core->mac[r->dt] >= core->mac[r->dlen] / E1000_RING_DESC_LEN;
}
static inline uint64_t
e1000e_ring_base(E1000ECore *core, const E1000E_RingInfo *r)
e1000e_ring_base(E1000ECore *core, const E1000ERingInfo *r)
{
uint64_t bah = core->mac[r->dbah];
uint64_t bal = core->mac[r->dbal];
@ -836,13 +836,13 @@ e1000e_ring_base(E1000ECore *core, const E1000E_RingInfo *r)
}
static inline uint64_t
e1000e_ring_head_descr(E1000ECore *core, const E1000E_RingInfo *r)
e1000e_ring_head_descr(E1000ECore *core, const E1000ERingInfo *r)
{
return e1000e_ring_base(core, r) + E1000_RING_DESC_LEN * core->mac[r->dh];
}
static inline void
e1000e_ring_advance(E1000ECore *core, const E1000E_RingInfo *r, uint32_t count)
e1000e_ring_advance(E1000ECore *core, const E1000ERingInfo *r, uint32_t count)
{
core->mac[r->dh] += count;
@ -852,7 +852,7 @@ e1000e_ring_advance(E1000ECore *core, const E1000E_RingInfo *r, uint32_t count)
}
static inline uint32_t
e1000e_ring_free_descr_num(E1000ECore *core, const E1000E_RingInfo *r)
e1000e_ring_free_descr_num(E1000ECore *core, const E1000ERingInfo *r)
{
trace_e1000e_ring_free_space(r->idx, core->mac[r->dlen],
core->mac[r->dh], core->mac[r->dt]);
@ -871,19 +871,19 @@ e1000e_ring_free_descr_num(E1000ECore *core, const E1000E_RingInfo *r)
}
static inline bool
e1000e_ring_enabled(E1000ECore *core, const E1000E_RingInfo *r)
e1000e_ring_enabled(E1000ECore *core, const E1000ERingInfo *r)
{
return core->mac[r->dlen] > 0;
}
static inline uint32_t
e1000e_ring_len(E1000ECore *core, const E1000E_RingInfo *r)
e1000e_ring_len(E1000ECore *core, const E1000ERingInfo *r)
{
return core->mac[r->dlen];
}
typedef struct E1000E_TxRing_st {
const E1000E_RingInfo *i;
const E1000ERingInfo *i;
struct e1000e_tx *tx;
} E1000E_TxRing;
@ -896,7 +896,7 @@ e1000e_mq_queue_idx(int base_reg_idx, int reg_idx)
static inline void
e1000e_tx_ring_init(E1000ECore *core, E1000E_TxRing *txr, int idx)
{
static const E1000E_RingInfo i[E1000E_NUM_QUEUES] = {
static const E1000ERingInfo i[E1000E_NUM_QUEUES] = {
{ TDBAH, TDBAL, TDLEN, TDH, TDT, 0 },
{ TDBAH1, TDBAL1, TDLEN1, TDH1, TDT1, 1 }
};
@ -908,13 +908,13 @@ e1000e_tx_ring_init(E1000ECore *core, E1000E_TxRing *txr, int idx)
}
typedef struct E1000E_RxRing_st {
const E1000E_RingInfo *i;
const E1000ERingInfo *i;
} E1000E_RxRing;
static inline void
e1000e_rx_ring_init(E1000ECore *core, E1000E_RxRing *rxr, int idx)
{
static const E1000E_RingInfo i[E1000E_NUM_QUEUES] = {
static const E1000ERingInfo i[E1000E_NUM_QUEUES] = {
{ RDBAH0, RDBAL0, RDLEN0, RDH0, RDT0, 0 },
{ RDBAH1, RDBAL1, RDLEN1, RDH1, RDT1, 1 }
};
@ -930,7 +930,7 @@ e1000e_start_xmit(E1000ECore *core, const E1000E_TxRing *txr)
dma_addr_t base;
struct e1000_tx_desc desc;
bool ide = false;
const E1000E_RingInfo *txi = txr->i;
const E1000ERingInfo *txi = txr->i;
uint32_t cause = E1000_ICS_TXQE;
if (!(core->mac[TCTL] & E1000_TCTL_EN)) {
@ -960,7 +960,7 @@ e1000e_start_xmit(E1000ECore *core, const E1000E_TxRing *txr)
}
static bool
e1000e_has_rxbufs(E1000ECore *core, const E1000E_RingInfo *r,
e1000e_has_rxbufs(E1000ECore *core, const E1000ERingInfo *r,
size_t total_size)
{
uint32_t bufs = e1000e_ring_free_descr_num(core, r);
@ -1397,17 +1397,17 @@ e1000e_pci_dma_write_rx_desc(E1000ECore *core, dma_addr_t addr,
}
}
typedef struct e1000e_ba_state_st {
typedef struct E1000EBAState {
uint16_t written[MAX_PS_BUFFERS];
uint8_t cur_idx;
} e1000e_ba_state;
} E1000EBAState;
static inline void
e1000e_write_hdr_to_rx_buffers(E1000ECore *core,
hwaddr ba[MAX_PS_BUFFERS],
e1000e_ba_state *bastate,
const char *data,
dma_addr_t data_len)
e1000e_write_hdr_frag_to_rx_buffers(E1000ECore *core,
hwaddr ba[MAX_PS_BUFFERS],
E1000EBAState *bastate,
const char *data,
dma_addr_t data_len)
{
assert(data_len <= core->rxbuf_sizes[0] - bastate->written[0]);
@ -1418,11 +1418,11 @@ e1000e_write_hdr_to_rx_buffers(E1000ECore *core,
}
static void
e1000e_write_to_rx_buffers(E1000ECore *core,
hwaddr ba[MAX_PS_BUFFERS],
e1000e_ba_state *bastate,
const char *data,
dma_addr_t data_len)
e1000e_write_payload_frag_to_rx_buffers(E1000ECore *core,
hwaddr ba[MAX_PS_BUFFERS],
E1000EBAState *bastate,
const char *data,
dma_addr_t data_len)
{
while (data_len > 0) {
uint32_t cur_buf_len = core->rxbuf_sizes[bastate->cur_idx];
@ -1460,7 +1460,7 @@ e1000e_update_rx_stats(E1000ECore *core, size_t pkt_size, size_t pkt_fcs_size)
}
static inline bool
e1000e_rx_descr_threshold_hit(E1000ECore *core, const E1000E_RingInfo *rxi)
e1000e_rx_descr_threshold_hit(E1000ECore *core, const E1000ERingInfo *rxi)
{
return e1000e_ring_free_descr_num(core, rxi) ==
e1000e_ring_len(core, rxi) >> core->rxbuf_min_shift;
@ -1521,7 +1521,7 @@ e1000e_write_packet_to_guest(E1000ECore *core, struct NetRxPkt *pkt,
struct iovec *iov = net_rx_pkt_get_iovec(pkt);
size_t size = net_rx_pkt_get_total_len(pkt);
size_t total_size = size + e1000x_fcs_len(core->mac);
const E1000E_RingInfo *rxi;
const E1000ERingInfo *rxi;
size_t ps_hdr_len = 0;
bool do_ps = e1000e_do_ps(core, pkt, &ps_hdr_len);
bool is_first = true;
@ -1530,7 +1530,7 @@ e1000e_write_packet_to_guest(E1000ECore *core, struct NetRxPkt *pkt,
do {
hwaddr ba[MAX_PS_BUFFERS];
e1000e_ba_state bastate = { { 0 } };
E1000EBAState bastate = { { 0 } };
bool is_last = false;
desc_size = total_size - desc_offset;
@ -1568,8 +1568,10 @@ e1000e_write_packet_to_guest(E1000ECore *core, struct NetRxPkt *pkt,
iov_copy = MIN(ps_hdr_len - ps_hdr_copied,
iov->iov_len - iov_ofs);
e1000e_write_hdr_to_rx_buffers(core, ba, &bastate,
iov->iov_base, iov_copy);
e1000e_write_hdr_frag_to_rx_buffers(core, ba,
&bastate,
iov->iov_base,
iov_copy);
copy_size -= iov_copy;
ps_hdr_copied += iov_copy;
@ -1585,8 +1587,8 @@ e1000e_write_packet_to_guest(E1000ECore *core, struct NetRxPkt *pkt,
} else {
/* Leave buffer 0 of each descriptor except first */
/* empty as per spec 7.1.5.1 */
e1000e_write_hdr_to_rx_buffers(core, ba, &bastate,
NULL, 0);
e1000e_write_hdr_frag_to_rx_buffers(core, ba, &bastate,
NULL, 0);
}
}
@ -1594,8 +1596,10 @@ e1000e_write_packet_to_guest(E1000ECore *core, struct NetRxPkt *pkt,
while (copy_size) {
iov_copy = MIN(copy_size, iov->iov_len - iov_ofs);
e1000e_write_to_rx_buffers(core, ba, &bastate,
iov->iov_base + iov_ofs, iov_copy);
e1000e_write_payload_frag_to_rx_buffers(core, ba, &bastate,
iov->iov_base +
iov_ofs,
iov_copy);
copy_size -= iov_copy;
iov_ofs += iov_copy;
@ -1607,7 +1611,7 @@ e1000e_write_packet_to_guest(E1000ECore *core, struct NetRxPkt *pkt,
if (desc_offset + desc_size >= total_size) {
/* Simulate FCS checksum presence in the last descriptor */
e1000e_write_to_rx_buffers(core, ba, &bastate,
e1000e_write_payload_frag_to_rx_buffers(core, ba, &bastate,
(const char *) &fcs_pad, e1000x_fcs_len(core->mac));
}
}
@ -2852,7 +2856,7 @@ e1000e_update_rx_offloads(E1000ECore *core)
if (core->has_vnet) {
qemu_set_offload(qemu_get_queue(core->owner_nic)->peer,
cso_state, 0, 0, 0, 0);
cso_state, 0, 0, 0, 0, 0, 0);
}
}

View File

@ -372,6 +372,12 @@ void etsec_walk_tx_ring(eTSEC *etsec, int ring_nbr)
etsec->regs[TSTAT].value |= 1 << (31 - ring_nbr);
}
/*
* rx_init_frame() ensures we never do more padding than this
* (checksum plus minimum data packet size)
*/
#define MAX_RX_PADDING 64
static void fill_rx_bd(eTSEC *etsec,
eTSEC_rxtx_bd *bd,
const uint8_t **buf,
@ -380,9 +386,11 @@ static void fill_rx_bd(eTSEC *etsec,
uint16_t to_write;
hwaddr bufptr = bd->bufptr +
((hwaddr)(etsec->regs[TBDBPH].value & 0xF) << 32);
uint8_t padd[etsec->rx_padding];
uint8_t padd[MAX_RX_PADDING];
uint8_t rem;
assert(etsec->rx_padding <= MAX_RX_PADDING);
RING_DEBUG("eTSEC fill Rx buffer @ 0x%016" HWADDR_PRIx
" size:%zu(padding + crc:%u) + fcb:%u\n",
bufptr, *size, etsec->rx_padding, etsec->rx_fcb_size);
@ -426,7 +434,7 @@ static void fill_rx_bd(eTSEC *etsec,
rem = MIN(etsec->regs[MRBLR].value - bd->length, etsec->rx_padding);
if (rem > 0) {
memset(padd, 0x0, sizeof(padd));
memset(padd, 0x0, rem);
etsec->rx_padding -= rem;
*size -= rem;
bd->length += rem;

View File

@ -267,6 +267,29 @@ igb_rx_use_legacy_descriptor(IGBCore *core)
return false;
}
typedef struct E1000ERingInfo {
int dbah;
int dbal;
int dlen;
int dh;
int dt;
int idx;
} E1000ERingInfo;
static uint32_t
igb_rx_queue_desctyp_get(IGBCore *core, const E1000ERingInfo *r)
{
return core->mac[E1000_SRRCTL(r->idx) >> 2] & E1000_SRRCTL_DESCTYPE_MASK;
}
static bool
igb_rx_use_ps_descriptor(IGBCore *core, const E1000ERingInfo *r)
{
uint32_t desctyp = igb_rx_queue_desctyp_get(core, r);
return desctyp == E1000_SRRCTL_DESCTYPE_HDR_SPLIT ||
desctyp == E1000_SRRCTL_DESCTYPE_HDR_SPLIT_ALWAYS;
}
static inline bool
igb_rss_enabled(IGBCore *core)
{
@ -694,24 +717,15 @@ static uint32_t igb_rx_wb_eic(IGBCore *core, int queue_idx)
return (ent & E1000_IVAR_VALID) ? BIT(ent & 0x1f) : 0;
}
typedef struct E1000E_RingInfo_st {
int dbah;
int dbal;
int dlen;
int dh;
int dt;
int idx;
} E1000E_RingInfo;
static inline bool
igb_ring_empty(IGBCore *core, const E1000E_RingInfo *r)
igb_ring_empty(IGBCore *core, const E1000ERingInfo *r)
{
return core->mac[r->dh] == core->mac[r->dt] ||
core->mac[r->dt] >= core->mac[r->dlen] / E1000_RING_DESC_LEN;
}
static inline uint64_t
igb_ring_base(IGBCore *core, const E1000E_RingInfo *r)
igb_ring_base(IGBCore *core, const E1000ERingInfo *r)
{
uint64_t bah = core->mac[r->dbah];
uint64_t bal = core->mac[r->dbal];
@ -720,13 +734,13 @@ igb_ring_base(IGBCore *core, const E1000E_RingInfo *r)
}
static inline uint64_t
igb_ring_head_descr(IGBCore *core, const E1000E_RingInfo *r)
igb_ring_head_descr(IGBCore *core, const E1000ERingInfo *r)
{
return igb_ring_base(core, r) + E1000_RING_DESC_LEN * core->mac[r->dh];
}
static inline void
igb_ring_advance(IGBCore *core, const E1000E_RingInfo *r, uint32_t count)
igb_ring_advance(IGBCore *core, const E1000ERingInfo *r, uint32_t count)
{
core->mac[r->dh] += count;
@ -736,7 +750,7 @@ igb_ring_advance(IGBCore *core, const E1000E_RingInfo *r, uint32_t count)
}
static inline uint32_t
igb_ring_free_descr_num(IGBCore *core, const E1000E_RingInfo *r)
igb_ring_free_descr_num(IGBCore *core, const E1000ERingInfo *r)
{
trace_e1000e_ring_free_space(r->idx, core->mac[r->dlen],
core->mac[r->dh], core->mac[r->dt]);
@ -755,13 +769,13 @@ igb_ring_free_descr_num(IGBCore *core, const E1000E_RingInfo *r)
}
static inline bool
igb_ring_enabled(IGBCore *core, const E1000E_RingInfo *r)
igb_ring_enabled(IGBCore *core, const E1000ERingInfo *r)
{
return core->mac[r->dlen] > 0;
}
typedef struct IGB_TxRing_st {
const E1000E_RingInfo *i;
const E1000ERingInfo *i;
struct igb_tx *tx;
} IGB_TxRing;
@ -774,7 +788,7 @@ igb_mq_queue_idx(int base_reg_idx, int reg_idx)
static inline void
igb_tx_ring_init(IGBCore *core, IGB_TxRing *txr, int idx)
{
static const E1000E_RingInfo i[IGB_NUM_QUEUES] = {
static const E1000ERingInfo i[IGB_NUM_QUEUES] = {
{ TDBAH0, TDBAL0, TDLEN0, TDH0, TDT0, 0 },
{ TDBAH1, TDBAL1, TDLEN1, TDH1, TDT1, 1 },
{ TDBAH2, TDBAL2, TDLEN2, TDH2, TDT2, 2 },
@ -800,13 +814,13 @@ igb_tx_ring_init(IGBCore *core, IGB_TxRing *txr, int idx)
}
typedef struct E1000E_RxRing_st {
const E1000E_RingInfo *i;
const E1000ERingInfo *i;
} E1000E_RxRing;
static inline void
igb_rx_ring_init(IGBCore *core, E1000E_RxRing *rxr, int idx)
{
static const E1000E_RingInfo i[IGB_NUM_QUEUES] = {
static const E1000ERingInfo i[IGB_NUM_QUEUES] = {
{ RDBAH0, RDBAL0, RDLEN0, RDH0, RDT0, 0 },
{ RDBAH1, RDBAL1, RDLEN1, RDH1, RDT1, 1 },
{ RDBAH2, RDBAL2, RDLEN2, RDH2, RDT2, 2 },
@ -833,7 +847,7 @@ igb_rx_ring_init(IGBCore *core, E1000E_RxRing *rxr, int idx)
static uint32_t
igb_txdesc_writeback(IGBCore *core, dma_addr_t base,
union e1000_adv_tx_desc *tx_desc,
const E1000E_RingInfo *txi)
const E1000ERingInfo *txi)
{
PCIDevice *d;
uint32_t cmd_type_len = le32_to_cpu(tx_desc->read.cmd_type_len);
@ -866,7 +880,7 @@ igb_txdesc_writeback(IGBCore *core, dma_addr_t base,
}
static inline bool
igb_tx_enabled(IGBCore *core, const E1000E_RingInfo *txi)
igb_tx_enabled(IGBCore *core, const E1000ERingInfo *txi)
{
bool vmdq = core->mac[MRQC] & 1;
uint16_t qn = txi->idx;
@ -883,7 +897,7 @@ igb_start_xmit(IGBCore *core, const IGB_TxRing *txr)
PCIDevice *d;
dma_addr_t base;
union e1000_adv_tx_desc desc;
const E1000E_RingInfo *txi = txr->i;
const E1000ERingInfo *txi = txr->i;
uint32_t eic = 0;
if (!igb_tx_enabled(core, txi)) {
@ -918,7 +932,7 @@ igb_start_xmit(IGBCore *core, const IGB_TxRing *txr)
}
static uint32_t
igb_rxbufsize(IGBCore *core, const E1000E_RingInfo *r)
igb_rxbufsize(IGBCore *core, const E1000ERingInfo *r)
{
uint32_t srrctl = core->mac[E1000_SRRCTL(r->idx) >> 2];
uint32_t bsizepkt = srrctl & E1000_SRRCTL_BSIZEPKT_MASK;
@ -930,7 +944,7 @@ igb_rxbufsize(IGBCore *core, const E1000E_RingInfo *r)
}
static bool
igb_has_rxbufs(IGBCore *core, const E1000E_RingInfo *r, size_t total_size)
igb_has_rxbufs(IGBCore *core, const E1000ERingInfo *r, size_t total_size)
{
uint32_t bufs = igb_ring_free_descr_num(core, r);
uint32_t bufsize = igb_rxbufsize(core, r);
@ -941,6 +955,14 @@ igb_has_rxbufs(IGBCore *core, const E1000E_RingInfo *r, size_t total_size)
bufsize;
}
static uint32_t
igb_rxhdrbufsize(IGBCore *core, const E1000ERingInfo *r)
{
uint32_t srrctl = core->mac[E1000_SRRCTL(r->idx) >> 2];
return (srrctl & E1000_SRRCTL_BSIZEHDRSIZE_MASK) >>
E1000_SRRCTL_BSIZEHDRSIZE_SHIFT;
}
void
igb_start_recv(IGBCore *core)
{
@ -1225,21 +1247,77 @@ igb_read_lgcy_rx_descr(IGBCore *core, struct e1000_rx_desc *desc,
}
static inline void
igb_read_adv_rx_descr(IGBCore *core, union e1000_adv_rx_desc *desc,
hwaddr *buff_addr)
igb_read_adv_rx_single_buf_descr(IGBCore *core, union e1000_adv_rx_desc *desc,
hwaddr *buff_addr)
{
*buff_addr = le64_to_cpu(desc->read.pkt_addr);
}
static inline void
igb_read_rx_descr(IGBCore *core, union e1000_rx_desc_union *desc,
hwaddr *buff_addr)
igb_read_adv_rx_split_buf_descr(IGBCore *core, union e1000_adv_rx_desc *desc,
hwaddr *buff_addr)
{
buff_addr[0] = le64_to_cpu(desc->read.hdr_addr);
buff_addr[1] = le64_to_cpu(desc->read.pkt_addr);
}
typedef struct IGBBAState {
uint16_t written[IGB_MAX_PS_BUFFERS];
uint8_t cur_idx;
} IGBBAState;
typedef struct IGBSplitDescriptorData {
bool sph;
bool hbo;
size_t hdr_len;
} IGBSplitDescriptorData;
typedef struct IGBPacketRxDMAState {
size_t size;
size_t total_size;
size_t ps_hdr_len;
size_t desc_size;
size_t desc_offset;
uint32_t rx_desc_packet_buf_size;
uint32_t rx_desc_header_buf_size;
struct iovec *iov;
size_t iov_ofs;
bool do_ps;
bool is_first;
IGBBAState bastate;
hwaddr ba[IGB_MAX_PS_BUFFERS];
IGBSplitDescriptorData ps_desc_data;
} IGBPacketRxDMAState;
static inline void
igb_read_rx_descr(IGBCore *core,
union e1000_rx_desc_union *desc,
IGBPacketRxDMAState *pdma_st,
const E1000ERingInfo *r)
{
uint32_t desc_type;
if (igb_rx_use_legacy_descriptor(core)) {
igb_read_lgcy_rx_descr(core, &desc->legacy, buff_addr);
} else {
igb_read_adv_rx_descr(core, &desc->adv, buff_addr);
igb_read_lgcy_rx_descr(core, &desc->legacy, &pdma_st->ba[1]);
pdma_st->ba[0] = 0;
return;
}
/* advanced header split descriptor */
if (igb_rx_use_ps_descriptor(core, r)) {
igb_read_adv_rx_split_buf_descr(core, &desc->adv, &pdma_st->ba[0]);
return;
}
/* descriptor replication modes not supported */
desc_type = igb_rx_queue_desctyp_get(core, r);
if (desc_type != E1000_SRRCTL_DESCTYPE_ADV_ONEBUF) {
trace_igb_wrn_rx_desc_modes_not_supp(desc_type);
}
/* advanced single buffer descriptor */
igb_read_adv_rx_single_buf_descr(core, &desc->adv, &pdma_st->ba[1]);
pdma_st->ba[0] = 0;
}
static void
@ -1281,15 +1359,11 @@ igb_verify_csum_in_sw(IGBCore *core,
}
static void
igb_build_rx_metadata(IGBCore *core,
struct NetRxPkt *pkt,
bool is_eop,
const E1000E_RSSInfo *rss_info, uint16_t etqf, bool ts,
uint16_t *pkt_info, uint16_t *hdr_info,
uint32_t *rss,
uint32_t *status_flags,
uint16_t *ip_id,
uint16_t *vlan_tag)
igb_build_rx_metadata_common(IGBCore *core,
struct NetRxPkt *pkt,
bool is_eop,
uint32_t *status_flags,
uint16_t *vlan_tag)
{
struct virtio_net_hdr *vhdr;
bool hasip4, hasip6, csum_valid;
@ -1298,7 +1372,6 @@ igb_build_rx_metadata(IGBCore *core,
*status_flags = E1000_RXD_STAT_DD;
/* No additional metadata needed for non-EOP descriptors */
/* TODO: EOP apply only to status so don't skip whole function. */
if (!is_eop) {
goto func_exit;
}
@ -1315,64 +1388,6 @@ igb_build_rx_metadata(IGBCore *core,
trace_e1000e_rx_metadata_vlan(*vlan_tag);
}
/* Packet parsing results */
if ((core->mac[RXCSUM] & E1000_RXCSUM_PCSD) != 0) {
if (rss_info->enabled) {
*rss = cpu_to_le32(rss_info->hash);
trace_igb_rx_metadata_rss(*rss);
}
} else if (hasip4) {
*status_flags |= E1000_RXD_STAT_IPIDV;
*ip_id = cpu_to_le16(net_rx_pkt_get_ip_id(pkt));
trace_e1000e_rx_metadata_ip_id(*ip_id);
}
if (l4hdr_proto == ETH_L4_HDR_PROTO_TCP && net_rx_pkt_is_tcp_ack(pkt)) {
*status_flags |= E1000_RXD_STAT_ACK;
trace_e1000e_rx_metadata_ack();
}
if (pkt_info) {
*pkt_info = rss_info->enabled ? rss_info->type : 0;
if (etqf < 8) {
*pkt_info |= (BIT(11) | etqf) << 4;
} else {
if (hasip4) {
*pkt_info |= E1000_ADVRXD_PKT_IP4;
}
if (hasip6) {
*pkt_info |= E1000_ADVRXD_PKT_IP6;
}
switch (l4hdr_proto) {
case ETH_L4_HDR_PROTO_TCP:
*pkt_info |= E1000_ADVRXD_PKT_TCP;
break;
case ETH_L4_HDR_PROTO_UDP:
*pkt_info |= E1000_ADVRXD_PKT_UDP;
break;
case ETH_L4_HDR_PROTO_SCTP:
*pkt_info |= E1000_ADVRXD_PKT_SCTP;
break;
default:
break;
}
}
}
if (hdr_info) {
*hdr_info = 0;
}
if (ts) {
*status_flags |= BIT(16);
}
/* RX CSO information */
if (hasip6 && (core->mac[RFCTL] & E1000_RFCTL_IPV6_XSUM_DIS)) {
trace_e1000e_rx_metadata_ipv6_sum_disabled();
@ -1428,56 +1443,168 @@ func_exit:
static inline void
igb_write_lgcy_rx_descr(IGBCore *core, struct e1000_rx_desc *desc,
struct NetRxPkt *pkt,
const E1000E_RSSInfo *rss_info, uint16_t etqf, bool ts,
const E1000E_RSSInfo *rss_info,
uint16_t length)
{
uint32_t status_flags, rss;
uint16_t ip_id;
uint32_t status_flags;
assert(!rss_info->enabled);
desc->length = cpu_to_le16(length);
desc->csum = 0;
igb_build_rx_metadata(core, pkt, pkt != NULL,
rss_info, etqf, ts,
NULL, NULL, &rss,
&status_flags, &ip_id,
&desc->special);
memset(desc, 0, sizeof(*desc));
desc->length = cpu_to_le16(length);
igb_build_rx_metadata_common(core, pkt, pkt != NULL,
&status_flags,
&desc->special);
desc->errors = (uint8_t) (le32_to_cpu(status_flags) >> 24);
desc->status = (uint8_t) le32_to_cpu(status_flags);
}
static bool
igb_rx_ps_descriptor_split_always(IGBCore *core, const E1000ERingInfo *r)
{
uint32_t desctyp = igb_rx_queue_desctyp_get(core, r);
return desctyp == E1000_SRRCTL_DESCTYPE_HDR_SPLIT_ALWAYS;
}
static uint16_t
igb_rx_desc_get_packet_type(IGBCore *core, struct NetRxPkt *pkt, uint16_t etqf)
{
uint16_t pkt_type;
bool hasip4, hasip6;
EthL4HdrProto l4hdr_proto;
if (etqf < 8) {
pkt_type = BIT(11) | etqf;
return pkt_type;
}
net_rx_pkt_get_protocols(pkt, &hasip4, &hasip6, &l4hdr_proto);
if (hasip6 && !(core->mac[RFCTL] & E1000_RFCTL_IPV6_DIS)) {
eth_ip6_hdr_info *ip6hdr_info = net_rx_pkt_get_ip6_info(pkt);
pkt_type = ip6hdr_info->has_ext_hdrs ? E1000_ADVRXD_PKT_IP6E :
E1000_ADVRXD_PKT_IP6;
} else if (hasip4) {
pkt_type = E1000_ADVRXD_PKT_IP4;
} else {
pkt_type = 0;
}
switch (l4hdr_proto) {
case ETH_L4_HDR_PROTO_TCP:
pkt_type |= E1000_ADVRXD_PKT_TCP;
break;
case ETH_L4_HDR_PROTO_UDP:
pkt_type |= E1000_ADVRXD_PKT_UDP;
break;
case ETH_L4_HDR_PROTO_SCTP:
pkt_type |= E1000_ADVRXD_PKT_SCTP;
break;
default:
break;
}
return pkt_type;
}
static inline void
igb_write_adv_rx_descr(IGBCore *core, union e1000_adv_rx_desc *desc,
struct NetRxPkt *pkt,
const E1000E_RSSInfo *rss_info, uint16_t etqf, bool ts,
uint16_t length)
{
bool hasip4, hasip6;
EthL4HdrProto l4hdr_proto;
uint16_t rss_type = 0, pkt_type;
bool eop = (pkt != NULL);
uint32_t adv_desc_status_error = 0;
memset(&desc->wb, 0, sizeof(desc->wb));
desc->wb.upper.length = cpu_to_le16(length);
igb_build_rx_metadata_common(core, pkt, eop,
&desc->wb.upper.status_error,
&desc->wb.upper.vlan);
igb_build_rx_metadata(core, pkt, pkt != NULL,
rss_info, etqf, ts,
&desc->wb.lower.lo_dword.pkt_info,
&desc->wb.lower.lo_dword.hdr_info,
&desc->wb.lower.hi_dword.rss,
&desc->wb.upper.status_error,
&desc->wb.lower.hi_dword.csum_ip.ip_id,
&desc->wb.upper.vlan);
if (!eop) {
return;
}
net_rx_pkt_get_protocols(pkt, &hasip4, &hasip6, &l4hdr_proto);
if ((core->mac[RXCSUM] & E1000_RXCSUM_PCSD) != 0) {
if (rss_info->enabled) {
desc->wb.lower.hi_dword.rss = cpu_to_le32(rss_info->hash);
rss_type = rss_info->type;
trace_igb_rx_metadata_rss(desc->wb.lower.hi_dword.rss, rss_type);
}
} else if (hasip4) {
adv_desc_status_error |= E1000_RXD_STAT_IPIDV;
desc->wb.lower.hi_dword.csum_ip.ip_id =
cpu_to_le16(net_rx_pkt_get_ip_id(pkt));
trace_e1000e_rx_metadata_ip_id(
desc->wb.lower.hi_dword.csum_ip.ip_id);
}
if (ts) {
adv_desc_status_error |= BIT(16);
}
pkt_type = igb_rx_desc_get_packet_type(core, pkt, etqf);
trace_e1000e_rx_metadata_pkt_type(pkt_type);
desc->wb.lower.lo_dword.pkt_info = cpu_to_le16(rss_type | (pkt_type << 4));
desc->wb.upper.status_error |= cpu_to_le32(adv_desc_status_error);
}
static inline void
igb_write_rx_descr(IGBCore *core, union e1000_rx_desc_union *desc,
struct NetRxPkt *pkt, const E1000E_RSSInfo *rss_info,
uint16_t etqf, bool ts, uint16_t length)
igb_write_adv_ps_rx_descr(IGBCore *core,
union e1000_adv_rx_desc *desc,
struct NetRxPkt *pkt,
const E1000E_RSSInfo *rss_info,
const E1000ERingInfo *r,
uint16_t etqf,
bool ts,
IGBPacketRxDMAState *pdma_st)
{
size_t pkt_len;
uint16_t hdr_info = 0;
if (pdma_st->do_ps) {
pkt_len = pdma_st->bastate.written[1];
} else {
pkt_len = pdma_st->bastate.written[0] + pdma_st->bastate.written[1];
}
igb_write_adv_rx_descr(core, desc, pkt, rss_info, etqf, ts, pkt_len);
hdr_info = (pdma_st->ps_desc_data.hdr_len << E1000_ADVRXD_HDR_LEN_OFFSET) &
E1000_ADVRXD_ADV_HDR_LEN_MASK;
hdr_info |= pdma_st->ps_desc_data.sph ? E1000_ADVRXD_HDR_SPH : 0;
desc->wb.lower.lo_dword.hdr_info = cpu_to_le16(hdr_info);
desc->wb.upper.status_error |= cpu_to_le32(
pdma_st->ps_desc_data.hbo ? E1000_ADVRXD_ST_ERR_HBO_OFFSET : 0);
}
static inline void
igb_write_rx_descr(IGBCore *core,
union e1000_rx_desc_union *desc,
struct NetRxPkt *pkt,
const E1000E_RSSInfo *rss_info,
uint16_t etqf,
bool ts,
IGBPacketRxDMAState *pdma_st,
const E1000ERingInfo *r)
{
if (igb_rx_use_legacy_descriptor(core)) {
igb_write_lgcy_rx_descr(core, &desc->legacy, pkt, rss_info,
etqf, ts, length);
pdma_st->bastate.written[1]);
} else if (igb_rx_use_ps_descriptor(core, r)) {
igb_write_adv_ps_rx_descr(core, &desc->adv, pkt, rss_info, r, etqf, ts,
pdma_st);
} else {
igb_write_adv_rx_descr(core, &desc->adv, pkt, rss_info,
etqf, ts, length);
etqf, ts, pdma_st->bastate.written[1]);
}
}
@ -1514,20 +1641,7 @@ igb_pci_dma_write_rx_desc(IGBCore *core, PCIDevice *dev, dma_addr_t addr,
}
static void
igb_write_to_rx_buffers(IGBCore *core,
PCIDevice *d,
hwaddr ba,
uint16_t *written,
const char *data,
dma_addr_t data_len)
{
trace_igb_rx_desc_buff_write(ba, *written, data, data_len);
pci_dma_write(d, ba + *written, data, data_len);
*written += data_len;
}
static void
igb_update_rx_stats(IGBCore *core, const E1000E_RingInfo *rxi,
igb_update_rx_stats(IGBCore *core, const E1000ERingInfo *rxi,
size_t pkt_size, size_t pkt_fcs_size)
{
eth_pkt_types_e pkt_type = net_rx_pkt_get_packet_type(core->rx_pkt);
@ -1545,12 +1659,256 @@ igb_update_rx_stats(IGBCore *core, const E1000E_RingInfo *rxi,
}
static inline bool
igb_rx_descr_threshold_hit(IGBCore *core, const E1000E_RingInfo *rxi)
igb_rx_descr_threshold_hit(IGBCore *core, const E1000ERingInfo *rxi)
{
return igb_ring_free_descr_num(core, rxi) ==
((core->mac[E1000_SRRCTL(rxi->idx) >> 2] >> 20) & 31) * 16;
}
static bool
igb_do_ps(IGBCore *core,
const E1000ERingInfo *r,
struct NetRxPkt *pkt,
IGBPacketRxDMAState *pdma_st)
{
bool hasip4, hasip6;
EthL4HdrProto l4hdr_proto;
bool fragment;
bool split_always;
size_t bheader_size;
size_t total_pkt_len;
if (!igb_rx_use_ps_descriptor(core, r)) {
return false;
}
total_pkt_len = net_rx_pkt_get_total_len(pkt);
bheader_size = igb_rxhdrbufsize(core, r);
split_always = igb_rx_ps_descriptor_split_always(core, r);
if (split_always && total_pkt_len <= bheader_size) {
pdma_st->ps_hdr_len = total_pkt_len;
pdma_st->ps_desc_data.hdr_len = total_pkt_len;
return true;
}
net_rx_pkt_get_protocols(pkt, &hasip4, &hasip6, &l4hdr_proto);
if (hasip4) {
fragment = net_rx_pkt_get_ip4_info(pkt)->fragment;
} else if (hasip6) {
fragment = net_rx_pkt_get_ip6_info(pkt)->fragment;
} else {
pdma_st->ps_desc_data.hdr_len = bheader_size;
goto header_not_handled;
}
if (fragment && (core->mac[RFCTL] & E1000_RFCTL_IPFRSP_DIS)) {
pdma_st->ps_desc_data.hdr_len = bheader_size;
goto header_not_handled;
}
/* no header splitting for SCTP */
if (!fragment && (l4hdr_proto == ETH_L4_HDR_PROTO_UDP ||
l4hdr_proto == ETH_L4_HDR_PROTO_TCP)) {
pdma_st->ps_hdr_len = net_rx_pkt_get_l5_hdr_offset(pkt);
} else {
pdma_st->ps_hdr_len = net_rx_pkt_get_l4_hdr_offset(pkt);
}
pdma_st->ps_desc_data.sph = true;
pdma_st->ps_desc_data.hdr_len = pdma_st->ps_hdr_len;
if (pdma_st->ps_hdr_len > bheader_size) {
pdma_st->ps_desc_data.hbo = true;
goto header_not_handled;
}
return true;
header_not_handled:
if (split_always) {
pdma_st->ps_hdr_len = bheader_size;
return true;
}
return false;
}
static void
igb_truncate_to_descriptor_size(IGBPacketRxDMAState *pdma_st, size_t *size)
{
if (pdma_st->do_ps && pdma_st->is_first) {
if (*size > pdma_st->rx_desc_packet_buf_size + pdma_st->ps_hdr_len) {
*size = pdma_st->rx_desc_packet_buf_size + pdma_st->ps_hdr_len;
}
} else {
if (*size > pdma_st->rx_desc_packet_buf_size) {
*size = pdma_st->rx_desc_packet_buf_size;
}
}
}
static inline void
igb_write_hdr_frag_to_rx_buffers(IGBCore *core,
PCIDevice *d,
IGBPacketRxDMAState *pdma_st,
const char *data,
dma_addr_t data_len)
{
assert(data_len <= pdma_st->rx_desc_header_buf_size -
pdma_st->bastate.written[0]);
pci_dma_write(d,
pdma_st->ba[0] + pdma_st->bastate.written[0],
data, data_len);
pdma_st->bastate.written[0] += data_len;
pdma_st->bastate.cur_idx = 1;
}
static void
igb_write_header_to_rx_buffers(IGBCore *core,
struct NetRxPkt *pkt,
PCIDevice *d,
IGBPacketRxDMAState *pdma_st,
size_t *copy_size)
{
size_t iov_copy;
size_t ps_hdr_copied = 0;
if (!pdma_st->is_first) {
/* Leave buffer 0 of each descriptor except first */
/* empty */
pdma_st->bastate.cur_idx = 1;
return;
}
do {
iov_copy = MIN(pdma_st->ps_hdr_len - ps_hdr_copied,
pdma_st->iov->iov_len - pdma_st->iov_ofs);
igb_write_hdr_frag_to_rx_buffers(core, d, pdma_st,
pdma_st->iov->iov_base,
iov_copy);
*copy_size -= iov_copy;
ps_hdr_copied += iov_copy;
pdma_st->iov_ofs += iov_copy;
if (pdma_st->iov_ofs == pdma_st->iov->iov_len) {
pdma_st->iov++;
pdma_st->iov_ofs = 0;
}
} while (ps_hdr_copied < pdma_st->ps_hdr_len);
pdma_st->is_first = false;
}
static void
igb_write_payload_frag_to_rx_buffers(IGBCore *core,
PCIDevice *d,
IGBPacketRxDMAState *pdma_st,
const char *data,
dma_addr_t data_len)
{
while (data_len > 0) {
assert(pdma_st->bastate.cur_idx < IGB_MAX_PS_BUFFERS);
uint32_t cur_buf_bytes_left =
pdma_st->rx_desc_packet_buf_size -
pdma_st->bastate.written[pdma_st->bastate.cur_idx];
uint32_t bytes_to_write = MIN(data_len, cur_buf_bytes_left);
trace_igb_rx_desc_buff_write(
pdma_st->bastate.cur_idx,
pdma_st->ba[pdma_st->bastate.cur_idx],
pdma_st->bastate.written[pdma_st->bastate.cur_idx],
data,
bytes_to_write);
pci_dma_write(d,
pdma_st->ba[pdma_st->bastate.cur_idx] +
pdma_st->bastate.written[pdma_st->bastate.cur_idx],
data, bytes_to_write);
pdma_st->bastate.written[pdma_st->bastate.cur_idx] += bytes_to_write;
data += bytes_to_write;
data_len -= bytes_to_write;
if (pdma_st->bastate.written[pdma_st->bastate.cur_idx] ==
pdma_st->rx_desc_packet_buf_size) {
pdma_st->bastate.cur_idx++;
}
}
}
static void
igb_write_payload_to_rx_buffers(IGBCore *core,
struct NetRxPkt *pkt,
PCIDevice *d,
IGBPacketRxDMAState *pdma_st,
size_t *copy_size)
{
static const uint32_t fcs_pad;
size_t iov_copy;
/* Copy packet payload */
while (*copy_size) {
iov_copy = MIN(*copy_size, pdma_st->iov->iov_len - pdma_st->iov_ofs);
igb_write_payload_frag_to_rx_buffers(core, d,
pdma_st,
pdma_st->iov->iov_base +
pdma_st->iov_ofs,
iov_copy);
*copy_size -= iov_copy;
pdma_st->iov_ofs += iov_copy;
if (pdma_st->iov_ofs == pdma_st->iov->iov_len) {
pdma_st->iov++;
pdma_st->iov_ofs = 0;
}
}
if (pdma_st->desc_offset + pdma_st->desc_size >= pdma_st->total_size) {
/* Simulate FCS checksum presence in the last descriptor */
igb_write_payload_frag_to_rx_buffers(core, d,
pdma_st,
(const char *) &fcs_pad,
e1000x_fcs_len(core->mac));
}
}
static void
igb_write_to_rx_buffers(IGBCore *core,
struct NetRxPkt *pkt,
PCIDevice *d,
IGBPacketRxDMAState *pdma_st)
{
size_t copy_size;
if (!(pdma_st->ba)[1] || (pdma_st->do_ps && !(pdma_st->ba[0]))) {
/* as per intel docs; skip descriptors with null buf addr */
trace_e1000e_rx_null_descriptor();
return;
}
if (pdma_st->desc_offset >= pdma_st->size) {
return;
}
pdma_st->desc_size = pdma_st->total_size - pdma_st->desc_offset;
igb_truncate_to_descriptor_size(pdma_st, &pdma_st->desc_size);
copy_size = pdma_st->size - pdma_st->desc_offset;
igb_truncate_to_descriptor_size(pdma_st, &copy_size);
/* For PS mode copy the packet header first */
if (pdma_st->do_ps) {
igb_write_header_to_rx_buffers(core, pkt, d, pdma_st, &copy_size);
} else {
pdma_st->bastate.cur_idx = 1;
}
igb_write_payload_to_rx_buffers(core, pkt, d, pdma_st, &copy_size);
}
static void
igb_write_packet_to_guest(IGBCore *core, struct NetRxPkt *pkt,
const E1000E_RxRing *rxr,
@ -1560,95 +1918,61 @@ igb_write_packet_to_guest(IGBCore *core, struct NetRxPkt *pkt,
PCIDevice *d;
dma_addr_t base;
union e1000_rx_desc_union desc;
size_t desc_size;
size_t desc_offset = 0;
size_t iov_ofs = 0;
const E1000ERingInfo *rxi;
size_t rx_desc_len;
struct iovec *iov = net_rx_pkt_get_iovec(pkt);
size_t size = net_rx_pkt_get_total_len(pkt);
size_t total_size = size + e1000x_fcs_len(core->mac);
const E1000E_RingInfo *rxi = rxr->i;
size_t bufsize = igb_rxbufsize(core, rxi);
IGBPacketRxDMAState pdma_st = {0};
pdma_st.is_first = true;
pdma_st.size = net_rx_pkt_get_total_len(pkt);
pdma_st.total_size = pdma_st.size + e1000x_fcs_len(core->mac);
rxi = rxr->i;
rx_desc_len = core->rx_desc_len;
pdma_st.rx_desc_packet_buf_size = igb_rxbufsize(core, rxi);
pdma_st.rx_desc_header_buf_size = igb_rxhdrbufsize(core, rxi);
pdma_st.iov = net_rx_pkt_get_iovec(pkt);
d = pcie_sriov_get_vf_at_index(core->owner, rxi->idx % 8);
if (!d) {
d = core->owner;
}
pdma_st.do_ps = igb_do_ps(core, rxi, pkt, &pdma_st);
do {
hwaddr ba;
uint16_t written = 0;
memset(&pdma_st.bastate, 0, sizeof(IGBBAState));
bool is_last = false;
desc_size = total_size - desc_offset;
if (desc_size > bufsize) {
desc_size = bufsize;
}
if (igb_ring_empty(core, rxi)) {
return;
}
base = igb_ring_head_descr(core, rxi);
pci_dma_read(d, base, &desc, rx_desc_len);
trace_e1000e_rx_descr(rxi->idx, base, rx_desc_len);
pci_dma_read(d, base, &desc, core->rx_desc_len);
igb_read_rx_descr(core, &desc, &pdma_st, rxi);
trace_e1000e_rx_descr(rxi->idx, base, core->rx_desc_len);
igb_read_rx_descr(core, &desc, &ba);
if (ba) {
if (desc_offset < size) {
static const uint32_t fcs_pad;
size_t iov_copy;
size_t copy_size = size - desc_offset;
if (copy_size > bufsize) {
copy_size = bufsize;
}
/* Copy packet payload */
while (copy_size) {
iov_copy = MIN(copy_size, iov->iov_len - iov_ofs);
igb_write_to_rx_buffers(core, d, ba, &written,
iov->iov_base + iov_ofs, iov_copy);
copy_size -= iov_copy;
iov_ofs += iov_copy;
if (iov_ofs == iov->iov_len) {
iov++;
iov_ofs = 0;
}
}
if (desc_offset + desc_size >= total_size) {
/* Simulate FCS checksum presence in the last descriptor */
igb_write_to_rx_buffers(core, d, ba, &written,
(const char *) &fcs_pad, e1000x_fcs_len(core->mac));
}
}
} else { /* as per intel docs; skip descriptors with null buf addr */
trace_e1000e_rx_null_descriptor();
}
desc_offset += desc_size;
if (desc_offset >= total_size) {
igb_write_to_rx_buffers(core, pkt, d, &pdma_st);
pdma_st.desc_offset += pdma_st.desc_size;
if (pdma_st.desc_offset >= pdma_st.total_size) {
is_last = true;
}
igb_write_rx_descr(core, &desc, is_last ? core->rx_pkt : NULL,
rss_info, etqf, ts, written);
igb_pci_dma_write_rx_desc(core, d, base, &desc, core->rx_desc_len);
igb_write_rx_descr(core, &desc,
is_last ? pkt : NULL,
rss_info,
etqf, ts,
&pdma_st,
rxi);
igb_pci_dma_write_rx_desc(core, d, base, &desc, rx_desc_len);
igb_ring_advance(core, rxi, rx_desc_len / E1000_MIN_RX_DESC_LEN);
} while (pdma_st.desc_offset < pdma_st.total_size);
igb_ring_advance(core, rxi, core->rx_desc_len / E1000_MIN_RX_DESC_LEN);
} while (desc_offset < total_size);
igb_update_rx_stats(core, rxi, size, total_size);
igb_update_rx_stats(core, rxi, pdma_st.size, pdma_st.total_size);
}
static bool
igb_rx_strip_vlan(IGBCore *core, const E1000E_RingInfo *rxi)
igb_rx_strip_vlan(IGBCore *core, const E1000ERingInfo *rxi)
{
if (core->mac[MRQC] & 1) {
uint16_t pool = rxi->idx % IGB_NUM_VM_POOLS;
@ -2753,7 +3077,7 @@ igb_update_rx_offloads(IGBCore *core)
if (core->has_vnet) {
qemu_set_offload(qemu_get_queue(core->owner_nic)->peer,
cso_state, 0, 0, 0, 0);
cso_state, 0, 0, 0, 0, 0, 0);
}
}

View File

@ -452,6 +452,7 @@ union e1000_adv_rx_desc {
#define E1000_SRRCTL_BSIZEHDRSIZE_MASK 0x00000F00
#define E1000_SRRCTL_BSIZEHDRSIZE_SHIFT 2 /* Shift _left_ */
#define E1000_SRRCTL_DESCTYPE_ADV_ONEBUF 0x02000000
#define E1000_SRRCTL_DESCTYPE_HDR_SPLIT 0x04000000
#define E1000_SRRCTL_DESCTYPE_HDR_SPLIT_ALWAYS 0x0A000000
#define E1000_SRRCTL_DESCTYPE_MASK 0x0E000000
#define E1000_SRRCTL_DROP_EN 0x80000000
@ -692,11 +693,20 @@ union e1000_adv_rx_desc {
#define E1000_STATUS_NUM_VFS_SHIFT 14
#define E1000_ADVRXD_PKT_IP4 BIT(4)
#define E1000_ADVRXD_PKT_IP6 BIT(6)
#define E1000_ADVRXD_PKT_TCP BIT(8)
#define E1000_ADVRXD_PKT_UDP BIT(9)
#define E1000_ADVRXD_PKT_SCTP BIT(10)
#define E1000_ADVRXD_PKT_IP4 BIT(0)
#define E1000_ADVRXD_PKT_IP6 BIT(2)
#define E1000_ADVRXD_PKT_IP6E BIT(3)
#define E1000_ADVRXD_PKT_TCP BIT(4)
#define E1000_ADVRXD_PKT_UDP BIT(5)
#define E1000_ADVRXD_PKT_SCTP BIT(6)
#define IGB_MAX_PS_BUFFERS 2
#define E1000_ADVRXD_HDR_LEN_OFFSET (21 - 16)
#define E1000_ADVRXD_ADV_HDR_LEN_MASK ((BIT(10) - 1) << \
E1000_ADVRXD_HDR_LEN_OFFSET)
#define E1000_ADVRXD_HDR_SPH BIT(15)
#define E1000_ADVRXD_ST_ERR_HBO_OFFSET BIT(3 + 20)
static inline uint8_t igb_ivar_entry_rx(uint8_t i)
{

View File

@ -1043,7 +1043,7 @@ static void of_dpa_flow_ig_tbl(OfDpaFlowContext *fc, uint32_t tbl_id)
static ssize_t of_dpa_ig(World *world, uint32_t pport,
const struct iovec *iov, int iovcnt)
{
struct iovec iov_copy[iovcnt + 2];
g_autofree struct iovec *iov_copy = g_new(struct iovec, iovcnt + 2);
OfDpaFlowContext fc = {
.of_dpa = world_private(world),
.in_pport = pport,

View File

@ -278,9 +278,9 @@ igb_core_mdic_write_unhandled(uint32_t addr) "MDIC WRITE: PHY[%u] UNHANDLED"
igb_link_set_ext_params(bool asd_check, bool speed_select_bypass, bool pfrstd) "Set extended link params: ASD check: %d, Speed select bypass: %d, PF reset done: %d"
igb_rx_desc_buff_size(uint32_t b) "buffer size: %u"
igb_rx_desc_buff_write(uint64_t addr, uint16_t offset, const void* source, uint32_t len) "addr: 0x%"PRIx64", offset: %u, from: %p, length: %u"
igb_rx_desc_buff_write(uint8_t idx, uint64_t addr, uint16_t offset, const void* source, uint32_t len) "buffer %u, addr: 0x%"PRIx64", offset: %u, from: %p, length: %u"
igb_rx_metadata_rss(uint32_t rss) "RSS data: 0x%X"
igb_rx_metadata_rss(uint32_t rss, uint16_t rss_pkt_type) "RSS data: rss: 0x%X, rss_pkt_type: 0x%X"
igb_irq_icr_clear_gpie_nsicr(void) "Clearing ICR on read due to GPIE.NSICR enabled"
igb_irq_set_iam(uint32_t icr) "Update IAM: 0x%x"
@ -295,6 +295,8 @@ igb_irq_eitr_set(uint32_t eitr_num, uint32_t val) "EITR[%u] = 0x%x"
igb_set_pfmailbox(uint32_t vf_num, uint32_t val) "PFMailbox[%d]: 0x%x"
igb_set_vfmailbox(uint32_t vf_num, uint32_t val) "VFMailbox[%d]: 0x%x"
igb_wrn_rx_desc_modes_not_supp(int desc_type) "Not supported descriptor type: %d"
# igbvf.c
igbvf_wrn_io_addr_unknown(uint64_t addr) "IO unknown register 0x%"PRIx64

View File

@ -78,6 +78,9 @@ static const int user_feature_bits[] = {
VIRTIO_F_RING_RESET,
VIRTIO_NET_F_RSS,
VIRTIO_NET_F_HASH_REPORT,
VIRTIO_NET_F_GUEST_USO4,
VIRTIO_NET_F_GUEST_USO6,
VIRTIO_NET_F_HOST_USO,
/* This bit implies RARP isn't sent by QEMU out of band */
VIRTIO_NET_F_GUEST_ANNOUNCE,

View File

@ -659,6 +659,15 @@ static int peer_has_ufo(VirtIONet *n)
return n->has_ufo;
}
static int peer_has_uso(VirtIONet *n)
{
if (!peer_has_vnet_hdr(n)) {
return 0;
}
return qemu_has_uso(qemu_get_queue(n->nic)->peer);
}
static void virtio_net_set_mrg_rx_bufs(VirtIONet *n, int mergeable_rx_bufs,
int version_1, int hash_report)
{
@ -796,6 +805,10 @@ static uint64_t virtio_net_get_features(VirtIODevice *vdev, uint64_t features,
virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_TSO6);
virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_ECN);
virtio_clear_feature(&features, VIRTIO_NET_F_HOST_USO);
virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO4);
virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO6);
virtio_clear_feature(&features, VIRTIO_NET_F_HASH_REPORT);
}
@ -804,6 +817,12 @@ static uint64_t virtio_net_get_features(VirtIODevice *vdev, uint64_t features,
virtio_clear_feature(&features, VIRTIO_NET_F_HOST_UFO);
}
if (!peer_has_uso(n)) {
virtio_clear_feature(&features, VIRTIO_NET_F_HOST_USO);
virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO4);
virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO6);
}
if (!get_vhost_net(nc->peer)) {
return features;
}
@ -859,17 +878,21 @@ static void virtio_net_apply_guest_offloads(VirtIONet *n)
!!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_TSO4)),
!!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_TSO6)),
!!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_ECN)),
!!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_UFO)));
!!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_UFO)),
!!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_USO4)),
!!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_USO6)));
}
static uint64_t virtio_net_guest_offloads_by_features(uint32_t features)
static uint64_t virtio_net_guest_offloads_by_features(uint64_t features)
{
static const uint64_t guest_offloads_mask =
(1ULL << VIRTIO_NET_F_GUEST_CSUM) |
(1ULL << VIRTIO_NET_F_GUEST_TSO4) |
(1ULL << VIRTIO_NET_F_GUEST_TSO6) |
(1ULL << VIRTIO_NET_F_GUEST_ECN) |
(1ULL << VIRTIO_NET_F_GUEST_UFO);
(1ULL << VIRTIO_NET_F_GUEST_UFO) |
(1ULL << VIRTIO_NET_F_GUEST_USO4) |
(1ULL << VIRTIO_NET_F_GUEST_USO6);
return guest_offloads_mask & features;
}
@ -3922,6 +3945,12 @@ static Property virtio_net_properties[] = {
DEFINE_PROP_INT32("speed", VirtIONet, net_conf.speed, SPEED_UNKNOWN),
DEFINE_PROP_STRING("duplex", VirtIONet, net_conf.duplex_str),
DEFINE_PROP_BOOL("failover", VirtIONet, failover, false),
DEFINE_PROP_BIT64("guest_uso4", VirtIONet, host_features,
VIRTIO_NET_F_GUEST_USO4, true),
DEFINE_PROP_BIT64("guest_uso6", VirtIONet, host_features,
VIRTIO_NET_F_GUEST_USO6, true),
DEFINE_PROP_BIT64("host_uso", VirtIONet, host_features,
VIRTIO_NET_F_HOST_USO, true),
DEFINE_PROP_END_OF_LIST(),
};

View File

@ -1341,6 +1341,8 @@ static void vmxnet3_update_features(VMXNET3State *s)
s->lro_supported,
s->lro_supported,
0,
0,
0,
0);
}
}

View File

@ -54,11 +54,12 @@ typedef void (LinkStatusChanged)(NetClientState *);
typedef void (NetClientDestructor)(NetClientState *);
typedef RxFilterInfo *(QueryRxFilter)(NetClientState *);
typedef bool (HasUfo)(NetClientState *);
typedef bool (HasUso)(NetClientState *);
typedef bool (HasVnetHdr)(NetClientState *);
typedef bool (HasVnetHdrLen)(NetClientState *, int);
typedef bool (GetUsingVnetHdr)(NetClientState *);
typedef void (UsingVnetHdr)(NetClientState *, bool);
typedef void (SetOffload)(NetClientState *, int, int, int, int, int);
typedef void (SetOffload)(NetClientState *, int, int, int, int, int, int, int);
typedef int (GetVnetHdrLen)(NetClientState *);
typedef void (SetVnetHdrLen)(NetClientState *, int);
typedef int (SetVnetLE)(NetClientState *, bool);
@ -84,6 +85,7 @@ typedef struct NetClientInfo {
QueryRxFilter *query_rx_filter;
NetPoll *poll;
HasUfo *has_ufo;
HasUso *has_uso;
HasVnetHdr *has_vnet_hdr;
HasVnetHdrLen *has_vnet_hdr_len;
GetUsingVnetHdr *get_using_vnet_hdr;
@ -187,12 +189,13 @@ void qemu_set_info_str(NetClientState *nc,
const char *fmt, ...) G_GNUC_PRINTF(2, 3);
void qemu_format_nic_info_str(NetClientState *nc, uint8_t macaddr[6]);
bool qemu_has_ufo(NetClientState *nc);
bool qemu_has_uso(NetClientState *nc);
bool qemu_has_vnet_hdr(NetClientState *nc);
bool qemu_has_vnet_hdr_len(NetClientState *nc, int len);
bool qemu_get_using_vnet_hdr(NetClientState *nc);
void qemu_using_vnet_hdr(NetClientState *nc, bool enable);
void qemu_set_offload(NetClientState *nc, int csum, int tso4, int tso6,
int ecn, int ufo);
int ecn, int ufo, int uso4, int uso6);
int qemu_get_vnet_hdr_len(NetClientState *nc);
void qemu_set_vnet_hdr_len(NetClientState *nc, int len);
int qemu_set_vnet_le(NetClientState *nc, bool is_le);

View File

@ -1873,6 +1873,13 @@ if libbpf.found() and not cc.links('''
endif
endif
# libxdp
libxdp = not_found
if not get_option('af_xdp').auto() or have_system
libxdp = dependency('libxdp', required: get_option('af_xdp'),
version: '>=1.4.0', method: 'pkg-config')
endif
# libdw
libdw = not_found
if not get_option('libdw').auto() or \
@ -2099,6 +2106,7 @@ config_host_data.set('CONFIG_HEXAGON_IDEF_PARSER', get_option('hexagon_idef_pars
config_host_data.set('CONFIG_LIBATTR', have_old_libattr)
config_host_data.set('CONFIG_LIBCAP_NG', libcap_ng.found())
config_host_data.set('CONFIG_EBPF', libbpf.found())
config_host_data.set('CONFIG_AF_XDP', libxdp.found())
config_host_data.set('CONFIG_LIBDAXCTL', libdaxctl.found())
config_host_data.set('CONFIG_LIBISCSI', libiscsi.found())
config_host_data.set('CONFIG_LIBNFS', libnfs.found())
@ -4270,6 +4278,7 @@ summary_info = {}
if targetos == 'darwin'
summary_info += {'vmnet.framework support': vmnet}
endif
summary_info += {'AF_XDP support': libxdp}
summary_info += {'slirp support': slirp}
summary_info += {'vde support': vde}
summary_info += {'netmap support': have_netmap}

View File

@ -122,6 +122,8 @@ option('avx512bw', type: 'feature', value: 'auto',
option('keyring', type: 'feature', value: 'auto',
description: 'Linux keyring support')
option('af_xdp', type : 'feature', value : 'auto',
description: 'AF_XDP network backend support')
option('attr', type : 'feature', value : 'auto',
description: 'attr/xattr support')
option('auth_pam', type : 'feature', value : 'auto',

526
net/af-xdp.c Normal file
View File

@ -0,0 +1,526 @@
/*
* AF_XDP network backend.
*
* Copyright (c) 2023 Red Hat, Inc.
*
* Authors:
* Ilya Maximets <i.maximets@ovn.org>
*
* This work is licensed under the terms of the GNU GPL, version 2 or later.
* See the COPYING file in the top-level directory.
*/
#include "qemu/osdep.h"
#include <bpf/bpf.h>
#include <inttypes.h>
#include <linux/if_link.h>
#include <linux/if_xdp.h>
#include <net/if.h>
#include <xdp/xsk.h>
#include "clients.h"
#include "monitor/monitor.h"
#include "net/net.h"
#include "qapi/error.h"
#include "qemu/cutils.h"
#include "qemu/error-report.h"
#include "qemu/iov.h"
#include "qemu/main-loop.h"
#include "qemu/memalign.h"
typedef struct AFXDPState {
NetClientState nc;
struct xsk_socket *xsk;
struct xsk_ring_cons rx;
struct xsk_ring_prod tx;
struct xsk_ring_cons cq;
struct xsk_ring_prod fq;
char ifname[IFNAMSIZ];
int ifindex;
bool read_poll;
bool write_poll;
uint32_t outstanding_tx;
uint64_t *pool;
uint32_t n_pool;
char *buffer;
struct xsk_umem *umem;
uint32_t n_queues;
uint32_t xdp_flags;
bool inhibit;
} AFXDPState;
#define AF_XDP_BATCH_SIZE 64
static void af_xdp_send(void *opaque);
static void af_xdp_writable(void *opaque);
/* Set the event-loop handlers for the af-xdp backend. */
static void af_xdp_update_fd_handler(AFXDPState *s)
{
qemu_set_fd_handler(xsk_socket__fd(s->xsk),
s->read_poll ? af_xdp_send : NULL,
s->write_poll ? af_xdp_writable : NULL,
s);
}
/* Update the read handler. */
static void af_xdp_read_poll(AFXDPState *s, bool enable)
{
if (s->read_poll != enable) {
s->read_poll = enable;
af_xdp_update_fd_handler(s);
}
}
/* Update the write handler. */
static void af_xdp_write_poll(AFXDPState *s, bool enable)
{
if (s->write_poll != enable) {
s->write_poll = enable;
af_xdp_update_fd_handler(s);
}
}
static void af_xdp_poll(NetClientState *nc, bool enable)
{
AFXDPState *s = DO_UPCAST(AFXDPState, nc, nc);
if (s->read_poll != enable || s->write_poll != enable) {
s->write_poll = enable;
s->read_poll = enable;
af_xdp_update_fd_handler(s);
}
}
static void af_xdp_complete_tx(AFXDPState *s)
{
uint32_t idx = 0;
uint32_t done, i;
uint64_t *addr;
done = xsk_ring_cons__peek(&s->cq, XSK_RING_CONS__DEFAULT_NUM_DESCS, &idx);
for (i = 0; i < done; i++) {
addr = (void *) xsk_ring_cons__comp_addr(&s->cq, idx++);
s->pool[s->n_pool++] = *addr;
s->outstanding_tx--;
}
if (done) {
xsk_ring_cons__release(&s->cq, done);
}
}
/*
* The fd_write() callback, invoked if the fd is marked as writable
* after a poll.
*/
static void af_xdp_writable(void *opaque)
{
AFXDPState *s = opaque;
/* Try to recover buffers that are already sent. */
af_xdp_complete_tx(s);
/*
* Unregister the handler, unless we still have packets to transmit
* and kernel needs a wake up.
*/
if (!s->outstanding_tx || !xsk_ring_prod__needs_wakeup(&s->tx)) {
af_xdp_write_poll(s, false);
}
/* Flush any buffered packets. */
qemu_flush_queued_packets(&s->nc);
}
static ssize_t af_xdp_receive(NetClientState *nc,
const uint8_t *buf, size_t size)
{
AFXDPState *s = DO_UPCAST(AFXDPState, nc, nc);
struct xdp_desc *desc;
uint32_t idx;
void *data;
/* Try to recover buffers that are already sent. */
af_xdp_complete_tx(s);
if (size > XSK_UMEM__DEFAULT_FRAME_SIZE) {
/* We can't transmit packet this size... */
return size;
}
if (!s->n_pool || !xsk_ring_prod__reserve(&s->tx, 1, &idx)) {
/*
* Out of buffers or space in tx ring. Poll until we can write.
* This will also kick the Tx, if it was waiting on CQ.
*/
af_xdp_write_poll(s, true);
return 0;
}
desc = xsk_ring_prod__tx_desc(&s->tx, idx);
desc->addr = s->pool[--s->n_pool];
desc->len = size;
data = xsk_umem__get_data(s->buffer, desc->addr);
memcpy(data, buf, size);
xsk_ring_prod__submit(&s->tx, 1);
s->outstanding_tx++;
if (xsk_ring_prod__needs_wakeup(&s->tx)) {
af_xdp_write_poll(s, true);
}
return size;
}
/*
* Complete a previous send (backend --> guest) and enable the
* fd_read callback.
*/
static void af_xdp_send_completed(NetClientState *nc, ssize_t len)
{
AFXDPState *s = DO_UPCAST(AFXDPState, nc, nc);
af_xdp_read_poll(s, true);
}
static void af_xdp_fq_refill(AFXDPState *s, uint32_t n)
{
uint32_t i, idx = 0;
/* Leave one packet for Tx, just in case. */
if (s->n_pool < n + 1) {
n = s->n_pool;
}
if (!n || !xsk_ring_prod__reserve(&s->fq, n, &idx)) {
return;
}
for (i = 0; i < n; i++) {
*xsk_ring_prod__fill_addr(&s->fq, idx++) = s->pool[--s->n_pool];
}
xsk_ring_prod__submit(&s->fq, n);
if (xsk_ring_prod__needs_wakeup(&s->fq)) {
/* Receive was blocked by not having enough buffers. Wake it up. */
af_xdp_read_poll(s, true);
}
}
static void af_xdp_send(void *opaque)
{
uint32_t i, n_rx, idx = 0;
AFXDPState *s = opaque;
n_rx = xsk_ring_cons__peek(&s->rx, AF_XDP_BATCH_SIZE, &idx);
if (!n_rx) {
return;
}
for (i = 0; i < n_rx; i++) {
const struct xdp_desc *desc;
struct iovec iov;
desc = xsk_ring_cons__rx_desc(&s->rx, idx++);
iov.iov_base = xsk_umem__get_data(s->buffer, desc->addr);
iov.iov_len = desc->len;
s->pool[s->n_pool++] = desc->addr;
if (!qemu_sendv_packet_async(&s->nc, &iov, 1,
af_xdp_send_completed)) {
/*
* The peer does not receive anymore. Packet is queued, stop
* reading from the backend until af_xdp_send_completed().
*/
af_xdp_read_poll(s, false);
/* Return unused descriptors to not break the ring cache. */
xsk_ring_cons__cancel(&s->rx, n_rx - i - 1);
n_rx = i + 1;
break;
}
}
/* Release actually sent descriptors and try to re-fill. */
xsk_ring_cons__release(&s->rx, n_rx);
af_xdp_fq_refill(s, AF_XDP_BATCH_SIZE);
}
/* Flush and close. */
static void af_xdp_cleanup(NetClientState *nc)
{
AFXDPState *s = DO_UPCAST(AFXDPState, nc, nc);
qemu_purge_queued_packets(nc);
af_xdp_poll(nc, false);
xsk_socket__delete(s->xsk);
s->xsk = NULL;
g_free(s->pool);
s->pool = NULL;
xsk_umem__delete(s->umem);
s->umem = NULL;
qemu_vfree(s->buffer);
s->buffer = NULL;
/* Remove the program if it's the last open queue. */
if (!s->inhibit && nc->queue_index == s->n_queues - 1 && s->xdp_flags
&& bpf_xdp_detach(s->ifindex, s->xdp_flags, NULL) != 0) {
fprintf(stderr,
"af-xdp: unable to remove XDP program from '%s', ifindex: %d\n",
s->ifname, s->ifindex);
}
}
static int af_xdp_umem_create(AFXDPState *s, int sock_fd, Error **errp)
{
struct xsk_umem_config config = {
.fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS,
.comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS,
.frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE,
.frame_headroom = 0,
};
uint64_t n_descs;
uint64_t size;
int64_t i;
int ret;
/* Number of descriptors if all 4 queues (rx, tx, cq, fq) are full. */
n_descs = (XSK_RING_PROD__DEFAULT_NUM_DESCS
+ XSK_RING_CONS__DEFAULT_NUM_DESCS) * 2;
size = n_descs * XSK_UMEM__DEFAULT_FRAME_SIZE;
s->buffer = qemu_memalign(qemu_real_host_page_size(), size);
memset(s->buffer, 0, size);
if (sock_fd < 0) {
ret = xsk_umem__create(&s->umem, s->buffer, size,
&s->fq, &s->cq, &config);
} else {
ret = xsk_umem__create_with_fd(&s->umem, sock_fd, s->buffer, size,
&s->fq, &s->cq, &config);
}
if (ret) {
qemu_vfree(s->buffer);
error_setg_errno(errp, errno,
"failed to create umem for %s queue_index: %d",
s->ifname, s->nc.queue_index);
return -1;
}
s->pool = g_new(uint64_t, n_descs);
/* Fill the pool in the opposite order, because it's a LIFO queue. */
for (i = n_descs; i >= 0; i--) {
s->pool[i] = i * XSK_UMEM__DEFAULT_FRAME_SIZE;
}
s->n_pool = n_descs;
af_xdp_fq_refill(s, XSK_RING_PROD__DEFAULT_NUM_DESCS);
return 0;
}
static int af_xdp_socket_create(AFXDPState *s,
const NetdevAFXDPOptions *opts, Error **errp)
{
struct xsk_socket_config cfg = {
.rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS,
.tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS,
.libxdp_flags = 0,
.bind_flags = XDP_USE_NEED_WAKEUP,
.xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST,
};
int queue_id, error = 0;
s->inhibit = opts->has_inhibit && opts->inhibit;
if (s->inhibit) {
cfg.libxdp_flags |= XSK_LIBXDP_FLAGS__INHIBIT_PROG_LOAD;
}
if (opts->has_force_copy && opts->force_copy) {
cfg.bind_flags |= XDP_COPY;
}
queue_id = s->nc.queue_index;
if (opts->has_start_queue && opts->start_queue > 0) {
queue_id += opts->start_queue;
}
if (opts->has_mode) {
/* Specific mode requested. */
cfg.xdp_flags |= (opts->mode == AFXDP_MODE_NATIVE)
? XDP_FLAGS_DRV_MODE : XDP_FLAGS_SKB_MODE;
if (xsk_socket__create(&s->xsk, s->ifname, queue_id,
s->umem, &s->rx, &s->tx, &cfg)) {
error = errno;
}
} else {
/* No mode requested, try native first. */
cfg.xdp_flags |= XDP_FLAGS_DRV_MODE;
if (xsk_socket__create(&s->xsk, s->ifname, queue_id,
s->umem, &s->rx, &s->tx, &cfg)) {
/* Can't use native mode, try skb. */
cfg.xdp_flags &= ~XDP_FLAGS_DRV_MODE;
cfg.xdp_flags |= XDP_FLAGS_SKB_MODE;
if (xsk_socket__create(&s->xsk, s->ifname, queue_id,
s->umem, &s->rx, &s->tx, &cfg)) {
error = errno;
}
}
}
if (error) {
error_setg_errno(errp, error,
"failed to create AF_XDP socket for %s queue_id: %d",
s->ifname, queue_id);
return -1;
}
s->xdp_flags = cfg.xdp_flags;
return 0;
}
/* NetClientInfo methods. */
static NetClientInfo net_af_xdp_info = {
.type = NET_CLIENT_DRIVER_AF_XDP,
.size = sizeof(AFXDPState),
.receive = af_xdp_receive,
.poll = af_xdp_poll,
.cleanup = af_xdp_cleanup,
};
static int *parse_socket_fds(const char *sock_fds_str,
int64_t n_expected, Error **errp)
{
gchar **substrings = g_strsplit(sock_fds_str, ":", -1);
int64_t i, n_sock_fds = g_strv_length(substrings);
int *sock_fds = NULL;
if (n_sock_fds != n_expected) {
error_setg(errp, "expected %"PRIi64" socket fds, got %"PRIi64,
n_expected, n_sock_fds);
goto exit;
}
sock_fds = g_new(int, n_sock_fds);
for (i = 0; i < n_sock_fds; i++) {
sock_fds[i] = monitor_fd_param(monitor_cur(), substrings[i], errp);
if (sock_fds[i] < 0) {
g_free(sock_fds);
sock_fds = NULL;
goto exit;
}
}
exit:
g_strfreev(substrings);
return sock_fds;
}
/*
* The exported init function.
*
* ... -netdev af-xdp,ifname="..."
*/
int net_init_af_xdp(const Netdev *netdev,
const char *name, NetClientState *peer, Error **errp)
{
const NetdevAFXDPOptions *opts = &netdev->u.af_xdp;
NetClientState *nc, *nc0 = NULL;
unsigned int ifindex;
uint32_t prog_id = 0;
int *sock_fds = NULL;
int64_t i, queues;
Error *err = NULL;
AFXDPState *s;
ifindex = if_nametoindex(opts->ifname);
if (!ifindex) {
error_setg_errno(errp, errno, "failed to get ifindex for '%s'",
opts->ifname);
return -1;
}
queues = opts->has_queues ? opts->queues : 1;
if (queues < 1) {
error_setg(errp, "invalid number of queues (%" PRIi64 ") for '%s'",
queues, opts->ifname);
return -1;
}
if ((opts->has_inhibit && opts->inhibit) != !!opts->sock_fds) {
error_setg(errp, "'inhibit=on' requires 'sock-fds' and vice versa");
return -1;
}
if (opts->sock_fds) {
sock_fds = parse_socket_fds(opts->sock_fds, queues, errp);
if (!sock_fds) {
return -1;
}
}
for (i = 0; i < queues; i++) {
nc = qemu_new_net_client(&net_af_xdp_info, peer, "af-xdp", name);
qemu_set_info_str(nc, "af-xdp%"PRIi64" to %s", i, opts->ifname);
nc->queue_index = i;
if (!nc0) {
nc0 = nc;
}
s = DO_UPCAST(AFXDPState, nc, nc);
pstrcpy(s->ifname, sizeof(s->ifname), opts->ifname);
s->ifindex = ifindex;
s->n_queues = queues;
if (af_xdp_umem_create(s, sock_fds ? sock_fds[i] : -1, errp)
|| af_xdp_socket_create(s, opts, errp)) {
/* Make sure the XDP program will be removed. */
s->n_queues = i;
error_propagate(errp, err);
goto err;
}
}
if (nc0) {
s = DO_UPCAST(AFXDPState, nc, nc0);
if (bpf_xdp_query_id(s->ifindex, s->xdp_flags, &prog_id) || !prog_id) {
error_setg_errno(errp, errno,
"no XDP program loaded on '%s', ifindex: %d",
s->ifname, s->ifindex);
goto err;
}
}
af_xdp_read_poll(s, true); /* Initially only poll for reads. */
return 0;
err:
g_free(sock_fds);
if (nc0) {
qemu_del_net_client(nc0);
}
return -1;
}

View File

@ -64,6 +64,11 @@ int net_init_netmap(const Netdev *netdev, const char *name,
NetClientState *peer, Error **errp);
#endif
#ifdef CONFIG_AF_XDP
int net_init_af_xdp(const Netdev *netdev, const char *name,
NetClientState *peer, Error **errp);
#endif
int net_init_vhost_user(const Netdev *netdev, const char *name,
NetClientState *peer, Error **errp);

View File

@ -68,7 +68,7 @@ static ssize_t dump_receive_iov(DumpState *s, const struct iovec *iov, int cnt,
int64_t ts;
int caplen;
size_t size = iov_size(iov, cnt) - offset;
struct iovec dumpiov[cnt + 1];
g_autofree struct iovec *dumpiov = g_new(struct iovec, cnt + 1);
/* Early return in case of previous error. */
if (s->fd < 0) {

View File

@ -36,6 +36,9 @@ system_ss.add(when: vde, if_true: files('vde.c'))
if have_netmap
system_ss.add(files('netmap.c'))
endif
system_ss.add(when: libxdp, if_true: files('af-xdp.c'))
if have_vhost_net_user
system_ss.add(when: 'CONFIG_VIRTIO_NET', if_true: files('vhost-user.c'), if_false: files('vhost-user-stub.c'))
system_ss.add(when: 'CONFIG_ALL', if_true: files('vhost-user-stub.c'))

View File

@ -495,6 +495,15 @@ bool qemu_has_ufo(NetClientState *nc)
return nc->info->has_ufo(nc);
}
bool qemu_has_uso(NetClientState *nc)
{
if (!nc || !nc->info->has_uso) {
return false;
}
return nc->info->has_uso(nc);
}
bool qemu_has_vnet_hdr(NetClientState *nc)
{
if (!nc || !nc->info->has_vnet_hdr) {
@ -532,13 +541,13 @@ void qemu_using_vnet_hdr(NetClientState *nc, bool enable)
}
void qemu_set_offload(NetClientState *nc, int csum, int tso4, int tso6,
int ecn, int ufo)
int ecn, int ufo, int uso4, int uso6)
{
if (!nc || !nc->info->set_offload) {
return;
}
nc->info->set_offload(nc, csum, tso4, tso6, ecn, ufo);
nc->info->set_offload(nc, csum, tso4, tso6, ecn, ufo, uso4, uso6);
}
int qemu_get_vnet_hdr_len(NetClientState *nc)
@ -1082,6 +1091,9 @@ static int (* const net_client_init_fun[NET_CLIENT_DRIVER__MAX])(
#ifdef CONFIG_NETMAP
[NET_CLIENT_DRIVER_NETMAP] = net_init_netmap,
#endif
#ifdef CONFIG_AF_XDP
[NET_CLIENT_DRIVER_AF_XDP] = net_init_af_xdp,
#endif
#ifdef CONFIG_NET_BRIDGE
[NET_CLIENT_DRIVER_BRIDGE] = net_init_bridge,
#endif
@ -1186,6 +1198,9 @@ void show_netdevs(void)
#ifdef CONFIG_NETMAP
"netmap",
#endif
#ifdef CONFIG_AF_XDP
"af-xdp",
#endif
#ifdef CONFIG_POSIX
"vhost-user",
#endif

View File

@ -371,7 +371,7 @@ static void netmap_set_vnet_hdr_len(NetClientState *nc, int len)
}
static void netmap_set_offload(NetClientState *nc, int csum, int tso4, int tso6,
int ecn, int ufo)
int ecn, int ufo, int uso4, int uso6)
{
NetmapState *s = DO_UPCAST(NetmapState, nc, nc);

View File

@ -212,6 +212,11 @@ int tap_probe_has_ufo(int fd)
return 0;
}
int tap_probe_has_uso(int fd)
{
return 0;
}
int tap_probe_vnet_hdr_len(int fd, int len)
{
return 0;
@ -232,7 +237,7 @@ int tap_fd_set_vnet_be(int fd, int is_be)
}
void tap_fd_set_offload(int fd, int csum, int tso4,
int tso6, int ecn, int ufo)
int tso6, int ecn, int ufo, int uso4, int uso6)
{
}

View File

@ -173,6 +173,18 @@ int tap_probe_has_ufo(int fd)
return 1;
}
int tap_probe_has_uso(int fd)
{
unsigned offload;
offload = TUN_F_CSUM | TUN_F_USO4 | TUN_F_USO6;
if (ioctl(fd, TUNSETOFFLOAD, offload) < 0) {
return 0;
}
return 1;
}
/* Verify that we can assign given length */
int tap_probe_vnet_hdr_len(int fd, int len)
{
@ -237,7 +249,7 @@ int tap_fd_set_vnet_be(int fd, int is_be)
}
void tap_fd_set_offload(int fd, int csum, int tso4,
int tso6, int ecn, int ufo)
int tso6, int ecn, int ufo, int uso4, int uso6)
{
unsigned int offload = 0;
@ -256,13 +268,22 @@ void tap_fd_set_offload(int fd, int csum, int tso4,
offload |= TUN_F_TSO_ECN;
if (ufo)
offload |= TUN_F_UFO;
if (uso4) {
offload |= TUN_F_USO4;
}
if (uso6) {
offload |= TUN_F_USO6;
}
}
if (ioctl(fd, TUNSETOFFLOAD, offload) != 0) {
offload &= ~TUN_F_UFO;
offload &= ~(TUN_F_USO4 | TUN_F_USO6);
if (ioctl(fd, TUNSETOFFLOAD, offload) != 0) {
fprintf(stderr, "TUNSETOFFLOAD ioctl() failed: %s\n",
offload &= ~TUN_F_UFO;
if (ioctl(fd, TUNSETOFFLOAD, offload) != 0) {
fprintf(stderr, "TUNSETOFFLOAD ioctl() failed: %s\n",
strerror(errno));
}
}
}
}

View File

@ -50,5 +50,7 @@
#define TUN_F_TSO6 0x04 /* I can handle TSO for IPv6 packets */
#define TUN_F_TSO_ECN 0x08 /* I can handle TSO with ECN bits. */
#define TUN_F_UFO 0x10 /* I can handle UFO packets */
#define TUN_F_USO4 0x20 /* I can handle USO for IPv4 packets */
#define TUN_F_USO6 0x40 /* I can handle USO for IPv6 packets */
#endif /* QEMU_TAP_LINUX_H */

View File

@ -216,6 +216,11 @@ int tap_probe_has_ufo(int fd)
return 0;
}
int tap_probe_has_uso(int fd)
{
return 0;
}
int tap_probe_vnet_hdr_len(int fd, int len)
{
return 0;
@ -236,7 +241,7 @@ int tap_fd_set_vnet_be(int fd, int is_be)
}
void tap_fd_set_offload(int fd, int csum, int tso4,
int tso6, int ecn, int ufo)
int tso6, int ecn, int ufo, int uso4, int uso6)
{
}

View File

@ -47,6 +47,11 @@ int tap_probe_has_ufo(int fd)
return 0;
}
int tap_probe_has_uso(int fd)
{
return 0;
}
int tap_probe_vnet_hdr_len(int fd, int len)
{
return 0;
@ -67,7 +72,7 @@ int tap_fd_set_vnet_be(int fd, int is_be)
}
void tap_fd_set_offload(int fd, int csum, int tso4,
int tso6, int ecn, int ufo)
int tso6, int ecn, int ufo, int uso4, int uso6)
{
}

View File

@ -741,7 +741,7 @@ static void tap_using_vnet_hdr(NetClientState *nc, bool using_vnet_hdr)
}
static void tap_set_offload(NetClientState *nc, int csum, int tso4,
int tso6, int ecn, int ufo)
int tso6, int ecn, int ufo, int uso4, int uso6)
{
}

View File

@ -57,6 +57,7 @@ typedef struct TAPState {
bool write_poll;
bool using_vnet_hdr;
bool has_ufo;
bool has_uso;
bool enabled;
VHostNetState *vhost_net;
unsigned host_vnet_hdr_len;
@ -117,10 +118,11 @@ static ssize_t tap_receive_iov(NetClientState *nc, const struct iovec *iov,
{
TAPState *s = DO_UPCAST(TAPState, nc, nc);
const struct iovec *iovp = iov;
struct iovec iov_copy[iovcnt + 1];
g_autofree struct iovec *iov_copy = NULL;
struct virtio_net_hdr_mrg_rxbuf hdr = { };
if (s->host_vnet_hdr_len && !s->using_vnet_hdr) {
iov_copy = g_new(struct iovec, iovcnt + 1);
iov_copy[0].iov_base = &hdr;
iov_copy[0].iov_len = s->host_vnet_hdr_len;
memcpy(&iov_copy[1], iov, iovcnt * sizeof(*iov));
@ -237,6 +239,15 @@ static bool tap_has_ufo(NetClientState *nc)
return s->has_ufo;
}
static bool tap_has_uso(NetClientState *nc)
{
TAPState *s = DO_UPCAST(TAPState, nc, nc);
assert(nc->info->type == NET_CLIENT_DRIVER_TAP);
return s->has_uso;
}
static bool tap_has_vnet_hdr(NetClientState *nc)
{
TAPState *s = DO_UPCAST(TAPState, nc, nc);
@ -307,14 +318,14 @@ static int tap_set_vnet_be(NetClientState *nc, bool is_be)
}
static void tap_set_offload(NetClientState *nc, int csum, int tso4,
int tso6, int ecn, int ufo)
int tso6, int ecn, int ufo, int uso4, int uso6)
{
TAPState *s = DO_UPCAST(TAPState, nc, nc);
if (s->fd < 0) {
return;
}
tap_fd_set_offload(s->fd, csum, tso4, tso6, ecn, ufo);
tap_fd_set_offload(s->fd, csum, tso4, tso6, ecn, ufo, uso4, uso6);
}
static void tap_exit_notify(Notifier *notifier, void *data)
@ -384,6 +395,7 @@ static NetClientInfo net_tap_info = {
.poll = tap_poll,
.cleanup = tap_cleanup,
.has_ufo = tap_has_ufo,
.has_uso = tap_has_uso,
.has_vnet_hdr = tap_has_vnet_hdr,
.has_vnet_hdr_len = tap_has_vnet_hdr_len,
.get_using_vnet_hdr = tap_get_using_vnet_hdr,
@ -413,8 +425,9 @@ static TAPState *net_tap_fd_init(NetClientState *peer,
s->host_vnet_hdr_len = vnet_hdr ? sizeof(struct virtio_net_hdr) : 0;
s->using_vnet_hdr = false;
s->has_ufo = tap_probe_has_ufo(s->fd);
s->has_uso = tap_probe_has_uso(s->fd);
s->enabled = true;
tap_set_offload(&s->nc, 0, 0, 0, 0, 0);
tap_set_offload(&s->nc, 0, 0, 0, 0, 0, 0, 0);
/*
* Make sure host header length is set correctly in tap:
* it might have been modified by another instance of qemu.

View File

@ -37,7 +37,9 @@ void tap_set_sndbuf(int fd, const NetdevTapOptions *tap, Error **errp);
int tap_probe_vnet_hdr(int fd, Error **errp);
int tap_probe_vnet_hdr_len(int fd, int len);
int tap_probe_has_ufo(int fd);
void tap_fd_set_offload(int fd, int csum, int tso4, int tso6, int ecn, int ufo);
int tap_probe_has_uso(int fd);
void tap_fd_set_offload(int fd, int csum, int tso4, int tso6, int ecn, int ufo,
int uso4, int uso6);
void tap_fd_set_vnet_hdr_len(int fd, int len);
int tap_fd_set_vnet_le(int fd, int vnet_is_le);
int tap_fd_set_vnet_be(int fd, int vnet_is_be);

View File

@ -75,11 +75,14 @@ const int vdpa_feature_bits[] = {
VIRTIO_NET_F_GUEST_TSO4,
VIRTIO_NET_F_GUEST_TSO6,
VIRTIO_NET_F_GUEST_UFO,
VIRTIO_NET_F_GUEST_USO4,
VIRTIO_NET_F_GUEST_USO6,
VIRTIO_NET_F_HASH_REPORT,
VIRTIO_NET_F_HOST_ECN,
VIRTIO_NET_F_HOST_TSO4,
VIRTIO_NET_F_HOST_TSO6,
VIRTIO_NET_F_HOST_UFO,
VIRTIO_NET_F_HOST_USO,
VIRTIO_NET_F_MQ,
VIRTIO_NET_F_MRG_RXBUF,
VIRTIO_NET_F_MTU,

View File

@ -408,6 +408,60 @@
'ifname': 'str',
'*devname': 'str' } }
##
# @AFXDPMode:
#
# Attach mode for a default XDP program
#
# @skb: generic mode, no driver support necessary
#
# @native: DRV mode, program is attached to a driver, packets are passed to
# the socket without allocation of skb.
#
# Since: 8.2
##
{ 'enum': 'AFXDPMode',
'data': [ 'native', 'skb' ],
'if': 'CONFIG_AF_XDP' }
##
# @NetdevAFXDPOptions:
#
# AF_XDP network backend
#
# @ifname: The name of an existing network interface.
#
# @mode: Attach mode for a default XDP program. If not specified, then
# 'native' will be tried first, then 'skb'.
#
# @force-copy: Force XDP copy mode even if device supports zero-copy.
# (default: false)
#
# @queues: number of queues to be used for multiqueue interfaces (default: 1).
#
# @start-queue: Use @queues starting from this queue number (default: 0).
#
# @inhibit: Don't load a default XDP program, use one already loaded to
# the interface (default: false). Requires @sock-fds.
#
# @sock-fds: A colon (:) separated list of file descriptors for already open
# but not bound AF_XDP sockets in the queue order. One fd per queue.
# These descriptors should already be added into XDP socket map for
# corresponding queues. Requires @inhibit.
#
# Since: 8.2
##
{ 'struct': 'NetdevAFXDPOptions',
'data': {
'ifname': 'str',
'*mode': 'AFXDPMode',
'*force-copy': 'bool',
'*queues': 'int',
'*start-queue': 'int',
'*inhibit': 'bool',
'*sock-fds': 'str' },
'if': 'CONFIG_AF_XDP' }
##
# @NetdevVhostUserOptions:
#
@ -642,6 +696,7 @@
# @vmnet-bridged: since 7.1
# @stream: since 7.2
# @dgram: since 7.2
# @af-xdp: since 8.2
#
# Since: 2.7
##
@ -649,6 +704,7 @@
'data': [ 'none', 'nic', 'user', 'tap', 'l2tpv3', 'socket', 'stream',
'dgram', 'vde', 'bridge', 'hubport', 'netmap', 'vhost-user',
'vhost-vdpa',
{ 'name': 'af-xdp', 'if': 'CONFIG_AF_XDP' },
{ 'name': 'vmnet-host', 'if': 'CONFIG_VMNET' },
{ 'name': 'vmnet-shared', 'if': 'CONFIG_VMNET' },
{ 'name': 'vmnet-bridged', 'if': 'CONFIG_VMNET' }] }
@ -679,6 +735,8 @@
'bridge': 'NetdevBridgeOptions',
'hubport': 'NetdevHubPortOptions',
'netmap': 'NetdevNetmapOptions',
'af-xdp': { 'type': 'NetdevAFXDPOptions',
'if': 'CONFIG_AF_XDP' },
'vhost-user': 'NetdevVhostUserOptions',
'vhost-vdpa': 'NetdevVhostVDPAOptions',
'vmnet-host': { 'type': 'NetdevVmnetHostOptions',

View File

@ -2882,6 +2882,19 @@ DEF("netdev", HAS_ARG, QEMU_OPTION_netdev,
" VALE port (created on the fly) called 'name' ('nmname' is name of the \n"
" netmap device, defaults to '/dev/netmap')\n"
#endif
#ifdef CONFIG_AF_XDP
"-netdev af-xdp,id=str,ifname=name[,mode=native|skb][,force-copy=on|off]\n"
" [,queues=n][,start-queue=m][,inhibit=on|off][,sock-fds=x:y:...:z]\n"
" attach to the existing network interface 'name' with AF_XDP socket\n"
" use 'mode=MODE' to specify an XDP program attach mode\n"
" use 'force-copy=on|off' to force XDP copy mode even if device supports zero-copy (default: off)\n"
" use 'inhibit=on|off' to inhibit loading of a default XDP program (default: off)\n"
" with inhibit=on,\n"
" use 'sock-fds' to provide file descriptors for already open AF_XDP sockets\n"
" added to a socket map in XDP program. One socket per queue.\n"
" use 'queues=n' to specify how many queues of a multiqueue interface should be used\n"
" use 'start-queue=m' to specify the first queue that should be used\n"
#endif
#ifdef CONFIG_POSIX
"-netdev vhost-user,id=str,chardev=dev[,vhostforce=on|off]\n"
" configure a vhost-user network, backed by a chardev 'dev'\n"
@ -2927,6 +2940,9 @@ DEF("nic", HAS_ARG, QEMU_OPTION_nic,
#ifdef CONFIG_NETMAP
"netmap|"
#endif
#ifdef CONFIG_AF_XDP
"af-xdp|"
#endif
#ifdef CONFIG_POSIX
"vhost-user|"
#endif
@ -2955,6 +2971,9 @@ DEF("net", HAS_ARG, QEMU_OPTION_net,
#ifdef CONFIG_NETMAP
"netmap|"
#endif
#ifdef CONFIG_AF_XDP
"af-xdp|"
#endif
#ifdef CONFIG_VMNET
"vmnet-host|vmnet-shared|vmnet-bridged|"
#endif
@ -2962,7 +2981,7 @@ DEF("net", HAS_ARG, QEMU_OPTION_net,
" old way to initialize a host network interface\n"
" (use the -netdev option if possible instead)\n", QEMU_ARCH_ALL)
SRST
``-nic [tap|bridge|user|l2tpv3|vde|netmap|vhost-user|socket][,...][,mac=macaddr][,model=mn]``
``-nic [tap|bridge|user|l2tpv3|vde|netmap|af-xdp|vhost-user|socket][,...][,mac=macaddr][,model=mn]``
This option is a shortcut for configuring both the on-board
(default) guest NIC hardware and the host network backend in one go.
The host backend options are the same as with the corresponding
@ -3376,6 +3395,55 @@ SRST
# launch QEMU instance
|qemu_system| linux.img -nic vde,sock=/tmp/myswitch
``-netdev af-xdp,id=str,ifname=name[,mode=native|skb][,force-copy=on|off][,queues=n][,start-queue=m][,inhibit=on|off][,sock-fds=x:y:...:z]``
Configure AF_XDP backend to connect to a network interface 'name'
using AF_XDP socket. A specific program attach mode for a default
XDP program can be forced with 'mode', defaults to best-effort,
where the likely most performant mode will be in use. Number of queues
'n' should generally match the number or queues in the interface,
defaults to 1. Traffic arriving on non-configured device queues will
not be delivered to the network backend.
.. parsed-literal::
# set number of queues to 4
ethtool -L eth0 combined 4
# launch QEMU instance
|qemu_system| linux.img -device virtio-net-pci,netdev=n1 \\
-netdev af-xdp,id=n1,ifname=eth0,queues=4
'start-queue' option can be specified if a particular range of queues
[m, m + n] should be in use. For example, this is may be necessary in
order to use certain NICs in native mode. Kernel allows the driver to
create a separate set of XDP queues on top of regular ones, and only
these queues can be used for AF_XDP sockets. NICs that work this way
may also require an additional traffic redirection with ethtool to these
special queues.
.. parsed-literal::
# set number of queues to 1
ethtool -L eth0 combined 1
# redirect all the traffic to the second queue (id: 1)
# note: drivers may require non-empty key/mask pair.
ethtool -N eth0 flow-type ether \\
dst 00:00:00:00:00:00 m FF:FF:FF:FF:FF:FE action 1
ethtool -N eth0 flow-type ether \\
dst 00:00:00:00:00:01 m FF:FF:FF:FF:FF:FE action 1
# launch QEMU instance
|qemu_system| linux.img -device virtio-net-pci,netdev=n1 \\
-netdev af-xdp,id=n1,ifname=eth0,queues=1,start-queue=1
XDP program can also be loaded externally. In this case 'inhibit' option
should be set to 'on' and 'sock-fds' provided with file descriptors for
already open but not bound XDP sockets already added to a socket map for
corresponding queues. One socket per queue.
.. parsed-literal::
|qemu_system| linux.img -device virtio-net-pci,netdev=n1 \\
-netdev af-xdp,id=n1,ifname=eth0,queues=3,inhibit=on,sock-fds=15:16:17
``-netdev vhost-user,chardev=id[,vhostforce=on|off][,queues=n]``
Establish a vhost-user netdev, backed by a chardev id. The chardev
should be a unix domain socket backed one. The vhost-user uses a

View File

@ -35,6 +35,7 @@
--block-drv-ro-whitelist="vmdk,vhdx,vpc,https,ssh" \
--with-coroutine=ucontext \
--tls-priority=@QEMU,SYSTEM \
--disable-af-xdp \
--disable-attr \
--disable-auth-pam \
--disable-avx2 \

View File

@ -76,6 +76,7 @@ meson_options_help() {
printf "%s\n" 'disabled with --disable-FEATURE, default is enabled if available'
printf "%s\n" '(unless built with --without-default-features):'
printf "%s\n" ''
printf "%s\n" ' af-xdp AF_XDP network backend support'
printf "%s\n" ' alsa ALSA sound support'
printf "%s\n" ' attr attr/xattr support'
printf "%s\n" ' auth-pam PAM access control'
@ -208,6 +209,8 @@ meson_options_help() {
}
_meson_option_parse() {
case $1 in
--enable-af-xdp) printf "%s" -Daf_xdp=enabled ;;
--disable-af-xdp) printf "%s" -Daf_xdp=disabled ;;
--enable-alsa) printf "%s" -Dalsa=enabled ;;
--disable-alsa) printf "%s" -Dalsa=disabled ;;
--enable-attr) printf "%s" -Dattr=enabled ;;

View File

@ -59,6 +59,7 @@ RUN apk update && \
libtasn1-dev \
liburing-dev \
libusb-dev \
libxdp-dev \
linux-pam-dev \
llvm \
lttng-ust-dev \

View File

@ -75,6 +75,7 @@ RUN dnf distro-sync -y && \
libubsan \
liburing-devel \
libusbx-devel \
libxdp-devel \
libzstd-devel \
llvm \
lttng-ust-devel \

View File

@ -84,7 +84,7 @@ RUN export DEBIAN_FRONTEND=noninteractive && \
g++-x86-64-linux-gnu \
gcc-x86-64-linux-gnu \
libaio-dev:amd64 \
libasan5:amd64 \
libasan6:amd64 \
libasound2-dev:amd64 \
libattr1-dev:amd64 \
libbpf-dev:amd64 \

View File

@ -32,7 +32,7 @@ RUN export DEBIAN_FRONTEND=noninteractive && \
git \
hostname \
libaio-dev \
libasan5 \
libasan6 \
libasound2-dev \
libattr1-dev \
libbpf-dev \

View File

@ -84,7 +84,7 @@ RUN export DEBIAN_FRONTEND=noninteractive && \
g++-aarch64-linux-gnu \
gcc-aarch64-linux-gnu \
libaio-dev:arm64 \
libasan5:arm64 \
libasan6:arm64 \
libasound2-dev:arm64 \
libattr1-dev:arm64 \
libbpf-dev:arm64 \

View File

@ -84,7 +84,7 @@ RUN export DEBIAN_FRONTEND=noninteractive && \
g++-arm-linux-gnueabi \
gcc-arm-linux-gnueabi \
libaio-dev:armel \
libasan5:armel \
libasan6:armel \
libasound2-dev:armel \
libattr1-dev:armel \
libbpf-dev:armel \

View File

@ -84,7 +84,7 @@ RUN export DEBIAN_FRONTEND=noninteractive && \
g++-arm-linux-gnueabihf \
gcc-arm-linux-gnueabihf \
libaio-dev:armhf \
libasan5:armhf \
libasan6:armhf \
libasound2-dev:armhf \
libattr1-dev:armhf \
libbpf-dev:armhf \

View File

@ -84,7 +84,7 @@ RUN export DEBIAN_FRONTEND=noninteractive && \
g++-powerpc64le-linux-gnu \
gcc-powerpc64le-linux-gnu \
libaio-dev:ppc64el \
libasan5:ppc64el \
libasan6:ppc64el \
libasound2-dev:ppc64el \
libattr1-dev:ppc64el \
libbpf-dev:ppc64el \

View File

@ -84,7 +84,7 @@ RUN export DEBIAN_FRONTEND=noninteractive && \
g++-s390x-linux-gnu \
gcc-s390x-linux-gnu \
libaio-dev:s390x \
libasan5:s390x \
libasan6:s390x \
libasound2-dev:s390x \
libattr1-dev:s390x \
libbpf-dev:s390x \

View File

@ -82,6 +82,7 @@ exec "$@"\n' > /usr/bin/nosync && \
libubsan \
liburing-devel \
libusbx-devel \
libxdp-devel \
libzstd-devel \
llvm \
lttng-ust-devel \

View File

@ -40,7 +40,7 @@ RUN zypper update -y && \
libSDL2-devel \
libSDL2_image-devel \
libaio-devel \
libasan6 \
libasan8 \
libattr-devel \
libbpf-devel \
libbz2-devel \

View File

@ -32,7 +32,7 @@ RUN export DEBIAN_FRONTEND=noninteractive && \
git \
hostname \
libaio-dev \
libasan5 \
libasan6 \
libasound2-dev \
libattr1-dev \
libbrlapi-dev \

View File

@ -32,7 +32,7 @@ RUN export DEBIAN_FRONTEND=noninteractive && \
git \
hostname \
libaio-dev \
libasan5 \
libasan6 \
libasound2-dev \
libattr1-dev \
libbpf-dev \

@ -1 +1 @@
Subproject commit bbd55b4d18cce8f89b5167675e434a6941315634
Subproject commit 5f84a21881577a5fb56cc956f6fe4e2abd6fcff0

View File

@ -69,6 +69,7 @@ packages:
- liburing
- libusbx
- libvdeplug
- libxdp
- libzstd
- llvm
- lttng-ust

View File

@ -109,6 +109,11 @@ static void igb_pci_start_hw(QOSGraphObject *obj)
E1000_RAH_AV | E1000_RAH_POOL_1 |
le16_to_cpu(*(uint16_t *)(address + 4)));
/* Set supported receive descriptor mode */
e1000e_macreg_write(&d->e1000e,
E1000_SRRCTL(0),
E1000_SRRCTL_DESCTYPE_ADV_ONEBUF);
/* Enable receive */
e1000e_macreg_write(&d->e1000e, E1000_RFCTL, E1000_RFCTL_EXTEN);
e1000e_macreg_write(&d->e1000e, E1000_RCTL, E1000_RCTL_EN);