-----BEGIN PGP SIGNATURE-----

Version: GnuPG v1 iQEcBAABAgAGBQJlB/SLAAoJEO8Ells5jWIR7EQH/1kAbxHcSGJXDOgQAXJ/rOZi UKn3ugJzD0Hxd4Xz8cvdVLM+9/JoEEOK1uB+NIG7Ask/gA5D7eUYzaLtp1OJ8VNO mamfKmn3EIBWJoLSHH19TKzfW2tGMJHQ0Nj+sbDQRkK5f2c7hwLTRXa1EmlJd4dB VoVzX4OiJtrQyv4OVmpP/PSETXJDvYYX/DNcRl9/3ccKtQW/wVDI3YzrMzXrsgyc w9ItJi8k+19mVH6RgQwciqRvTbVMdzkOxqvU//LY0TxnjsHfbyHr+KlNAa2WTY2N QgpAlMZhHqUG6/XXAs0o2VEtA66zmw932Xfy/CZUEcdGWfkG/9CEVfbuT4CKGY4= =tF7K -----END PGP SIGNATURE----- Merge tag 'net-pull-request' of https://github.com/jasowang/qemu into staging # -----BEGIN PGP SIGNATURE----- # Version: GnuPG v1 # # iQEcBAABAgAGBQJlB/SLAAoJEO8Ells5jWIR7EQH/1kAbxHcSGJXDOgQAXJ/rOZi # UKn3ugJzD0Hxd4Xz8cvdVLM+9/JoEEOK1uB+NIG7Ask/gA5D7eUYzaLtp1OJ8VNO # mamfKmn3EIBWJoLSHH19TKzfW2tGMJHQ0Nj+sbDQRkK5f2c7hwLTRXa1EmlJd4dB # VoVzX4OiJtrQyv4OVmpP/PSETXJDvYYX/DNcRl9/3ccKtQW/wVDI3YzrMzXrsgyc # w9ItJi8k+19mVH6RgQwciqRvTbVMdzkOxqvU//LY0TxnjsHfbyHr+KlNAa2WTY2N # QgpAlMZhHqUG6/XXAs0o2VEtA66zmw932Xfy/CZUEcdGWfkG/9CEVfbuT4CKGY4= # =tF7K # -----END PGP SIGNATURE----- # gpg: Signature made Mon 18 Sep 2023 02:56:11 EDT # gpg: using RSA key EF04965B398D6211 # gpg: Good signature from "Jason Wang (Jason Wang on RedHat) <jasowang@redhat.com>" [full] # Primary key fingerprint: 215D 46F4 8246 689E C77F 3562 EF04 965B 398D 6211 * tag 'net-pull-request' of https://github.com/jasowang/qemu: net/tap: Avoid variable-length array net/dump: Avoid variable length array hw/net/rocker: Avoid variable length array hw/net/fsl_etsec/rings.c: Avoid variable length array net: add initial support for AF_XDP network backend tests: bump libvirt-ci for libasan and libxdp e1000e: rename e1000e_ba_state and e1000e_write_hdr_to_rx_buffers igb: packet-split descriptors support igb: add IPv6 extended headers traffic detection igb: RX payload guest writting refactoring igb: RX descriptors guest writting refactoring igb: rename E1000E_RingInfo_st igb: remove TCP ACK detection virtio-net: Add support for USO features virtio-net: Add USO flags to vhost support. tap: Add check for USO features tap: Add USO support to tap device. Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2023-09-19 13:21:49 -04:00 · 2023-09-19 13:21:49 -04:00 · dd0c84983d
commit dd0c84983d
parent d7754940d7 6d7a53e9f1
50 changed files with 1435 additions and 284 deletions
--- a/4
+++ b/4
@ -2957,6 +2957,10 @@ W: http://info.iet.unipi.it/~luigi/netmap/
 S: Maintained
 F: net/netmap.c

+AF_XDP network backend
+R: Ilya Maximets <i.maximets@ovn.org>
+F: net/af-xdp.c
+
 Host Memory Backends
 M: David Hildenbrand <david@redhat.com>
 M: Igor Mammedov <imammedo@redhat.com>
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@ -1296,6 +1296,9 @@ ERST
        .name       = "netdev_add",
        .args_type  = "netdev:O",
        .params     = "[user|tap|socket|stream|dgram|vde|bridge|hubport|netmap|vhost-user"
+#ifdef CONFIG_AF_XDP
+                      "|af-xdp"
+#endif
 #ifdef CONFIG_VMNET
                      "|vmnet-host|vmnet-shared|vmnet-bridged"
 #endif
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@ -38,6 +38,7 @@
 #include "exec/confidential-guest-support.h"
 #include "hw/virtio/virtio.h"
 #include "hw/virtio/virtio-pci.h"
+#include "hw/virtio/virtio-net.h"

 GlobalProperty hw_compat_8_1[] = {};
 const size_t hw_compat_8_1_len = G_N_ELEMENTS(hw_compat_8_1);
@ -45,6 +46,9 @@ const size_t hw_compat_8_1_len = G_N_ELEMENTS(hw_compat_8_1);
 GlobalProperty hw_compat_8_0[] = {
    { "migration", "multifd-flush-after-each-section", "on"},
    { TYPE_PCI_DEVICE, "x-pcie-ari-nextfn-1", "on" },
+    { TYPE_VIRTIO_NET, "host_uso", "off"},
+    { TYPE_VIRTIO_NET, "guest_uso4", "off"},
+    { TYPE_VIRTIO_NET, "guest_uso6", "off"},
 };
 const size_t hw_compat_8_0_len = G_N_ELEMENTS(hw_compat_8_0);

--- a/hw/net/e1000e_core.c
+++ b/hw/net/e1000e_core.c
@ -810,24 +810,24 @@ e1000e_txdesc_writeback(E1000ECore *core, dma_addr_t base,
    return e1000e_tx_wb_interrupt_cause(core, queue_idx);
 }

-typedef struct E1000E_RingInfo_st {
+typedef struct E1000ERingInfo {
    int dbah;
    int dbal;
    int dlen;
    int dh;
    int dt;
    int idx;
-} E1000E_RingInfo;
+} E1000ERingInfo;

 static inline bool
-e1000e_ring_empty(E1000ECore *core, const E1000E_RingInfo *r)
+e1000e_ring_empty(E1000ECore *core, const E1000ERingInfo *r)
 {
    return core->mac[r->dh] == core->mac[r->dt] ||
                core->mac[r->dt] >= core->mac[r->dlen] / E1000_RING_DESC_LEN;
 }

 static inline uint64_t
-e1000e_ring_base(E1000ECore *core, const E1000E_RingInfo *r)
+e1000e_ring_base(E1000ECore *core, const E1000ERingInfo *r)
 {
    uint64_t bah = core->mac[r->dbah];
    uint64_t bal = core->mac[r->dbal];
@ -836,13 +836,13 @@ e1000e_ring_base(E1000ECore *core, const E1000E_RingInfo *r)
 }

 static inline uint64_t
-e1000e_ring_head_descr(E1000ECore *core, const E1000E_RingInfo *r)
+e1000e_ring_head_descr(E1000ECore *core, const E1000ERingInfo *r)
 {
    return e1000e_ring_base(core, r) + E1000_RING_DESC_LEN * core->mac[r->dh];
 }

 static inline void
-e1000e_ring_advance(E1000ECore *core, const E1000E_RingInfo *r, uint32_t count)
+e1000e_ring_advance(E1000ECore *core, const E1000ERingInfo *r, uint32_t count)
 {
    core->mac[r->dh] += count;

@ -852,7 +852,7 @@ e1000e_ring_advance(E1000ECore *core, const E1000E_RingInfo *r, uint32_t count)
 }

 static inline uint32_t
-e1000e_ring_free_descr_num(E1000ECore *core, const E1000E_RingInfo *r)
+e1000e_ring_free_descr_num(E1000ECore *core, const E1000ERingInfo *r)
 {
    trace_e1000e_ring_free_space(r->idx, core->mac[r->dlen],
                                 core->mac[r->dh],  core->mac[r->dt]);
@ -871,19 +871,19 @@ e1000e_ring_free_descr_num(E1000ECore *core, const E1000E_RingInfo *r)
 }

 static inline bool
-e1000e_ring_enabled(E1000ECore *core, const E1000E_RingInfo *r)
+e1000e_ring_enabled(E1000ECore *core, const E1000ERingInfo *r)
 {
    return core->mac[r->dlen] > 0;
 }

 static inline uint32_t
-e1000e_ring_len(E1000ECore *core, const E1000E_RingInfo *r)
+e1000e_ring_len(E1000ECore *core, const E1000ERingInfo *r)
 {
    return core->mac[r->dlen];
 }

 typedef struct E1000E_TxRing_st {
-    const E1000E_RingInfo *i;
+    const E1000ERingInfo *i;
    struct e1000e_tx *tx;
 } E1000E_TxRing;

@ -896,7 +896,7 @@ e1000e_mq_queue_idx(int base_reg_idx, int reg_idx)
 static inline void
 e1000e_tx_ring_init(E1000ECore *core, E1000E_TxRing *txr, int idx)
 {
-    static const E1000E_RingInfo i[E1000E_NUM_QUEUES] = {
+    static const E1000ERingInfo i[E1000E_NUM_QUEUES] = {
        { TDBAH,  TDBAL,  TDLEN,  TDH,  TDT, 0 },
        { TDBAH1, TDBAL1, TDLEN1, TDH1, TDT1, 1 }
    };
@ -908,13 +908,13 @@ e1000e_tx_ring_init(E1000ECore *core, E1000E_TxRing *txr, int idx)
 }

 typedef struct E1000E_RxRing_st {
-    const E1000E_RingInfo *i;
+    const E1000ERingInfo *i;
 } E1000E_RxRing;

 static inline void
 e1000e_rx_ring_init(E1000ECore *core, E1000E_RxRing *rxr, int idx)
 {
-    static const E1000E_RingInfo i[E1000E_NUM_QUEUES] = {
+    static const E1000ERingInfo i[E1000E_NUM_QUEUES] = {
        { RDBAH0, RDBAL0, RDLEN0, RDH0, RDT0, 0 },
        { RDBAH1, RDBAL1, RDLEN1, RDH1, RDT1, 1 }
    };
@ -930,7 +930,7 @@ e1000e_start_xmit(E1000ECore *core, const E1000E_TxRing *txr)
    dma_addr_t base;
    struct e1000_tx_desc desc;
    bool ide = false;
-    const E1000E_RingInfo *txi = txr->i;
+    const E1000ERingInfo *txi = txr->i;
    uint32_t cause = E1000_ICS_TXQE;

    if (!(core->mac[TCTL] & E1000_TCTL_EN)) {
@ -960,7 +960,7 @@ e1000e_start_xmit(E1000ECore *core, const E1000E_TxRing *txr)
 }

 static bool
-e1000e_has_rxbufs(E1000ECore *core, const E1000E_RingInfo *r,
+e1000e_has_rxbufs(E1000ECore *core, const E1000ERingInfo *r,
                  size_t total_size)
 {
    uint32_t bufs = e1000e_ring_free_descr_num(core, r);
@ -1397,17 +1397,17 @@ e1000e_pci_dma_write_rx_desc(E1000ECore *core, dma_addr_t addr,
    }
 }

-typedef struct e1000e_ba_state_st {
+typedef struct E1000EBAState {
    uint16_t written[MAX_PS_BUFFERS];
    uint8_t cur_idx;
-} e1000e_ba_state;
+} E1000EBAState;

 static inline void
-e1000e_write_hdr_to_rx_buffers(E1000ECore *core,
-                               hwaddr ba[MAX_PS_BUFFERS],
-                               e1000e_ba_state *bastate,
-                               const char *data,
-                               dma_addr_t data_len)
+e1000e_write_hdr_frag_to_rx_buffers(E1000ECore *core,
+                                    hwaddr ba[MAX_PS_BUFFERS],
+                                    E1000EBAState *bastate,
+                                    const char *data,
+                                    dma_addr_t data_len)
 {
    assert(data_len <= core->rxbuf_sizes[0] - bastate->written[0]);

@ -1418,11 +1418,11 @@ e1000e_write_hdr_to_rx_buffers(E1000ECore *core,
 }

 static void
-e1000e_write_to_rx_buffers(E1000ECore *core,
-                           hwaddr ba[MAX_PS_BUFFERS],
-                           e1000e_ba_state *bastate,
-                           const char *data,
-                           dma_addr_t data_len)
+e1000e_write_payload_frag_to_rx_buffers(E1000ECore *core,
+                                        hwaddr ba[MAX_PS_BUFFERS],
+                                        E1000EBAState *bastate,
+                                        const char *data,
+                                        dma_addr_t data_len)
 {
    while (data_len > 0) {
        uint32_t cur_buf_len = core->rxbuf_sizes[bastate->cur_idx];
@ -1460,7 +1460,7 @@ e1000e_update_rx_stats(E1000ECore *core, size_t pkt_size, size_t pkt_fcs_size)
 }

 static inline bool
-e1000e_rx_descr_threshold_hit(E1000ECore *core, const E1000E_RingInfo *rxi)
+e1000e_rx_descr_threshold_hit(E1000ECore *core, const E1000ERingInfo *rxi)
 {
    return e1000e_ring_free_descr_num(core, rxi) ==
           e1000e_ring_len(core, rxi) >> core->rxbuf_min_shift;
@ -1521,7 +1521,7 @@ e1000e_write_packet_to_guest(E1000ECore *core, struct NetRxPkt *pkt,
    struct iovec *iov = net_rx_pkt_get_iovec(pkt);
    size_t size = net_rx_pkt_get_total_len(pkt);
    size_t total_size = size + e1000x_fcs_len(core->mac);
-    const E1000E_RingInfo *rxi;
+    const E1000ERingInfo *rxi;
    size_t ps_hdr_len = 0;
    bool do_ps = e1000e_do_ps(core, pkt, &ps_hdr_len);
    bool is_first = true;
@ -1530,7 +1530,7 @@ e1000e_write_packet_to_guest(E1000ECore *core, struct NetRxPkt *pkt,

    do {
        hwaddr ba[MAX_PS_BUFFERS];
-        e1000e_ba_state bastate = { { 0 } };
+        E1000EBAState bastate = { { 0 } };
        bool is_last = false;

        desc_size = total_size - desc_offset;
@ -1568,8 +1568,10 @@ e1000e_write_packet_to_guest(E1000ECore *core, struct NetRxPkt *pkt,
                            iov_copy = MIN(ps_hdr_len - ps_hdr_copied,
                                           iov->iov_len - iov_ofs);

-                            e1000e_write_hdr_to_rx_buffers(core, ba, &bastate,
-                                                      iov->iov_base, iov_copy);
+                            e1000e_write_hdr_frag_to_rx_buffers(core, ba,
+                                                                &bastate,
+                                                                iov->iov_base,
+                                                                iov_copy);

                            copy_size -= iov_copy;
                            ps_hdr_copied += iov_copy;
@ -1585,8 +1587,8 @@ e1000e_write_packet_to_guest(E1000ECore *core, struct NetRxPkt *pkt,
                    } else {
                        /* Leave buffer 0 of each descriptor except first */
                        /* empty as per spec 7.1.5.1                      */
-                        e1000e_write_hdr_to_rx_buffers(core, ba, &bastate,
-                                                       NULL, 0);
+                        e1000e_write_hdr_frag_to_rx_buffers(core, ba, &bastate,
+                                                            NULL, 0);
                    }
                }

@ -1594,8 +1596,10 @@ e1000e_write_packet_to_guest(E1000ECore *core, struct NetRxPkt *pkt,
                while (copy_size) {
                    iov_copy = MIN(copy_size, iov->iov_len - iov_ofs);

-                    e1000e_write_to_rx_buffers(core, ba, &bastate,
-                                            iov->iov_base + iov_ofs, iov_copy);
+                    e1000e_write_payload_frag_to_rx_buffers(core, ba, &bastate,
+                                                            iov->iov_base +
+                                                            iov_ofs,
+                                                            iov_copy);

                    copy_size -= iov_copy;
                    iov_ofs += iov_copy;
@ -1607,7 +1611,7 @@ e1000e_write_packet_to_guest(E1000ECore *core, struct NetRxPkt *pkt,

                if (desc_offset + desc_size >= total_size) {
                    /* Simulate FCS checksum presence in the last descriptor */
-                    e1000e_write_to_rx_buffers(core, ba, &bastate,
+                    e1000e_write_payload_frag_to_rx_buffers(core, ba, &bastate,
                          (const char *) &fcs_pad, e1000x_fcs_len(core->mac));
                }
            }
@ -2852,7 +2856,7 @@ e1000e_update_rx_offloads(E1000ECore *core)

    if (core->has_vnet) {
        qemu_set_offload(qemu_get_queue(core->owner_nic)->peer,
-                         cso_state, 0, 0, 0, 0);
+                         cso_state, 0, 0, 0, 0, 0, 0);
    }
 }

--- a/hw/net/fsl_etsec/rings.c
+++ b/hw/net/fsl_etsec/rings.c
@ -372,6 +372,12 @@ void etsec_walk_tx_ring(eTSEC *etsec, int ring_nbr)
    etsec->regs[TSTAT].value |= 1 << (31 - ring_nbr);
 }

+/*
+ * rx_init_frame() ensures we never do more padding than this
+ * (checksum plus minimum data packet size)
+ */
+#define MAX_RX_PADDING 64
+
 static void fill_rx_bd(eTSEC          *etsec,
                       eTSEC_rxtx_bd  *bd,
                       const uint8_t **buf,
@ -380,9 +386,11 @@ static void fill_rx_bd(eTSEC          *etsec,
    uint16_t to_write;
    hwaddr   bufptr = bd->bufptr +
        ((hwaddr)(etsec->regs[TBDBPH].value & 0xF) << 32);
-    uint8_t  padd[etsec->rx_padding];
+    uint8_t  padd[MAX_RX_PADDING];
    uint8_t  rem;

+    assert(etsec->rx_padding <= MAX_RX_PADDING);
+
    RING_DEBUG("eTSEC fill Rx buffer @ 0x%016" HWADDR_PRIx
               " size:%zu(padding + crc:%u) + fcb:%u\n",
               bufptr, *size, etsec->rx_padding, etsec->rx_fcb_size);
@ -426,7 +434,7 @@ static void fill_rx_bd(eTSEC          *etsec,
        rem = MIN(etsec->regs[MRBLR].value - bd->length, etsec->rx_padding);

        if (rem > 0) {
-            memset(padd, 0x0, sizeof(padd));
+            memset(padd, 0x0, rem);
            etsec->rx_padding -= rem;
            *size             -= rem;
            bd->length        += rem;
--- a/hw/net/igb_core.c
+++ b/hw/net/igb_core.c
@ -267,6 +267,29 @@ igb_rx_use_legacy_descriptor(IGBCore *core)
    return false;
 }

+typedef struct E1000ERingInfo {
+    int dbah;
+    int dbal;
+    int dlen;
+    int dh;
+    int dt;
+    int idx;
+} E1000ERingInfo;
+
+static uint32_t
+igb_rx_queue_desctyp_get(IGBCore *core, const E1000ERingInfo *r)
+{
+    return core->mac[E1000_SRRCTL(r->idx) >> 2] & E1000_SRRCTL_DESCTYPE_MASK;
+}
+
+static bool
+igb_rx_use_ps_descriptor(IGBCore *core, const E1000ERingInfo *r)
+{
+    uint32_t desctyp = igb_rx_queue_desctyp_get(core, r);
+    return desctyp == E1000_SRRCTL_DESCTYPE_HDR_SPLIT ||
+           desctyp == E1000_SRRCTL_DESCTYPE_HDR_SPLIT_ALWAYS;
+}
+
 static inline bool
 igb_rss_enabled(IGBCore *core)
 {
@ -694,24 +717,15 @@ static uint32_t igb_rx_wb_eic(IGBCore *core, int queue_idx)
    return (ent & E1000_IVAR_VALID) ? BIT(ent & 0x1f) : 0;
 }

-typedef struct E1000E_RingInfo_st {
-    int dbah;
-    int dbal;
-    int dlen;
-    int dh;
-    int dt;
-    int idx;
-} E1000E_RingInfo;
-
 static inline bool
-igb_ring_empty(IGBCore *core, const E1000E_RingInfo *r)
+igb_ring_empty(IGBCore *core, const E1000ERingInfo *r)
 {
    return core->mac[r->dh] == core->mac[r->dt] ||
                core->mac[r->dt] >= core->mac[r->dlen] / E1000_RING_DESC_LEN;
 }

 static inline uint64_t
-igb_ring_base(IGBCore *core, const E1000E_RingInfo *r)
+igb_ring_base(IGBCore *core, const E1000ERingInfo *r)
 {
    uint64_t bah = core->mac[r->dbah];
    uint64_t bal = core->mac[r->dbal];
@ -720,13 +734,13 @@ igb_ring_base(IGBCore *core, const E1000E_RingInfo *r)
 }

 static inline uint64_t
-igb_ring_head_descr(IGBCore *core, const E1000E_RingInfo *r)
+igb_ring_head_descr(IGBCore *core, const E1000ERingInfo *r)
 {
    return igb_ring_base(core, r) + E1000_RING_DESC_LEN * core->mac[r->dh];
 }

 static inline void
-igb_ring_advance(IGBCore *core, const E1000E_RingInfo *r, uint32_t count)
+igb_ring_advance(IGBCore *core, const E1000ERingInfo *r, uint32_t count)
 {
    core->mac[r->dh] += count;

@ -736,7 +750,7 @@ igb_ring_advance(IGBCore *core, const E1000E_RingInfo *r, uint32_t count)
 }

 static inline uint32_t
-igb_ring_free_descr_num(IGBCore *core, const E1000E_RingInfo *r)
+igb_ring_free_descr_num(IGBCore *core, const E1000ERingInfo *r)
 {
    trace_e1000e_ring_free_space(r->idx, core->mac[r->dlen],
                                 core->mac[r->dh],  core->mac[r->dt]);
@ -755,13 +769,13 @@ igb_ring_free_descr_num(IGBCore *core, const E1000E_RingInfo *r)
 }

 static inline bool
-igb_ring_enabled(IGBCore *core, const E1000E_RingInfo *r)
+igb_ring_enabled(IGBCore *core, const E1000ERingInfo *r)
 {
    return core->mac[r->dlen] > 0;
 }

 typedef struct IGB_TxRing_st {
-    const E1000E_RingInfo *i;
+    const E1000ERingInfo *i;
    struct igb_tx *tx;
 } IGB_TxRing;

@ -774,7 +788,7 @@ igb_mq_queue_idx(int base_reg_idx, int reg_idx)
 static inline void
 igb_tx_ring_init(IGBCore *core, IGB_TxRing *txr, int idx)
 {
-    static const E1000E_RingInfo i[IGB_NUM_QUEUES] = {
+    static const E1000ERingInfo i[IGB_NUM_QUEUES] = {
        { TDBAH0, TDBAL0, TDLEN0, TDH0, TDT0, 0 },
        { TDBAH1, TDBAL1, TDLEN1, TDH1, TDT1, 1 },
        { TDBAH2, TDBAL2, TDLEN2, TDH2, TDT2, 2 },
@ -800,13 +814,13 @@ igb_tx_ring_init(IGBCore *core, IGB_TxRing *txr, int idx)
 }

 typedef struct E1000E_RxRing_st {
-    const E1000E_RingInfo *i;
+    const E1000ERingInfo *i;
 } E1000E_RxRing;

 static inline void
 igb_rx_ring_init(IGBCore *core, E1000E_RxRing *rxr, int idx)
 {
-    static const E1000E_RingInfo i[IGB_NUM_QUEUES] = {
+    static const E1000ERingInfo i[IGB_NUM_QUEUES] = {
        { RDBAH0, RDBAL0, RDLEN0, RDH0, RDT0, 0 },
        { RDBAH1, RDBAL1, RDLEN1, RDH1, RDT1, 1 },
        { RDBAH2, RDBAL2, RDLEN2, RDH2, RDT2, 2 },
@ -833,7 +847,7 @@ igb_rx_ring_init(IGBCore *core, E1000E_RxRing *rxr, int idx)
 static uint32_t
 igb_txdesc_writeback(IGBCore *core, dma_addr_t base,
                     union e1000_adv_tx_desc *tx_desc,
-                     const E1000E_RingInfo *txi)
+                     const E1000ERingInfo *txi)
 {
    PCIDevice *d;
    uint32_t cmd_type_len = le32_to_cpu(tx_desc->read.cmd_type_len);
@ -866,7 +880,7 @@ igb_txdesc_writeback(IGBCore *core, dma_addr_t base,
 }

 static inline bool
-igb_tx_enabled(IGBCore *core, const E1000E_RingInfo *txi)
+igb_tx_enabled(IGBCore *core, const E1000ERingInfo *txi)
 {
    bool vmdq = core->mac[MRQC] & 1;
    uint16_t qn = txi->idx;
@ -883,7 +897,7 @@ igb_start_xmit(IGBCore *core, const IGB_TxRing *txr)
    PCIDevice *d;
    dma_addr_t base;
    union e1000_adv_tx_desc desc;
-    const E1000E_RingInfo *txi = txr->i;
+    const E1000ERingInfo *txi = txr->i;
    uint32_t eic = 0;

    if (!igb_tx_enabled(core, txi)) {
@ -918,7 +932,7 @@ igb_start_xmit(IGBCore *core, const IGB_TxRing *txr)
 }

 static uint32_t
-igb_rxbufsize(IGBCore *core, const E1000E_RingInfo *r)
+igb_rxbufsize(IGBCore *core, const E1000ERingInfo *r)
 {
    uint32_t srrctl = core->mac[E1000_SRRCTL(r->idx) >> 2];
    uint32_t bsizepkt = srrctl & E1000_SRRCTL_BSIZEPKT_MASK;
@ -930,7 +944,7 @@ igb_rxbufsize(IGBCore *core, const E1000E_RingInfo *r)
 }

 static bool
-igb_has_rxbufs(IGBCore *core, const E1000E_RingInfo *r, size_t total_size)
+igb_has_rxbufs(IGBCore *core, const E1000ERingInfo *r, size_t total_size)
 {
    uint32_t bufs = igb_ring_free_descr_num(core, r);
    uint32_t bufsize = igb_rxbufsize(core, r);
@ -941,6 +955,14 @@ igb_has_rxbufs(IGBCore *core, const E1000E_RingInfo *r, size_t total_size)
                         bufsize;
 }

+static uint32_t
+igb_rxhdrbufsize(IGBCore *core, const E1000ERingInfo *r)
+{
+    uint32_t srrctl = core->mac[E1000_SRRCTL(r->idx) >> 2];
+    return (srrctl & E1000_SRRCTL_BSIZEHDRSIZE_MASK) >>
+           E1000_SRRCTL_BSIZEHDRSIZE_SHIFT;
+}
+
 void
 igb_start_recv(IGBCore *core)
 {
@ -1225,21 +1247,77 @@ igb_read_lgcy_rx_descr(IGBCore *core, struct e1000_rx_desc *desc,
 }

 static inline void
-igb_read_adv_rx_descr(IGBCore *core, union e1000_adv_rx_desc *desc,
-                      hwaddr *buff_addr)
+igb_read_adv_rx_single_buf_descr(IGBCore *core, union e1000_adv_rx_desc *desc,
+                                 hwaddr *buff_addr)
 {
    *buff_addr = le64_to_cpu(desc->read.pkt_addr);
 }

 static inline void
-igb_read_rx_descr(IGBCore *core, union e1000_rx_desc_union *desc,
-                  hwaddr *buff_addr)
+igb_read_adv_rx_split_buf_descr(IGBCore *core, union e1000_adv_rx_desc *desc,
+                                hwaddr *buff_addr)
 {
+    buff_addr[0] = le64_to_cpu(desc->read.hdr_addr);
+    buff_addr[1] = le64_to_cpu(desc->read.pkt_addr);
+}
+
+typedef struct IGBBAState {
+    uint16_t written[IGB_MAX_PS_BUFFERS];
+    uint8_t cur_idx;
+} IGBBAState;
+
+typedef struct IGBSplitDescriptorData {
+    bool sph;
+    bool hbo;
+    size_t hdr_len;
+} IGBSplitDescriptorData;
+
+typedef struct IGBPacketRxDMAState {
+    size_t size;
+    size_t total_size;
+    size_t ps_hdr_len;
+    size_t desc_size;
+    size_t desc_offset;
+    uint32_t rx_desc_packet_buf_size;
+    uint32_t rx_desc_header_buf_size;
+    struct iovec *iov;
+    size_t iov_ofs;
+    bool do_ps;
+    bool is_first;
+    IGBBAState bastate;
+    hwaddr ba[IGB_MAX_PS_BUFFERS];
+    IGBSplitDescriptorData ps_desc_data;
+} IGBPacketRxDMAState;
+
+static inline void
+igb_read_rx_descr(IGBCore *core,
+                  union e1000_rx_desc_union *desc,
+                  IGBPacketRxDMAState *pdma_st,
+                  const E1000ERingInfo *r)
+{
+    uint32_t desc_type;
+
    if (igb_rx_use_legacy_descriptor(core)) {
-        igb_read_lgcy_rx_descr(core, &desc->legacy, buff_addr);
-    } else {
-        igb_read_adv_rx_descr(core, &desc->adv, buff_addr);
+        igb_read_lgcy_rx_descr(core, &desc->legacy, &pdma_st->ba[1]);
+        pdma_st->ba[0] = 0;
+        return;
    }
+
+    /* advanced header split descriptor */
+    if (igb_rx_use_ps_descriptor(core, r)) {
+        igb_read_adv_rx_split_buf_descr(core, &desc->adv, &pdma_st->ba[0]);
+        return;
+    }
+
+    /* descriptor replication modes not supported */
+    desc_type = igb_rx_queue_desctyp_get(core, r);
+    if (desc_type != E1000_SRRCTL_DESCTYPE_ADV_ONEBUF) {
+        trace_igb_wrn_rx_desc_modes_not_supp(desc_type);
+    }
+
+    /* advanced single buffer descriptor */
+    igb_read_adv_rx_single_buf_descr(core, &desc->adv, &pdma_st->ba[1]);
+    pdma_st->ba[0] = 0;
 }

 static void
@ -1281,15 +1359,11 @@ igb_verify_csum_in_sw(IGBCore *core,
 }

 static void
-igb_build_rx_metadata(IGBCore *core,
-                      struct NetRxPkt *pkt,
-                      bool is_eop,
-                      const E1000E_RSSInfo *rss_info, uint16_t etqf, bool ts,
-                      uint16_t *pkt_info, uint16_t *hdr_info,
-                      uint32_t *rss,
-                      uint32_t *status_flags,
-                      uint16_t *ip_id,
-                      uint16_t *vlan_tag)
+igb_build_rx_metadata_common(IGBCore *core,
+                             struct NetRxPkt *pkt,
+                             bool is_eop,
+                             uint32_t *status_flags,
+                             uint16_t *vlan_tag)
 {
    struct virtio_net_hdr *vhdr;
    bool hasip4, hasip6, csum_valid;
@ -1298,7 +1372,6 @@ igb_build_rx_metadata(IGBCore *core,
    *status_flags = E1000_RXD_STAT_DD;

    /* No additional metadata needed for non-EOP descriptors */
-    /* TODO: EOP apply only to status so don't skip whole function. */
    if (!is_eop) {
        goto func_exit;
    }
@ -1315,64 +1388,6 @@ igb_build_rx_metadata(IGBCore *core,
        trace_e1000e_rx_metadata_vlan(*vlan_tag);
    }

-    /* Packet parsing results */
-    if ((core->mac[RXCSUM] & E1000_RXCSUM_PCSD) != 0) {
-        if (rss_info->enabled) {
-            *rss = cpu_to_le32(rss_info->hash);
-            trace_igb_rx_metadata_rss(*rss);
-        }
-    } else if (hasip4) {
-            *status_flags |= E1000_RXD_STAT_IPIDV;
-            *ip_id = cpu_to_le16(net_rx_pkt_get_ip_id(pkt));
-            trace_e1000e_rx_metadata_ip_id(*ip_id);
-    }
-
-    if (l4hdr_proto == ETH_L4_HDR_PROTO_TCP && net_rx_pkt_is_tcp_ack(pkt)) {
-        *status_flags |= E1000_RXD_STAT_ACK;
-        trace_e1000e_rx_metadata_ack();
-    }
-
-    if (pkt_info) {
-        *pkt_info = rss_info->enabled ? rss_info->type : 0;
-
-        if (etqf < 8) {
-            *pkt_info |= (BIT(11) | etqf) << 4;
-        } else {
-            if (hasip4) {
-                *pkt_info |= E1000_ADVRXD_PKT_IP4;
-            }
-
-            if (hasip6) {
-                *pkt_info |= E1000_ADVRXD_PKT_IP6;
-            }
-
-            switch (l4hdr_proto) {
-            case ETH_L4_HDR_PROTO_TCP:
-                *pkt_info |= E1000_ADVRXD_PKT_TCP;
-                break;
-
-            case ETH_L4_HDR_PROTO_UDP:
-                *pkt_info |= E1000_ADVRXD_PKT_UDP;
-                break;
-
-            case ETH_L4_HDR_PROTO_SCTP:
-                *pkt_info |= E1000_ADVRXD_PKT_SCTP;
-                break;
-
-            default:
-                break;
-            }
-        }
-    }
-
-    if (hdr_info) {
-        *hdr_info = 0;
-    }
-
-    if (ts) {
-        *status_flags |= BIT(16);
-    }
-
    /* RX CSO information */
    if (hasip6 && (core->mac[RFCTL] & E1000_RFCTL_IPV6_XSUM_DIS)) {
        trace_e1000e_rx_metadata_ipv6_sum_disabled();
@ -1428,56 +1443,168 @@ func_exit:
 static inline void
 igb_write_lgcy_rx_descr(IGBCore *core, struct e1000_rx_desc *desc,
                        struct NetRxPkt *pkt,
-                        const E1000E_RSSInfo *rss_info, uint16_t etqf, bool ts,
+                        const E1000E_RSSInfo *rss_info,
                        uint16_t length)
 {
-    uint32_t status_flags, rss;
-    uint16_t ip_id;
+    uint32_t status_flags;

    assert(!rss_info->enabled);
-    desc->length = cpu_to_le16(length);
-    desc->csum = 0;

-    igb_build_rx_metadata(core, pkt, pkt != NULL,
-                          rss_info, etqf, ts,
-                          NULL, NULL, &rss,
-                          &status_flags, &ip_id,
-                          &desc->special);
+    memset(desc, 0, sizeof(*desc));
+    desc->length = cpu_to_le16(length);
+    igb_build_rx_metadata_common(core, pkt, pkt != NULL,
+                                 &status_flags,
+                                 &desc->special);
+
    desc->errors = (uint8_t) (le32_to_cpu(status_flags) >> 24);
    desc->status = (uint8_t) le32_to_cpu(status_flags);
 }

+static bool
+igb_rx_ps_descriptor_split_always(IGBCore *core, const E1000ERingInfo *r)
+{
+    uint32_t desctyp = igb_rx_queue_desctyp_get(core, r);
+    return desctyp == E1000_SRRCTL_DESCTYPE_HDR_SPLIT_ALWAYS;
+}
+
+static uint16_t
+igb_rx_desc_get_packet_type(IGBCore *core, struct NetRxPkt *pkt, uint16_t etqf)
+{
+    uint16_t pkt_type;
+    bool hasip4, hasip6;
+    EthL4HdrProto l4hdr_proto;
+
+    if (etqf < 8) {
+        pkt_type = BIT(11) | etqf;
+        return pkt_type;
+    }
+
+    net_rx_pkt_get_protocols(pkt, &hasip4, &hasip6, &l4hdr_proto);
+
+    if (hasip6 && !(core->mac[RFCTL] & E1000_RFCTL_IPV6_DIS)) {
+        eth_ip6_hdr_info *ip6hdr_info = net_rx_pkt_get_ip6_info(pkt);
+        pkt_type = ip6hdr_info->has_ext_hdrs ? E1000_ADVRXD_PKT_IP6E :
+                                               E1000_ADVRXD_PKT_IP6;
+    } else if (hasip4) {
+        pkt_type = E1000_ADVRXD_PKT_IP4;
+    } else {
+        pkt_type = 0;
+    }
+
+    switch (l4hdr_proto) {
+    case ETH_L4_HDR_PROTO_TCP:
+        pkt_type |= E1000_ADVRXD_PKT_TCP;
+        break;
+    case ETH_L4_HDR_PROTO_UDP:
+        pkt_type |= E1000_ADVRXD_PKT_UDP;
+        break;
+    case ETH_L4_HDR_PROTO_SCTP:
+        pkt_type |= E1000_ADVRXD_PKT_SCTP;
+        break;
+    default:
+        break;
+    }
+
+    return pkt_type;
+}
+
 static inline void
 igb_write_adv_rx_descr(IGBCore *core, union e1000_adv_rx_desc *desc,
                       struct NetRxPkt *pkt,
                       const E1000E_RSSInfo *rss_info, uint16_t etqf, bool ts,
                       uint16_t length)
 {
+    bool hasip4, hasip6;
+    EthL4HdrProto l4hdr_proto;
+    uint16_t rss_type = 0, pkt_type;
+    bool eop = (pkt != NULL);
+    uint32_t adv_desc_status_error = 0;
    memset(&desc->wb, 0, sizeof(desc->wb));

    desc->wb.upper.length = cpu_to_le16(length);
+    igb_build_rx_metadata_common(core, pkt, eop,
+                                 &desc->wb.upper.status_error,
+                                 &desc->wb.upper.vlan);

-    igb_build_rx_metadata(core, pkt, pkt != NULL,
-                          rss_info, etqf, ts,
-                          &desc->wb.lower.lo_dword.pkt_info,
-                          &desc->wb.lower.lo_dword.hdr_info,
-                          &desc->wb.lower.hi_dword.rss,
-                          &desc->wb.upper.status_error,
-                          &desc->wb.lower.hi_dword.csum_ip.ip_id,
-                          &desc->wb.upper.vlan);
+    if (!eop) {
+        return;
+    }
+
+    net_rx_pkt_get_protocols(pkt, &hasip4, &hasip6, &l4hdr_proto);
+
+    if ((core->mac[RXCSUM] & E1000_RXCSUM_PCSD) != 0) {
+        if (rss_info->enabled) {
+            desc->wb.lower.hi_dword.rss = cpu_to_le32(rss_info->hash);
+            rss_type = rss_info->type;
+            trace_igb_rx_metadata_rss(desc->wb.lower.hi_dword.rss, rss_type);
+        }
+    } else if (hasip4) {
+            adv_desc_status_error |= E1000_RXD_STAT_IPIDV;
+            desc->wb.lower.hi_dword.csum_ip.ip_id =
+                cpu_to_le16(net_rx_pkt_get_ip_id(pkt));
+            trace_e1000e_rx_metadata_ip_id(
+                desc->wb.lower.hi_dword.csum_ip.ip_id);
+    }
+
+    if (ts) {
+        adv_desc_status_error |= BIT(16);
+    }
+
+    pkt_type = igb_rx_desc_get_packet_type(core, pkt, etqf);
+    trace_e1000e_rx_metadata_pkt_type(pkt_type);
+    desc->wb.lower.lo_dword.pkt_info = cpu_to_le16(rss_type | (pkt_type << 4));
+    desc->wb.upper.status_error |= cpu_to_le32(adv_desc_status_error);
 }

 static inline void
-igb_write_rx_descr(IGBCore *core, union e1000_rx_desc_union *desc,
-                   struct NetRxPkt *pkt, const E1000E_RSSInfo *rss_info,
-                   uint16_t etqf, bool ts, uint16_t length)
+igb_write_adv_ps_rx_descr(IGBCore *core,
+                          union e1000_adv_rx_desc *desc,
+                          struct NetRxPkt *pkt,
+                          const E1000E_RSSInfo *rss_info,
+                          const E1000ERingInfo *r,
+                          uint16_t etqf,
+                          bool ts,
+                          IGBPacketRxDMAState *pdma_st)
+{
+    size_t pkt_len;
+    uint16_t hdr_info = 0;
+
+    if (pdma_st->do_ps) {
+        pkt_len = pdma_st->bastate.written[1];
+    } else {
+        pkt_len = pdma_st->bastate.written[0] + pdma_st->bastate.written[1];
+    }
+
+    igb_write_adv_rx_descr(core, desc, pkt, rss_info, etqf, ts, pkt_len);
+
+    hdr_info = (pdma_st->ps_desc_data.hdr_len << E1000_ADVRXD_HDR_LEN_OFFSET) &
+               E1000_ADVRXD_ADV_HDR_LEN_MASK;
+    hdr_info |= pdma_st->ps_desc_data.sph ? E1000_ADVRXD_HDR_SPH : 0;
+    desc->wb.lower.lo_dword.hdr_info = cpu_to_le16(hdr_info);
+
+    desc->wb.upper.status_error |= cpu_to_le32(
+        pdma_st->ps_desc_data.hbo ? E1000_ADVRXD_ST_ERR_HBO_OFFSET : 0);
+}
+
+static inline void
+igb_write_rx_descr(IGBCore *core,
+                   union e1000_rx_desc_union *desc,
+                   struct NetRxPkt *pkt,
+                   const E1000E_RSSInfo *rss_info,
+                   uint16_t etqf,
+                   bool ts,
+                   IGBPacketRxDMAState *pdma_st,
+                   const E1000ERingInfo *r)
 {
    if (igb_rx_use_legacy_descriptor(core)) {
        igb_write_lgcy_rx_descr(core, &desc->legacy, pkt, rss_info,
-                                etqf, ts, length);
+                                pdma_st->bastate.written[1]);
+    } else if (igb_rx_use_ps_descriptor(core, r)) {
+        igb_write_adv_ps_rx_descr(core, &desc->adv, pkt, rss_info, r, etqf, ts,
+                                  pdma_st);
    } else {
        igb_write_adv_rx_descr(core, &desc->adv, pkt, rss_info,
-                               etqf, ts, length);
+                               etqf, ts, pdma_st->bastate.written[1]);
    }
 }

@ -1514,20 +1641,7 @@ igb_pci_dma_write_rx_desc(IGBCore *core, PCIDevice *dev, dma_addr_t addr,
 }

 static void
-igb_write_to_rx_buffers(IGBCore *core,
-                        PCIDevice *d,
-                        hwaddr ba,
-                        uint16_t *written,
-                        const char *data,
-                        dma_addr_t data_len)
-{
-    trace_igb_rx_desc_buff_write(ba, *written, data, data_len);
-    pci_dma_write(d, ba + *written, data, data_len);
-    *written += data_len;
-}
-
-static void
-igb_update_rx_stats(IGBCore *core, const E1000E_RingInfo *rxi,
+igb_update_rx_stats(IGBCore *core, const E1000ERingInfo *rxi,
                    size_t pkt_size, size_t pkt_fcs_size)
 {
    eth_pkt_types_e pkt_type = net_rx_pkt_get_packet_type(core->rx_pkt);
@ -1545,12 +1659,256 @@ igb_update_rx_stats(IGBCore *core, const E1000E_RingInfo *rxi,
 }

 static inline bool
-igb_rx_descr_threshold_hit(IGBCore *core, const E1000E_RingInfo *rxi)
+igb_rx_descr_threshold_hit(IGBCore *core, const E1000ERingInfo *rxi)
 {
    return igb_ring_free_descr_num(core, rxi) ==
           ((core->mac[E1000_SRRCTL(rxi->idx) >> 2] >> 20) & 31) * 16;
 }

+static bool
+igb_do_ps(IGBCore *core,
+          const E1000ERingInfo *r,
+          struct NetRxPkt *pkt,
+          IGBPacketRxDMAState *pdma_st)
+{
+    bool hasip4, hasip6;
+    EthL4HdrProto l4hdr_proto;
+    bool fragment;
+    bool split_always;
+    size_t bheader_size;
+    size_t total_pkt_len;
+
+    if (!igb_rx_use_ps_descriptor(core, r)) {
+        return false;
+    }
+
+    total_pkt_len = net_rx_pkt_get_total_len(pkt);
+    bheader_size = igb_rxhdrbufsize(core, r);
+    split_always = igb_rx_ps_descriptor_split_always(core, r);
+    if (split_always && total_pkt_len <= bheader_size) {
+        pdma_st->ps_hdr_len = total_pkt_len;
+        pdma_st->ps_desc_data.hdr_len = total_pkt_len;
+        return true;
+    }
+
+    net_rx_pkt_get_protocols(pkt, &hasip4, &hasip6, &l4hdr_proto);
+
+    if (hasip4) {
+        fragment = net_rx_pkt_get_ip4_info(pkt)->fragment;
+    } else if (hasip6) {
+        fragment = net_rx_pkt_get_ip6_info(pkt)->fragment;
+    } else {
+        pdma_st->ps_desc_data.hdr_len = bheader_size;
+        goto header_not_handled;
+    }
+
+    if (fragment && (core->mac[RFCTL] & E1000_RFCTL_IPFRSP_DIS)) {
+        pdma_st->ps_desc_data.hdr_len = bheader_size;
+        goto header_not_handled;
+    }
+
+    /* no header splitting for SCTP */
+    if (!fragment && (l4hdr_proto == ETH_L4_HDR_PROTO_UDP ||
+                      l4hdr_proto == ETH_L4_HDR_PROTO_TCP)) {
+        pdma_st->ps_hdr_len = net_rx_pkt_get_l5_hdr_offset(pkt);
+    } else {
+        pdma_st->ps_hdr_len = net_rx_pkt_get_l4_hdr_offset(pkt);
+    }
+
+    pdma_st->ps_desc_data.sph = true;
+    pdma_st->ps_desc_data.hdr_len = pdma_st->ps_hdr_len;
+
+    if (pdma_st->ps_hdr_len > bheader_size) {
+        pdma_st->ps_desc_data.hbo = true;
+        goto header_not_handled;
+    }
+
+    return true;
+
+header_not_handled:
+    if (split_always) {
+        pdma_st->ps_hdr_len = bheader_size;
+        return true;
+    }
+
+    return false;
+}
+
+static void
+igb_truncate_to_descriptor_size(IGBPacketRxDMAState *pdma_st, size_t *size)
+{
+    if (pdma_st->do_ps && pdma_st->is_first) {
+        if (*size > pdma_st->rx_desc_packet_buf_size + pdma_st->ps_hdr_len) {
+            *size = pdma_st->rx_desc_packet_buf_size + pdma_st->ps_hdr_len;
+        }
+    } else {
+        if (*size > pdma_st->rx_desc_packet_buf_size) {
+            *size = pdma_st->rx_desc_packet_buf_size;
+        }
+    }
+}
+
+static inline void
+igb_write_hdr_frag_to_rx_buffers(IGBCore *core,
+                                 PCIDevice *d,
+                                 IGBPacketRxDMAState *pdma_st,
+                                 const char *data,
+                                 dma_addr_t data_len)
+{
+    assert(data_len <= pdma_st->rx_desc_header_buf_size -
+                       pdma_st->bastate.written[0]);
+    pci_dma_write(d,
+                  pdma_st->ba[0] + pdma_st->bastate.written[0],
+                  data, data_len);
+    pdma_st->bastate.written[0] += data_len;
+    pdma_st->bastate.cur_idx = 1;
+}
+
+static void
+igb_write_header_to_rx_buffers(IGBCore *core,
+                               struct NetRxPkt *pkt,
+                               PCIDevice *d,
+                               IGBPacketRxDMAState *pdma_st,
+                               size_t *copy_size)
+{
+    size_t iov_copy;
+    size_t ps_hdr_copied = 0;
+
+    if (!pdma_st->is_first) {
+        /* Leave buffer 0 of each descriptor except first */
+        /* empty                                          */
+        pdma_st->bastate.cur_idx = 1;
+        return;
+    }
+
+    do {
+        iov_copy = MIN(pdma_st->ps_hdr_len - ps_hdr_copied,
+                       pdma_st->iov->iov_len - pdma_st->iov_ofs);
+
+        igb_write_hdr_frag_to_rx_buffers(core, d, pdma_st,
+                                         pdma_st->iov->iov_base,
+                                         iov_copy);
+
+        *copy_size -= iov_copy;
+        ps_hdr_copied += iov_copy;
+
+        pdma_st->iov_ofs += iov_copy;
+        if (pdma_st->iov_ofs == pdma_st->iov->iov_len) {
+            pdma_st->iov++;
+            pdma_st->iov_ofs = 0;
+        }
+    } while (ps_hdr_copied < pdma_st->ps_hdr_len);
+
+    pdma_st->is_first = false;
+}
+
+static void
+igb_write_payload_frag_to_rx_buffers(IGBCore *core,
+                                     PCIDevice *d,
+                                     IGBPacketRxDMAState *pdma_st,
+                                     const char *data,
+                                     dma_addr_t data_len)
+{
+    while (data_len > 0) {
+        assert(pdma_st->bastate.cur_idx < IGB_MAX_PS_BUFFERS);
+
+        uint32_t cur_buf_bytes_left =
+            pdma_st->rx_desc_packet_buf_size -
+            pdma_st->bastate.written[pdma_st->bastate.cur_idx];
+        uint32_t bytes_to_write = MIN(data_len, cur_buf_bytes_left);
+
+        trace_igb_rx_desc_buff_write(
+            pdma_st->bastate.cur_idx,
+            pdma_st->ba[pdma_st->bastate.cur_idx],
+            pdma_st->bastate.written[pdma_st->bastate.cur_idx],
+            data,
+            bytes_to_write);
+
+        pci_dma_write(d,
+                      pdma_st->ba[pdma_st->bastate.cur_idx] +
+                      pdma_st->bastate.written[pdma_st->bastate.cur_idx],
+                      data, bytes_to_write);
+
+        pdma_st->bastate.written[pdma_st->bastate.cur_idx] += bytes_to_write;
+        data += bytes_to_write;
+        data_len -= bytes_to_write;
+
+        if (pdma_st->bastate.written[pdma_st->bastate.cur_idx] ==
+            pdma_st->rx_desc_packet_buf_size) {
+            pdma_st->bastate.cur_idx++;
+        }
+    }
+}
+
+static void
+igb_write_payload_to_rx_buffers(IGBCore *core,
+                                struct NetRxPkt *pkt,
+                                PCIDevice *d,
+                                IGBPacketRxDMAState *pdma_st,
+                                size_t *copy_size)
+{
+    static const uint32_t fcs_pad;
+    size_t iov_copy;
+
+    /* Copy packet payload */
+    while (*copy_size) {
+        iov_copy = MIN(*copy_size, pdma_st->iov->iov_len - pdma_st->iov_ofs);
+        igb_write_payload_frag_to_rx_buffers(core, d,
+                                             pdma_st,
+                                             pdma_st->iov->iov_base +
+                                             pdma_st->iov_ofs,
+                                             iov_copy);
+
+        *copy_size -= iov_copy;
+        pdma_st->iov_ofs += iov_copy;
+        if (pdma_st->iov_ofs == pdma_st->iov->iov_len) {
+            pdma_st->iov++;
+            pdma_st->iov_ofs = 0;
+        }
+    }
+
+    if (pdma_st->desc_offset + pdma_st->desc_size >= pdma_st->total_size) {
+        /* Simulate FCS checksum presence in the last descriptor */
+        igb_write_payload_frag_to_rx_buffers(core, d,
+                                             pdma_st,
+                                             (const char *) &fcs_pad,
+                                             e1000x_fcs_len(core->mac));
+    }
+}
+
+static void
+igb_write_to_rx_buffers(IGBCore *core,
+                        struct NetRxPkt *pkt,
+                        PCIDevice *d,
+                        IGBPacketRxDMAState *pdma_st)
+{
+    size_t copy_size;
+
+    if (!(pdma_st->ba)[1] || (pdma_st->do_ps && !(pdma_st->ba[0]))) {
+        /* as per intel docs; skip descriptors with null buf addr */
+        trace_e1000e_rx_null_descriptor();
+        return;
+    }
+
+    if (pdma_st->desc_offset >= pdma_st->size) {
+        return;
+    }
+
+    pdma_st->desc_size = pdma_st->total_size - pdma_st->desc_offset;
+    igb_truncate_to_descriptor_size(pdma_st, &pdma_st->desc_size);
+    copy_size = pdma_st->size - pdma_st->desc_offset;
+    igb_truncate_to_descriptor_size(pdma_st, &copy_size);
+
+    /* For PS mode copy the packet header first */
+    if (pdma_st->do_ps) {
+        igb_write_header_to_rx_buffers(core, pkt, d, pdma_st, &copy_size);
+    } else {
+        pdma_st->bastate.cur_idx = 1;
+    }
+
+    igb_write_payload_to_rx_buffers(core, pkt, d, pdma_st, &copy_size);
+}
+
 static void
 igb_write_packet_to_guest(IGBCore *core, struct NetRxPkt *pkt,
                          const E1000E_RxRing *rxr,
@ -1560,95 +1918,61 @@ igb_write_packet_to_guest(IGBCore *core, struct NetRxPkt *pkt,
    PCIDevice *d;
    dma_addr_t base;
    union e1000_rx_desc_union desc;
-    size_t desc_size;
-    size_t desc_offset = 0;
-    size_t iov_ofs = 0;
+    const E1000ERingInfo *rxi;
+    size_t rx_desc_len;

-    struct iovec *iov = net_rx_pkt_get_iovec(pkt);
-    size_t size = net_rx_pkt_get_total_len(pkt);
-    size_t total_size = size + e1000x_fcs_len(core->mac);
-    const E1000E_RingInfo *rxi = rxr->i;
-    size_t bufsize = igb_rxbufsize(core, rxi);
+    IGBPacketRxDMAState pdma_st = {0};
+    pdma_st.is_first = true;
+    pdma_st.size = net_rx_pkt_get_total_len(pkt);
+    pdma_st.total_size = pdma_st.size + e1000x_fcs_len(core->mac);

+    rxi = rxr->i;
+    rx_desc_len = core->rx_desc_len;
+    pdma_st.rx_desc_packet_buf_size = igb_rxbufsize(core, rxi);
+    pdma_st.rx_desc_header_buf_size = igb_rxhdrbufsize(core, rxi);
+    pdma_st.iov = net_rx_pkt_get_iovec(pkt);
    d = pcie_sriov_get_vf_at_index(core->owner, rxi->idx % 8);
    if (!d) {
        d = core->owner;
    }

+    pdma_st.do_ps = igb_do_ps(core, rxi, pkt, &pdma_st);
+
    do {
-        hwaddr ba;
-        uint16_t written = 0;
+        memset(&pdma_st.bastate, 0, sizeof(IGBBAState));
        bool is_last = false;

-        desc_size = total_size - desc_offset;
-
-        if (desc_size > bufsize) {
-            desc_size = bufsize;
-        }
-
        if (igb_ring_empty(core, rxi)) {
            return;
        }

        base = igb_ring_head_descr(core, rxi);
+        pci_dma_read(d, base, &desc, rx_desc_len);
+        trace_e1000e_rx_descr(rxi->idx, base, rx_desc_len);

-        pci_dma_read(d, base, &desc, core->rx_desc_len);
+        igb_read_rx_descr(core, &desc, &pdma_st, rxi);

-        trace_e1000e_rx_descr(rxi->idx, base, core->rx_desc_len);
-
-        igb_read_rx_descr(core, &desc, &ba);
-
-        if (ba) {
-            if (desc_offset < size) {
-                static const uint32_t fcs_pad;
-                size_t iov_copy;
-                size_t copy_size = size - desc_offset;
-                if (copy_size > bufsize) {
-                    copy_size = bufsize;
-                }
-
-                /* Copy packet payload */
-                while (copy_size) {
-                    iov_copy = MIN(copy_size, iov->iov_len - iov_ofs);
-
-                    igb_write_to_rx_buffers(core, d, ba, &written,
-                                            iov->iov_base + iov_ofs, iov_copy);
-
-                    copy_size -= iov_copy;
-                    iov_ofs += iov_copy;
-                    if (iov_ofs == iov->iov_len) {
-                        iov++;
-                        iov_ofs = 0;
-                    }
-                }
-
-                if (desc_offset + desc_size >= total_size) {
-                    /* Simulate FCS checksum presence in the last descriptor */
-                    igb_write_to_rx_buffers(core, d, ba, &written,
-                          (const char *) &fcs_pad, e1000x_fcs_len(core->mac));
-                }
-            }
-        } else { /* as per intel docs; skip descriptors with null buf addr */
-            trace_e1000e_rx_null_descriptor();
-        }
-        desc_offset += desc_size;
-        if (desc_offset >= total_size) {
+        igb_write_to_rx_buffers(core, pkt, d, &pdma_st);
+        pdma_st.desc_offset += pdma_st.desc_size;
+        if (pdma_st.desc_offset >= pdma_st.total_size) {
            is_last = true;
        }

-        igb_write_rx_descr(core, &desc, is_last ? core->rx_pkt : NULL,
-                           rss_info, etqf, ts, written);
-        igb_pci_dma_write_rx_desc(core, d, base, &desc, core->rx_desc_len);
+        igb_write_rx_descr(core, &desc,
+                           is_last ? pkt : NULL,
+                           rss_info,
+                           etqf, ts,
+                           &pdma_st,
+                           rxi);
+        igb_pci_dma_write_rx_desc(core, d, base, &desc, rx_desc_len);
+        igb_ring_advance(core, rxi, rx_desc_len / E1000_MIN_RX_DESC_LEN);
+    } while (pdma_st.desc_offset < pdma_st.total_size);

-        igb_ring_advance(core, rxi, core->rx_desc_len / E1000_MIN_RX_DESC_LEN);
-
-    } while (desc_offset < total_size);
-
-    igb_update_rx_stats(core, rxi, size, total_size);
+    igb_update_rx_stats(core, rxi, pdma_st.size, pdma_st.total_size);
 }

 static bool
-igb_rx_strip_vlan(IGBCore *core, const E1000E_RingInfo *rxi)
+igb_rx_strip_vlan(IGBCore *core, const E1000ERingInfo *rxi)
 {
    if (core->mac[MRQC] & 1) {
        uint16_t pool = rxi->idx % IGB_NUM_VM_POOLS;
@ -2753,7 +3077,7 @@ igb_update_rx_offloads(IGBCore *core)

    if (core->has_vnet) {
        qemu_set_offload(qemu_get_queue(core->owner_nic)->peer,
-                         cso_state, 0, 0, 0, 0);
+                         cso_state, 0, 0, 0, 0, 0, 0);
    }
 }

--- a/hw/net/igb_regs.h
+++ b/hw/net/igb_regs.h
@ -452,6 +452,7 @@ union e1000_adv_rx_desc {
 #define E1000_SRRCTL_BSIZEHDRSIZE_MASK         0x00000F00
 #define E1000_SRRCTL_BSIZEHDRSIZE_SHIFT        2  /* Shift _left_ */
 #define E1000_SRRCTL_DESCTYPE_ADV_ONEBUF       0x02000000
+#define E1000_SRRCTL_DESCTYPE_HDR_SPLIT        0x04000000
 #define E1000_SRRCTL_DESCTYPE_HDR_SPLIT_ALWAYS 0x0A000000
 #define E1000_SRRCTL_DESCTYPE_MASK             0x0E000000
 #define E1000_SRRCTL_DROP_EN                   0x80000000
@ -692,11 +693,20 @@ union e1000_adv_rx_desc {

 #define E1000_STATUS_NUM_VFS_SHIFT 14

-#define E1000_ADVRXD_PKT_IP4 BIT(4)
-#define E1000_ADVRXD_PKT_IP6 BIT(6)
-#define E1000_ADVRXD_PKT_TCP BIT(8)
-#define E1000_ADVRXD_PKT_UDP BIT(9)
-#define E1000_ADVRXD_PKT_SCTP BIT(10)
+#define E1000_ADVRXD_PKT_IP4  BIT(0)
+#define E1000_ADVRXD_PKT_IP6  BIT(2)
+#define E1000_ADVRXD_PKT_IP6E BIT(3)
+#define E1000_ADVRXD_PKT_TCP  BIT(4)
+#define E1000_ADVRXD_PKT_UDP  BIT(5)
+#define E1000_ADVRXD_PKT_SCTP BIT(6)
+
+#define IGB_MAX_PS_BUFFERS 2
+
+#define E1000_ADVRXD_HDR_LEN_OFFSET    (21 - 16)
+#define E1000_ADVRXD_ADV_HDR_LEN_MASK  ((BIT(10) - 1) << \
+                                        E1000_ADVRXD_HDR_LEN_OFFSET)
+#define E1000_ADVRXD_HDR_SPH           BIT(15)
+#define E1000_ADVRXD_ST_ERR_HBO_OFFSET BIT(3 + 20)

 static inline uint8_t igb_ivar_entry_rx(uint8_t i)
 {
--- a/hw/net/rocker/rocker_of_dpa.c
+++ b/hw/net/rocker/rocker_of_dpa.c
@ -1043,7 +1043,7 @@ static void of_dpa_flow_ig_tbl(OfDpaFlowContext *fc, uint32_t tbl_id)
 static ssize_t of_dpa_ig(World *world, uint32_t pport,
                         const struct iovec *iov, int iovcnt)
 {
-    struct iovec iov_copy[iovcnt + 2];
+    g_autofree struct iovec *iov_copy = g_new(struct iovec, iovcnt + 2);
    OfDpaFlowContext fc = {
        .of_dpa = world_private(world),
        .in_pport = pport,
--- a/hw/net/trace-events
+++ b/hw/net/trace-events
@ -278,9 +278,9 @@ igb_core_mdic_write_unhandled(uint32_t addr) "MDIC WRITE: PHY[%u] UNHANDLED"
 igb_link_set_ext_params(bool asd_check, bool speed_select_bypass, bool pfrstd) "Set extended link params: ASD check: %d, Speed select bypass: %d, PF reset done: %d"

 igb_rx_desc_buff_size(uint32_t b) "buffer size: %u"
-igb_rx_desc_buff_write(uint64_t addr, uint16_t offset, const void* source, uint32_t len) "addr: 0x%"PRIx64", offset: %u, from: %p, length: %u"
+igb_rx_desc_buff_write(uint8_t idx, uint64_t addr, uint16_t offset, const void* source, uint32_t len) "buffer %u, addr: 0x%"PRIx64", offset: %u, from: %p, length: %u"

-igb_rx_metadata_rss(uint32_t rss) "RSS data: 0x%X"
+igb_rx_metadata_rss(uint32_t rss, uint16_t rss_pkt_type) "RSS data: rss: 0x%X, rss_pkt_type: 0x%X"

 igb_irq_icr_clear_gpie_nsicr(void) "Clearing ICR on read due to GPIE.NSICR enabled"
 igb_irq_set_iam(uint32_t icr) "Update IAM: 0x%x"
@ -295,6 +295,8 @@ igb_irq_eitr_set(uint32_t eitr_num, uint32_t val) "EITR[%u] = 0x%x"
 igb_set_pfmailbox(uint32_t vf_num, uint32_t val) "PFMailbox[%d]: 0x%x"
 igb_set_vfmailbox(uint32_t vf_num, uint32_t val) "VFMailbox[%d]: 0x%x"

+igb_wrn_rx_desc_modes_not_supp(int desc_type) "Not supported descriptor type: %d"
+
 # igbvf.c
 igbvf_wrn_io_addr_unknown(uint64_t addr) "IO unknown register 0x%"PRIx64

--- a/hw/net/vhost_net.c
+++ b/hw/net/vhost_net.c
@ -78,6 +78,9 @@ static const int user_feature_bits[] = {
    VIRTIO_F_RING_RESET,
    VIRTIO_NET_F_RSS,
    VIRTIO_NET_F_HASH_REPORT,
+    VIRTIO_NET_F_GUEST_USO4,
+    VIRTIO_NET_F_GUEST_USO6,
+    VIRTIO_NET_F_HOST_USO,

    /* This bit implies RARP isn't sent by QEMU out of band */
    VIRTIO_NET_F_GUEST_ANNOUNCE,
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@ -659,6 +659,15 @@ static int peer_has_ufo(VirtIONet *n)
    return n->has_ufo;
 }

+static int peer_has_uso(VirtIONet *n)
+{
+    if (!peer_has_vnet_hdr(n)) {
+        return 0;
+    }
+
+    return qemu_has_uso(qemu_get_queue(n->nic)->peer);
+}
+
 static void virtio_net_set_mrg_rx_bufs(VirtIONet *n, int mergeable_rx_bufs,
                                       int version_1, int hash_report)
 {
@ -796,6 +805,10 @@ static uint64_t virtio_net_get_features(VirtIODevice *vdev, uint64_t features,
        virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_TSO6);
        virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_ECN);

+        virtio_clear_feature(&features, VIRTIO_NET_F_HOST_USO);
+        virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO4);
+        virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO6);
+
        virtio_clear_feature(&features, VIRTIO_NET_F_HASH_REPORT);
    }

@ -804,6 +817,12 @@ static uint64_t virtio_net_get_features(VirtIODevice *vdev, uint64_t features,
        virtio_clear_feature(&features, VIRTIO_NET_F_HOST_UFO);
    }

+    if (!peer_has_uso(n)) {
+        virtio_clear_feature(&features, VIRTIO_NET_F_HOST_USO);
+        virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO4);
+        virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO6);
+    }
+
    if (!get_vhost_net(nc->peer)) {
        return features;
    }
@ -859,17 +878,21 @@ static void virtio_net_apply_guest_offloads(VirtIONet *n)
            !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_TSO4)),
            !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_TSO6)),
            !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_ECN)),
-            !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_UFO)));
+            !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_UFO)),
+            !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_USO4)),
+            !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_USO6)));
 }

-static uint64_t virtio_net_guest_offloads_by_features(uint32_t features)
+static uint64_t virtio_net_guest_offloads_by_features(uint64_t features)
 {
    static const uint64_t guest_offloads_mask =
        (1ULL << VIRTIO_NET_F_GUEST_CSUM) |
        (1ULL << VIRTIO_NET_F_GUEST_TSO4) |
        (1ULL << VIRTIO_NET_F_GUEST_TSO6) |
        (1ULL << VIRTIO_NET_F_GUEST_ECN)  |
-        (1ULL << VIRTIO_NET_F_GUEST_UFO);
+        (1ULL << VIRTIO_NET_F_GUEST_UFO)  |
+        (1ULL << VIRTIO_NET_F_GUEST_USO4) |
+        (1ULL << VIRTIO_NET_F_GUEST_USO6);

    return guest_offloads_mask & features;
 }
@ -3922,6 +3945,12 @@ static Property virtio_net_properties[] = {
    DEFINE_PROP_INT32("speed", VirtIONet, net_conf.speed, SPEED_UNKNOWN),
    DEFINE_PROP_STRING("duplex", VirtIONet, net_conf.duplex_str),
    DEFINE_PROP_BOOL("failover", VirtIONet, failover, false),
+    DEFINE_PROP_BIT64("guest_uso4", VirtIONet, host_features,
+                      VIRTIO_NET_F_GUEST_USO4, true),
+    DEFINE_PROP_BIT64("guest_uso6", VirtIONet, host_features,
+                      VIRTIO_NET_F_GUEST_USO6, true),
+    DEFINE_PROP_BIT64("host_uso", VirtIONet, host_features,
+                      VIRTIO_NET_F_HOST_USO, true),
    DEFINE_PROP_END_OF_LIST(),
 };

--- a/hw/net/vmxnet3.c
+++ b/hw/net/vmxnet3.c
@ -1341,6 +1341,8 @@ static void vmxnet3_update_features(VMXNET3State *s)
                         s->lro_supported,
                         s->lro_supported,
                         0,
+                         0,
+                         0,
                         0);
    }
 }
--- a/include/net/net.h
+++ b/include/net/net.h
@ -54,11 +54,12 @@ typedef void (LinkStatusChanged)(NetClientState *);
 typedef void (NetClientDestructor)(NetClientState *);
 typedef RxFilterInfo *(QueryRxFilter)(NetClientState *);
 typedef bool (HasUfo)(NetClientState *);
+typedef bool (HasUso)(NetClientState *);
 typedef bool (HasVnetHdr)(NetClientState *);
 typedef bool (HasVnetHdrLen)(NetClientState *, int);
 typedef bool (GetUsingVnetHdr)(NetClientState *);
 typedef void (UsingVnetHdr)(NetClientState *, bool);
-typedef void (SetOffload)(NetClientState *, int, int, int, int, int);
+typedef void (SetOffload)(NetClientState *, int, int, int, int, int, int, int);
 typedef int (GetVnetHdrLen)(NetClientState *);
 typedef void (SetVnetHdrLen)(NetClientState *, int);
 typedef int (SetVnetLE)(NetClientState *, bool);
@ -84,6 +85,7 @@ typedef struct NetClientInfo {
    QueryRxFilter *query_rx_filter;
    NetPoll *poll;
    HasUfo *has_ufo;
+    HasUso *has_uso;
    HasVnetHdr *has_vnet_hdr;
    HasVnetHdrLen *has_vnet_hdr_len;
    GetUsingVnetHdr *get_using_vnet_hdr;
@ -187,12 +189,13 @@ void qemu_set_info_str(NetClientState *nc,
                       const char *fmt, ...) G_GNUC_PRINTF(2, 3);
 void qemu_format_nic_info_str(NetClientState *nc, uint8_t macaddr[6]);
 bool qemu_has_ufo(NetClientState *nc);
+bool qemu_has_uso(NetClientState *nc);
 bool qemu_has_vnet_hdr(NetClientState *nc);
 bool qemu_has_vnet_hdr_len(NetClientState *nc, int len);
 bool qemu_get_using_vnet_hdr(NetClientState *nc);
 void qemu_using_vnet_hdr(NetClientState *nc, bool enable);
 void qemu_set_offload(NetClientState *nc, int csum, int tso4, int tso6,
-                      int ecn, int ufo);
+                      int ecn, int ufo, int uso4, int uso6);
 int qemu_get_vnet_hdr_len(NetClientState *nc);
 void qemu_set_vnet_hdr_len(NetClientState *nc, int len);
 int qemu_set_vnet_le(NetClientState *nc, bool is_le);
--- a/meson.build
+++ b/meson.build
@ -1873,6 +1873,13 @@ if libbpf.found() and not cc.links('''
  endif
 endif

+# libxdp
+libxdp = not_found
+if not get_option('af_xdp').auto() or have_system
+    libxdp = dependency('libxdp', required: get_option('af_xdp'),
+                        version: '>=1.4.0', method: 'pkg-config')
+endif
+
 # libdw
 libdw = not_found
 if not get_option('libdw').auto() or \
@ -2099,6 +2106,7 @@ config_host_data.set('CONFIG_HEXAGON_IDEF_PARSER', get_option('hexagon_idef_pars
 config_host_data.set('CONFIG_LIBATTR', have_old_libattr)
 config_host_data.set('CONFIG_LIBCAP_NG', libcap_ng.found())
 config_host_data.set('CONFIG_EBPF', libbpf.found())
+config_host_data.set('CONFIG_AF_XDP', libxdp.found())
 config_host_data.set('CONFIG_LIBDAXCTL', libdaxctl.found())
 config_host_data.set('CONFIG_LIBISCSI', libiscsi.found())
 config_host_data.set('CONFIG_LIBNFS', libnfs.found())
@ -4270,6 +4278,7 @@ summary_info = {}
 if targetos == 'darwin'
  summary_info += {'vmnet.framework support': vmnet}
 endif
+summary_info += {'AF_XDP support':    libxdp}
 summary_info += {'slirp support':     slirp}
 summary_info += {'vde support':       vde}
 summary_info += {'netmap support':    have_netmap}
--- a/meson_options.txt
+++ b/meson_options.txt
@ -122,6 +122,8 @@ option('avx512bw', type: 'feature', value: 'auto',
 option('keyring', type: 'feature', value: 'auto',
       description: 'Linux keyring support')

+option('af_xdp', type : 'feature', value : 'auto',
+       description: 'AF_XDP network backend support')
 option('attr', type : 'feature', value : 'auto',
       description: 'attr/xattr support')
 option('auth_pam', type : 'feature', value : 'auto',
--- a/net/af-xdp.c
+++ b/net/af-xdp.c
@ -0,0 +1,526 @@
+/*
+ * AF_XDP network backend.
+ *
+ * Copyright (c) 2023 Red Hat, Inc.
+ *
+ * Authors:
+ *  Ilya Maximets <i.maximets@ovn.org>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+
+#include "qemu/osdep.h"
+#include <bpf/bpf.h>
+#include <inttypes.h>
+#include <linux/if_link.h>
+#include <linux/if_xdp.h>
+#include <net/if.h>
+#include <xdp/xsk.h>
+
+#include "clients.h"
+#include "monitor/monitor.h"
+#include "net/net.h"
+#include "qapi/error.h"
+#include "qemu/cutils.h"
+#include "qemu/error-report.h"
+#include "qemu/iov.h"
+#include "qemu/main-loop.h"
+#include "qemu/memalign.h"
+
+
+typedef struct AFXDPState {
+    NetClientState       nc;
+
+    struct xsk_socket    *xsk;
+    struct xsk_ring_cons rx;
+    struct xsk_ring_prod tx;
+    struct xsk_ring_cons cq;
+    struct xsk_ring_prod fq;
+
+    char                 ifname[IFNAMSIZ];
+    int                  ifindex;
+    bool                 read_poll;
+    bool                 write_poll;
+    uint32_t             outstanding_tx;
+
+    uint64_t             *pool;
+    uint32_t             n_pool;
+    char                 *buffer;
+    struct xsk_umem      *umem;
+
+    uint32_t             n_queues;
+    uint32_t             xdp_flags;
+    bool                 inhibit;
+} AFXDPState;
+
+#define AF_XDP_BATCH_SIZE 64
+
+static void af_xdp_send(void *opaque);
+static void af_xdp_writable(void *opaque);
+
+/* Set the event-loop handlers for the af-xdp backend. */
+static void af_xdp_update_fd_handler(AFXDPState *s)
+{
+    qemu_set_fd_handler(xsk_socket__fd(s->xsk),
+                        s->read_poll ? af_xdp_send : NULL,
+                        s->write_poll ? af_xdp_writable : NULL,
+                        s);
+}
+
+/* Update the read handler. */
+static void af_xdp_read_poll(AFXDPState *s, bool enable)
+{
+    if (s->read_poll != enable) {
+        s->read_poll = enable;
+        af_xdp_update_fd_handler(s);
+    }
+}
+
+/* Update the write handler. */
+static void af_xdp_write_poll(AFXDPState *s, bool enable)
+{
+    if (s->write_poll != enable) {
+        s->write_poll = enable;
+        af_xdp_update_fd_handler(s);
+    }
+}
+
+static void af_xdp_poll(NetClientState *nc, bool enable)
+{
+    AFXDPState *s = DO_UPCAST(AFXDPState, nc, nc);
+
+    if (s->read_poll != enable || s->write_poll != enable) {
+        s->write_poll = enable;
+        s->read_poll  = enable;
+        af_xdp_update_fd_handler(s);
+    }
+}
+
+static void af_xdp_complete_tx(AFXDPState *s)
+{
+    uint32_t idx = 0;
+    uint32_t done, i;
+    uint64_t *addr;
+
+    done = xsk_ring_cons__peek(&s->cq, XSK_RING_CONS__DEFAULT_NUM_DESCS, &idx);
+
+    for (i = 0; i < done; i++) {
+        addr = (void *) xsk_ring_cons__comp_addr(&s->cq, idx++);
+        s->pool[s->n_pool++] = *addr;
+        s->outstanding_tx--;
+    }
+
+    if (done) {
+        xsk_ring_cons__release(&s->cq, done);
+    }
+}
+
+/*
+ * The fd_write() callback, invoked if the fd is marked as writable
+ * after a poll.
+ */
+static void af_xdp_writable(void *opaque)
+{
+    AFXDPState *s = opaque;
+
+    /* Try to recover buffers that are already sent. */
+    af_xdp_complete_tx(s);
+
+    /*
+     * Unregister the handler, unless we still have packets to transmit
+     * and kernel needs a wake up.
+     */
+    if (!s->outstanding_tx || !xsk_ring_prod__needs_wakeup(&s->tx)) {
+        af_xdp_write_poll(s, false);
+    }
+
+    /* Flush any buffered packets. */
+    qemu_flush_queued_packets(&s->nc);
+}
+
+static ssize_t af_xdp_receive(NetClientState *nc,
+                              const uint8_t *buf, size_t size)
+{
+    AFXDPState *s = DO_UPCAST(AFXDPState, nc, nc);
+    struct xdp_desc *desc;
+    uint32_t idx;
+    void *data;
+
+    /* Try to recover buffers that are already sent. */
+    af_xdp_complete_tx(s);
+
+    if (size > XSK_UMEM__DEFAULT_FRAME_SIZE) {
+        /* We can't transmit packet this size... */
+        return size;
+    }
+
+    if (!s->n_pool || !xsk_ring_prod__reserve(&s->tx, 1, &idx)) {
+        /*
+         * Out of buffers or space in tx ring.  Poll until we can write.
+         * This will also kick the Tx, if it was waiting on CQ.
+         */
+        af_xdp_write_poll(s, true);
+        return 0;
+    }
+
+    desc = xsk_ring_prod__tx_desc(&s->tx, idx);
+    desc->addr = s->pool[--s->n_pool];
+    desc->len = size;
+
+    data = xsk_umem__get_data(s->buffer, desc->addr);
+    memcpy(data, buf, size);
+
+    xsk_ring_prod__submit(&s->tx, 1);
+    s->outstanding_tx++;
+
+    if (xsk_ring_prod__needs_wakeup(&s->tx)) {
+        af_xdp_write_poll(s, true);
+    }
+
+    return size;
+}
+
+/*
+ * Complete a previous send (backend --> guest) and enable the
+ * fd_read callback.
+ */
+static void af_xdp_send_completed(NetClientState *nc, ssize_t len)
+{
+    AFXDPState *s = DO_UPCAST(AFXDPState, nc, nc);
+
+    af_xdp_read_poll(s, true);
+}
+
+static void af_xdp_fq_refill(AFXDPState *s, uint32_t n)
+{
+    uint32_t i, idx = 0;
+
+    /* Leave one packet for Tx, just in case. */
+    if (s->n_pool < n + 1) {
+        n = s->n_pool;
+    }
+
+    if (!n || !xsk_ring_prod__reserve(&s->fq, n, &idx)) {
+        return;
+    }
+
+    for (i = 0; i < n; i++) {
+        *xsk_ring_prod__fill_addr(&s->fq, idx++) = s->pool[--s->n_pool];
+    }
+    xsk_ring_prod__submit(&s->fq, n);
+
+    if (xsk_ring_prod__needs_wakeup(&s->fq)) {
+        /* Receive was blocked by not having enough buffers.  Wake it up. */
+        af_xdp_read_poll(s, true);
+    }
+}
+
+static void af_xdp_send(void *opaque)
+{
+    uint32_t i, n_rx, idx = 0;
+    AFXDPState *s = opaque;
+
+    n_rx = xsk_ring_cons__peek(&s->rx, AF_XDP_BATCH_SIZE, &idx);
+    if (!n_rx) {
+        return;
+    }
+
+    for (i = 0; i < n_rx; i++) {
+        const struct xdp_desc *desc;
+        struct iovec iov;
+
+        desc = xsk_ring_cons__rx_desc(&s->rx, idx++);
+
+        iov.iov_base = xsk_umem__get_data(s->buffer, desc->addr);
+        iov.iov_len = desc->len;
+
+        s->pool[s->n_pool++] = desc->addr;
+
+        if (!qemu_sendv_packet_async(&s->nc, &iov, 1,
+                                     af_xdp_send_completed)) {
+            /*
+             * The peer does not receive anymore.  Packet is queued, stop
+             * reading from the backend until af_xdp_send_completed().
+             */
+            af_xdp_read_poll(s, false);
+
+            /* Return unused descriptors to not break the ring cache. */
+            xsk_ring_cons__cancel(&s->rx, n_rx - i - 1);
+            n_rx = i + 1;
+            break;
+        }
+    }
+
+    /* Release actually sent descriptors and try to re-fill. */
+    xsk_ring_cons__release(&s->rx, n_rx);
+    af_xdp_fq_refill(s, AF_XDP_BATCH_SIZE);
+}
+
+/* Flush and close. */
+static void af_xdp_cleanup(NetClientState *nc)
+{
+    AFXDPState *s = DO_UPCAST(AFXDPState, nc, nc);
+
+    qemu_purge_queued_packets(nc);
+
+    af_xdp_poll(nc, false);
+
+    xsk_socket__delete(s->xsk);
+    s->xsk = NULL;
+    g_free(s->pool);
+    s->pool = NULL;
+    xsk_umem__delete(s->umem);
+    s->umem = NULL;
+    qemu_vfree(s->buffer);
+    s->buffer = NULL;
+
+    /* Remove the program if it's the last open queue. */
+    if (!s->inhibit && nc->queue_index == s->n_queues - 1 && s->xdp_flags
+        && bpf_xdp_detach(s->ifindex, s->xdp_flags, NULL) != 0) {
+        fprintf(stderr,
+                "af-xdp: unable to remove XDP program from '%s', ifindex: %d\n",
+                s->ifname, s->ifindex);
+    }
+}
+
+static int af_xdp_umem_create(AFXDPState *s, int sock_fd, Error **errp)
+{
+    struct xsk_umem_config config = {
+        .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS,
+        .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS,
+        .frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE,
+        .frame_headroom = 0,
+    };
+    uint64_t n_descs;
+    uint64_t size;
+    int64_t i;
+    int ret;
+
+    /* Number of descriptors if all 4 queues (rx, tx, cq, fq) are full. */
+    n_descs = (XSK_RING_PROD__DEFAULT_NUM_DESCS
+               + XSK_RING_CONS__DEFAULT_NUM_DESCS) * 2;
+    size = n_descs * XSK_UMEM__DEFAULT_FRAME_SIZE;
+
+    s->buffer = qemu_memalign(qemu_real_host_page_size(), size);
+    memset(s->buffer, 0, size);
+
+    if (sock_fd < 0) {
+        ret = xsk_umem__create(&s->umem, s->buffer, size,
+                               &s->fq, &s->cq, &config);
+    } else {
+        ret = xsk_umem__create_with_fd(&s->umem, sock_fd, s->buffer, size,
+                                       &s->fq, &s->cq, &config);
+    }
+
+    if (ret) {
+        qemu_vfree(s->buffer);
+        error_setg_errno(errp, errno,
+                         "failed to create umem for %s queue_index: %d",
+                         s->ifname, s->nc.queue_index);
+        return -1;
+    }
+
+    s->pool = g_new(uint64_t, n_descs);
+    /* Fill the pool in the opposite order, because it's a LIFO queue. */
+    for (i = n_descs; i >= 0; i--) {
+        s->pool[i] = i * XSK_UMEM__DEFAULT_FRAME_SIZE;
+    }
+    s->n_pool = n_descs;
+
+    af_xdp_fq_refill(s, XSK_RING_PROD__DEFAULT_NUM_DESCS);
+
+    return 0;
+}
+
+static int af_xdp_socket_create(AFXDPState *s,
+                                const NetdevAFXDPOptions *opts, Error **errp)
+{
+    struct xsk_socket_config cfg = {
+        .rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS,
+        .tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS,
+        .libxdp_flags = 0,
+        .bind_flags = XDP_USE_NEED_WAKEUP,
+        .xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST,
+    };
+    int queue_id, error = 0;
+
+    s->inhibit = opts->has_inhibit && opts->inhibit;
+    if (s->inhibit) {
+        cfg.libxdp_flags |= XSK_LIBXDP_FLAGS__INHIBIT_PROG_LOAD;
+    }
+
+    if (opts->has_force_copy && opts->force_copy) {
+        cfg.bind_flags |= XDP_COPY;
+    }
+
+    queue_id = s->nc.queue_index;
+    if (opts->has_start_queue && opts->start_queue > 0) {
+        queue_id += opts->start_queue;
+    }
+
+    if (opts->has_mode) {
+        /* Specific mode requested. */
+        cfg.xdp_flags |= (opts->mode == AFXDP_MODE_NATIVE)
+                         ? XDP_FLAGS_DRV_MODE : XDP_FLAGS_SKB_MODE;
+        if (xsk_socket__create(&s->xsk, s->ifname, queue_id,
+                               s->umem, &s->rx, &s->tx, &cfg)) {
+            error = errno;
+        }
+    } else {
+        /* No mode requested, try native first. */
+        cfg.xdp_flags |= XDP_FLAGS_DRV_MODE;
+
+        if (xsk_socket__create(&s->xsk, s->ifname, queue_id,
+                               s->umem, &s->rx, &s->tx, &cfg)) {
+            /* Can't use native mode, try skb. */
+            cfg.xdp_flags &= ~XDP_FLAGS_DRV_MODE;
+            cfg.xdp_flags |= XDP_FLAGS_SKB_MODE;
+
+            if (xsk_socket__create(&s->xsk, s->ifname, queue_id,
+                                   s->umem, &s->rx, &s->tx, &cfg)) {
+                error = errno;
+            }
+        }
+    }
+
+    if (error) {
+        error_setg_errno(errp, error,
+                         "failed to create AF_XDP socket for %s queue_id: %d",
+                         s->ifname, queue_id);
+        return -1;
+    }
+
+    s->xdp_flags = cfg.xdp_flags;
+
+    return 0;
+}
+
+/* NetClientInfo methods. */
+static NetClientInfo net_af_xdp_info = {
+    .type = NET_CLIENT_DRIVER_AF_XDP,
+    .size = sizeof(AFXDPState),
+    .receive = af_xdp_receive,
+    .poll = af_xdp_poll,
+    .cleanup = af_xdp_cleanup,
+};
+
+static int *parse_socket_fds(const char *sock_fds_str,
+                             int64_t n_expected, Error **errp)
+{
+    gchar **substrings = g_strsplit(sock_fds_str, ":", -1);
+    int64_t i, n_sock_fds = g_strv_length(substrings);
+    int *sock_fds = NULL;
+
+    if (n_sock_fds != n_expected) {
+        error_setg(errp, "expected %"PRIi64" socket fds, got %"PRIi64,
+                   n_expected, n_sock_fds);
+        goto exit;
+    }
+
+    sock_fds = g_new(int, n_sock_fds);
+
+    for (i = 0; i < n_sock_fds; i++) {
+        sock_fds[i] = monitor_fd_param(monitor_cur(), substrings[i], errp);
+        if (sock_fds[i] < 0) {
+            g_free(sock_fds);
+            sock_fds = NULL;
+            goto exit;
+        }
+    }
+
+exit:
+    g_strfreev(substrings);
+    return sock_fds;
+}
+
+/*
+ * The exported init function.
+ *
+ * ... -netdev af-xdp,ifname="..."
+ */
+int net_init_af_xdp(const Netdev *netdev,
+                    const char *name, NetClientState *peer, Error **errp)
+{
+    const NetdevAFXDPOptions *opts = &netdev->u.af_xdp;
+    NetClientState *nc, *nc0 = NULL;
+    unsigned int ifindex;
+    uint32_t prog_id = 0;
+    int *sock_fds = NULL;
+    int64_t i, queues;
+    Error *err = NULL;
+    AFXDPState *s;
+
+    ifindex = if_nametoindex(opts->ifname);
+    if (!ifindex) {
+        error_setg_errno(errp, errno, "failed to get ifindex for '%s'",
+                         opts->ifname);
+        return -1;
+    }
+
+    queues = opts->has_queues ? opts->queues : 1;
+    if (queues < 1) {
+        error_setg(errp, "invalid number of queues (%" PRIi64 ") for '%s'",
+                   queues, opts->ifname);
+        return -1;
+    }
+
+    if ((opts->has_inhibit && opts->inhibit) != !!opts->sock_fds) {
+        error_setg(errp, "'inhibit=on' requires 'sock-fds' and vice versa");
+        return -1;
+    }
+
+    if (opts->sock_fds) {
+        sock_fds = parse_socket_fds(opts->sock_fds, queues, errp);
+        if (!sock_fds) {
+            return -1;
+        }
+    }
+
+    for (i = 0; i < queues; i++) {
+        nc = qemu_new_net_client(&net_af_xdp_info, peer, "af-xdp", name);
+        qemu_set_info_str(nc, "af-xdp%"PRIi64" to %s", i, opts->ifname);
+        nc->queue_index = i;
+
+        if (!nc0) {
+            nc0 = nc;
+        }
+
+        s = DO_UPCAST(AFXDPState, nc, nc);
+
+        pstrcpy(s->ifname, sizeof(s->ifname), opts->ifname);
+        s->ifindex = ifindex;
+        s->n_queues = queues;
+
+        if (af_xdp_umem_create(s, sock_fds ? sock_fds[i] : -1, errp)
+            || af_xdp_socket_create(s, opts, errp)) {
+            /* Make sure the XDP program will be removed. */
+            s->n_queues = i;
+            error_propagate(errp, err);
+            goto err;
+        }
+    }
+
+    if (nc0) {
+        s = DO_UPCAST(AFXDPState, nc, nc0);
+        if (bpf_xdp_query_id(s->ifindex, s->xdp_flags, &prog_id) || !prog_id) {
+            error_setg_errno(errp, errno,
+                             "no XDP program loaded on '%s', ifindex: %d",
+                             s->ifname, s->ifindex);
+            goto err;
+        }
+    }
+
+    af_xdp_read_poll(s, true); /* Initially only poll for reads. */
+
+    return 0;
+
+err:
+    g_free(sock_fds);
+    if (nc0) {
+        qemu_del_net_client(nc0);
+    }
+
+    return -1;
+}
--- a/net/clients.h
+++ b/net/clients.h
@ -64,6 +64,11 @@ int net_init_netmap(const Netdev *netdev, const char *name,
                    NetClientState *peer, Error **errp);
 #endif

+#ifdef CONFIG_AF_XDP
+int net_init_af_xdp(const Netdev *netdev, const char *name,
+                    NetClientState *peer, Error **errp);
+#endif
+
 int net_init_vhost_user(const Netdev *netdev, const char *name,
                        NetClientState *peer, Error **errp);

--- a/net/dump.c
+++ b/net/dump.c
@ -68,7 +68,7 @@ static ssize_t dump_receive_iov(DumpState *s, const struct iovec *iov, int cnt,
    int64_t ts;
    int caplen;
    size_t size = iov_size(iov, cnt) - offset;
-    struct iovec dumpiov[cnt + 1];
+    g_autofree struct iovec *dumpiov = g_new(struct iovec, cnt + 1);

    /* Early return in case of previous error. */
    if (s->fd < 0) {
--- a/net/meson.build
+++ b/net/meson.build
@ -36,6 +36,9 @@ system_ss.add(when: vde, if_true: files('vde.c'))
 if have_netmap
  system_ss.add(files('netmap.c'))
 endif
+
+system_ss.add(when: libxdp, if_true: files('af-xdp.c'))
+
 if have_vhost_net_user
  system_ss.add(when: 'CONFIG_VIRTIO_NET', if_true: files('vhost-user.c'), if_false: files('vhost-user-stub.c'))
  system_ss.add(when: 'CONFIG_ALL', if_true: files('vhost-user-stub.c'))
--- a/net/net.c
+++ b/net/net.c
@ -495,6 +495,15 @@ bool qemu_has_ufo(NetClientState *nc)
    return nc->info->has_ufo(nc);
 }

+bool qemu_has_uso(NetClientState *nc)
+{
+    if (!nc || !nc->info->has_uso) {
+        return false;
+    }
+
+    return nc->info->has_uso(nc);
+}
+
 bool qemu_has_vnet_hdr(NetClientState *nc)
 {
    if (!nc || !nc->info->has_vnet_hdr) {
@ -532,13 +541,13 @@ void qemu_using_vnet_hdr(NetClientState *nc, bool enable)
 }

 void qemu_set_offload(NetClientState *nc, int csum, int tso4, int tso6,
-                          int ecn, int ufo)
+                          int ecn, int ufo, int uso4, int uso6)
 {
    if (!nc || !nc->info->set_offload) {
        return;
    }

-    nc->info->set_offload(nc, csum, tso4, tso6, ecn, ufo);
+    nc->info->set_offload(nc, csum, tso4, tso6, ecn, ufo, uso4, uso6);
 }

 int qemu_get_vnet_hdr_len(NetClientState *nc)
@ -1082,6 +1091,9 @@ static int (* const net_client_init_fun[NET_CLIENT_DRIVER__MAX])(
 #ifdef CONFIG_NETMAP
        [NET_CLIENT_DRIVER_NETMAP]    = net_init_netmap,
 #endif
+#ifdef CONFIG_AF_XDP
+        [NET_CLIENT_DRIVER_AF_XDP]    = net_init_af_xdp,
+#endif
 #ifdef CONFIG_NET_BRIDGE
        [NET_CLIENT_DRIVER_BRIDGE]    = net_init_bridge,
 #endif
@ -1186,6 +1198,9 @@ void show_netdevs(void)
 #ifdef CONFIG_NETMAP
        "netmap",
 #endif
+#ifdef CONFIG_AF_XDP
+        "af-xdp",
+#endif
 #ifdef CONFIG_POSIX
        "vhost-user",
 #endif
--- a/net/netmap.c
+++ b/net/netmap.c
@ -371,7 +371,7 @@ static void netmap_set_vnet_hdr_len(NetClientState *nc, int len)
 }

 static void netmap_set_offload(NetClientState *nc, int csum, int tso4, int tso6,
-                               int ecn, int ufo)
+                               int ecn, int ufo, int uso4, int uso6)
 {
    NetmapState *s = DO_UPCAST(NetmapState, nc, nc);

--- a/net/tap-bsd.c
+++ b/net/tap-bsd.c
@ -212,6 +212,11 @@ int tap_probe_has_ufo(int fd)
    return 0;
 }

+int tap_probe_has_uso(int fd)
+{
+    return 0;
+}
+
 int tap_probe_vnet_hdr_len(int fd, int len)
 {
    return 0;
@ -232,7 +237,7 @@ int tap_fd_set_vnet_be(int fd, int is_be)
 }

 void tap_fd_set_offload(int fd, int csum, int tso4,
-                        int tso6, int ecn, int ufo)
+                        int tso6, int ecn, int ufo, int uso4, int uso6)
 {
 }

--- a/net/tap-linux.c
+++ b/net/tap-linux.c
@ -173,6 +173,18 @@ int tap_probe_has_ufo(int fd)
    return 1;
 }

+int tap_probe_has_uso(int fd)
+{
+    unsigned offload;
+
+    offload = TUN_F_CSUM | TUN_F_USO4 | TUN_F_USO6;
+
+    if (ioctl(fd, TUNSETOFFLOAD, offload) < 0) {
+        return 0;
+    }
+    return 1;
+}
+
 /* Verify that we can assign given length */
 int tap_probe_vnet_hdr_len(int fd, int len)
 {
@ -237,7 +249,7 @@ int tap_fd_set_vnet_be(int fd, int is_be)
 }

 void tap_fd_set_offload(int fd, int csum, int tso4,
-                        int tso6, int ecn, int ufo)
+                        int tso6, int ecn, int ufo, int uso4, int uso6)
 {
    unsigned int offload = 0;

@ -256,13 +268,22 @@ void tap_fd_set_offload(int fd, int csum, int tso4,
            offload |= TUN_F_TSO_ECN;
        if (ufo)
            offload |= TUN_F_UFO;
+        if (uso4) {
+            offload |= TUN_F_USO4;
+        }
+        if (uso6) {
+            offload |= TUN_F_USO6;
+        }
    }

    if (ioctl(fd, TUNSETOFFLOAD, offload) != 0) {
-        offload &= ~TUN_F_UFO;
+        offload &= ~(TUN_F_USO4 | TUN_F_USO6);
        if (ioctl(fd, TUNSETOFFLOAD, offload) != 0) {
-            fprintf(stderr, "TUNSETOFFLOAD ioctl() failed: %s\n",
+            offload &= ~TUN_F_UFO;
+            if (ioctl(fd, TUNSETOFFLOAD, offload) != 0) {
+                fprintf(stderr, "TUNSETOFFLOAD ioctl() failed: %s\n",
                    strerror(errno));
+            }
        }
    }
 }
--- a/net/tap-linux.h
+++ b/net/tap-linux.h
@ -50,5 +50,7 @@
 #define TUN_F_TSO6    0x04    /* I can handle TSO for IPv6 packets */
 #define TUN_F_TSO_ECN 0x08    /* I can handle TSO with ECN bits. */
 #define TUN_F_UFO     0x10    /* I can handle UFO packets */
+#define TUN_F_USO4    0x20    /* I can handle USO for IPv4 packets */
+#define TUN_F_USO6    0x40    /* I can handle USO for IPv6 packets */

 #endif /* QEMU_TAP_LINUX_H */
--- a/net/tap-solaris.c
+++ b/net/tap-solaris.c
@ -216,6 +216,11 @@ int tap_probe_has_ufo(int fd)
    return 0;
 }

+int tap_probe_has_uso(int fd)
+{
+    return 0;
+}
+
 int tap_probe_vnet_hdr_len(int fd, int len)
 {
    return 0;
@ -236,7 +241,7 @@ int tap_fd_set_vnet_be(int fd, int is_be)
 }

 void tap_fd_set_offload(int fd, int csum, int tso4,
-                        int tso6, int ecn, int ufo)
+                        int tso6, int ecn, int ufo, int uso4, int uso6)
 {
 }

--- a/net/tap-stub.c
+++ b/net/tap-stub.c
@ -47,6 +47,11 @@ int tap_probe_has_ufo(int fd)
    return 0;
 }

+int tap_probe_has_uso(int fd)
+{
+    return 0;
+}
+
 int tap_probe_vnet_hdr_len(int fd, int len)
 {
    return 0;
@ -67,7 +72,7 @@ int tap_fd_set_vnet_be(int fd, int is_be)
 }

 void tap_fd_set_offload(int fd, int csum, int tso4,
-                        int tso6, int ecn, int ufo)
+                        int tso6, int ecn, int ufo, int uso4, int uso6)
 {
 }

--- a/net/tap-win32.c
+++ b/net/tap-win32.c
@ -741,7 +741,7 @@ static void tap_using_vnet_hdr(NetClientState *nc, bool using_vnet_hdr)
 }

 static void tap_set_offload(NetClientState *nc, int csum, int tso4,
-                     int tso6, int ecn, int ufo)
+                     int tso6, int ecn, int ufo, int uso4, int uso6)
 {
 }

--- a/net/tap.c
+++ b/net/tap.c
@ -57,6 +57,7 @@ typedef struct TAPState {
    bool write_poll;
    bool using_vnet_hdr;
    bool has_ufo;
+    bool has_uso;
    bool enabled;
    VHostNetState *vhost_net;
    unsigned host_vnet_hdr_len;
@ -117,10 +118,11 @@ static ssize_t tap_receive_iov(NetClientState *nc, const struct iovec *iov,
 {
    TAPState *s = DO_UPCAST(TAPState, nc, nc);
    const struct iovec *iovp = iov;
-    struct iovec iov_copy[iovcnt + 1];
+    g_autofree struct iovec *iov_copy = NULL;
    struct virtio_net_hdr_mrg_rxbuf hdr = { };

    if (s->host_vnet_hdr_len && !s->using_vnet_hdr) {
+        iov_copy = g_new(struct iovec, iovcnt + 1);
        iov_copy[0].iov_base = &hdr;
        iov_copy[0].iov_len =  s->host_vnet_hdr_len;
        memcpy(&iov_copy[1], iov, iovcnt * sizeof(*iov));
@ -237,6 +239,15 @@ static bool tap_has_ufo(NetClientState *nc)
    return s->has_ufo;
 }

+static bool tap_has_uso(NetClientState *nc)
+{
+    TAPState *s = DO_UPCAST(TAPState, nc, nc);
+
+    assert(nc->info->type == NET_CLIENT_DRIVER_TAP);
+
+    return s->has_uso;
+}
+
 static bool tap_has_vnet_hdr(NetClientState *nc)
 {
    TAPState *s = DO_UPCAST(TAPState, nc, nc);
@ -307,14 +318,14 @@ static int tap_set_vnet_be(NetClientState *nc, bool is_be)
 }

 static void tap_set_offload(NetClientState *nc, int csum, int tso4,
-                     int tso6, int ecn, int ufo)
+                     int tso6, int ecn, int ufo, int uso4, int uso6)
 {
    TAPState *s = DO_UPCAST(TAPState, nc, nc);
    if (s->fd < 0) {
        return;
    }

-    tap_fd_set_offload(s->fd, csum, tso4, tso6, ecn, ufo);
+    tap_fd_set_offload(s->fd, csum, tso4, tso6, ecn, ufo, uso4, uso6);
 }

 static void tap_exit_notify(Notifier *notifier, void *data)
@ -384,6 +395,7 @@ static NetClientInfo net_tap_info = {
    .poll = tap_poll,
    .cleanup = tap_cleanup,
    .has_ufo = tap_has_ufo,
+    .has_uso = tap_has_uso,
    .has_vnet_hdr = tap_has_vnet_hdr,
    .has_vnet_hdr_len = tap_has_vnet_hdr_len,
    .get_using_vnet_hdr = tap_get_using_vnet_hdr,
@ -413,8 +425,9 @@ static TAPState *net_tap_fd_init(NetClientState *peer,
    s->host_vnet_hdr_len = vnet_hdr ? sizeof(struct virtio_net_hdr) : 0;
    s->using_vnet_hdr = false;
    s->has_ufo = tap_probe_has_ufo(s->fd);
+    s->has_uso = tap_probe_has_uso(s->fd);
    s->enabled = true;
-    tap_set_offload(&s->nc, 0, 0, 0, 0, 0);
+    tap_set_offload(&s->nc, 0, 0, 0, 0, 0, 0, 0);
    /*
     * Make sure host header length is set correctly in tap:
     * it might have been modified by another instance of qemu.
--- a/net/tap_int.h
+++ b/net/tap_int.h
@ -37,7 +37,9 @@ void tap_set_sndbuf(int fd, const NetdevTapOptions *tap, Error **errp);
 int tap_probe_vnet_hdr(int fd, Error **errp);
 int tap_probe_vnet_hdr_len(int fd, int len);
 int tap_probe_has_ufo(int fd);
-void tap_fd_set_offload(int fd, int csum, int tso4, int tso6, int ecn, int ufo);
+int tap_probe_has_uso(int fd);
+void tap_fd_set_offload(int fd, int csum, int tso4, int tso6, int ecn, int ufo,
+                        int uso4, int uso6);
 void tap_fd_set_vnet_hdr_len(int fd, int len);
 int tap_fd_set_vnet_le(int fd, int vnet_is_le);
 int tap_fd_set_vnet_be(int fd, int vnet_is_be);
--- a/net/vhost-vdpa.c
+++ b/net/vhost-vdpa.c
@ -75,11 +75,14 @@ const int vdpa_feature_bits[] = {
    VIRTIO_NET_F_GUEST_TSO4,
    VIRTIO_NET_F_GUEST_TSO6,
    VIRTIO_NET_F_GUEST_UFO,
+    VIRTIO_NET_F_GUEST_USO4,
+    VIRTIO_NET_F_GUEST_USO6,
    VIRTIO_NET_F_HASH_REPORT,
    VIRTIO_NET_F_HOST_ECN,
    VIRTIO_NET_F_HOST_TSO4,
    VIRTIO_NET_F_HOST_TSO6,
    VIRTIO_NET_F_HOST_UFO,
+    VIRTIO_NET_F_HOST_USO,
    VIRTIO_NET_F_MQ,
    VIRTIO_NET_F_MRG_RXBUF,
    VIRTIO_NET_F_MTU,
--- a/qapi/net.json
+++ b/qapi/net.json
@ -408,6 +408,60 @@
    'ifname':     'str',
    '*devname':    'str' } }

+##
+# @AFXDPMode:
+#
+# Attach mode for a default XDP program
+#
+# @skb: generic mode, no driver support necessary
+#
+# @native: DRV mode, program is attached to a driver, packets are passed to
+#     the socket without allocation of skb.
+#
+# Since: 8.2
+##
+{ 'enum': 'AFXDPMode',
+  'data': [ 'native', 'skb' ],
+  'if': 'CONFIG_AF_XDP' }
+
+##
+# @NetdevAFXDPOptions:
+#
+# AF_XDP network backend
+#
+# @ifname: The name of an existing network interface.
+#
+# @mode: Attach mode for a default XDP program.  If not specified, then
+#     'native' will be tried first, then 'skb'.
+#
+# @force-copy: Force XDP copy mode even if device supports zero-copy.
+#     (default: false)
+#
+# @queues: number of queues to be used for multiqueue interfaces (default: 1).
+#
+# @start-queue: Use @queues starting from this queue number (default: 0).
+#
+# @inhibit: Don't load a default XDP program, use one already loaded to
+#     the interface (default: false).  Requires @sock-fds.
+#
+# @sock-fds: A colon (:) separated list of file descriptors for already open
+#     but not bound AF_XDP sockets in the queue order.  One fd per queue.
+#     These descriptors should already be added into XDP socket map for
+#     corresponding queues.  Requires @inhibit.
+#
+# Since: 8.2
+##
+{ 'struct': 'NetdevAFXDPOptions',
+  'data': {
+    'ifname':       'str',
+    '*mode':        'AFXDPMode',
+    '*force-copy':  'bool',
+    '*queues':      'int',
+    '*start-queue': 'int',
+    '*inhibit':     'bool',
+    '*sock-fds':    'str' },
+  'if': 'CONFIG_AF_XDP' }
+
 ##
 # @NetdevVhostUserOptions:
 #
@ -642,6 +696,7 @@
 # @vmnet-bridged: since 7.1
 # @stream: since 7.2
 # @dgram: since 7.2
+# @af-xdp: since 8.2
 #
 # Since: 2.7
 ##
@ -649,6 +704,7 @@
  'data': [ 'none', 'nic', 'user', 'tap', 'l2tpv3', 'socket', 'stream',
            'dgram', 'vde', 'bridge', 'hubport', 'netmap', 'vhost-user',
            'vhost-vdpa',
+            { 'name': 'af-xdp', 'if': 'CONFIG_AF_XDP' },
            { 'name': 'vmnet-host', 'if': 'CONFIG_VMNET' },
            { 'name': 'vmnet-shared', 'if': 'CONFIG_VMNET' },
            { 'name': 'vmnet-bridged', 'if': 'CONFIG_VMNET' }] }
@ -679,6 +735,8 @@
    'bridge':   'NetdevBridgeOptions',
    'hubport':  'NetdevHubPortOptions',
    'netmap':   'NetdevNetmapOptions',
+    'af-xdp':   { 'type': 'NetdevAFXDPOptions',
+                  'if': 'CONFIG_AF_XDP' },
    'vhost-user': 'NetdevVhostUserOptions',
    'vhost-vdpa': 'NetdevVhostVDPAOptions',
    'vmnet-host': { 'type': 'NetdevVmnetHostOptions',
--- a/qemu-options.hx
+++ b/qemu-options.hx
@ -2882,6 +2882,19 @@ DEF("netdev", HAS_ARG, QEMU_OPTION_netdev,
    "                VALE port (created on the fly) called 'name' ('nmname' is name of the \n"
    "                netmap device, defaults to '/dev/netmap')\n"
 #endif
+#ifdef CONFIG_AF_XDP
+    "-netdev af-xdp,id=str,ifname=name[,mode=native|skb][,force-copy=on|off]\n"
+    "         [,queues=n][,start-queue=m][,inhibit=on|off][,sock-fds=x:y:...:z]\n"
+    "                attach to the existing network interface 'name' with AF_XDP socket\n"
+    "                use 'mode=MODE' to specify an XDP program attach mode\n"
+    "                use 'force-copy=on|off' to force XDP copy mode even if device supports zero-copy (default: off)\n"
+    "                use 'inhibit=on|off' to inhibit loading of a default XDP program (default: off)\n"
+    "                with inhibit=on,\n"
+    "                  use 'sock-fds' to provide file descriptors for already open AF_XDP sockets\n"
+    "                  added to a socket map in XDP program.  One socket per queue.\n"
+    "                use 'queues=n' to specify how many queues of a multiqueue interface should be used\n"
+    "                use 'start-queue=m' to specify the first queue that should be used\n"
+#endif
 #ifdef CONFIG_POSIX
    "-netdev vhost-user,id=str,chardev=dev[,vhostforce=on|off]\n"
    "                configure a vhost-user network, backed by a chardev 'dev'\n"
@ -2927,6 +2940,9 @@ DEF("nic", HAS_ARG, QEMU_OPTION_nic,
 #ifdef CONFIG_NETMAP
    "netmap|"
 #endif
+#ifdef CONFIG_AF_XDP
+    "af-xdp|"
+#endif
 #ifdef CONFIG_POSIX
    "vhost-user|"
 #endif
@ -2955,6 +2971,9 @@ DEF("net", HAS_ARG, QEMU_OPTION_net,
 #ifdef CONFIG_NETMAP
    "netmap|"
 #endif
+#ifdef CONFIG_AF_XDP
+    "af-xdp|"
+#endif
 #ifdef CONFIG_VMNET
    "vmnet-host|vmnet-shared|vmnet-bridged|"
 #endif
@ -2962,7 +2981,7 @@ DEF("net", HAS_ARG, QEMU_OPTION_net,
    "                old way to initialize a host network interface\n"
    "                (use the -netdev option if possible instead)\n", QEMU_ARCH_ALL)
 SRST
-``-nic [tap|bridge|user|l2tpv3|vde|netmap|vhost-user|socket][,...][,mac=macaddr][,model=mn]``
+``-nic [tap|bridge|user|l2tpv3|vde|netmap|af-xdp|vhost-user|socket][,...][,mac=macaddr][,model=mn]``
    This option is a shortcut for configuring both the on-board
    (default) guest NIC hardware and the host network backend in one go.
    The host backend options are the same as with the corresponding
@ -3376,6 +3395,55 @@ SRST
        # launch QEMU instance
        |qemu_system| linux.img -nic vde,sock=/tmp/myswitch

+``-netdev af-xdp,id=str,ifname=name[,mode=native|skb][,force-copy=on|off][,queues=n][,start-queue=m][,inhibit=on|off][,sock-fds=x:y:...:z]``
+    Configure AF_XDP backend to connect to a network interface 'name'
+    using AF_XDP socket.  A specific program attach mode for a default
+    XDP program can be forced with 'mode', defaults to best-effort,
+    where the likely most performant mode will be in use.  Number of queues
+    'n' should generally match the number or queues in the interface,
+    defaults to 1.  Traffic arriving on non-configured device queues will
+    not be delivered to the network backend.
+
+    .. parsed-literal::
+
+        # set number of queues to 4
+        ethtool -L eth0 combined 4
+        # launch QEMU instance
+        |qemu_system| linux.img -device virtio-net-pci,netdev=n1 \\
+            -netdev af-xdp,id=n1,ifname=eth0,queues=4
+
+    'start-queue' option can be specified if a particular range of queues
+    [m, m + n] should be in use.  For example, this is may be necessary in
+    order to use certain NICs in native mode.  Kernel allows the driver to
+    create a separate set of XDP queues on top of regular ones, and only
+    these queues can be used for AF_XDP sockets.  NICs that work this way
+    may also require an additional traffic redirection with ethtool to these
+    special queues.
+
+    .. parsed-literal::
+
+        # set number of queues to 1
+        ethtool -L eth0 combined 1
+        # redirect all the traffic to the second queue (id: 1)
+        # note: drivers may require non-empty key/mask pair.
+        ethtool -N eth0 flow-type ether \\
+            dst 00:00:00:00:00:00 m FF:FF:FF:FF:FF:FE action 1
+        ethtool -N eth0 flow-type ether \\
+            dst 00:00:00:00:00:01 m FF:FF:FF:FF:FF:FE action 1
+        # launch QEMU instance
+        |qemu_system| linux.img -device virtio-net-pci,netdev=n1 \\
+            -netdev af-xdp,id=n1,ifname=eth0,queues=1,start-queue=1
+
+    XDP program can also be loaded externally.  In this case 'inhibit' option
+    should be set to 'on' and 'sock-fds' provided with file descriptors for
+    already open but not bound XDP sockets already added to a socket map for
+    corresponding queues.  One socket per queue.
+
+    .. parsed-literal::
+
+        |qemu_system| linux.img -device virtio-net-pci,netdev=n1 \\
+            -netdev af-xdp,id=n1,ifname=eth0,queues=3,inhibit=on,sock-fds=15:16:17
+
 ``-netdev vhost-user,chardev=id[,vhostforce=on|off][,queues=n]``
    Establish a vhost-user netdev, backed by a chardev id. The chardev
    should be a unix domain socket backed one. The vhost-user uses a
--- a/scripts/ci/org.centos/stream/8/x86_64/configure
+++ b/scripts/ci/org.centos/stream/8/x86_64/configure
@ -35,6 +35,7 @@
 --block-drv-ro-whitelist="vmdk,vhdx,vpc,https,ssh" \
 --with-coroutine=ucontext \
 --tls-priority=@QEMU,SYSTEM \
+--disable-af-xdp \
 --disable-attr \
 --disable-auth-pam \
 --disable-avx2 \
--- a/scripts/meson-buildoptions.sh
+++ b/scripts/meson-buildoptions.sh
@ -76,6 +76,7 @@ meson_options_help() {
  printf "%s\n" 'disabled with --disable-FEATURE, default is enabled if available'
  printf "%s\n" '(unless built with --without-default-features):'
  printf "%s\n" ''
+  printf "%s\n" '  af-xdp          AF_XDP network backend support'
  printf "%s\n" '  alsa            ALSA sound support'
  printf "%s\n" '  attr            attr/xattr support'
  printf "%s\n" '  auth-pam        PAM access control'
@ -208,6 +209,8 @@ meson_options_help() {
 }
 _meson_option_parse() {
  case $1 in
+    --enable-af-xdp) printf "%s" -Daf_xdp=enabled ;;
+    --disable-af-xdp) printf "%s" -Daf_xdp=disabled ;;
    --enable-alsa) printf "%s" -Dalsa=enabled ;;
    --disable-alsa) printf "%s" -Dalsa=disabled ;;
    --enable-attr) printf "%s" -Dattr=enabled ;;
--- a/tests/docker/dockerfiles/alpine.docker
+++ b/tests/docker/dockerfiles/alpine.docker
@ -59,6 +59,7 @@ RUN apk update && \
        libtasn1-dev \
        liburing-dev \
        libusb-dev \
+        libxdp-dev \
        linux-pam-dev \
        llvm \
        lttng-ust-dev \
--- a/tests/docker/dockerfiles/centos8.docker
+++ b/tests/docker/dockerfiles/centos8.docker
@ -75,6 +75,7 @@ RUN dnf distro-sync -y && \
        libubsan \
        liburing-devel \
        libusbx-devel \
+        libxdp-devel \
        libzstd-devel \
        llvm \
        lttng-ust-devel \
--- a/tests/docker/dockerfiles/debian-amd64-cross.docker
+++ b/tests/docker/dockerfiles/debian-amd64-cross.docker
@ -84,7 +84,7 @@ RUN export DEBIAN_FRONTEND=noninteractive && \
                      g++-x86-64-linux-gnu \
                      gcc-x86-64-linux-gnu \
                      libaio-dev:amd64 \
-                      libasan5:amd64 \
+                      libasan6:amd64 \
                      libasound2-dev:amd64 \
                      libattr1-dev:amd64 \
                      libbpf-dev:amd64 \
--- a/tests/docker/dockerfiles/debian-amd64.docker
+++ b/tests/docker/dockerfiles/debian-amd64.docker
@ -32,7 +32,7 @@ RUN export DEBIAN_FRONTEND=noninteractive && \
                      git \
                      hostname \
                      libaio-dev \
-                      libasan5 \
+                      libasan6 \
                      libasound2-dev \
                      libattr1-dev \
                      libbpf-dev \
--- a/tests/docker/dockerfiles/debian-arm64-cross.docker
+++ b/tests/docker/dockerfiles/debian-arm64-cross.docker
@ -84,7 +84,7 @@ RUN export DEBIAN_FRONTEND=noninteractive && \
                      g++-aarch64-linux-gnu \
                      gcc-aarch64-linux-gnu \
                      libaio-dev:arm64 \
-                      libasan5:arm64 \
+                      libasan6:arm64 \
                      libasound2-dev:arm64 \
                      libattr1-dev:arm64 \
                      libbpf-dev:arm64 \
--- a/tests/docker/dockerfiles/debian-armel-cross.docker
+++ b/tests/docker/dockerfiles/debian-armel-cross.docker
@ -84,7 +84,7 @@ RUN export DEBIAN_FRONTEND=noninteractive && \
                      g++-arm-linux-gnueabi \
                      gcc-arm-linux-gnueabi \
                      libaio-dev:armel \
-                      libasan5:armel \
+                      libasan6:armel \
                      libasound2-dev:armel \
                      libattr1-dev:armel \
                      libbpf-dev:armel \
--- a/tests/docker/dockerfiles/debian-armhf-cross.docker
+++ b/tests/docker/dockerfiles/debian-armhf-cross.docker
@ -84,7 +84,7 @@ RUN export DEBIAN_FRONTEND=noninteractive && \
                      g++-arm-linux-gnueabihf \
                      gcc-arm-linux-gnueabihf \
                      libaio-dev:armhf \
-                      libasan5:armhf \
+                      libasan6:armhf \
                      libasound2-dev:armhf \
                      libattr1-dev:armhf \
                      libbpf-dev:armhf \
--- a/tests/docker/dockerfiles/debian-ppc64el-cross.docker
+++ b/tests/docker/dockerfiles/debian-ppc64el-cross.docker
@ -84,7 +84,7 @@ RUN export DEBIAN_FRONTEND=noninteractive && \
                      g++-powerpc64le-linux-gnu \
                      gcc-powerpc64le-linux-gnu \
                      libaio-dev:ppc64el \
-                      libasan5:ppc64el \
+                      libasan6:ppc64el \
                      libasound2-dev:ppc64el \
                      libattr1-dev:ppc64el \
                      libbpf-dev:ppc64el \
--- a/tests/docker/dockerfiles/debian-s390x-cross.docker
+++ b/tests/docker/dockerfiles/debian-s390x-cross.docker
@ -84,7 +84,7 @@ RUN export DEBIAN_FRONTEND=noninteractive && \
                      g++-s390x-linux-gnu \
                      gcc-s390x-linux-gnu \
                      libaio-dev:s390x \
-                      libasan5:s390x \
+                      libasan6:s390x \
                      libasound2-dev:s390x \
                      libattr1-dev:s390x \
                      libbpf-dev:s390x \
--- a/tests/docker/dockerfiles/fedora.docker
+++ b/tests/docker/dockerfiles/fedora.docker
@ -82,6 +82,7 @@ exec "$@"\n' > /usr/bin/nosync && \
               libubsan \
               liburing-devel \
               libusbx-devel \
+               libxdp-devel \
               libzstd-devel \
               llvm \
               lttng-ust-devel \
--- a/tests/docker/dockerfiles/opensuse-leap.docker
+++ b/tests/docker/dockerfiles/opensuse-leap.docker
@ -40,7 +40,7 @@ RUN zypper update -y && \
           libSDL2-devel \
           libSDL2_image-devel \
           libaio-devel \
-           libasan6 \
+           libasan8 \
           libattr-devel \
           libbpf-devel \
           libbz2-devel \
--- a/tests/docker/dockerfiles/ubuntu2004.docker
+++ b/tests/docker/dockerfiles/ubuntu2004.docker
@ -32,7 +32,7 @@ RUN export DEBIAN_FRONTEND=noninteractive && \
                      git \
                      hostname \
                      libaio-dev \
-                      libasan5 \
+                      libasan6 \
                      libasound2-dev \
                      libattr1-dev \
                      libbrlapi-dev \
--- a/tests/docker/dockerfiles/ubuntu2204.docker
+++ b/tests/docker/dockerfiles/ubuntu2204.docker
@ -32,7 +32,7 @@ RUN export DEBIAN_FRONTEND=noninteractive && \
                      git \
                      hostname \
                      libaio-dev \
-                      libasan5 \
+                      libasan6 \
                      libasound2-dev \
                      libattr1-dev \
                      libbpf-dev \
--- a/tests/lcitool/libvirt-ci
+++ b/tests/lcitool/libvirt-ci
@ -1 +1 @@
-Subproject commit bbd55b4d18cce8f89b5167675e434a6941315634
+Subproject commit 5f84a21881577a5fb56cc956f6fe4e2abd6fcff0
--- a/tests/lcitool/projects/qemu.yml
+++ b/tests/lcitool/projects/qemu.yml
@ -69,6 +69,7 @@ packages:
 - liburing
 - libusbx
 - libvdeplug
+ - libxdp
 - libzstd
 - llvm
 - lttng-ust
--- a/tests/qtest/libqos/igb.c
+++ b/tests/qtest/libqos/igb.c
@ -109,6 +109,11 @@ static void igb_pci_start_hw(QOSGraphObject *obj)
                        E1000_RAH_AV | E1000_RAH_POOL_1 |
                        le16_to_cpu(*(uint16_t *)(address + 4)));

+    /* Set supported receive descriptor mode */
+    e1000e_macreg_write(&d->e1000e,
+                        E1000_SRRCTL(0),
+                        E1000_SRRCTL_DESCTYPE_ADV_ONEBUF);
+
    /* Enable receive */
    e1000e_macreg_write(&d->e1000e, E1000_RFCTL, E1000_RFCTL_EXTEN);
    e1000e_macreg_write(&d->e1000e, E1000_RCTL, E1000_RCTL_EN);