2008-02-03 05:20:18 +03:00
|
|
|
/*
|
|
|
|
* QEMU e1000 emulation
|
|
|
|
*
|
2009-12-23 18:05:21 +03:00
|
|
|
* Software developer's manual:
|
|
|
|
* http://download.intel.com/design/network/manuals/8254x_GBe_SDM.pdf
|
|
|
|
*
|
2008-02-03 05:20:18 +03:00
|
|
|
* Nir Peleg, Tutis Systems Ltd. for Qumranet Inc.
|
|
|
|
* Copyright (c) 2008 Qumranet
|
|
|
|
* Based on work done by:
|
|
|
|
* Copyright (c) 2007 Dan Aloni
|
|
|
|
* Copyright (c) 2004 Antony T Curtis
|
|
|
|
*
|
|
|
|
* This library is free software; you can redistribute it and/or
|
|
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
|
|
* License as published by the Free Software Foundation; either
|
2020-10-23 15:44:24 +03:00
|
|
|
* version 2.1 of the License, or (at your option) any later version.
|
2008-02-03 05:20:18 +03:00
|
|
|
*
|
|
|
|
* This library is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
* Lesser General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU Lesser General Public
|
2009-07-17 00:47:01 +04:00
|
|
|
* License along with this library; if not, see <http://www.gnu.org/licenses/>.
|
2008-02-03 05:20:18 +03:00
|
|
|
*/
|
|
|
|
|
|
|
|
|
2016-01-26 21:17:11 +03:00
|
|
|
#include "qemu/osdep.h"
|
2023-02-23 13:19:48 +03:00
|
|
|
#include "hw/net/mii.h"
|
2022-12-22 13:03:28 +03:00
|
|
|
#include "hw/pci/pci_device.h"
|
2019-08-12 08:23:51 +03:00
|
|
|
#include "hw/qdev-properties.h"
|
2019-08-12 08:23:45 +03:00
|
|
|
#include "migration/vmstate.h"
|
2021-07-23 10:55:10 +03:00
|
|
|
#include "net/eth.h"
|
2012-10-24 10:43:34 +04:00
|
|
|
#include "net/net.h"
|
2009-10-22 20:49:03 +04:00
|
|
|
#include "net/checksum.h"
|
2012-12-17 21:20:04 +04:00
|
|
|
#include "sysemu/sysemu.h"
|
|
|
|
#include "sysemu/dma.h"
|
2013-09-12 12:47:37 +04:00
|
|
|
#include "qemu/iov.h"
|
2019-05-23 17:35:07 +03:00
|
|
|
#include "qemu/module.h"
|
2014-12-01 21:06:52 +03:00
|
|
|
#include "qemu/range.h"
|
2008-02-03 05:20:18 +03:00
|
|
|
|
2023-02-23 13:50:51 +03:00
|
|
|
#include "e1000_common.h"
|
2016-06-01 11:23:44 +03:00
|
|
|
#include "e1000x_common.h"
|
2018-10-16 12:40:45 +03:00
|
|
|
#include "trace.h"
|
2020-09-03 23:43:22 +03:00
|
|
|
#include "qom/object.h"
|
2008-02-03 05:20:18 +03:00
|
|
|
|
2017-03-22 03:05:44 +03:00
|
|
|
/* #define E1000_DEBUG */
|
2008-02-03 05:20:18 +03:00
|
|
|
|
2010-06-14 19:05:17 +04:00
|
|
|
#ifdef E1000_DEBUG
|
2008-02-03 05:20:18 +03:00
|
|
|
enum {
|
2015-11-11 16:52:39 +03:00
|
|
|
DEBUG_GENERAL, DEBUG_IO, DEBUG_MMIO, DEBUG_INTERRUPT,
|
|
|
|
DEBUG_RX, DEBUG_TX, DEBUG_MDIC, DEBUG_EEPROM,
|
|
|
|
DEBUG_UNKNOWN, DEBUG_TXSUM, DEBUG_TXERR, DEBUG_RXERR,
|
2012-03-22 14:02:16 +04:00
|
|
|
DEBUG_RXFILTER, DEBUG_PHY, DEBUG_NOTYET,
|
2008-02-03 05:20:18 +03:00
|
|
|
};
|
2015-11-11 16:52:39 +03:00
|
|
|
#define DBGBIT(x) (1<<DEBUG_##x)
|
2008-02-03 05:20:18 +03:00
|
|
|
static int debugflags = DBGBIT(TXERR) | DBGBIT(GENERAL);
|
|
|
|
|
2015-11-11 16:52:39 +03:00
|
|
|
#define DBGOUT(what, fmt, ...) do { \
|
2008-02-03 05:20:18 +03:00
|
|
|
if (debugflags & DBGBIT(what)) \
|
2009-05-13 22:09:29 +04:00
|
|
|
fprintf(stderr, "e1000: " fmt, ## __VA_ARGS__); \
|
2008-02-03 05:20:18 +03:00
|
|
|
} while (0)
|
|
|
|
#else
|
2015-11-11 16:52:39 +03:00
|
|
|
#define DBGOUT(what, fmt, ...) do {} while (0)
|
2008-02-03 05:20:18 +03:00
|
|
|
#endif
|
|
|
|
|
|
|
|
#define IOPORT_SIZE 0x40
|
2008-03-10 03:02:10 +03:00
|
|
|
#define PNPMMIO_SIZE 0x20000
|
2008-02-03 05:20:18 +03:00
|
|
|
|
2023-02-23 13:19:52 +03:00
|
|
|
#define MAXIMUM_ETHERNET_HDR_LEN (ETH_HLEN + 4)
|
2013-09-12 12:47:37 +04:00
|
|
|
|
2008-02-03 05:20:18 +03:00
|
|
|
/*
|
|
|
|
* HW models:
|
2014-06-02 17:33:27 +04:00
|
|
|
* E1000_DEV_ID_82540EM works with Windows, Linux, and OS X <= 10.8
|
2008-02-03 05:20:18 +03:00
|
|
|
* E1000_DEV_ID_82544GC_COPPER appears to work; not well tested
|
2014-06-02 17:33:27 +04:00
|
|
|
* E1000_DEV_ID_82545EM_COPPER works with Linux and OS X >= 10.6
|
2008-02-03 05:20:18 +03:00
|
|
|
* Others never tested
|
|
|
|
*/
|
|
|
|
|
2020-09-03 23:43:22 +03:00
|
|
|
struct E1000State_st {
|
2013-06-30 14:55:52 +04:00
|
|
|
/*< private >*/
|
|
|
|
PCIDevice parent_obj;
|
|
|
|
/*< public >*/
|
|
|
|
|
2009-11-25 21:49:12 +03:00
|
|
|
NICState *nic;
|
2009-10-21 17:25:31 +04:00
|
|
|
NICConf conf;
|
2011-08-08 17:09:08 +04:00
|
|
|
MemoryRegion mmio;
|
|
|
|
MemoryRegion io;
|
2008-02-03 05:20:18 +03:00
|
|
|
|
|
|
|
uint32_t mac_reg[0x8000];
|
|
|
|
uint16_t phy_reg[0x20];
|
|
|
|
uint16_t eeprom_data[64];
|
|
|
|
|
|
|
|
uint32_t rxbuf_size;
|
|
|
|
uint32_t rxbuf_min_shift;
|
|
|
|
struct e1000_tx {
|
|
|
|
unsigned char header[256];
|
2008-11-21 19:25:17 +03:00
|
|
|
unsigned char vlan_header[4];
|
2009-11-19 21:44:55 +03:00
|
|
|
/* Fields vlan and data must not be reordered or separated. */
|
2008-11-21 19:25:17 +03:00
|
|
|
unsigned char vlan[4];
|
2008-02-03 05:20:18 +03:00
|
|
|
unsigned char data[0x10000];
|
|
|
|
uint16_t size;
|
2008-11-21 19:25:17 +03:00
|
|
|
unsigned char vlan_needed;
|
2017-11-15 02:23:33 +03:00
|
|
|
unsigned char sum_needed;
|
|
|
|
bool cptse;
|
2016-06-01 11:23:44 +03:00
|
|
|
e1000x_txd_props props;
|
e1000: Separate TSO and non-TSO contexts, fixing UDP TX corruption
The device is supposed to maintain two distinct contexts for transmit
offloads: one has parameters for both segmentation and checksum
offload, the other only for checksum offload. The guest driver can
send two context descriptors, one for each context (the TSE flag
specifies which). Then the guest can refer to one or the other context
in subsequent transmit data descriptors, depending on what offloads it
wants applied to each packet.
Currently the e1000 device stores just one context, and misinterprets
the TSE flags in the context and data descriptors. This is often okay:
Linux happens to send a fresh context descriptor before every data
descriptor, so forgetting the other context doesn't matter. Windows
does rely on separate contexts for TSO vs. non-TSO packets, but for
mostly-TCP traffic the two contexts have identical TCP-specific
offload parameters so confusing them doesn't matter.
One case where this confusion matters is when a Windows guest sets up
a TSO context for TCP and a non-TSO context for UDP, and then
transmits both TCP and UDP traffic in parallel. The e1000 device
sometimes ends up using TCP-specific parameters while doing checksum
offload on a UDP datagram: it writes the checksum to offset 16 (the
correct location for a TCP checksum), stomping on two bytes of UDP
data, and leaving the wrong value in the actual UDP checksum field at
offset 6. (Even worse, the host network stack may then recompute the
UDP checksum, "correcting" it to match the corrupt data before sending
it out a physical interface.)
Correct this by tracking the TSO context independently of the non-TSO
context, and selecting the appropriate context based on the TSE flag
in each transmit data descriptor.
Signed-off-by: Ed Swierk <eswierk@skyportsystems.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
2017-11-15 02:23:34 +03:00
|
|
|
e1000x_txd_props tso_props;
|
2008-02-03 05:20:18 +03:00
|
|
|
uint16_t tso_frames;
|
2021-10-21 19:10:47 +03:00
|
|
|
bool busy;
|
2008-02-03 05:20:18 +03:00
|
|
|
} tx;
|
|
|
|
|
|
|
|
struct {
|
2015-11-11 16:52:39 +03:00
|
|
|
uint32_t val_in; /* shifted in from guest driver */
|
2008-02-03 05:20:18 +03:00
|
|
|
uint16_t bitnum_in;
|
|
|
|
uint16_t bitnum_out;
|
|
|
|
uint16_t reading;
|
|
|
|
uint32_t old_eecd;
|
|
|
|
} eecd_state;
|
2012-03-22 14:02:24 +04:00
|
|
|
|
|
|
|
QEMUTimer *autoneg_timer;
|
2013-02-14 21:11:27 +04:00
|
|
|
|
e1000: add interrupt mitigation support
This patch partially implements the e1000 interrupt mitigation mechanisms.
Using a single QEMUTimer, it emulates the ITR register (which is the newer
mitigation register, recommended by Intel) and approximately emulates
RADV and TADV registers. TIDV and RDTR register functionalities are not
emulated (RDTR is only used to validate RADV, according to the e1000 specs).
RADV, TADV, TIDV and RDTR registers make up the older e1000 mitigation
mechanism and would need a timer each to be completely emulated. However,
a single timer has been used in order to reach a good compromise between
emulation accuracy and simplicity/efficiency.
The implemented mechanism can be enabled/disabled specifying the command
line e1000-specific boolean parameter "mitigation", e.g.
qemu-system-x86_64 -device e1000,mitigation=on,... ...
For more information, see the Software developer's manual at
http://download.intel.com/design/network/manuals/8254x_GBe_SDM.pdf.
Interrupt mitigation boosts performance when the guest suffers from
an high interrupt rate (i.e. receiving short UDP packets at high packet
rate). For some numerical results see the following link
http://info.iet.unipi.it/~luigi/papers/20130520-rizzo-vm.pdf
Signed-off-by: Vincenzo Maffione <v.maffione@gmail.com>
Reviewed-by: Andreas Färber <afaerber@suse.de> (for pc-* machines)
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2013-08-02 20:30:52 +04:00
|
|
|
QEMUTimer *mit_timer; /* Mitigation timer. */
|
|
|
|
bool mit_timer_on; /* Mitigation timer is running. */
|
|
|
|
bool mit_irq_level; /* Tracks interrupt pin level. */
|
|
|
|
uint32_t mit_ide; /* Tracks E1000_TXD_CMD_IDE bit. */
|
|
|
|
|
e1000: Delay flush queue when receive RCTL
Due to too early RCT0 interrput, win10x32 may hang on booting.
This problem can be reproduced by doing power cycle on win10x32 guest.
In our environment, we have 10 win10x32 and stress power cycle.
The problem will happen about 20 rounds.
Below shows some log with comment:
The normal case:
22831@1551928392.984687:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
22831@1551928392.985655:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
22831@1551928392.985801:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: RCTL: 0, mac_reg[RCTL] = 0x0
22831@1551928393.056710:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: ICR read: 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: RCTL: 0, mac_reg[RCTL] = 0x0
22831@1551928393.077548:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: ICR read: 0
e1000: set_ics 2, ICR 0, IMR 0
e1000: set_ics 2, ICR 2, IMR 0
e1000: RCTL: 0, mac_reg[RCTL] = 0x0
22831@1551928393.102974:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
22831@1551928393.103267:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
e1000: RCTL: 255, mac_reg[RCTL] = 0x40002 <- win10x32 says it can handle
RX now
e1000: set_ics 0, ICR 2, IMR 9d <- unmask interrupt
e1000: RCTL: 255, mac_reg[RCTL] = 0x48002
e1000: set_ics 80, ICR 2, IMR 9d <- interrupt and work!
...
The bad case:
27744@1551930483.117766:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
27744@1551930483.118398:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: RCTL: 0, mac_reg[RCTL] = 0x0
27744@1551930483.198063:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: ICR read: 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: RCTL: 0, mac_reg[RCTL] = 0x0
27744@1551930483.218675:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: ICR read: 0
e1000: set_ics 2, ICR 0, IMR 0
e1000: set_ics 2, ICR 2, IMR 0
e1000: RCTL: 0, mac_reg[RCTL] = 0x0
27744@1551930483.241768:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
27744@1551930483.241979:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
e1000: RCTL: 255, mac_reg[RCTL] = 0x40002 <- win10x32 says it can handle
RX now
e1000: set_ics 80, ICR 2, IMR 0 <- flush queue (caused by setting RCTL)
e1000: set_ics 0, ICR 82, IMR 9d <- unmask interrupt and because 0x82&0x9d
!= 0 generate interrupt, hang on here...
To workaround this problem, simply delay flush queue. Also stop receiving
when timer is going to run.
Tested on CentOS, Win7SP1x64 and Win10x32.
Signed-off-by: yuchenlin <yuchenlin@synology.com>
Reviewed-by: Dmitry Fleytman <dmitry.fleytman@gmail.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
2019-03-13 09:56:49 +03:00
|
|
|
QEMUTimer *flush_queue_timer;
|
|
|
|
|
2013-02-14 21:11:27 +04:00
|
|
|
/* Compatibility flags for migration to/from qemu 1.3.0 and older */
|
2015-11-11 16:52:40 +03:00
|
|
|
#define E1000_FLAG_MAC_BIT 2
|
2018-03-28 19:36:27 +03:00
|
|
|
#define E1000_FLAG_TSO_BIT 3
|
2021-07-23 10:55:10 +03:00
|
|
|
#define E1000_FLAG_VET_BIT 4
|
2015-11-11 16:52:40 +03:00
|
|
|
#define E1000_FLAG_MAC (1 << E1000_FLAG_MAC_BIT)
|
2018-03-28 19:36:27 +03:00
|
|
|
#define E1000_FLAG_TSO (1 << E1000_FLAG_TSO_BIT)
|
2021-07-23 10:55:10 +03:00
|
|
|
#define E1000_FLAG_VET (1 << E1000_FLAG_VET_BIT)
|
|
|
|
|
2013-02-14 21:11:27 +04:00
|
|
|
uint32_t compat_flags;
|
2018-03-28 19:36:26 +03:00
|
|
|
bool received_tx_tso;
|
2018-03-28 19:36:29 +03:00
|
|
|
bool use_tso_for_migration;
|
2018-03-28 19:36:28 +03:00
|
|
|
e1000x_txd_props mig_props;
|
2020-09-03 23:43:22 +03:00
|
|
|
};
|
|
|
|
typedef struct E1000State_st E1000State;
|
2008-02-03 05:20:18 +03:00
|
|
|
|
e1000: Introduced an array to control the access to the MAC registers
The array of uint8_t's which is introduced here, contains access metadata
about the MAC registers: if a register is accessible, but partly implemented,
or if a register requires a certain compatibility flag in order to be
accessed. Currently, 6 hypothetical flags are supported (3 exist for e1000
so far) but in the future, if more than 6 flags will be needed, the datatype
of this array can simply be swapped for a larger one.
This patch is intended to solve the following current problems:
1) In a scenario of migration between different versions of QEMU, which
differ by the MAC registers implemented in them, some registers need not to
be active if a compatibility flag is set, in order to preserve the machine's
state perfectly for the older version. Checking this for each register
individually, would create a lot of clutter in the code.
2) Some registers are (or may be) only partly implemented (e.g.
placeholders that allow reading and writing, but lack other functions).
In such cases it is better to print a debug warning on read/write attempts.
As above, dealing with this functionality on a per-register level, would
require longer and more messy code.
Signed-off-by: Leonid Bloch <leonid.bloch@ravellosystems.com>
Signed-off-by: Dmitry Fleytman <dmitry.fleytman@ravellosystems.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
2015-11-11 16:52:41 +03:00
|
|
|
#define chkflag(x) (s->compat_flags & E1000_FLAG_##x)
|
|
|
|
|
2020-09-03 23:43:22 +03:00
|
|
|
struct E1000BaseClass {
|
2014-06-02 17:33:27 +04:00
|
|
|
PCIDeviceClass parent_class;
|
|
|
|
uint16_t phy_id2;
|
2020-09-03 23:43:22 +03:00
|
|
|
};
|
|
|
|
typedef struct E1000BaseClass E1000BaseClass;
|
2014-06-02 17:33:27 +04:00
|
|
|
|
|
|
|
#define TYPE_E1000_BASE "e1000-base"
|
2013-06-24 10:50:30 +04:00
|
|
|
|
2020-09-01 00:07:33 +03:00
|
|
|
DECLARE_OBJ_CHECKERS(E1000State, E1000BaseClass,
|
|
|
|
E1000, TYPE_E1000_BASE)
|
2014-06-02 17:33:27 +04:00
|
|
|
|
2013-06-24 10:50:30 +04:00
|
|
|
|
2012-03-22 14:02:07 +04:00
|
|
|
static void
|
2016-06-01 11:23:44 +03:00
|
|
|
e1000_link_up(E1000State *s)
|
2012-03-22 14:02:07 +04:00
|
|
|
{
|
2016-06-01 11:23:44 +03:00
|
|
|
e1000x_update_regs_on_link_up(s->mac_reg, s->phy_reg);
|
|
|
|
|
|
|
|
/* E1000_STATUS_LU is tested by e1000_can_receive() */
|
|
|
|
qemu_flush_queued_packets(qemu_get_queue(s->nic));
|
2012-03-22 14:02:07 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2016-06-01 11:23:44 +03:00
|
|
|
e1000_autoneg_done(E1000State *s)
|
2012-03-22 14:02:07 +04:00
|
|
|
{
|
2016-06-01 11:23:44 +03:00
|
|
|
e1000x_update_regs_on_autoneg_done(s->mac_reg, s->phy_reg);
|
2015-06-25 12:18:05 +03:00
|
|
|
|
|
|
|
/* E1000_STATUS_LU is tested by e1000_can_receive() */
|
|
|
|
qemu_flush_queued_packets(qemu_get_queue(s->nic));
|
2012-03-22 14:02:07 +04:00
|
|
|
}
|
|
|
|
|
2014-08-06 22:07:10 +04:00
|
|
|
static bool
|
|
|
|
have_autoneg(E1000State *s)
|
|
|
|
{
|
2023-09-22 12:52:11 +03:00
|
|
|
return (s->phy_reg[MII_BMCR] & MII_BMCR_AUTOEN);
|
2014-08-06 22:07:10 +04:00
|
|
|
}
|
|
|
|
|
2012-03-22 14:02:24 +04:00
|
|
|
static void
|
|
|
|
set_phy_ctrl(E1000State *s, int index, uint16_t val)
|
|
|
|
{
|
2023-02-23 13:19:48 +03:00
|
|
|
/* bits 0-5 reserved; MII_BMCR_[ANRESTART,RESET] are self clearing */
|
|
|
|
s->phy_reg[MII_BMCR] = val & ~(0x3f |
|
|
|
|
MII_BMCR_RESET |
|
|
|
|
MII_BMCR_ANRESTART);
|
2014-08-06 22:07:10 +04:00
|
|
|
|
2013-02-14 21:11:27 +04:00
|
|
|
/*
|
|
|
|
* QEMU 1.3 does not support link auto-negotiation emulation, so if we
|
|
|
|
* migrate during auto negotiation, after migration the link will be
|
|
|
|
* down.
|
|
|
|
*/
|
2023-02-23 13:19:48 +03:00
|
|
|
if (have_autoneg(s) && (val & MII_BMCR_ANRESTART)) {
|
2016-06-01 11:23:44 +03:00
|
|
|
e1000x_restart_autoneg(s->mac_reg, s->phy_reg, s->autoneg_timer);
|
2012-03-22 14:02:24 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void (*phyreg_writeops[])(E1000State *, int, uint16_t) = {
|
2023-02-23 13:19:48 +03:00
|
|
|
[MII_BMCR] = set_phy_ctrl,
|
2012-03-22 14:02:24 +04:00
|
|
|
};
|
|
|
|
|
|
|
|
enum { NPHYWRITEOPS = ARRAY_SIZE(phyreg_writeops) };
|
|
|
|
|
2008-02-03 05:20:18 +03:00
|
|
|
enum { PHY_R = 1, PHY_W = 2, PHY_RW = PHY_R | PHY_W };
|
2008-10-02 22:24:21 +04:00
|
|
|
static const char phy_regcap[0x20] = {
|
2023-02-23 13:19:48 +03:00
|
|
|
[MII_BMSR] = PHY_R, [M88E1000_EXT_PHY_SPEC_CTRL] = PHY_RW,
|
|
|
|
[MII_PHYID1] = PHY_R, [M88E1000_PHY_SPEC_CTRL] = PHY_RW,
|
|
|
|
[MII_BMCR] = PHY_RW, [MII_CTRL1000] = PHY_RW,
|
|
|
|
[MII_ANLPAR] = PHY_R, [MII_STAT1000] = PHY_R,
|
|
|
|
[MII_ANAR] = PHY_RW, [M88E1000_RX_ERR_CNTR] = PHY_R,
|
|
|
|
[MII_PHYID2] = PHY_R, [M88E1000_PHY_SPEC_STATUS] = PHY_R,
|
|
|
|
[MII_ANER] = PHY_R,
|
2008-02-03 05:20:18 +03:00
|
|
|
};
|
|
|
|
|
2023-02-23 13:19:48 +03:00
|
|
|
/* MII_PHYID2 documented in 8254x_GBe_SDM.pdf, pp. 250 */
|
2012-02-12 17:11:53 +04:00
|
|
|
static const uint16_t phy_reg_init[] = {
|
2023-02-23 13:19:48 +03:00
|
|
|
[MII_BMCR] = MII_BMCR_SPEED1000 |
|
|
|
|
MII_BMCR_FD |
|
|
|
|
MII_BMCR_AUTOEN,
|
|
|
|
|
|
|
|
[MII_BMSR] = MII_BMSR_EXTCAP |
|
|
|
|
MII_BMSR_LINK_ST | /* link initially up */
|
|
|
|
MII_BMSR_AUTONEG |
|
|
|
|
/* MII_BMSR_AN_COMP: initially NOT completed */
|
|
|
|
MII_BMSR_MFPS |
|
|
|
|
MII_BMSR_EXTSTAT |
|
|
|
|
MII_BMSR_10T_HD |
|
|
|
|
MII_BMSR_10T_FD |
|
|
|
|
MII_BMSR_100TX_HD |
|
|
|
|
MII_BMSR_100TX_FD,
|
|
|
|
|
|
|
|
[MII_PHYID1] = 0x141,
|
|
|
|
/* [MII_PHYID2] configured per DevId, from e1000_reset() */
|
2023-02-23 13:19:52 +03:00
|
|
|
[MII_ANAR] = MII_ANAR_CSMACD | MII_ANAR_10 |
|
|
|
|
MII_ANAR_10FD | MII_ANAR_TX |
|
|
|
|
MII_ANAR_TXFD | MII_ANAR_PAUSE |
|
|
|
|
MII_ANAR_PAUSE_ASYM,
|
|
|
|
[MII_ANLPAR] = MII_ANLPAR_10 | MII_ANLPAR_10FD |
|
|
|
|
MII_ANLPAR_TX | MII_ANLPAR_TXFD,
|
|
|
|
[MII_CTRL1000] = MII_CTRL1000_FULL | MII_CTRL1000_PORT |
|
|
|
|
MII_CTRL1000_MASTER,
|
|
|
|
[MII_STAT1000] = MII_STAT1000_HALF | MII_STAT1000_FULL |
|
|
|
|
MII_STAT1000_ROK | MII_STAT1000_LOK,
|
2014-08-06 22:07:11 +04:00
|
|
|
[M88E1000_PHY_SPEC_CTRL] = 0x360,
|
2012-02-12 17:11:53 +04:00
|
|
|
[M88E1000_PHY_SPEC_STATUS] = 0xac00,
|
2014-08-06 22:07:11 +04:00
|
|
|
[M88E1000_EXT_PHY_SPEC_CTRL] = 0x0d60,
|
2012-02-12 17:11:53 +04:00
|
|
|
};
|
|
|
|
|
|
|
|
static const uint32_t mac_reg_init[] = {
|
2015-11-11 16:52:39 +03:00
|
|
|
[PBA] = 0x00100030,
|
|
|
|
[LEDCTL] = 0x602,
|
|
|
|
[CTRL] = E1000_CTRL_SWDPIN2 | E1000_CTRL_SWDPIN0 |
|
2012-02-12 17:11:53 +04:00
|
|
|
E1000_CTRL_SPD_1000 | E1000_CTRL_SLU,
|
2015-11-11 16:52:39 +03:00
|
|
|
[STATUS] = 0x80000000 | E1000_STATUS_GIO_MASTER_ENABLE |
|
2012-02-12 17:11:53 +04:00
|
|
|
E1000_STATUS_ASDV | E1000_STATUS_MTXCKOK |
|
|
|
|
E1000_STATUS_SPEED_1000 | E1000_STATUS_FD |
|
|
|
|
E1000_STATUS_LU,
|
2015-11-11 16:52:39 +03:00
|
|
|
[MANC] = E1000_MANC_EN_MNG2HOST | E1000_MANC_RCV_TCO_EN |
|
2012-02-12 17:11:53 +04:00
|
|
|
E1000_MANC_ARP_EN | E1000_MANC_0298_EN |
|
|
|
|
E1000_MANC_RMCP_EN,
|
|
|
|
};
|
|
|
|
|
e1000: add interrupt mitigation support
This patch partially implements the e1000 interrupt mitigation mechanisms.
Using a single QEMUTimer, it emulates the ITR register (which is the newer
mitigation register, recommended by Intel) and approximately emulates
RADV and TADV registers. TIDV and RDTR register functionalities are not
emulated (RDTR is only used to validate RADV, according to the e1000 specs).
RADV, TADV, TIDV and RDTR registers make up the older e1000 mitigation
mechanism and would need a timer each to be completely emulated. However,
a single timer has been used in order to reach a good compromise between
emulation accuracy and simplicity/efficiency.
The implemented mechanism can be enabled/disabled specifying the command
line e1000-specific boolean parameter "mitigation", e.g.
qemu-system-x86_64 -device e1000,mitigation=on,... ...
For more information, see the Software developer's manual at
http://download.intel.com/design/network/manuals/8254x_GBe_SDM.pdf.
Interrupt mitigation boosts performance when the guest suffers from
an high interrupt rate (i.e. receiving short UDP packets at high packet
rate). For some numerical results see the following link
http://info.iet.unipi.it/~luigi/papers/20130520-rizzo-vm.pdf
Signed-off-by: Vincenzo Maffione <v.maffione@gmail.com>
Reviewed-by: Andreas Färber <afaerber@suse.de> (for pc-* machines)
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2013-08-02 20:30:52 +04:00
|
|
|
/* Helper function, *curr == 0 means the value is not set */
|
|
|
|
static inline void
|
|
|
|
mit_update_delay(uint32_t *curr, uint32_t value)
|
|
|
|
{
|
|
|
|
if (value && (*curr == 0 || value < *curr)) {
|
|
|
|
*curr = value;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2008-02-03 05:20:18 +03:00
|
|
|
static void
|
|
|
|
set_interrupt_cause(E1000State *s, int index, uint32_t val)
|
|
|
|
{
|
2013-06-30 14:55:52 +04:00
|
|
|
PCIDevice *d = PCI_DEVICE(s);
|
e1000: add interrupt mitigation support
This patch partially implements the e1000 interrupt mitigation mechanisms.
Using a single QEMUTimer, it emulates the ITR register (which is the newer
mitigation register, recommended by Intel) and approximately emulates
RADV and TADV registers. TIDV and RDTR register functionalities are not
emulated (RDTR is only used to validate RADV, according to the e1000 specs).
RADV, TADV, TIDV and RDTR registers make up the older e1000 mitigation
mechanism and would need a timer each to be completely emulated. However,
a single timer has been used in order to reach a good compromise between
emulation accuracy and simplicity/efficiency.
The implemented mechanism can be enabled/disabled specifying the command
line e1000-specific boolean parameter "mitigation", e.g.
qemu-system-x86_64 -device e1000,mitigation=on,... ...
For more information, see the Software developer's manual at
http://download.intel.com/design/network/manuals/8254x_GBe_SDM.pdf.
Interrupt mitigation boosts performance when the guest suffers from
an high interrupt rate (i.e. receiving short UDP packets at high packet
rate). For some numerical results see the following link
http://info.iet.unipi.it/~luigi/papers/20130520-rizzo-vm.pdf
Signed-off-by: Vincenzo Maffione <v.maffione@gmail.com>
Reviewed-by: Andreas Färber <afaerber@suse.de> (for pc-* machines)
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2013-08-02 20:30:52 +04:00
|
|
|
uint32_t pending_ints;
|
|
|
|
uint32_t mit_delay;
|
2013-06-30 14:55:52 +04:00
|
|
|
|
2008-02-03 05:20:18 +03:00
|
|
|
s->mac_reg[ICR] = val;
|
2013-01-10 01:50:00 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Make sure ICR and ICS registers have the same value.
|
|
|
|
* The spec says that the ICS register is write-only. However in practice,
|
|
|
|
* on real hardware ICS is readable, and for reads it has the same value as
|
|
|
|
* ICR (except that ICS does not have the clear on read behaviour of ICR).
|
|
|
|
*
|
|
|
|
* The VxWorks PRO/1000 driver uses this behaviour.
|
|
|
|
*/
|
2009-07-29 21:22:55 +04:00
|
|
|
s->mac_reg[ICS] = val;
|
2013-01-10 01:50:00 +04:00
|
|
|
|
e1000: add interrupt mitigation support
This patch partially implements the e1000 interrupt mitigation mechanisms.
Using a single QEMUTimer, it emulates the ITR register (which is the newer
mitigation register, recommended by Intel) and approximately emulates
RADV and TADV registers. TIDV and RDTR register functionalities are not
emulated (RDTR is only used to validate RADV, according to the e1000 specs).
RADV, TADV, TIDV and RDTR registers make up the older e1000 mitigation
mechanism and would need a timer each to be completely emulated. However,
a single timer has been used in order to reach a good compromise between
emulation accuracy and simplicity/efficiency.
The implemented mechanism can be enabled/disabled specifying the command
line e1000-specific boolean parameter "mitigation", e.g.
qemu-system-x86_64 -device e1000,mitigation=on,... ...
For more information, see the Software developer's manual at
http://download.intel.com/design/network/manuals/8254x_GBe_SDM.pdf.
Interrupt mitigation boosts performance when the guest suffers from
an high interrupt rate (i.e. receiving short UDP packets at high packet
rate). For some numerical results see the following link
http://info.iet.unipi.it/~luigi/papers/20130520-rizzo-vm.pdf
Signed-off-by: Vincenzo Maffione <v.maffione@gmail.com>
Reviewed-by: Andreas Färber <afaerber@suse.de> (for pc-* machines)
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2013-08-02 20:30:52 +04:00
|
|
|
pending_ints = (s->mac_reg[IMS] & s->mac_reg[ICR]);
|
|
|
|
if (!s->mit_irq_level && pending_ints) {
|
|
|
|
/*
|
|
|
|
* Here we detect a potential raising edge. We postpone raising the
|
|
|
|
* interrupt line if we are inside the mitigation delay window
|
|
|
|
* (s->mit_timer_on == 1).
|
|
|
|
* We provide a partial implementation of interrupt mitigation,
|
|
|
|
* emulating only RADV, TADV and ITR (lower 16 bits, 1024ns units for
|
|
|
|
* RADV and TADV, 256ns units for ITR). RDTR is only used to enable
|
|
|
|
* RADV; relative timers based on TIDV and RDTR are not implemented.
|
|
|
|
*/
|
|
|
|
if (s->mit_timer_on) {
|
|
|
|
return;
|
|
|
|
}
|
2023-09-22 12:52:11 +03:00
|
|
|
|
|
|
|
/* Compute the next mitigation delay according to pending
|
|
|
|
* interrupts and the current values of RADV (provided
|
|
|
|
* RDTR!=0), TADV and ITR.
|
|
|
|
* Then rearm the timer.
|
|
|
|
*/
|
|
|
|
mit_delay = 0;
|
|
|
|
if (s->mit_ide &&
|
|
|
|
(pending_ints & (E1000_ICR_TXQE | E1000_ICR_TXDW))) {
|
|
|
|
mit_update_delay(&mit_delay, s->mac_reg[TADV] * 4);
|
e1000: add interrupt mitigation support
This patch partially implements the e1000 interrupt mitigation mechanisms.
Using a single QEMUTimer, it emulates the ITR register (which is the newer
mitigation register, recommended by Intel) and approximately emulates
RADV and TADV registers. TIDV and RDTR register functionalities are not
emulated (RDTR is only used to validate RADV, according to the e1000 specs).
RADV, TADV, TIDV and RDTR registers make up the older e1000 mitigation
mechanism and would need a timer each to be completely emulated. However,
a single timer has been used in order to reach a good compromise between
emulation accuracy and simplicity/efficiency.
The implemented mechanism can be enabled/disabled specifying the command
line e1000-specific boolean parameter "mitigation", e.g.
qemu-system-x86_64 -device e1000,mitigation=on,... ...
For more information, see the Software developer's manual at
http://download.intel.com/design/network/manuals/8254x_GBe_SDM.pdf.
Interrupt mitigation boosts performance when the guest suffers from
an high interrupt rate (i.e. receiving short UDP packets at high packet
rate). For some numerical results see the following link
http://info.iet.unipi.it/~luigi/papers/20130520-rizzo-vm.pdf
Signed-off-by: Vincenzo Maffione <v.maffione@gmail.com>
Reviewed-by: Andreas Färber <afaerber@suse.de> (for pc-* machines)
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2013-08-02 20:30:52 +04:00
|
|
|
}
|
2023-09-22 12:52:11 +03:00
|
|
|
if (s->mac_reg[RDTR] && (pending_ints & E1000_ICS_RXT0)) {
|
|
|
|
mit_update_delay(&mit_delay, s->mac_reg[RADV] * 4);
|
|
|
|
}
|
|
|
|
mit_update_delay(&mit_delay, s->mac_reg[ITR]);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* According to e1000 SPEC, the Ethernet controller guarantees
|
|
|
|
* a maximum observable interrupt rate of 7813 interrupts/sec.
|
|
|
|
* Thus if mit_delay < 500 then the delay should be set to the
|
|
|
|
* minimum delay possible which is 500.
|
|
|
|
*/
|
|
|
|
mit_delay = (mit_delay < 500) ? 500 : mit_delay;
|
|
|
|
|
|
|
|
s->mit_timer_on = 1;
|
|
|
|
timer_mod(s->mit_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
|
|
|
|
mit_delay * 256);
|
|
|
|
s->mit_ide = 0;
|
e1000: add interrupt mitigation support
This patch partially implements the e1000 interrupt mitigation mechanisms.
Using a single QEMUTimer, it emulates the ITR register (which is the newer
mitigation register, recommended by Intel) and approximately emulates
RADV and TADV registers. TIDV and RDTR register functionalities are not
emulated (RDTR is only used to validate RADV, according to the e1000 specs).
RADV, TADV, TIDV and RDTR registers make up the older e1000 mitigation
mechanism and would need a timer each to be completely emulated. However,
a single timer has been used in order to reach a good compromise between
emulation accuracy and simplicity/efficiency.
The implemented mechanism can be enabled/disabled specifying the command
line e1000-specific boolean parameter "mitigation", e.g.
qemu-system-x86_64 -device e1000,mitigation=on,... ...
For more information, see the Software developer's manual at
http://download.intel.com/design/network/manuals/8254x_GBe_SDM.pdf.
Interrupt mitigation boosts performance when the guest suffers from
an high interrupt rate (i.e. receiving short UDP packets at high packet
rate). For some numerical results see the following link
http://info.iet.unipi.it/~luigi/papers/20130520-rizzo-vm.pdf
Signed-off-by: Vincenzo Maffione <v.maffione@gmail.com>
Reviewed-by: Andreas Färber <afaerber@suse.de> (for pc-* machines)
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2013-08-02 20:30:52 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
s->mit_irq_level = (pending_ints != 0);
|
2013-10-07 11:36:39 +04:00
|
|
|
pci_set_irq(d, s->mit_irq_level);
|
e1000: add interrupt mitigation support
This patch partially implements the e1000 interrupt mitigation mechanisms.
Using a single QEMUTimer, it emulates the ITR register (which is the newer
mitigation register, recommended by Intel) and approximately emulates
RADV and TADV registers. TIDV and RDTR register functionalities are not
emulated (RDTR is only used to validate RADV, according to the e1000 specs).
RADV, TADV, TIDV and RDTR registers make up the older e1000 mitigation
mechanism and would need a timer each to be completely emulated. However,
a single timer has been used in order to reach a good compromise between
emulation accuracy and simplicity/efficiency.
The implemented mechanism can be enabled/disabled specifying the command
line e1000-specific boolean parameter "mitigation", e.g.
qemu-system-x86_64 -device e1000,mitigation=on,... ...
For more information, see the Software developer's manual at
http://download.intel.com/design/network/manuals/8254x_GBe_SDM.pdf.
Interrupt mitigation boosts performance when the guest suffers from
an high interrupt rate (i.e. receiving short UDP packets at high packet
rate). For some numerical results see the following link
http://info.iet.unipi.it/~luigi/papers/20130520-rizzo-vm.pdf
Signed-off-by: Vincenzo Maffione <v.maffione@gmail.com>
Reviewed-by: Andreas Färber <afaerber@suse.de> (for pc-* machines)
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2013-08-02 20:30:52 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
e1000_mit_timer(void *opaque)
|
|
|
|
{
|
|
|
|
E1000State *s = opaque;
|
|
|
|
|
|
|
|
s->mit_timer_on = 0;
|
|
|
|
/* Call set_interrupt_cause to update the irq level (if necessary). */
|
|
|
|
set_interrupt_cause(s, 0, s->mac_reg[ICR]);
|
2008-02-03 05:20:18 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
set_ics(E1000State *s, int index, uint32_t val)
|
|
|
|
{
|
|
|
|
DBGOUT(INTERRUPT, "set_ics %x, ICR %x, IMR %x\n", val, s->mac_reg[ICR],
|
|
|
|
s->mac_reg[IMS]);
|
|
|
|
set_interrupt_cause(s, 0, val | s->mac_reg[ICR]);
|
|
|
|
}
|
|
|
|
|
2014-06-19 19:55:35 +04:00
|
|
|
static void
|
|
|
|
e1000_autoneg_timer(void *opaque)
|
|
|
|
{
|
|
|
|
E1000State *s = opaque;
|
|
|
|
if (!qemu_get_queue(s->nic)->link_down) {
|
2016-06-01 11:23:44 +03:00
|
|
|
e1000_autoneg_done(s);
|
2014-06-19 19:55:35 +04:00
|
|
|
set_ics(s, 0, E1000_ICS_LSC); /* signal link status change to guest */
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-07-23 10:55:10 +03:00
|
|
|
static bool e1000_vet_init_need(void *opaque)
|
|
|
|
{
|
|
|
|
E1000State *s = opaque;
|
|
|
|
|
|
|
|
return chkflag(VET);
|
|
|
|
}
|
|
|
|
|
2023-02-23 13:19:58 +03:00
|
|
|
static void e1000_reset_hold(Object *obj)
|
2012-02-12 17:11:53 +04:00
|
|
|
{
|
2023-02-23 13:19:58 +03:00
|
|
|
E1000State *d = E1000(obj);
|
2020-08-25 22:19:57 +03:00
|
|
|
E1000BaseClass *edc = E1000_GET_CLASS(d);
|
2012-10-31 22:15:39 +04:00
|
|
|
uint8_t *macaddr = d->conf.macaddr.a;
|
2012-02-12 17:11:53 +04:00
|
|
|
|
2013-08-21 19:03:08 +04:00
|
|
|
timer_del(d->autoneg_timer);
|
e1000: add interrupt mitigation support
This patch partially implements the e1000 interrupt mitigation mechanisms.
Using a single QEMUTimer, it emulates the ITR register (which is the newer
mitigation register, recommended by Intel) and approximately emulates
RADV and TADV registers. TIDV and RDTR register functionalities are not
emulated (RDTR is only used to validate RADV, according to the e1000 specs).
RADV, TADV, TIDV and RDTR registers make up the older e1000 mitigation
mechanism and would need a timer each to be completely emulated. However,
a single timer has been used in order to reach a good compromise between
emulation accuracy and simplicity/efficiency.
The implemented mechanism can be enabled/disabled specifying the command
line e1000-specific boolean parameter "mitigation", e.g.
qemu-system-x86_64 -device e1000,mitigation=on,... ...
For more information, see the Software developer's manual at
http://download.intel.com/design/network/manuals/8254x_GBe_SDM.pdf.
Interrupt mitigation boosts performance when the guest suffers from
an high interrupt rate (i.e. receiving short UDP packets at high packet
rate). For some numerical results see the following link
http://info.iet.unipi.it/~luigi/papers/20130520-rizzo-vm.pdf
Signed-off-by: Vincenzo Maffione <v.maffione@gmail.com>
Reviewed-by: Andreas Färber <afaerber@suse.de> (for pc-* machines)
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2013-08-02 20:30:52 +04:00
|
|
|
timer_del(d->mit_timer);
|
e1000: Delay flush queue when receive RCTL
Due to too early RCT0 interrput, win10x32 may hang on booting.
This problem can be reproduced by doing power cycle on win10x32 guest.
In our environment, we have 10 win10x32 and stress power cycle.
The problem will happen about 20 rounds.
Below shows some log with comment:
The normal case:
22831@1551928392.984687:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
22831@1551928392.985655:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
22831@1551928392.985801:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: RCTL: 0, mac_reg[RCTL] = 0x0
22831@1551928393.056710:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: ICR read: 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: RCTL: 0, mac_reg[RCTL] = 0x0
22831@1551928393.077548:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: ICR read: 0
e1000: set_ics 2, ICR 0, IMR 0
e1000: set_ics 2, ICR 2, IMR 0
e1000: RCTL: 0, mac_reg[RCTL] = 0x0
22831@1551928393.102974:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
22831@1551928393.103267:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
e1000: RCTL: 255, mac_reg[RCTL] = 0x40002 <- win10x32 says it can handle
RX now
e1000: set_ics 0, ICR 2, IMR 9d <- unmask interrupt
e1000: RCTL: 255, mac_reg[RCTL] = 0x48002
e1000: set_ics 80, ICR 2, IMR 9d <- interrupt and work!
...
The bad case:
27744@1551930483.117766:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
27744@1551930483.118398:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: RCTL: 0, mac_reg[RCTL] = 0x0
27744@1551930483.198063:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: ICR read: 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: RCTL: 0, mac_reg[RCTL] = 0x0
27744@1551930483.218675:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: ICR read: 0
e1000: set_ics 2, ICR 0, IMR 0
e1000: set_ics 2, ICR 2, IMR 0
e1000: RCTL: 0, mac_reg[RCTL] = 0x0
27744@1551930483.241768:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
27744@1551930483.241979:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
e1000: RCTL: 255, mac_reg[RCTL] = 0x40002 <- win10x32 says it can handle
RX now
e1000: set_ics 80, ICR 2, IMR 0 <- flush queue (caused by setting RCTL)
e1000: set_ics 0, ICR 82, IMR 9d <- unmask interrupt and because 0x82&0x9d
!= 0 generate interrupt, hang on here...
To workaround this problem, simply delay flush queue. Also stop receiving
when timer is going to run.
Tested on CentOS, Win7SP1x64 and Win10x32.
Signed-off-by: yuchenlin <yuchenlin@synology.com>
Reviewed-by: Dmitry Fleytman <dmitry.fleytman@gmail.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
2019-03-13 09:56:49 +03:00
|
|
|
timer_del(d->flush_queue_timer);
|
e1000: add interrupt mitigation support
This patch partially implements the e1000 interrupt mitigation mechanisms.
Using a single QEMUTimer, it emulates the ITR register (which is the newer
mitigation register, recommended by Intel) and approximately emulates
RADV and TADV registers. TIDV and RDTR register functionalities are not
emulated (RDTR is only used to validate RADV, according to the e1000 specs).
RADV, TADV, TIDV and RDTR registers make up the older e1000 mitigation
mechanism and would need a timer each to be completely emulated. However,
a single timer has been used in order to reach a good compromise between
emulation accuracy and simplicity/efficiency.
The implemented mechanism can be enabled/disabled specifying the command
line e1000-specific boolean parameter "mitigation", e.g.
qemu-system-x86_64 -device e1000,mitigation=on,... ...
For more information, see the Software developer's manual at
http://download.intel.com/design/network/manuals/8254x_GBe_SDM.pdf.
Interrupt mitigation boosts performance when the guest suffers from
an high interrupt rate (i.e. receiving short UDP packets at high packet
rate). For some numerical results see the following link
http://info.iet.unipi.it/~luigi/papers/20130520-rizzo-vm.pdf
Signed-off-by: Vincenzo Maffione <v.maffione@gmail.com>
Reviewed-by: Andreas Färber <afaerber@suse.de> (for pc-* machines)
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2013-08-02 20:30:52 +04:00
|
|
|
d->mit_timer_on = 0;
|
|
|
|
d->mit_irq_level = 0;
|
|
|
|
d->mit_ide = 0;
|
2012-02-12 17:11:53 +04:00
|
|
|
memset(d->phy_reg, 0, sizeof d->phy_reg);
|
2023-02-23 13:19:54 +03:00
|
|
|
memcpy(d->phy_reg, phy_reg_init, sizeof phy_reg_init);
|
2023-02-23 13:19:48 +03:00
|
|
|
d->phy_reg[MII_PHYID2] = edc->phy_id2;
|
2012-02-12 17:11:53 +04:00
|
|
|
memset(d->mac_reg, 0, sizeof d->mac_reg);
|
2023-02-23 13:19:54 +03:00
|
|
|
memcpy(d->mac_reg, mac_reg_init, sizeof mac_reg_init);
|
2012-02-12 17:11:53 +04:00
|
|
|
d->rxbuf_min_shift = 1;
|
|
|
|
memset(&d->tx, 0, sizeof d->tx);
|
|
|
|
|
2013-01-30 15:12:22 +04:00
|
|
|
if (qemu_get_queue(d->nic)->link_down) {
|
2016-06-01 11:23:44 +03:00
|
|
|
e1000x_update_regs_on_link_down(d->mac_reg, d->phy_reg);
|
2012-02-12 17:11:53 +04:00
|
|
|
}
|
2012-10-31 22:15:39 +04:00
|
|
|
|
2016-06-01 11:23:44 +03:00
|
|
|
e1000x_reset_mac_addr(d->nic, d->mac_reg, macaddr);
|
2021-07-23 10:55:10 +03:00
|
|
|
|
|
|
|
if (e1000_vet_init_need(d)) {
|
|
|
|
d->mac_reg[VET] = ETH_P_VLAN;
|
|
|
|
}
|
2012-02-12 17:11:53 +04:00
|
|
|
}
|
|
|
|
|
2009-05-23 13:21:33 +04:00
|
|
|
static void
|
|
|
|
set_ctrl(E1000State *s, int index, uint32_t val)
|
|
|
|
{
|
|
|
|
/* RST is self clearing */
|
|
|
|
s->mac_reg[CTRL] = val & ~E1000_CTRL_RST;
|
|
|
|
}
|
|
|
|
|
e1000: Delay flush queue when receive RCTL
Due to too early RCT0 interrput, win10x32 may hang on booting.
This problem can be reproduced by doing power cycle on win10x32 guest.
In our environment, we have 10 win10x32 and stress power cycle.
The problem will happen about 20 rounds.
Below shows some log with comment:
The normal case:
22831@1551928392.984687:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
22831@1551928392.985655:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
22831@1551928392.985801:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: RCTL: 0, mac_reg[RCTL] = 0x0
22831@1551928393.056710:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: ICR read: 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: RCTL: 0, mac_reg[RCTL] = 0x0
22831@1551928393.077548:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: ICR read: 0
e1000: set_ics 2, ICR 0, IMR 0
e1000: set_ics 2, ICR 2, IMR 0
e1000: RCTL: 0, mac_reg[RCTL] = 0x0
22831@1551928393.102974:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
22831@1551928393.103267:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
e1000: RCTL: 255, mac_reg[RCTL] = 0x40002 <- win10x32 says it can handle
RX now
e1000: set_ics 0, ICR 2, IMR 9d <- unmask interrupt
e1000: RCTL: 255, mac_reg[RCTL] = 0x48002
e1000: set_ics 80, ICR 2, IMR 9d <- interrupt and work!
...
The bad case:
27744@1551930483.117766:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
27744@1551930483.118398:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: RCTL: 0, mac_reg[RCTL] = 0x0
27744@1551930483.198063:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: ICR read: 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: RCTL: 0, mac_reg[RCTL] = 0x0
27744@1551930483.218675:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: ICR read: 0
e1000: set_ics 2, ICR 0, IMR 0
e1000: set_ics 2, ICR 2, IMR 0
e1000: RCTL: 0, mac_reg[RCTL] = 0x0
27744@1551930483.241768:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
27744@1551930483.241979:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
e1000: RCTL: 255, mac_reg[RCTL] = 0x40002 <- win10x32 says it can handle
RX now
e1000: set_ics 80, ICR 2, IMR 0 <- flush queue (caused by setting RCTL)
e1000: set_ics 0, ICR 82, IMR 9d <- unmask interrupt and because 0x82&0x9d
!= 0 generate interrupt, hang on here...
To workaround this problem, simply delay flush queue. Also stop receiving
when timer is going to run.
Tested on CentOS, Win7SP1x64 and Win10x32.
Signed-off-by: yuchenlin <yuchenlin@synology.com>
Reviewed-by: Dmitry Fleytman <dmitry.fleytman@gmail.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
2019-03-13 09:56:49 +03:00
|
|
|
static void
|
|
|
|
e1000_flush_queue_timer(void *opaque)
|
|
|
|
{
|
|
|
|
E1000State *s = opaque;
|
|
|
|
|
|
|
|
qemu_flush_queued_packets(qemu_get_queue(s->nic));
|
|
|
|
}
|
|
|
|
|
2008-02-03 05:20:18 +03:00
|
|
|
static void
|
|
|
|
set_rx_control(E1000State *s, int index, uint32_t val)
|
|
|
|
{
|
|
|
|
s->mac_reg[RCTL] = val;
|
2016-06-01 11:23:44 +03:00
|
|
|
s->rxbuf_size = e1000x_rxbufsize(val);
|
2008-02-03 05:20:18 +03:00
|
|
|
s->rxbuf_min_shift = ((val / E1000_RCTL_RDMTS_QUAT) & 3) + 1;
|
|
|
|
DBGOUT(RX, "RCTL: %d, mac_reg[RCTL] = 0x%x\n", s->mac_reg[RDT],
|
|
|
|
s->mac_reg[RCTL]);
|
e1000: Delay flush queue when receive RCTL
Due to too early RCT0 interrput, win10x32 may hang on booting.
This problem can be reproduced by doing power cycle on win10x32 guest.
In our environment, we have 10 win10x32 and stress power cycle.
The problem will happen about 20 rounds.
Below shows some log with comment:
The normal case:
22831@1551928392.984687:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
22831@1551928392.985655:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
22831@1551928392.985801:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: RCTL: 0, mac_reg[RCTL] = 0x0
22831@1551928393.056710:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: ICR read: 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: RCTL: 0, mac_reg[RCTL] = 0x0
22831@1551928393.077548:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: ICR read: 0
e1000: set_ics 2, ICR 0, IMR 0
e1000: set_ics 2, ICR 2, IMR 0
e1000: RCTL: 0, mac_reg[RCTL] = 0x0
22831@1551928393.102974:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
22831@1551928393.103267:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
e1000: RCTL: 255, mac_reg[RCTL] = 0x40002 <- win10x32 says it can handle
RX now
e1000: set_ics 0, ICR 2, IMR 9d <- unmask interrupt
e1000: RCTL: 255, mac_reg[RCTL] = 0x48002
e1000: set_ics 80, ICR 2, IMR 9d <- interrupt and work!
...
The bad case:
27744@1551930483.117766:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
27744@1551930483.118398:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: RCTL: 0, mac_reg[RCTL] = 0x0
27744@1551930483.198063:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: ICR read: 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: RCTL: 0, mac_reg[RCTL] = 0x0
27744@1551930483.218675:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: ICR read: 0
e1000: set_ics 2, ICR 0, IMR 0
e1000: set_ics 2, ICR 2, IMR 0
e1000: RCTL: 0, mac_reg[RCTL] = 0x0
27744@1551930483.241768:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
27744@1551930483.241979:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
e1000: RCTL: 255, mac_reg[RCTL] = 0x40002 <- win10x32 says it can handle
RX now
e1000: set_ics 80, ICR 2, IMR 0 <- flush queue (caused by setting RCTL)
e1000: set_ics 0, ICR 82, IMR 9d <- unmask interrupt and because 0x82&0x9d
!= 0 generate interrupt, hang on here...
To workaround this problem, simply delay flush queue. Also stop receiving
when timer is going to run.
Tested on CentOS, Win7SP1x64 and Win10x32.
Signed-off-by: yuchenlin <yuchenlin@synology.com>
Reviewed-by: Dmitry Fleytman <dmitry.fleytman@gmail.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
2019-03-13 09:56:49 +03:00
|
|
|
timer_mod(s->flush_queue_timer,
|
|
|
|
qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + 1000);
|
2008-02-03 05:20:18 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
set_mdic(E1000State *s, int index, uint32_t val)
|
|
|
|
{
|
|
|
|
uint32_t data = val & E1000_MDIC_DATA_MASK;
|
|
|
|
uint32_t addr = ((val & E1000_MDIC_REG_MASK) >> E1000_MDIC_REG_SHIFT);
|
|
|
|
|
|
|
|
if ((val & E1000_MDIC_PHY_MASK) >> E1000_MDIC_PHY_SHIFT != 1) // phy #
|
|
|
|
val = s->mac_reg[MDIC] | E1000_MDIC_ERROR;
|
|
|
|
else if (val & E1000_MDIC_OP_READ) {
|
|
|
|
DBGOUT(MDIC, "MDIC read reg 0x%x\n", addr);
|
|
|
|
if (!(phy_regcap[addr] & PHY_R)) {
|
|
|
|
DBGOUT(MDIC, "MDIC read reg %x unhandled\n", addr);
|
|
|
|
val |= E1000_MDIC_ERROR;
|
|
|
|
} else
|
|
|
|
val = (val ^ data) | s->phy_reg[addr];
|
|
|
|
} else if (val & E1000_MDIC_OP_WRITE) {
|
|
|
|
DBGOUT(MDIC, "MDIC write reg 0x%x, value 0x%x\n", addr, data);
|
|
|
|
if (!(phy_regcap[addr] & PHY_W)) {
|
|
|
|
DBGOUT(MDIC, "MDIC write reg %x unhandled\n", addr);
|
|
|
|
val |= E1000_MDIC_ERROR;
|
2012-03-22 14:02:24 +04:00
|
|
|
} else {
|
|
|
|
if (addr < NPHYWRITEOPS && phyreg_writeops[addr]) {
|
|
|
|
phyreg_writeops[addr](s, index, data);
|
2014-08-06 22:07:10 +04:00
|
|
|
} else {
|
|
|
|
s->phy_reg[addr] = data;
|
2012-03-22 14:02:24 +04:00
|
|
|
}
|
|
|
|
}
|
2008-02-03 05:20:18 +03:00
|
|
|
}
|
|
|
|
s->mac_reg[MDIC] = val | E1000_MDIC_READY;
|
2012-03-22 14:01:50 +04:00
|
|
|
|
|
|
|
if (val & E1000_MDIC_INT_EN) {
|
|
|
|
set_ics(s, 0, E1000_ICR_MDAC);
|
|
|
|
}
|
2008-02-03 05:20:18 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static uint32_t
|
|
|
|
get_eecd(E1000State *s, int index)
|
|
|
|
{
|
|
|
|
uint32_t ret = E1000_EECD_PRES|E1000_EECD_GNT | s->eecd_state.old_eecd;
|
|
|
|
|
|
|
|
DBGOUT(EEPROM, "reading eeprom bit %d (reading %d)\n",
|
|
|
|
s->eecd_state.bitnum_out, s->eecd_state.reading);
|
|
|
|
if (!s->eecd_state.reading ||
|
|
|
|
((s->eeprom_data[(s->eecd_state.bitnum_out >> 4) & 0x3f] >>
|
|
|
|
((s->eecd_state.bitnum_out & 0xf) ^ 0xf))) & 1)
|
|
|
|
ret |= E1000_EECD_DO;
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
set_eecd(E1000State *s, int index, uint32_t val)
|
|
|
|
{
|
|
|
|
uint32_t oldval = s->eecd_state.old_eecd;
|
|
|
|
|
|
|
|
s->eecd_state.old_eecd = val & (E1000_EECD_SK | E1000_EECD_CS |
|
|
|
|
E1000_EECD_DI|E1000_EECD_FWE_MASK|E1000_EECD_REQ);
|
2015-11-11 16:52:39 +03:00
|
|
|
if (!(E1000_EECD_CS & val)) { /* CS inactive; nothing to do */
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
if (E1000_EECD_CS & (val ^ oldval)) { /* CS rise edge; reset state */
|
|
|
|
s->eecd_state.val_in = 0;
|
|
|
|
s->eecd_state.bitnum_in = 0;
|
|
|
|
s->eecd_state.bitnum_out = 0;
|
|
|
|
s->eecd_state.reading = 0;
|
2010-07-10 18:03:45 +04:00
|
|
|
}
|
2015-11-11 16:52:39 +03:00
|
|
|
if (!(E1000_EECD_SK & (val ^ oldval))) { /* no clock edge */
|
2008-02-03 05:20:18 +03:00
|
|
|
return;
|
2015-11-11 16:52:39 +03:00
|
|
|
}
|
|
|
|
if (!(E1000_EECD_SK & val)) { /* falling edge */
|
2008-02-03 05:20:18 +03:00
|
|
|
s->eecd_state.bitnum_out++;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
s->eecd_state.val_in <<= 1;
|
|
|
|
if (val & E1000_EECD_DI)
|
|
|
|
s->eecd_state.val_in |= 1;
|
|
|
|
if (++s->eecd_state.bitnum_in == 9 && !s->eecd_state.reading) {
|
|
|
|
s->eecd_state.bitnum_out = ((s->eecd_state.val_in & 0x3f)<<4)-1;
|
|
|
|
s->eecd_state.reading = (((s->eecd_state.val_in >> 6) & 7) ==
|
|
|
|
EEPROM_READ_OPCODE_MICROWIRE);
|
|
|
|
}
|
|
|
|
DBGOUT(EEPROM, "eeprom bitnum in %d out %d, reading %d\n",
|
|
|
|
s->eecd_state.bitnum_in, s->eecd_state.bitnum_out,
|
|
|
|
s->eecd_state.reading);
|
|
|
|
}
|
|
|
|
|
|
|
|
static uint32_t
|
|
|
|
flash_eerd_read(E1000State *s, int x)
|
|
|
|
{
|
|
|
|
unsigned int index, r = s->mac_reg[EERD] & ~E1000_EEPROM_RW_REG_START;
|
|
|
|
|
2009-07-29 21:22:55 +04:00
|
|
|
if ((s->mac_reg[EERD] & E1000_EEPROM_RW_REG_START) == 0)
|
|
|
|
return (s->mac_reg[EERD]);
|
|
|
|
|
2008-02-03 05:20:18 +03:00
|
|
|
if ((index = r >> E1000_EEPROM_RW_ADDR_SHIFT) > EEPROM_CHECKSUM_REG)
|
2009-07-29 21:22:55 +04:00
|
|
|
return (E1000_EEPROM_RW_REG_DONE | r);
|
|
|
|
|
|
|
|
return ((s->eeprom_data[index] << E1000_EEPROM_RW_REG_DATA) |
|
|
|
|
E1000_EEPROM_RW_REG_DONE | r);
|
2008-02-03 05:20:18 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
putsum(uint8_t *data, uint32_t n, uint32_t sloc, uint32_t css, uint32_t cse)
|
|
|
|
{
|
2008-07-29 23:41:19 +04:00
|
|
|
uint32_t sum;
|
|
|
|
|
2008-02-03 05:20:18 +03:00
|
|
|
if (cse && cse < n)
|
|
|
|
n = cse + 1;
|
2008-07-29 23:41:19 +04:00
|
|
|
if (sloc < n-1) {
|
|
|
|
sum = net_checksum_add(n-css, data+css);
|
2017-11-16 17:06:06 +03:00
|
|
|
stw_be_p(data + sloc, net_checksum_finish_nozero(sum));
|
2008-07-29 23:41:19 +04:00
|
|
|
}
|
2008-02-03 05:20:18 +03:00
|
|
|
}
|
|
|
|
|
2015-11-11 16:52:46 +03:00
|
|
|
static inline void
|
|
|
|
inc_tx_bcast_or_mcast_count(E1000State *s, const unsigned char *arr)
|
|
|
|
{
|
2023-02-23 13:19:52 +03:00
|
|
|
if (is_broadcast_ether_addr(arr)) {
|
2016-06-01 11:23:44 +03:00
|
|
|
e1000x_inc_reg_if_not_full(s->mac_reg, BPTC);
|
2023-02-23 13:19:52 +03:00
|
|
|
} else if (is_multicast_ether_addr(arr)) {
|
2016-06-01 11:23:44 +03:00
|
|
|
e1000x_inc_reg_if_not_full(s->mac_reg, MPTC);
|
2015-11-11 16:52:46 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-03-22 14:01:59 +04:00
|
|
|
static void
|
|
|
|
e1000_send_packet(E1000State *s, const uint8_t *buf, int size)
|
|
|
|
{
|
2015-11-11 16:52:46 +03:00
|
|
|
static const int PTCregs[6] = { PTC64, PTC127, PTC255, PTC511,
|
|
|
|
PTC1023, PTC1522 };
|
|
|
|
|
2013-01-30 15:12:22 +04:00
|
|
|
NetClientState *nc = qemu_get_queue(s->nic);
|
2023-02-23 13:19:48 +03:00
|
|
|
if (s->phy_reg[MII_BMCR] & MII_BMCR_LOOPBACK) {
|
2021-02-24 07:13:22 +03:00
|
|
|
qemu_receive_packet(nc, buf, size);
|
2012-03-22 14:01:59 +04:00
|
|
|
} else {
|
2013-01-30 15:12:22 +04:00
|
|
|
qemu_send_packet(nc, buf, size);
|
2012-03-22 14:01:59 +04:00
|
|
|
}
|
2015-11-11 16:52:46 +03:00
|
|
|
inc_tx_bcast_or_mcast_count(s, buf);
|
2023-02-23 13:20:15 +03:00
|
|
|
e1000x_increase_size_stats(s->mac_reg, PTCregs, size + 4);
|
2012-03-22 14:01:59 +04:00
|
|
|
}
|
|
|
|
|
2008-02-03 05:20:18 +03:00
|
|
|
static void
|
|
|
|
xmit_seg(E1000State *s)
|
|
|
|
{
|
2016-06-16 20:17:26 +03:00
|
|
|
uint16_t len;
|
2015-11-11 16:52:44 +03:00
|
|
|
unsigned int frames = s->tx.tso_frames, css, sofar;
|
2008-02-03 05:20:18 +03:00
|
|
|
struct e1000_tx *tp = &s->tx;
|
e1000: Separate TSO and non-TSO contexts, fixing UDP TX corruption
The device is supposed to maintain two distinct contexts for transmit
offloads: one has parameters for both segmentation and checksum
offload, the other only for checksum offload. The guest driver can
send two context descriptors, one for each context (the TSE flag
specifies which). Then the guest can refer to one or the other context
in subsequent transmit data descriptors, depending on what offloads it
wants applied to each packet.
Currently the e1000 device stores just one context, and misinterprets
the TSE flags in the context and data descriptors. This is often okay:
Linux happens to send a fresh context descriptor before every data
descriptor, so forgetting the other context doesn't matter. Windows
does rely on separate contexts for TSO vs. non-TSO packets, but for
mostly-TCP traffic the two contexts have identical TCP-specific
offload parameters so confusing them doesn't matter.
One case where this confusion matters is when a Windows guest sets up
a TSO context for TCP and a non-TSO context for UDP, and then
transmits both TCP and UDP traffic in parallel. The e1000 device
sometimes ends up using TCP-specific parameters while doing checksum
offload on a UDP datagram: it writes the checksum to offset 16 (the
correct location for a TCP checksum), stomping on two bytes of UDP
data, and leaving the wrong value in the actual UDP checksum field at
offset 6. (Even worse, the host network stack may then recompute the
UDP checksum, "correcting" it to match the corrupt data before sending
it out a physical interface.)
Correct this by tracking the TSO context independently of the non-TSO
context, and selecting the appropriate context based on the TSE flag
in each transmit data descriptor.
Signed-off-by: Ed Swierk <eswierk@skyportsystems.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
2017-11-15 02:23:34 +03:00
|
|
|
struct e1000x_txd_props *props = tp->cptse ? &tp->tso_props : &tp->props;
|
2008-02-03 05:20:18 +03:00
|
|
|
|
e1000: Separate TSO and non-TSO contexts, fixing UDP TX corruption
The device is supposed to maintain two distinct contexts for transmit
offloads: one has parameters for both segmentation and checksum
offload, the other only for checksum offload. The guest driver can
send two context descriptors, one for each context (the TSE flag
specifies which). Then the guest can refer to one or the other context
in subsequent transmit data descriptors, depending on what offloads it
wants applied to each packet.
Currently the e1000 device stores just one context, and misinterprets
the TSE flags in the context and data descriptors. This is often okay:
Linux happens to send a fresh context descriptor before every data
descriptor, so forgetting the other context doesn't matter. Windows
does rely on separate contexts for TSO vs. non-TSO packets, but for
mostly-TCP traffic the two contexts have identical TCP-specific
offload parameters so confusing them doesn't matter.
One case where this confusion matters is when a Windows guest sets up
a TSO context for TCP and a non-TSO context for UDP, and then
transmits both TCP and UDP traffic in parallel. The e1000 device
sometimes ends up using TCP-specific parameters while doing checksum
offload on a UDP datagram: it writes the checksum to offset 16 (the
correct location for a TCP checksum), stomping on two bytes of UDP
data, and leaving the wrong value in the actual UDP checksum field at
offset 6. (Even worse, the host network stack may then recompute the
UDP checksum, "correcting" it to match the corrupt data before sending
it out a physical interface.)
Correct this by tracking the TSO context independently of the non-TSO
context, and selecting the appropriate context based on the TSE flag
in each transmit data descriptor.
Signed-off-by: Ed Swierk <eswierk@skyportsystems.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
2017-11-15 02:23:34 +03:00
|
|
|
if (tp->cptse) {
|
|
|
|
css = props->ipcss;
|
2008-02-03 05:20:18 +03:00
|
|
|
DBGOUT(TXSUM, "frames %d size %d ipcss %d\n",
|
|
|
|
frames, tp->size, css);
|
e1000: Separate TSO and non-TSO contexts, fixing UDP TX corruption
The device is supposed to maintain two distinct contexts for transmit
offloads: one has parameters for both segmentation and checksum
offload, the other only for checksum offload. The guest driver can
send two context descriptors, one for each context (the TSE flag
specifies which). Then the guest can refer to one or the other context
in subsequent transmit data descriptors, depending on what offloads it
wants applied to each packet.
Currently the e1000 device stores just one context, and misinterprets
the TSE flags in the context and data descriptors. This is often okay:
Linux happens to send a fresh context descriptor before every data
descriptor, so forgetting the other context doesn't matter. Windows
does rely on separate contexts for TSO vs. non-TSO packets, but for
mostly-TCP traffic the two contexts have identical TCP-specific
offload parameters so confusing them doesn't matter.
One case where this confusion matters is when a Windows guest sets up
a TSO context for TCP and a non-TSO context for UDP, and then
transmits both TCP and UDP traffic in parallel. The e1000 device
sometimes ends up using TCP-specific parameters while doing checksum
offload on a UDP datagram: it writes the checksum to offset 16 (the
correct location for a TCP checksum), stomping on two bytes of UDP
data, and leaving the wrong value in the actual UDP checksum field at
offset 6. (Even worse, the host network stack may then recompute the
UDP checksum, "correcting" it to match the corrupt data before sending
it out a physical interface.)
Correct this by tracking the TSO context independently of the non-TSO
context, and selecting the appropriate context based on the TSE flag
in each transmit data descriptor.
Signed-off-by: Ed Swierk <eswierk@skyportsystems.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
2017-11-15 02:23:34 +03:00
|
|
|
if (props->ip) { /* IPv4 */
|
2013-11-05 20:38:34 +04:00
|
|
|
stw_be_p(tp->data+css+2, tp->size - css);
|
|
|
|
stw_be_p(tp->data+css+4,
|
2016-06-16 20:17:26 +03:00
|
|
|
lduw_be_p(tp->data + css + 4) + frames);
|
2015-11-11 16:52:39 +03:00
|
|
|
} else { /* IPv6 */
|
2013-11-05 20:38:34 +04:00
|
|
|
stw_be_p(tp->data+css+4, tp->size - css);
|
2015-11-11 16:52:39 +03:00
|
|
|
}
|
e1000: Separate TSO and non-TSO contexts, fixing UDP TX corruption
The device is supposed to maintain two distinct contexts for transmit
offloads: one has parameters for both segmentation and checksum
offload, the other only for checksum offload. The guest driver can
send two context descriptors, one for each context (the TSE flag
specifies which). Then the guest can refer to one or the other context
in subsequent transmit data descriptors, depending on what offloads it
wants applied to each packet.
Currently the e1000 device stores just one context, and misinterprets
the TSE flags in the context and data descriptors. This is often okay:
Linux happens to send a fresh context descriptor before every data
descriptor, so forgetting the other context doesn't matter. Windows
does rely on separate contexts for TSO vs. non-TSO packets, but for
mostly-TCP traffic the two contexts have identical TCP-specific
offload parameters so confusing them doesn't matter.
One case where this confusion matters is when a Windows guest sets up
a TSO context for TCP and a non-TSO context for UDP, and then
transmits both TCP and UDP traffic in parallel. The e1000 device
sometimes ends up using TCP-specific parameters while doing checksum
offload on a UDP datagram: it writes the checksum to offset 16 (the
correct location for a TCP checksum), stomping on two bytes of UDP
data, and leaving the wrong value in the actual UDP checksum field at
offset 6. (Even worse, the host network stack may then recompute the
UDP checksum, "correcting" it to match the corrupt data before sending
it out a physical interface.)
Correct this by tracking the TSO context independently of the non-TSO
context, and selecting the appropriate context based on the TSE flag
in each transmit data descriptor.
Signed-off-by: Ed Swierk <eswierk@skyportsystems.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
2017-11-15 02:23:34 +03:00
|
|
|
css = props->tucss;
|
2008-02-03 05:20:18 +03:00
|
|
|
len = tp->size - css;
|
e1000: Separate TSO and non-TSO contexts, fixing UDP TX corruption
The device is supposed to maintain two distinct contexts for transmit
offloads: one has parameters for both segmentation and checksum
offload, the other only for checksum offload. The guest driver can
send two context descriptors, one for each context (the TSE flag
specifies which). Then the guest can refer to one or the other context
in subsequent transmit data descriptors, depending on what offloads it
wants applied to each packet.
Currently the e1000 device stores just one context, and misinterprets
the TSE flags in the context and data descriptors. This is often okay:
Linux happens to send a fresh context descriptor before every data
descriptor, so forgetting the other context doesn't matter. Windows
does rely on separate contexts for TSO vs. non-TSO packets, but for
mostly-TCP traffic the two contexts have identical TCP-specific
offload parameters so confusing them doesn't matter.
One case where this confusion matters is when a Windows guest sets up
a TSO context for TCP and a non-TSO context for UDP, and then
transmits both TCP and UDP traffic in parallel. The e1000 device
sometimes ends up using TCP-specific parameters while doing checksum
offload on a UDP datagram: it writes the checksum to offset 16 (the
correct location for a TCP checksum), stomping on two bytes of UDP
data, and leaving the wrong value in the actual UDP checksum field at
offset 6. (Even worse, the host network stack may then recompute the
UDP checksum, "correcting" it to match the corrupt data before sending
it out a physical interface.)
Correct this by tracking the TSO context independently of the non-TSO
context, and selecting the appropriate context based on the TSE flag
in each transmit data descriptor.
Signed-off-by: Ed Swierk <eswierk@skyportsystems.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
2017-11-15 02:23:34 +03:00
|
|
|
DBGOUT(TXSUM, "tcp %d tucss %d len %d\n", props->tcp, css, len);
|
|
|
|
if (props->tcp) {
|
|
|
|
sofar = frames * props->mss;
|
2013-11-05 20:38:35 +04:00
|
|
|
stl_be_p(tp->data+css+4, ldl_be_p(tp->data+css+4)+sofar); /* seq */
|
e1000: Separate TSO and non-TSO contexts, fixing UDP TX corruption
The device is supposed to maintain two distinct contexts for transmit
offloads: one has parameters for both segmentation and checksum
offload, the other only for checksum offload. The guest driver can
send two context descriptors, one for each context (the TSE flag
specifies which). Then the guest can refer to one or the other context
in subsequent transmit data descriptors, depending on what offloads it
wants applied to each packet.
Currently the e1000 device stores just one context, and misinterprets
the TSE flags in the context and data descriptors. This is often okay:
Linux happens to send a fresh context descriptor before every data
descriptor, so forgetting the other context doesn't matter. Windows
does rely on separate contexts for TSO vs. non-TSO packets, but for
mostly-TCP traffic the two contexts have identical TCP-specific
offload parameters so confusing them doesn't matter.
One case where this confusion matters is when a Windows guest sets up
a TSO context for TCP and a non-TSO context for UDP, and then
transmits both TCP and UDP traffic in parallel. The e1000 device
sometimes ends up using TCP-specific parameters while doing checksum
offload on a UDP datagram: it writes the checksum to offset 16 (the
correct location for a TCP checksum), stomping on two bytes of UDP
data, and leaving the wrong value in the actual UDP checksum field at
offset 6. (Even worse, the host network stack may then recompute the
UDP checksum, "correcting" it to match the corrupt data before sending
it out a physical interface.)
Correct this by tracking the TSO context independently of the non-TSO
context, and selecting the appropriate context based on the TSE flag
in each transmit data descriptor.
Signed-off-by: Ed Swierk <eswierk@skyportsystems.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
2017-11-15 02:23:34 +03:00
|
|
|
if (props->paylen - sofar > props->mss) {
|
2015-11-11 16:52:39 +03:00
|
|
|
tp->data[css + 13] &= ~9; /* PSH, FIN */
|
2015-11-11 16:52:46 +03:00
|
|
|
} else if (frames) {
|
2016-06-01 11:23:44 +03:00
|
|
|
e1000x_inc_reg_if_not_full(s->mac_reg, TSCTC);
|
2015-11-11 16:52:46 +03:00
|
|
|
}
|
e1000: Separate TSO and non-TSO contexts, fixing UDP TX corruption
The device is supposed to maintain two distinct contexts for transmit
offloads: one has parameters for both segmentation and checksum
offload, the other only for checksum offload. The guest driver can
send two context descriptors, one for each context (the TSE flag
specifies which). Then the guest can refer to one or the other context
in subsequent transmit data descriptors, depending on what offloads it
wants applied to each packet.
Currently the e1000 device stores just one context, and misinterprets
the TSE flags in the context and data descriptors. This is often okay:
Linux happens to send a fresh context descriptor before every data
descriptor, so forgetting the other context doesn't matter. Windows
does rely on separate contexts for TSO vs. non-TSO packets, but for
mostly-TCP traffic the two contexts have identical TCP-specific
offload parameters so confusing them doesn't matter.
One case where this confusion matters is when a Windows guest sets up
a TSO context for TCP and a non-TSO context for UDP, and then
transmits both TCP and UDP traffic in parallel. The e1000 device
sometimes ends up using TCP-specific parameters while doing checksum
offload on a UDP datagram: it writes the checksum to offset 16 (the
correct location for a TCP checksum), stomping on two bytes of UDP
data, and leaving the wrong value in the actual UDP checksum field at
offset 6. (Even worse, the host network stack may then recompute the
UDP checksum, "correcting" it to match the corrupt data before sending
it out a physical interface.)
Correct this by tracking the TSO context independently of the non-TSO
context, and selecting the appropriate context based on the TSE flag
in each transmit data descriptor.
Signed-off-by: Ed Swierk <eswierk@skyportsystems.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
2017-11-15 02:23:34 +03:00
|
|
|
} else { /* UDP */
|
2013-11-05 20:38:34 +04:00
|
|
|
stw_be_p(tp->data+css+4, len);
|
e1000: Separate TSO and non-TSO contexts, fixing UDP TX corruption
The device is supposed to maintain two distinct contexts for transmit
offloads: one has parameters for both segmentation and checksum
offload, the other only for checksum offload. The guest driver can
send two context descriptors, one for each context (the TSE flag
specifies which). Then the guest can refer to one or the other context
in subsequent transmit data descriptors, depending on what offloads it
wants applied to each packet.
Currently the e1000 device stores just one context, and misinterprets
the TSE flags in the context and data descriptors. This is often okay:
Linux happens to send a fresh context descriptor before every data
descriptor, so forgetting the other context doesn't matter. Windows
does rely on separate contexts for TSO vs. non-TSO packets, but for
mostly-TCP traffic the two contexts have identical TCP-specific
offload parameters so confusing them doesn't matter.
One case where this confusion matters is when a Windows guest sets up
a TSO context for TCP and a non-TSO context for UDP, and then
transmits both TCP and UDP traffic in parallel. The e1000 device
sometimes ends up using TCP-specific parameters while doing checksum
offload on a UDP datagram: it writes the checksum to offset 16 (the
correct location for a TCP checksum), stomping on two bytes of UDP
data, and leaving the wrong value in the actual UDP checksum field at
offset 6. (Even worse, the host network stack may then recompute the
UDP checksum, "correcting" it to match the corrupt data before sending
it out a physical interface.)
Correct this by tracking the TSO context independently of the non-TSO
context, and selecting the appropriate context based on the TSE flag
in each transmit data descriptor.
Signed-off-by: Ed Swierk <eswierk@skyportsystems.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
2017-11-15 02:23:34 +03:00
|
|
|
}
|
2017-11-15 02:23:33 +03:00
|
|
|
if (tp->sum_needed & E1000_TXD_POPTS_TXSM) {
|
2010-11-05 23:52:08 +03:00
|
|
|
unsigned int phsum;
|
2008-02-03 05:20:18 +03:00
|
|
|
// add pseudo-header length before checksum calculation
|
e1000: Separate TSO and non-TSO contexts, fixing UDP TX corruption
The device is supposed to maintain two distinct contexts for transmit
offloads: one has parameters for both segmentation and checksum
offload, the other only for checksum offload. The guest driver can
send two context descriptors, one for each context (the TSE flag
specifies which). Then the guest can refer to one or the other context
in subsequent transmit data descriptors, depending on what offloads it
wants applied to each packet.
Currently the e1000 device stores just one context, and misinterprets
the TSE flags in the context and data descriptors. This is often okay:
Linux happens to send a fresh context descriptor before every data
descriptor, so forgetting the other context doesn't matter. Windows
does rely on separate contexts for TSO vs. non-TSO packets, but for
mostly-TCP traffic the two contexts have identical TCP-specific
offload parameters so confusing them doesn't matter.
One case where this confusion matters is when a Windows guest sets up
a TSO context for TCP and a non-TSO context for UDP, and then
transmits both TCP and UDP traffic in parallel. The e1000 device
sometimes ends up using TCP-specific parameters while doing checksum
offload on a UDP datagram: it writes the checksum to offset 16 (the
correct location for a TCP checksum), stomping on two bytes of UDP
data, and leaving the wrong value in the actual UDP checksum field at
offset 6. (Even worse, the host network stack may then recompute the
UDP checksum, "correcting" it to match the corrupt data before sending
it out a physical interface.)
Correct this by tracking the TSO context independently of the non-TSO
context, and selecting the appropriate context based on the TSE flag
in each transmit data descriptor.
Signed-off-by: Ed Swierk <eswierk@skyportsystems.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
2017-11-15 02:23:34 +03:00
|
|
|
void *sp = tp->data + props->tucso;
|
2016-06-16 20:17:26 +03:00
|
|
|
|
|
|
|
phsum = lduw_be_p(sp) + len;
|
2010-11-05 23:52:08 +03:00
|
|
|
phsum = (phsum >> 16) + (phsum & 0xffff);
|
2013-11-05 20:38:34 +04:00
|
|
|
stw_be_p(sp, phsum);
|
2008-02-03 05:20:18 +03:00
|
|
|
}
|
|
|
|
tp->tso_frames++;
|
|
|
|
}
|
|
|
|
|
2017-11-15 02:23:33 +03:00
|
|
|
if (tp->sum_needed & E1000_TXD_POPTS_TXSM) {
|
e1000: Separate TSO and non-TSO contexts, fixing UDP TX corruption
The device is supposed to maintain two distinct contexts for transmit
offloads: one has parameters for both segmentation and checksum
offload, the other only for checksum offload. The guest driver can
send two context descriptors, one for each context (the TSE flag
specifies which). Then the guest can refer to one or the other context
in subsequent transmit data descriptors, depending on what offloads it
wants applied to each packet.
Currently the e1000 device stores just one context, and misinterprets
the TSE flags in the context and data descriptors. This is often okay:
Linux happens to send a fresh context descriptor before every data
descriptor, so forgetting the other context doesn't matter. Windows
does rely on separate contexts for TSO vs. non-TSO packets, but for
mostly-TCP traffic the two contexts have identical TCP-specific
offload parameters so confusing them doesn't matter.
One case where this confusion matters is when a Windows guest sets up
a TSO context for TCP and a non-TSO context for UDP, and then
transmits both TCP and UDP traffic in parallel. The e1000 device
sometimes ends up using TCP-specific parameters while doing checksum
offload on a UDP datagram: it writes the checksum to offset 16 (the
correct location for a TCP checksum), stomping on two bytes of UDP
data, and leaving the wrong value in the actual UDP checksum field at
offset 6. (Even worse, the host network stack may then recompute the
UDP checksum, "correcting" it to match the corrupt data before sending
it out a physical interface.)
Correct this by tracking the TSO context independently of the non-TSO
context, and selecting the appropriate context based on the TSE flag
in each transmit data descriptor.
Signed-off-by: Ed Swierk <eswierk@skyportsystems.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
2017-11-15 02:23:34 +03:00
|
|
|
putsum(tp->data, tp->size, props->tucso, props->tucss, props->tucse);
|
2016-06-01 11:23:44 +03:00
|
|
|
}
|
2017-11-15 02:23:33 +03:00
|
|
|
if (tp->sum_needed & E1000_TXD_POPTS_IXSM) {
|
e1000: Separate TSO and non-TSO contexts, fixing UDP TX corruption
The device is supposed to maintain two distinct contexts for transmit
offloads: one has parameters for both segmentation and checksum
offload, the other only for checksum offload. The guest driver can
send two context descriptors, one for each context (the TSE flag
specifies which). Then the guest can refer to one or the other context
in subsequent transmit data descriptors, depending on what offloads it
wants applied to each packet.
Currently the e1000 device stores just one context, and misinterprets
the TSE flags in the context and data descriptors. This is often okay:
Linux happens to send a fresh context descriptor before every data
descriptor, so forgetting the other context doesn't matter. Windows
does rely on separate contexts for TSO vs. non-TSO packets, but for
mostly-TCP traffic the two contexts have identical TCP-specific
offload parameters so confusing them doesn't matter.
One case where this confusion matters is when a Windows guest sets up
a TSO context for TCP and a non-TSO context for UDP, and then
transmits both TCP and UDP traffic in parallel. The e1000 device
sometimes ends up using TCP-specific parameters while doing checksum
offload on a UDP datagram: it writes the checksum to offset 16 (the
correct location for a TCP checksum), stomping on two bytes of UDP
data, and leaving the wrong value in the actual UDP checksum field at
offset 6. (Even worse, the host network stack may then recompute the
UDP checksum, "correcting" it to match the corrupt data before sending
it out a physical interface.)
Correct this by tracking the TSO context independently of the non-TSO
context, and selecting the appropriate context based on the TSE flag
in each transmit data descriptor.
Signed-off-by: Ed Swierk <eswierk@skyportsystems.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
2017-11-15 02:23:34 +03:00
|
|
|
putsum(tp->data, tp->size, props->ipcso, props->ipcss, props->ipcse);
|
2016-06-01 11:23:44 +03:00
|
|
|
}
|
2008-11-21 19:25:17 +03:00
|
|
|
if (tp->vlan_needed) {
|
2009-11-19 21:44:55 +03:00
|
|
|
memmove(tp->vlan, tp->data, 4);
|
|
|
|
memmove(tp->data, tp->data + 4, 8);
|
2008-11-21 19:25:17 +03:00
|
|
|
memcpy(tp->data + 8, tp->vlan_header, 4);
|
2012-03-22 14:01:59 +04:00
|
|
|
e1000_send_packet(s, tp->vlan, tp->size + 4);
|
2015-11-11 16:52:39 +03:00
|
|
|
} else {
|
2012-03-22 14:01:59 +04:00
|
|
|
e1000_send_packet(s, tp->data, tp->size);
|
2015-11-11 16:52:39 +03:00
|
|
|
}
|
|
|
|
|
2016-06-01 11:23:44 +03:00
|
|
|
e1000x_inc_reg_if_not_full(s->mac_reg, TPT);
|
2023-02-23 13:20:15 +03:00
|
|
|
e1000x_grow_8reg_if_not_full(s->mac_reg, TOTL, s->tx.size + 4);
|
2023-04-10 18:27:48 +03:00
|
|
|
e1000x_inc_reg_if_not_full(s->mac_reg, GPTC);
|
|
|
|
e1000x_grow_8reg_if_not_full(s->mac_reg, GOTCL, s->tx.size + 4);
|
2008-02-03 05:20:18 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
process_tx_desc(E1000State *s, struct e1000_tx_desc *dp)
|
|
|
|
{
|
2013-06-30 14:55:52 +04:00
|
|
|
PCIDevice *d = PCI_DEVICE(s);
|
2008-02-03 05:20:18 +03:00
|
|
|
uint32_t txd_lower = le32_to_cpu(dp->lower.data);
|
|
|
|
uint32_t dtype = txd_lower & (E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_D);
|
2016-06-01 11:23:44 +03:00
|
|
|
unsigned int split_size = txd_lower & 0xffff, bytes, sz;
|
2013-06-04 12:49:48 +04:00
|
|
|
unsigned int msh = 0xfffff;
|
2008-02-03 05:20:18 +03:00
|
|
|
uint64_t addr;
|
|
|
|
struct e1000_context_desc *xp = (struct e1000_context_desc *)dp;
|
|
|
|
struct e1000_tx *tp = &s->tx;
|
|
|
|
|
e1000: add interrupt mitigation support
This patch partially implements the e1000 interrupt mitigation mechanisms.
Using a single QEMUTimer, it emulates the ITR register (which is the newer
mitigation register, recommended by Intel) and approximately emulates
RADV and TADV registers. TIDV and RDTR register functionalities are not
emulated (RDTR is only used to validate RADV, according to the e1000 specs).
RADV, TADV, TIDV and RDTR registers make up the older e1000 mitigation
mechanism and would need a timer each to be completely emulated. However,
a single timer has been used in order to reach a good compromise between
emulation accuracy and simplicity/efficiency.
The implemented mechanism can be enabled/disabled specifying the command
line e1000-specific boolean parameter "mitigation", e.g.
qemu-system-x86_64 -device e1000,mitigation=on,... ...
For more information, see the Software developer's manual at
http://download.intel.com/design/network/manuals/8254x_GBe_SDM.pdf.
Interrupt mitigation boosts performance when the guest suffers from
an high interrupt rate (i.e. receiving short UDP packets at high packet
rate). For some numerical results see the following link
http://info.iet.unipi.it/~luigi/papers/20130520-rizzo-vm.pdf
Signed-off-by: Vincenzo Maffione <v.maffione@gmail.com>
Reviewed-by: Andreas Färber <afaerber@suse.de> (for pc-* machines)
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2013-08-02 20:30:52 +04:00
|
|
|
s->mit_ide |= (txd_lower & E1000_TXD_CMD_IDE);
|
2015-11-11 16:52:39 +03:00
|
|
|
if (dtype == E1000_TXD_CMD_DEXT) { /* context descriptor */
|
e1000: Separate TSO and non-TSO contexts, fixing UDP TX corruption
The device is supposed to maintain two distinct contexts for transmit
offloads: one has parameters for both segmentation and checksum
offload, the other only for checksum offload. The guest driver can
send two context descriptors, one for each context (the TSE flag
specifies which). Then the guest can refer to one or the other context
in subsequent transmit data descriptors, depending on what offloads it
wants applied to each packet.
Currently the e1000 device stores just one context, and misinterprets
the TSE flags in the context and data descriptors. This is often okay:
Linux happens to send a fresh context descriptor before every data
descriptor, so forgetting the other context doesn't matter. Windows
does rely on separate contexts for TSO vs. non-TSO packets, but for
mostly-TCP traffic the two contexts have identical TCP-specific
offload parameters so confusing them doesn't matter.
One case where this confusion matters is when a Windows guest sets up
a TSO context for TCP and a non-TSO context for UDP, and then
transmits both TCP and UDP traffic in parallel. The e1000 device
sometimes ends up using TCP-specific parameters while doing checksum
offload on a UDP datagram: it writes the checksum to offset 16 (the
correct location for a TCP checksum), stomping on two bytes of UDP
data, and leaving the wrong value in the actual UDP checksum field at
offset 6. (Even worse, the host network stack may then recompute the
UDP checksum, "correcting" it to match the corrupt data before sending
it out a physical interface.)
Correct this by tracking the TSO context independently of the non-TSO
context, and selecting the appropriate context based on the TSE flag
in each transmit data descriptor.
Signed-off-by: Ed Swierk <eswierk@skyportsystems.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
2017-11-15 02:23:34 +03:00
|
|
|
if (le32_to_cpu(xp->cmd_and_length) & E1000_TXD_CMD_TSE) {
|
|
|
|
e1000x_read_tx_ctx_descr(xp, &tp->tso_props);
|
2018-03-28 19:36:29 +03:00
|
|
|
s->use_tso_for_migration = 1;
|
e1000: Separate TSO and non-TSO contexts, fixing UDP TX corruption
The device is supposed to maintain two distinct contexts for transmit
offloads: one has parameters for both segmentation and checksum
offload, the other only for checksum offload. The guest driver can
send two context descriptors, one for each context (the TSE flag
specifies which). Then the guest can refer to one or the other context
in subsequent transmit data descriptors, depending on what offloads it
wants applied to each packet.
Currently the e1000 device stores just one context, and misinterprets
the TSE flags in the context and data descriptors. This is often okay:
Linux happens to send a fresh context descriptor before every data
descriptor, so forgetting the other context doesn't matter. Windows
does rely on separate contexts for TSO vs. non-TSO packets, but for
mostly-TCP traffic the two contexts have identical TCP-specific
offload parameters so confusing them doesn't matter.
One case where this confusion matters is when a Windows guest sets up
a TSO context for TCP and a non-TSO context for UDP, and then
transmits both TCP and UDP traffic in parallel. The e1000 device
sometimes ends up using TCP-specific parameters while doing checksum
offload on a UDP datagram: it writes the checksum to offset 16 (the
correct location for a TCP checksum), stomping on two bytes of UDP
data, and leaving the wrong value in the actual UDP checksum field at
offset 6. (Even worse, the host network stack may then recompute the
UDP checksum, "correcting" it to match the corrupt data before sending
it out a physical interface.)
Correct this by tracking the TSO context independently of the non-TSO
context, and selecting the appropriate context based on the TSE flag
in each transmit data descriptor.
Signed-off-by: Ed Swierk <eswierk@skyportsystems.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
2017-11-15 02:23:34 +03:00
|
|
|
tp->tso_frames = 0;
|
|
|
|
} else {
|
|
|
|
e1000x_read_tx_ctx_descr(xp, &tp->props);
|
2018-03-28 19:36:29 +03:00
|
|
|
s->use_tso_for_migration = 0;
|
2008-02-03 05:20:18 +03:00
|
|
|
}
|
|
|
|
return;
|
2008-07-16 16:39:45 +04:00
|
|
|
} else if (dtype == (E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_D)) {
|
|
|
|
// data descriptor
|
2011-03-07 23:04:07 +03:00
|
|
|
if (tp->size == 0) {
|
2017-11-15 02:23:33 +03:00
|
|
|
tp->sum_needed = le32_to_cpu(dp->upper.data) >> 8;
|
2011-03-07 23:04:07 +03:00
|
|
|
}
|
2017-11-15 02:23:33 +03:00
|
|
|
tp->cptse = (txd_lower & E1000_TXD_CMD_TSE) ? 1 : 0;
|
2010-11-11 18:10:04 +03:00
|
|
|
} else {
|
2008-07-16 16:39:45 +04:00
|
|
|
// legacy descriptor
|
2017-11-15 02:23:33 +03:00
|
|
|
tp->cptse = 0;
|
2010-11-11 18:10:04 +03:00
|
|
|
}
|
2008-02-03 05:20:18 +03:00
|
|
|
|
2016-06-01 11:23:44 +03:00
|
|
|
if (e1000x_vlan_enabled(s->mac_reg) &&
|
|
|
|
e1000x_is_vlan_txd(txd_lower) &&
|
2017-11-15 02:23:33 +03:00
|
|
|
(tp->cptse || txd_lower & E1000_TXD_CMD_EOP)) {
|
2008-11-21 19:25:17 +03:00
|
|
|
tp->vlan_needed = 1;
|
2013-11-05 20:38:34 +04:00
|
|
|
stw_be_p(tp->vlan_header,
|
2015-03-13 08:21:59 +03:00
|
|
|
le16_to_cpu(s->mac_reg[VET]));
|
2013-11-05 20:38:34 +04:00
|
|
|
stw_be_p(tp->vlan_header + 2,
|
2008-11-21 19:25:17 +03:00
|
|
|
le16_to_cpu(dp->upper.fields.special));
|
|
|
|
}
|
2015-11-11 16:52:39 +03:00
|
|
|
|
2008-02-03 05:20:18 +03:00
|
|
|
addr = le64_to_cpu(dp->buffer_addr);
|
e1000: Separate TSO and non-TSO contexts, fixing UDP TX corruption
The device is supposed to maintain two distinct contexts for transmit
offloads: one has parameters for both segmentation and checksum
offload, the other only for checksum offload. The guest driver can
send two context descriptors, one for each context (the TSE flag
specifies which). Then the guest can refer to one or the other context
in subsequent transmit data descriptors, depending on what offloads it
wants applied to each packet.
Currently the e1000 device stores just one context, and misinterprets
the TSE flags in the context and data descriptors. This is often okay:
Linux happens to send a fresh context descriptor before every data
descriptor, so forgetting the other context doesn't matter. Windows
does rely on separate contexts for TSO vs. non-TSO packets, but for
mostly-TCP traffic the two contexts have identical TCP-specific
offload parameters so confusing them doesn't matter.
One case where this confusion matters is when a Windows guest sets up
a TSO context for TCP and a non-TSO context for UDP, and then
transmits both TCP and UDP traffic in parallel. The e1000 device
sometimes ends up using TCP-specific parameters while doing checksum
offload on a UDP datagram: it writes the checksum to offset 16 (the
correct location for a TCP checksum), stomping on two bytes of UDP
data, and leaving the wrong value in the actual UDP checksum field at
offset 6. (Even worse, the host network stack may then recompute the
UDP checksum, "correcting" it to match the corrupt data before sending
it out a physical interface.)
Correct this by tracking the TSO context independently of the non-TSO
context, and selecting the appropriate context based on the TSE flag
in each transmit data descriptor.
Signed-off-by: Ed Swierk <eswierk@skyportsystems.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
2017-11-15 02:23:34 +03:00
|
|
|
if (tp->cptse) {
|
|
|
|
msh = tp->tso_props.hdr_len + tp->tso_props.mss;
|
2008-07-16 16:39:45 +04:00
|
|
|
do {
|
|
|
|
bytes = split_size;
|
2021-02-24 08:45:28 +03:00
|
|
|
if (tp->size >= msh) {
|
|
|
|
goto eop;
|
|
|
|
}
|
2008-07-16 16:39:45 +04:00
|
|
|
if (tp->size + bytes > msh)
|
|
|
|
bytes = msh - tp->size;
|
2012-01-23 17:30:43 +04:00
|
|
|
|
|
|
|
bytes = MIN(sizeof(tp->data) - tp->size, bytes);
|
2013-06-30 14:55:52 +04:00
|
|
|
pci_dma_read(d, addr, tp->data + tp->size, bytes);
|
2013-06-04 12:49:48 +04:00
|
|
|
sz = tp->size + bytes;
|
e1000: Separate TSO and non-TSO contexts, fixing UDP TX corruption
The device is supposed to maintain two distinct contexts for transmit
offloads: one has parameters for both segmentation and checksum
offload, the other only for checksum offload. The guest driver can
send two context descriptors, one for each context (the TSE flag
specifies which). Then the guest can refer to one or the other context
in subsequent transmit data descriptors, depending on what offloads it
wants applied to each packet.
Currently the e1000 device stores just one context, and misinterprets
the TSE flags in the context and data descriptors. This is often okay:
Linux happens to send a fresh context descriptor before every data
descriptor, so forgetting the other context doesn't matter. Windows
does rely on separate contexts for TSO vs. non-TSO packets, but for
mostly-TCP traffic the two contexts have identical TCP-specific
offload parameters so confusing them doesn't matter.
One case where this confusion matters is when a Windows guest sets up
a TSO context for TCP and a non-TSO context for UDP, and then
transmits both TCP and UDP traffic in parallel. The e1000 device
sometimes ends up using TCP-specific parameters while doing checksum
offload on a UDP datagram: it writes the checksum to offset 16 (the
correct location for a TCP checksum), stomping on two bytes of UDP
data, and leaving the wrong value in the actual UDP checksum field at
offset 6. (Even worse, the host network stack may then recompute the
UDP checksum, "correcting" it to match the corrupt data before sending
it out a physical interface.)
Correct this by tracking the TSO context independently of the non-TSO
context, and selecting the appropriate context based on the TSE flag
in each transmit data descriptor.
Signed-off-by: Ed Swierk <eswierk@skyportsystems.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
2017-11-15 02:23:34 +03:00
|
|
|
if (sz >= tp->tso_props.hdr_len
|
|
|
|
&& tp->size < tp->tso_props.hdr_len) {
|
|
|
|
memmove(tp->header, tp->data, tp->tso_props.hdr_len);
|
2013-06-04 12:49:48 +04:00
|
|
|
}
|
2008-07-16 16:39:45 +04:00
|
|
|
tp->size = sz;
|
|
|
|
addr += bytes;
|
|
|
|
if (sz == msh) {
|
|
|
|
xmit_seg(s);
|
e1000: Separate TSO and non-TSO contexts, fixing UDP TX corruption
The device is supposed to maintain two distinct contexts for transmit
offloads: one has parameters for both segmentation and checksum
offload, the other only for checksum offload. The guest driver can
send two context descriptors, one for each context (the TSE flag
specifies which). Then the guest can refer to one or the other context
in subsequent transmit data descriptors, depending on what offloads it
wants applied to each packet.
Currently the e1000 device stores just one context, and misinterprets
the TSE flags in the context and data descriptors. This is often okay:
Linux happens to send a fresh context descriptor before every data
descriptor, so forgetting the other context doesn't matter. Windows
does rely on separate contexts for TSO vs. non-TSO packets, but for
mostly-TCP traffic the two contexts have identical TCP-specific
offload parameters so confusing them doesn't matter.
One case where this confusion matters is when a Windows guest sets up
a TSO context for TCP and a non-TSO context for UDP, and then
transmits both TCP and UDP traffic in parallel. The e1000 device
sometimes ends up using TCP-specific parameters while doing checksum
offload on a UDP datagram: it writes the checksum to offset 16 (the
correct location for a TCP checksum), stomping on two bytes of UDP
data, and leaving the wrong value in the actual UDP checksum field at
offset 6. (Even worse, the host network stack may then recompute the
UDP checksum, "correcting" it to match the corrupt data before sending
it out a physical interface.)
Correct this by tracking the TSO context independently of the non-TSO
context, and selecting the appropriate context based on the TSE flag
in each transmit data descriptor.
Signed-off-by: Ed Swierk <eswierk@skyportsystems.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
2017-11-15 02:23:34 +03:00
|
|
|
memmove(tp->data, tp->header, tp->tso_props.hdr_len);
|
|
|
|
tp->size = tp->tso_props.hdr_len;
|
2008-07-16 16:39:45 +04:00
|
|
|
}
|
2015-09-04 19:21:06 +03:00
|
|
|
split_size -= bytes;
|
|
|
|
} while (bytes && split_size);
|
2008-07-16 16:39:45 +04:00
|
|
|
} else {
|
2012-01-23 17:30:43 +04:00
|
|
|
split_size = MIN(sizeof(tp->data) - tp->size, split_size);
|
2013-06-30 14:55:52 +04:00
|
|
|
pci_dma_read(d, addr, tp->data + tp->size, split_size);
|
2008-07-16 16:39:45 +04:00
|
|
|
tp->size += split_size;
|
2008-02-03 05:20:18 +03:00
|
|
|
}
|
|
|
|
|
2021-02-24 08:45:28 +03:00
|
|
|
eop:
|
2008-02-03 05:20:18 +03:00
|
|
|
if (!(txd_lower & E1000_TXD_CMD_EOP))
|
|
|
|
return;
|
e1000: Separate TSO and non-TSO contexts, fixing UDP TX corruption
The device is supposed to maintain two distinct contexts for transmit
offloads: one has parameters for both segmentation and checksum
offload, the other only for checksum offload. The guest driver can
send two context descriptors, one for each context (the TSE flag
specifies which). Then the guest can refer to one or the other context
in subsequent transmit data descriptors, depending on what offloads it
wants applied to each packet.
Currently the e1000 device stores just one context, and misinterprets
the TSE flags in the context and data descriptors. This is often okay:
Linux happens to send a fresh context descriptor before every data
descriptor, so forgetting the other context doesn't matter. Windows
does rely on separate contexts for TSO vs. non-TSO packets, but for
mostly-TCP traffic the two contexts have identical TCP-specific
offload parameters so confusing them doesn't matter.
One case where this confusion matters is when a Windows guest sets up
a TSO context for TCP and a non-TSO context for UDP, and then
transmits both TCP and UDP traffic in parallel. The e1000 device
sometimes ends up using TCP-specific parameters while doing checksum
offload on a UDP datagram: it writes the checksum to offset 16 (the
correct location for a TCP checksum), stomping on two bytes of UDP
data, and leaving the wrong value in the actual UDP checksum field at
offset 6. (Even worse, the host network stack may then recompute the
UDP checksum, "correcting" it to match the corrupt data before sending
it out a physical interface.)
Correct this by tracking the TSO context independently of the non-TSO
context, and selecting the appropriate context based on the TSE flag
in each transmit data descriptor.
Signed-off-by: Ed Swierk <eswierk@skyportsystems.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
2017-11-15 02:23:34 +03:00
|
|
|
if (!(tp->cptse && tp->size < tp->tso_props.hdr_len)) {
|
2008-02-03 05:20:18 +03:00
|
|
|
xmit_seg(s);
|
2013-06-04 12:49:48 +04:00
|
|
|
}
|
2008-02-03 05:20:18 +03:00
|
|
|
tp->tso_frames = 0;
|
2017-11-15 02:23:33 +03:00
|
|
|
tp->sum_needed = 0;
|
2008-11-21 19:25:17 +03:00
|
|
|
tp->vlan_needed = 0;
|
2008-02-03 05:20:18 +03:00
|
|
|
tp->size = 0;
|
2017-11-15 02:23:33 +03:00
|
|
|
tp->cptse = 0;
|
2008-02-03 05:20:18 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static uint32_t
|
2011-10-31 10:06:52 +04:00
|
|
|
txdesc_writeback(E1000State *s, dma_addr_t base, struct e1000_tx_desc *dp)
|
2008-02-03 05:20:18 +03:00
|
|
|
{
|
2013-06-30 14:55:52 +04:00
|
|
|
PCIDevice *d = PCI_DEVICE(s);
|
2008-02-03 05:20:18 +03:00
|
|
|
uint32_t txd_upper, txd_lower = le32_to_cpu(dp->lower.data);
|
|
|
|
|
|
|
|
if (!(txd_lower & (E1000_TXD_CMD_RS|E1000_TXD_CMD_RPS)))
|
|
|
|
return 0;
|
|
|
|
txd_upper = (le32_to_cpu(dp->upper.data) | E1000_TXD_STAT_DD) &
|
|
|
|
~(E1000_TXD_STAT_EC | E1000_TXD_STAT_LC | E1000_TXD_STAT_TU);
|
|
|
|
dp->upper.data = cpu_to_le32(txd_upper);
|
2013-06-30 14:55:52 +04:00
|
|
|
pci_dma_write(d, base + ((char *)&dp->upper - (char *)dp),
|
2011-11-04 05:03:33 +04:00
|
|
|
&dp->upper, sizeof(dp->upper));
|
2008-02-03 05:20:18 +03:00
|
|
|
return E1000_ICR_TXDW;
|
|
|
|
}
|
|
|
|
|
2011-03-26 21:37:56 +03:00
|
|
|
static uint64_t tx_desc_base(E1000State *s)
|
|
|
|
{
|
|
|
|
uint64_t bah = s->mac_reg[TDBAH];
|
|
|
|
uint64_t bal = s->mac_reg[TDBAL] & ~0xf;
|
|
|
|
|
|
|
|
return (bah << 32) + bal;
|
|
|
|
}
|
|
|
|
|
2008-02-03 05:20:18 +03:00
|
|
|
static void
|
|
|
|
start_xmit(E1000State *s)
|
|
|
|
{
|
2013-06-30 14:55:52 +04:00
|
|
|
PCIDevice *d = PCI_DEVICE(s);
|
2011-10-31 10:06:52 +04:00
|
|
|
dma_addr_t base;
|
2008-02-03 05:20:18 +03:00
|
|
|
struct e1000_tx_desc desc;
|
|
|
|
uint32_t tdh_start = s->mac_reg[TDH], cause = E1000_ICS_TXQE;
|
|
|
|
|
|
|
|
if (!(s->mac_reg[TCTL] & E1000_TCTL_EN)) {
|
|
|
|
DBGOUT(TX, "tx disabled\n");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2021-10-21 19:10:47 +03:00
|
|
|
if (s->tx.busy) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
s->tx.busy = true;
|
|
|
|
|
2008-02-03 05:20:18 +03:00
|
|
|
while (s->mac_reg[TDH] != s->mac_reg[TDT]) {
|
2011-03-26 21:37:56 +03:00
|
|
|
base = tx_desc_base(s) +
|
2008-02-03 05:20:18 +03:00
|
|
|
sizeof(struct e1000_tx_desc) * s->mac_reg[TDH];
|
2013-06-30 14:55:52 +04:00
|
|
|
pci_dma_read(d, base, &desc, sizeof(desc));
|
2008-02-03 05:20:18 +03:00
|
|
|
|
|
|
|
DBGOUT(TX, "index %d: %p : %x %x\n", s->mac_reg[TDH],
|
2008-05-13 18:35:34 +04:00
|
|
|
(void *)(intptr_t)desc.buffer_addr, desc.lower.data,
|
2008-02-03 05:20:18 +03:00
|
|
|
desc.upper.data);
|
|
|
|
|
|
|
|
process_tx_desc(s, &desc);
|
2011-10-31 10:06:52 +04:00
|
|
|
cause |= txdesc_writeback(s, base, &desc);
|
2008-02-03 05:20:18 +03:00
|
|
|
|
|
|
|
if (++s->mac_reg[TDH] * sizeof(desc) >= s->mac_reg[TDLEN])
|
|
|
|
s->mac_reg[TDH] = 0;
|
|
|
|
/*
|
|
|
|
* the following could happen only if guest sw assigns
|
|
|
|
* bogus values to TDT/TDLEN.
|
|
|
|
* there's nothing too intelligent we could do about this.
|
|
|
|
*/
|
e1000: eliminate infinite loops on out-of-bounds transfer start
The start_xmit() and e1000_receive_iov() functions implement DMA transfers
iterating over a set of descriptors that the guest's e1000 driver
prepares:
- the TDLEN and RDLEN registers store the total size of the descriptor
area,
- while the TDH and RDH registers store the offset (in whole tx / rx
descriptors) into the area where the transfer is supposed to start.
Each time a descriptor is processed, the TDH and RDH register is bumped
(as appropriate for the transfer direction).
QEMU already contains logic to deal with bogus transfers submitted by the
guest:
- Normally, the transmit case wants to increase TDH from its initial value
to TDT. (TDT is allowed to be numerically smaller than the initial TDH
value; wrapping at or above TDLEN bytes to zero is normal.) The failsafe
that QEMU currently has here is a check against reaching the original
TDH value again -- a complete wraparound, which should never happen.
- In the receive case RDH is increased from its initial value until
"total_size" bytes have been received; preferably in a single step, or
in "s->rxbuf_size" byte steps, if the latter is smaller. However, null
RX descriptors are skipped without receiving data, while RDH is
incremented just the same. QEMU tries to prevent an infinite loop
(processing only null RX descriptors) by detecting whether RDH assumes
its original value during the loop. (Again, wrapping from RDLEN to 0 is
normal.)
What both directions miss is that the guest could program TDLEN and RDLEN
so low, and the initial TDH and RDH so high, that these registers will
immediately be truncated to zero, and then never reassume their initial
values in the loop -- a full wraparound will never occur.
The condition that expresses this is:
xdh_start >= s->mac_reg[XDLEN] / sizeof(desc)
i.e., TDH or RDH start out after the last whole rx or tx descriptor that
fits into the TDLEN or RDLEN sized area.
This condition could be checked before we enter the loops, but
pci_dma_read() / pci_dma_write() knows how to fill in buffers safely for
bogus DMA addresses, so we just extend the existing failsafes with the
above condition.
This is CVE-2016-1981.
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Petr Matousek <pmatouse@redhat.com>
Cc: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Cc: Prasad Pandit <ppandit@redhat.com>
Cc: Michael Roth <mdroth@linux.vnet.ibm.com>
Cc: Jason Wang <jasowang@redhat.com>
Cc: qemu-stable@nongnu.org
RHBZ: https://bugzilla.redhat.com/show_bug.cgi?id=1296044
Signed-off-by: Laszlo Ersek <lersek@redhat.com>
Reviewed-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
2016-01-19 16:17:20 +03:00
|
|
|
if (s->mac_reg[TDH] == tdh_start ||
|
|
|
|
tdh_start >= s->mac_reg[TDLEN] / sizeof(desc)) {
|
2008-02-03 05:20:18 +03:00
|
|
|
DBGOUT(TXERR, "TDH wraparound @%x, TDT %x, TDLEN %x\n",
|
|
|
|
tdh_start, s->mac_reg[TDT], s->mac_reg[TDLEN]);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2021-10-21 19:10:47 +03:00
|
|
|
s->tx.busy = false;
|
2008-02-03 05:20:18 +03:00
|
|
|
set_ics(s, 0, cause);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
2023-05-23 05:43:06 +03:00
|
|
|
receive_filter(E1000State *s, const void *buf)
|
|
|
|
{
|
|
|
|
return (!e1000x_is_vlan_packet(buf, s->mac_reg[VET]) ||
|
|
|
|
e1000x_rx_vlan_filter(s->mac_reg, PKT_GET_VLAN_HDR(buf))) &&
|
|
|
|
e1000x_rx_group_filter(s->mac_reg, buf);
|
2008-02-03 05:20:18 +03:00
|
|
|
}
|
|
|
|
|
2009-01-08 22:45:50 +03:00
|
|
|
static void
|
2012-07-24 19:35:13 +04:00
|
|
|
e1000_set_link_status(NetClientState *nc)
|
2009-01-08 22:45:50 +03:00
|
|
|
{
|
2013-01-30 15:12:23 +04:00
|
|
|
E1000State *s = qemu_get_nic_opaque(nc);
|
2009-01-08 22:45:50 +03:00
|
|
|
uint32_t old_status = s->mac_reg[STATUS];
|
|
|
|
|
2011-08-17 13:03:14 +04:00
|
|
|
if (nc->link_down) {
|
2016-06-01 11:23:44 +03:00
|
|
|
e1000x_update_regs_on_link_down(s->mac_reg, s->phy_reg);
|
2011-08-17 13:03:14 +04:00
|
|
|
} else {
|
2014-06-19 23:40:51 +04:00
|
|
|
if (have_autoneg(s) &&
|
2023-02-23 13:19:48 +03:00
|
|
|
!(s->phy_reg[MII_BMSR] & MII_BMSR_AN_COMP)) {
|
2016-06-01 11:23:44 +03:00
|
|
|
e1000x_restart_autoneg(s->mac_reg, s->phy_reg, s->autoneg_timer);
|
2014-06-19 19:55:33 +04:00
|
|
|
} else {
|
|
|
|
e1000_link_up(s);
|
|
|
|
}
|
2011-08-17 13:03:14 +04:00
|
|
|
}
|
2009-01-08 22:45:50 +03:00
|
|
|
|
|
|
|
if (s->mac_reg[STATUS] != old_status)
|
|
|
|
set_ics(s, 0, E1000_ICR_LSC);
|
|
|
|
}
|
|
|
|
|
2011-02-15 19:27:55 +03:00
|
|
|
static bool e1000_has_rxbufs(E1000State *s, size_t total_size)
|
|
|
|
{
|
|
|
|
int bufs;
|
|
|
|
/* Fast-path short packets */
|
|
|
|
if (total_size <= s->rxbuf_size) {
|
2012-10-19 09:56:55 +04:00
|
|
|
return s->mac_reg[RDH] != s->mac_reg[RDT];
|
2011-02-15 19:27:55 +03:00
|
|
|
}
|
|
|
|
if (s->mac_reg[RDH] < s->mac_reg[RDT]) {
|
|
|
|
bufs = s->mac_reg[RDT] - s->mac_reg[RDH];
|
2012-10-19 09:56:55 +04:00
|
|
|
} else if (s->mac_reg[RDH] > s->mac_reg[RDT]) {
|
2011-02-15 19:27:55 +03:00
|
|
|
bufs = s->mac_reg[RDLEN] / sizeof(struct e1000_rx_desc) +
|
|
|
|
s->mac_reg[RDT] - s->mac_reg[RDH];
|
|
|
|
} else {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
return total_size <= bufs * s->rxbuf_size;
|
|
|
|
}
|
|
|
|
|
2020-03-05 20:56:49 +03:00
|
|
|
static bool
|
2012-07-24 19:35:13 +04:00
|
|
|
e1000_can_receive(NetClientState *nc)
|
2011-03-27 15:37:35 +04:00
|
|
|
{
|
2013-01-30 15:12:23 +04:00
|
|
|
E1000State *s = qemu_get_nic_opaque(nc);
|
2011-03-27 15:37:35 +04:00
|
|
|
|
2016-06-01 11:23:44 +03:00
|
|
|
return e1000x_rx_ready(&s->parent_obj, s->mac_reg) &&
|
e1000: Delay flush queue when receive RCTL
Due to too early RCT0 interrput, win10x32 may hang on booting.
This problem can be reproduced by doing power cycle on win10x32 guest.
In our environment, we have 10 win10x32 and stress power cycle.
The problem will happen about 20 rounds.
Below shows some log with comment:
The normal case:
22831@1551928392.984687:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
22831@1551928392.985655:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
22831@1551928392.985801:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: RCTL: 0, mac_reg[RCTL] = 0x0
22831@1551928393.056710:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: ICR read: 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: RCTL: 0, mac_reg[RCTL] = 0x0
22831@1551928393.077548:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: ICR read: 0
e1000: set_ics 2, ICR 0, IMR 0
e1000: set_ics 2, ICR 2, IMR 0
e1000: RCTL: 0, mac_reg[RCTL] = 0x0
22831@1551928393.102974:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
22831@1551928393.103267:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
e1000: RCTL: 255, mac_reg[RCTL] = 0x40002 <- win10x32 says it can handle
RX now
e1000: set_ics 0, ICR 2, IMR 9d <- unmask interrupt
e1000: RCTL: 255, mac_reg[RCTL] = 0x48002
e1000: set_ics 80, ICR 2, IMR 9d <- interrupt and work!
...
The bad case:
27744@1551930483.117766:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
27744@1551930483.118398:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: RCTL: 0, mac_reg[RCTL] = 0x0
27744@1551930483.198063:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: ICR read: 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: RCTL: 0, mac_reg[RCTL] = 0x0
27744@1551930483.218675:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: ICR read: 0
e1000: set_ics 2, ICR 0, IMR 0
e1000: set_ics 2, ICR 2, IMR 0
e1000: RCTL: 0, mac_reg[RCTL] = 0x0
27744@1551930483.241768:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
27744@1551930483.241979:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
e1000: RCTL: 255, mac_reg[RCTL] = 0x40002 <- win10x32 says it can handle
RX now
e1000: set_ics 80, ICR 2, IMR 0 <- flush queue (caused by setting RCTL)
e1000: set_ics 0, ICR 82, IMR 9d <- unmask interrupt and because 0x82&0x9d
!= 0 generate interrupt, hang on here...
To workaround this problem, simply delay flush queue. Also stop receiving
when timer is going to run.
Tested on CentOS, Win7SP1x64 and Win10x32.
Signed-off-by: yuchenlin <yuchenlin@synology.com>
Reviewed-by: Dmitry Fleytman <dmitry.fleytman@gmail.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
2019-03-13 09:56:49 +03:00
|
|
|
e1000_has_rxbufs(s, 1) && !timer_pending(s->flush_queue_timer);
|
2011-03-27 15:37:35 +04:00
|
|
|
}
|
|
|
|
|
2011-03-26 21:37:56 +03:00
|
|
|
static uint64_t rx_desc_base(E1000State *s)
|
|
|
|
{
|
|
|
|
uint64_t bah = s->mac_reg[RDBAH];
|
|
|
|
uint64_t bal = s->mac_reg[RDBAL] & ~0xf;
|
|
|
|
|
|
|
|
return (bah << 32) + bal;
|
|
|
|
}
|
|
|
|
|
2018-10-16 12:40:45 +03:00
|
|
|
static void
|
|
|
|
e1000_receiver_overrun(E1000State *s, size_t size)
|
|
|
|
{
|
|
|
|
trace_e1000_receiver_overrun(size, s->mac_reg[RDH], s->mac_reg[RDT]);
|
|
|
|
e1000x_inc_reg_if_not_full(s->mac_reg, RNBC);
|
|
|
|
e1000x_inc_reg_if_not_full(s->mac_reg, MPC);
|
|
|
|
set_ics(s, 0, E1000_ICS_RXO);
|
|
|
|
}
|
|
|
|
|
2009-05-18 16:40:55 +04:00
|
|
|
static ssize_t
|
2013-09-12 12:47:37 +04:00
|
|
|
e1000_receive_iov(NetClientState *nc, const struct iovec *iov, int iovcnt)
|
2008-02-03 05:20:18 +03:00
|
|
|
{
|
2013-01-30 15:12:23 +04:00
|
|
|
E1000State *s = qemu_get_nic_opaque(nc);
|
2013-06-30 14:55:52 +04:00
|
|
|
PCIDevice *d = PCI_DEVICE(s);
|
2008-02-03 05:20:18 +03:00
|
|
|
struct e1000_rx_desc desc;
|
2011-10-31 10:06:52 +04:00
|
|
|
dma_addr_t base;
|
2008-02-03 05:20:18 +03:00
|
|
|
unsigned int n, rdt;
|
|
|
|
uint32_t rdh_start;
|
2008-11-21 19:25:17 +03:00
|
|
|
uint16_t vlan_special = 0;
|
2013-09-12 12:47:37 +04:00
|
|
|
uint8_t vlan_status = 0;
|
2023-02-23 13:19:52 +03:00
|
|
|
uint8_t min_buf[ETH_ZLEN];
|
2013-09-12 12:47:37 +04:00
|
|
|
uint8_t *filter_buf = iov->iov_base;
|
|
|
|
size_t size = iov_size(iov, iovcnt);
|
|
|
|
size_t iov_ofs = 0;
|
2011-02-15 19:27:48 +03:00
|
|
|
size_t desc_offset;
|
|
|
|
size_t desc_size;
|
|
|
|
size_t total_size;
|
2023-05-23 05:42:54 +03:00
|
|
|
eth_pkt_types_e pkt_type;
|
2013-02-05 23:00:21 +04:00
|
|
|
|
2016-06-01 11:23:44 +03:00
|
|
|
if (!e1000x_hw_rx_enabled(s->mac_reg)) {
|
2009-05-18 16:40:55 +04:00
|
|
|
return -1;
|
2013-02-05 23:00:21 +04:00
|
|
|
}
|
2008-02-03 05:20:18 +03:00
|
|
|
|
e1000: Delay flush queue when receive RCTL
Due to too early RCT0 interrput, win10x32 may hang on booting.
This problem can be reproduced by doing power cycle on win10x32 guest.
In our environment, we have 10 win10x32 and stress power cycle.
The problem will happen about 20 rounds.
Below shows some log with comment:
The normal case:
22831@1551928392.984687:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
22831@1551928392.985655:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
22831@1551928392.985801:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: RCTL: 0, mac_reg[RCTL] = 0x0
22831@1551928393.056710:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: ICR read: 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: RCTL: 0, mac_reg[RCTL] = 0x0
22831@1551928393.077548:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: ICR read: 0
e1000: set_ics 2, ICR 0, IMR 0
e1000: set_ics 2, ICR 2, IMR 0
e1000: RCTL: 0, mac_reg[RCTL] = 0x0
22831@1551928393.102974:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
22831@1551928393.103267:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
e1000: RCTL: 255, mac_reg[RCTL] = 0x40002 <- win10x32 says it can handle
RX now
e1000: set_ics 0, ICR 2, IMR 9d <- unmask interrupt
e1000: RCTL: 255, mac_reg[RCTL] = 0x48002
e1000: set_ics 80, ICR 2, IMR 9d <- interrupt and work!
...
The bad case:
27744@1551930483.117766:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
27744@1551930483.118398:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: RCTL: 0, mac_reg[RCTL] = 0x0
27744@1551930483.198063:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: ICR read: 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: RCTL: 0, mac_reg[RCTL] = 0x0
27744@1551930483.218675:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: ICR read: 0
e1000: set_ics 2, ICR 0, IMR 0
e1000: set_ics 2, ICR 2, IMR 0
e1000: RCTL: 0, mac_reg[RCTL] = 0x0
27744@1551930483.241768:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
27744@1551930483.241979:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
e1000: RCTL: 255, mac_reg[RCTL] = 0x40002 <- win10x32 says it can handle
RX now
e1000: set_ics 80, ICR 2, IMR 0 <- flush queue (caused by setting RCTL)
e1000: set_ics 0, ICR 82, IMR 9d <- unmask interrupt and because 0x82&0x9d
!= 0 generate interrupt, hang on here...
To workaround this problem, simply delay flush queue. Also stop receiving
when timer is going to run.
Tested on CentOS, Win7SP1x64 and Win10x32.
Signed-off-by: yuchenlin <yuchenlin@synology.com>
Reviewed-by: Dmitry Fleytman <dmitry.fleytman@gmail.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
2019-03-13 09:56:49 +03:00
|
|
|
if (timer_pending(s->flush_queue_timer)) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2023-06-25 04:53:13 +03:00
|
|
|
if (iov->iov_len < MAXIMUM_ETHERNET_HDR_LEN) {
|
2013-09-12 12:47:37 +04:00
|
|
|
/* This is very unlikely, but may happen. */
|
|
|
|
iov_to_buf(iov, iovcnt, 0, min_buf, MAXIMUM_ETHERNET_HDR_LEN);
|
|
|
|
filter_buf = min_buf;
|
2010-09-19 00:43:45 +04:00
|
|
|
}
|
|
|
|
|
2012-12-03 08:11:22 +04:00
|
|
|
/* Discard oversized packets if !LPE and !SBP. */
|
2016-06-01 11:23:44 +03:00
|
|
|
if (e1000x_is_oversized(s->mac_reg, size)) {
|
2012-12-03 08:11:22 +04:00
|
|
|
return size;
|
|
|
|
}
|
|
|
|
|
2023-05-23 05:43:06 +03:00
|
|
|
if (!receive_filter(s, filter_buf)) {
|
2009-05-18 16:40:55 +04:00
|
|
|
return size;
|
2013-09-12 12:47:37 +04:00
|
|
|
}
|
2008-02-03 05:20:18 +03:00
|
|
|
|
2016-06-01 11:23:44 +03:00
|
|
|
if (e1000x_vlan_enabled(s->mac_reg) &&
|
|
|
|
e1000x_is_vlan_packet(filter_buf, le16_to_cpu(s->mac_reg[VET]))) {
|
2016-06-16 20:17:26 +03:00
|
|
|
vlan_special = cpu_to_le16(lduw_be_p(filter_buf + 14));
|
2013-09-12 12:47:37 +04:00
|
|
|
iov_ofs = 4;
|
|
|
|
if (filter_buf == iov->iov_base) {
|
|
|
|
memmove(filter_buf + 4, filter_buf, 12);
|
|
|
|
} else {
|
|
|
|
iov_from_buf(iov, iovcnt, 4, filter_buf, 12);
|
|
|
|
while (iov->iov_len <= iov_ofs) {
|
|
|
|
iov_ofs -= iov->iov_len;
|
|
|
|
iov++;
|
|
|
|
}
|
|
|
|
}
|
2008-11-21 19:25:17 +03:00
|
|
|
vlan_status = E1000_RXD_STAT_VP;
|
|
|
|
size -= 4;
|
|
|
|
}
|
|
|
|
|
2023-05-23 05:42:54 +03:00
|
|
|
pkt_type = get_eth_packet_type(PKT_GET_ETH_HDR(filter_buf));
|
2008-02-03 05:20:18 +03:00
|
|
|
rdh_start = s->mac_reg[RDH];
|
2011-02-15 19:27:48 +03:00
|
|
|
desc_offset = 0;
|
2016-06-01 11:23:44 +03:00
|
|
|
total_size = size + e1000x_fcs_len(s->mac_reg);
|
2011-02-15 19:27:55 +03:00
|
|
|
if (!e1000_has_rxbufs(s, total_size)) {
|
2018-10-16 12:40:45 +03:00
|
|
|
e1000_receiver_overrun(s, total_size);
|
|
|
|
return -1;
|
2011-02-15 19:27:55 +03:00
|
|
|
}
|
2008-02-03 05:20:18 +03:00
|
|
|
do {
|
2011-02-15 19:27:48 +03:00
|
|
|
desc_size = total_size - desc_offset;
|
|
|
|
if (desc_size > s->rxbuf_size) {
|
|
|
|
desc_size = s->rxbuf_size;
|
|
|
|
}
|
2011-03-26 21:37:56 +03:00
|
|
|
base = rx_desc_base(s) + sizeof(desc) * s->mac_reg[RDH];
|
2013-06-30 14:55:52 +04:00
|
|
|
pci_dma_read(d, base, &desc, sizeof(desc));
|
2008-11-21 19:25:17 +03:00
|
|
|
desc.special = vlan_special;
|
e1000: set RX descriptor status in a separate operation
The code of setting RX descriptor status field maybe work fine in
previously, however with the update of glibc version, it shows two
issues when guest using dpdk receive packets:
1. The dpdk has a certain probability getting wrong buffer_addr
this impact may be not obvious, such as lost a packet once in
a while
2. The dpdk may consume a packet twice when scan the RX desc queue
over again
this impact will lead a infinite wait in Qemu, since the RDT
(tail pointer) be inscreased to equal to RDH by unexpected,
which regard as the RX desc queue is full
Write a whole of RX desc with DD flag on is not quite correct, because
when the underlying implementation of memcpy using XMM registers to
copy e1000_rx_desc (when AVX or something else CPU feature is usable),
the bytes order of desc writing to memory is indeterminacy
We can use full-scale test case to reproduce the issue-2 by
https://github.com/BASM/qemu_dpdk_e1000_test (thanks to Leonid Myravjev)
I also write a POC test case at https://github.com/cdkey/e1000_poc
which can reproduce both of them, and easy to verify the patch effect.
The hw watchpoint also shows that, when Qemu using XMM related instructions
writing 16 bytes e1000_rx_desc, concurrent with DPDK using movb
writing 1 byte status, the final result of writing to memory will be one
of them, if it made by Qemu which DD flag is on, DPDK will consume it
again.
Setting DD status in a separate operation, can prevent the impact of
disorder memory writing by memcpy, also avoid unexpected data when
concurrent writing status by qemu and guest dpdk.
Links: https://lore.kernel.org/qemu-devel/20200102110504.GG121208@stefanha-x1.localdomain/T/
Reported-by: Leonid Myravjev <asm@asm.pp.ru>
Cc: Stefan Hajnoczi <stefanha@gmail.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: qemu-stable@nongnu.org
Tested-by: Jing Zhang <zhangjing@sangfor.com.cn>
Reviewed-by: Frank Lee <lifan38153@sangfor.com.cn>
Signed-off-by: Ding Hui <dinghui@sangfor.com.cn>
Signed-off-by: Jason Wang <jasowang@redhat.com>
2022-06-29 12:40:26 +03:00
|
|
|
desc.status &= ~E1000_RXD_STAT_DD;
|
2008-02-03 05:20:18 +03:00
|
|
|
if (desc.buffer_addr) {
|
2011-02-15 19:27:48 +03:00
|
|
|
if (desc_offset < size) {
|
2013-09-12 12:47:37 +04:00
|
|
|
size_t iov_copy;
|
|
|
|
hwaddr ba = le64_to_cpu(desc.buffer_addr);
|
2011-02-15 19:27:48 +03:00
|
|
|
size_t copy_size = size - desc_offset;
|
|
|
|
if (copy_size > s->rxbuf_size) {
|
|
|
|
copy_size = s->rxbuf_size;
|
|
|
|
}
|
2013-09-12 12:47:37 +04:00
|
|
|
do {
|
|
|
|
iov_copy = MIN(copy_size, iov->iov_len - iov_ofs);
|
|
|
|
pci_dma_write(d, ba, iov->iov_base + iov_ofs, iov_copy);
|
|
|
|
copy_size -= iov_copy;
|
|
|
|
ba += iov_copy;
|
|
|
|
iov_ofs += iov_copy;
|
|
|
|
if (iov_ofs == iov->iov_len) {
|
|
|
|
iov++;
|
|
|
|
iov_ofs = 0;
|
|
|
|
}
|
|
|
|
} while (copy_size);
|
2011-02-15 19:27:48 +03:00
|
|
|
}
|
|
|
|
desc_offset += desc_size;
|
2011-02-15 19:27:52 +03:00
|
|
|
desc.length = cpu_to_le16(desc_size);
|
2011-02-15 19:27:48 +03:00
|
|
|
if (desc_offset >= total_size) {
|
|
|
|
desc.status |= E1000_RXD_STAT_EOP | E1000_RXD_STAT_IXSM;
|
|
|
|
} else {
|
2011-02-15 19:27:52 +03:00
|
|
|
/* Guest zeroing out status is not a hardware requirement.
|
|
|
|
Clear EOP in case guest didn't do it. */
|
|
|
|
desc.status &= ~E1000_RXD_STAT_EOP;
|
2011-02-15 19:27:48 +03:00
|
|
|
}
|
2010-11-11 18:10:04 +03:00
|
|
|
} else { // as per intel docs; skip descriptors with null buf addr
|
2008-02-03 05:20:18 +03:00
|
|
|
DBGOUT(RX, "Null RX descriptor!!\n");
|
2010-11-11 18:10:04 +03:00
|
|
|
}
|
2013-06-30 14:55:52 +04:00
|
|
|
pci_dma_write(d, base, &desc, sizeof(desc));
|
e1000: set RX descriptor status in a separate operation
The code of setting RX descriptor status field maybe work fine in
previously, however with the update of glibc version, it shows two
issues when guest using dpdk receive packets:
1. The dpdk has a certain probability getting wrong buffer_addr
this impact may be not obvious, such as lost a packet once in
a while
2. The dpdk may consume a packet twice when scan the RX desc queue
over again
this impact will lead a infinite wait in Qemu, since the RDT
(tail pointer) be inscreased to equal to RDH by unexpected,
which regard as the RX desc queue is full
Write a whole of RX desc with DD flag on is not quite correct, because
when the underlying implementation of memcpy using XMM registers to
copy e1000_rx_desc (when AVX or something else CPU feature is usable),
the bytes order of desc writing to memory is indeterminacy
We can use full-scale test case to reproduce the issue-2 by
https://github.com/BASM/qemu_dpdk_e1000_test (thanks to Leonid Myravjev)
I also write a POC test case at https://github.com/cdkey/e1000_poc
which can reproduce both of them, and easy to verify the patch effect.
The hw watchpoint also shows that, when Qemu using XMM related instructions
writing 16 bytes e1000_rx_desc, concurrent with DPDK using movb
writing 1 byte status, the final result of writing to memory will be one
of them, if it made by Qemu which DD flag is on, DPDK will consume it
again.
Setting DD status in a separate operation, can prevent the impact of
disorder memory writing by memcpy, also avoid unexpected data when
concurrent writing status by qemu and guest dpdk.
Links: https://lore.kernel.org/qemu-devel/20200102110504.GG121208@stefanha-x1.localdomain/T/
Reported-by: Leonid Myravjev <asm@asm.pp.ru>
Cc: Stefan Hajnoczi <stefanha@gmail.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: qemu-stable@nongnu.org
Tested-by: Jing Zhang <zhangjing@sangfor.com.cn>
Reviewed-by: Frank Lee <lifan38153@sangfor.com.cn>
Signed-off-by: Ding Hui <dinghui@sangfor.com.cn>
Signed-off-by: Jason Wang <jasowang@redhat.com>
2022-06-29 12:40:26 +03:00
|
|
|
desc.status |= (vlan_status | E1000_RXD_STAT_DD);
|
|
|
|
pci_dma_write(d, base + offsetof(struct e1000_rx_desc, status),
|
|
|
|
&desc.status, sizeof(desc.status));
|
2008-02-03 05:20:18 +03:00
|
|
|
|
|
|
|
if (++s->mac_reg[RDH] * sizeof(desc) >= s->mac_reg[RDLEN])
|
|
|
|
s->mac_reg[RDH] = 0;
|
|
|
|
/* see comment in start_xmit; same here */
|
e1000: eliminate infinite loops on out-of-bounds transfer start
The start_xmit() and e1000_receive_iov() functions implement DMA transfers
iterating over a set of descriptors that the guest's e1000 driver
prepares:
- the TDLEN and RDLEN registers store the total size of the descriptor
area,
- while the TDH and RDH registers store the offset (in whole tx / rx
descriptors) into the area where the transfer is supposed to start.
Each time a descriptor is processed, the TDH and RDH register is bumped
(as appropriate for the transfer direction).
QEMU already contains logic to deal with bogus transfers submitted by the
guest:
- Normally, the transmit case wants to increase TDH from its initial value
to TDT. (TDT is allowed to be numerically smaller than the initial TDH
value; wrapping at or above TDLEN bytes to zero is normal.) The failsafe
that QEMU currently has here is a check against reaching the original
TDH value again -- a complete wraparound, which should never happen.
- In the receive case RDH is increased from its initial value until
"total_size" bytes have been received; preferably in a single step, or
in "s->rxbuf_size" byte steps, if the latter is smaller. However, null
RX descriptors are skipped without receiving data, while RDH is
incremented just the same. QEMU tries to prevent an infinite loop
(processing only null RX descriptors) by detecting whether RDH assumes
its original value during the loop. (Again, wrapping from RDLEN to 0 is
normal.)
What both directions miss is that the guest could program TDLEN and RDLEN
so low, and the initial TDH and RDH so high, that these registers will
immediately be truncated to zero, and then never reassume their initial
values in the loop -- a full wraparound will never occur.
The condition that expresses this is:
xdh_start >= s->mac_reg[XDLEN] / sizeof(desc)
i.e., TDH or RDH start out after the last whole rx or tx descriptor that
fits into the TDLEN or RDLEN sized area.
This condition could be checked before we enter the loops, but
pci_dma_read() / pci_dma_write() knows how to fill in buffers safely for
bogus DMA addresses, so we just extend the existing failsafes with the
above condition.
This is CVE-2016-1981.
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Petr Matousek <pmatouse@redhat.com>
Cc: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Cc: Prasad Pandit <ppandit@redhat.com>
Cc: Michael Roth <mdroth@linux.vnet.ibm.com>
Cc: Jason Wang <jasowang@redhat.com>
Cc: qemu-stable@nongnu.org
RHBZ: https://bugzilla.redhat.com/show_bug.cgi?id=1296044
Signed-off-by: Laszlo Ersek <lersek@redhat.com>
Reviewed-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
2016-01-19 16:17:20 +03:00
|
|
|
if (s->mac_reg[RDH] == rdh_start ||
|
|
|
|
rdh_start >= s->mac_reg[RDLEN] / sizeof(desc)) {
|
2008-02-03 05:20:18 +03:00
|
|
|
DBGOUT(RXERR, "RDH wraparound @%x, RDT %x, RDLEN %x\n",
|
|
|
|
rdh_start, s->mac_reg[RDT], s->mac_reg[RDLEN]);
|
2018-10-16 12:40:45 +03:00
|
|
|
e1000_receiver_overrun(s, total_size);
|
2009-05-18 16:40:55 +04:00
|
|
|
return -1;
|
2008-02-03 05:20:18 +03:00
|
|
|
}
|
2011-02-15 19:27:48 +03:00
|
|
|
} while (desc_offset < total_size);
|
2008-02-03 05:20:18 +03:00
|
|
|
|
2023-05-23 05:42:54 +03:00
|
|
|
e1000x_update_rx_total_stats(s->mac_reg, pkt_type, size, total_size);
|
2008-02-03 05:20:18 +03:00
|
|
|
|
|
|
|
n = E1000_ICS_RXT0;
|
|
|
|
if ((rdt = s->mac_reg[RDT]) < s->mac_reg[RDH])
|
|
|
|
rdt += s->mac_reg[RDLEN] / sizeof(desc);
|
2009-03-20 19:13:47 +03:00
|
|
|
if (((rdt - s->mac_reg[RDH]) * sizeof(desc)) <= s->mac_reg[RDLEN] >>
|
|
|
|
s->rxbuf_min_shift)
|
2008-02-03 05:20:18 +03:00
|
|
|
n |= E1000_ICS_RXDMT0;
|
|
|
|
|
|
|
|
set_ics(s, 0, n);
|
2009-05-18 16:40:55 +04:00
|
|
|
|
|
|
|
return size;
|
2008-02-03 05:20:18 +03:00
|
|
|
}
|
|
|
|
|
2013-09-12 12:47:37 +04:00
|
|
|
static ssize_t
|
|
|
|
e1000_receive(NetClientState *nc, const uint8_t *buf, size_t size)
|
|
|
|
{
|
|
|
|
const struct iovec iov = {
|
|
|
|
.iov_base = (uint8_t *)buf,
|
|
|
|
.iov_len = size
|
|
|
|
};
|
|
|
|
|
|
|
|
return e1000_receive_iov(nc, &iov, 1);
|
|
|
|
}
|
|
|
|
|
2008-02-03 05:20:18 +03:00
|
|
|
static uint32_t
|
|
|
|
mac_readreg(E1000State *s, int index)
|
|
|
|
{
|
|
|
|
return s->mac_reg[index];
|
|
|
|
}
|
|
|
|
|
|
|
|
static uint32_t
|
|
|
|
mac_icr_read(E1000State *s, int index)
|
|
|
|
{
|
|
|
|
uint32_t ret = s->mac_reg[ICR];
|
|
|
|
|
|
|
|
DBGOUT(INTERRUPT, "ICR read: %x\n", ret);
|
|
|
|
set_interrupt_cause(s, 0, 0);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static uint32_t
|
|
|
|
mac_read_clr4(E1000State *s, int index)
|
|
|
|
{
|
|
|
|
uint32_t ret = s->mac_reg[index];
|
|
|
|
|
|
|
|
s->mac_reg[index] = 0;
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static uint32_t
|
|
|
|
mac_read_clr8(E1000State *s, int index)
|
|
|
|
{
|
|
|
|
uint32_t ret = s->mac_reg[index];
|
|
|
|
|
|
|
|
s->mac_reg[index] = 0;
|
|
|
|
s->mac_reg[index-1] = 0;
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
mac_writereg(E1000State *s, int index, uint32_t val)
|
|
|
|
{
|
2013-10-17 11:02:49 +04:00
|
|
|
uint32_t macaddr[2];
|
|
|
|
|
2008-02-03 05:20:18 +03:00
|
|
|
s->mac_reg[index] = val;
|
2013-10-17 11:02:49 +04:00
|
|
|
|
2013-11-18 23:41:44 +04:00
|
|
|
if (index == RA + 1) {
|
2013-10-17 11:02:49 +04:00
|
|
|
macaddr[0] = cpu_to_le32(s->mac_reg[RA]);
|
|
|
|
macaddr[1] = cpu_to_le32(s->mac_reg[RA + 1]);
|
|
|
|
qemu_format_nic_info_str(qemu_get_queue(s->nic), (uint8_t *)macaddr);
|
|
|
|
}
|
2008-02-03 05:20:18 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
set_rdt(E1000State *s, int index, uint32_t val)
|
|
|
|
{
|
|
|
|
s->mac_reg[index] = val & 0xffff;
|
2012-08-09 18:45:56 +04:00
|
|
|
if (e1000_has_rxbufs(s, 1)) {
|
2013-01-30 15:12:22 +04:00
|
|
|
qemu_flush_queued_packets(qemu_get_queue(s->nic));
|
2012-08-09 18:45:56 +04:00
|
|
|
}
|
2008-02-03 05:20:18 +03:00
|
|
|
}
|
|
|
|
|
2023-02-23 13:19:49 +03:00
|
|
|
#define LOW_BITS_SET_FUNC(num) \
|
|
|
|
static void \
|
|
|
|
set_##num##bit(E1000State *s, int index, uint32_t val) \
|
|
|
|
{ \
|
|
|
|
s->mac_reg[index] = val & (BIT(num) - 1); \
|
|
|
|
}
|
|
|
|
|
|
|
|
LOW_BITS_SET_FUNC(4)
|
|
|
|
LOW_BITS_SET_FUNC(11)
|
|
|
|
LOW_BITS_SET_FUNC(13)
|
|
|
|
LOW_BITS_SET_FUNC(16)
|
2008-02-03 05:20:18 +03:00
|
|
|
|
|
|
|
static void
|
|
|
|
set_dlen(E1000State *s, int index, uint32_t val)
|
|
|
|
{
|
|
|
|
s->mac_reg[index] = val & 0xfff80;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
set_tctl(E1000State *s, int index, uint32_t val)
|
|
|
|
{
|
|
|
|
s->mac_reg[index] = val;
|
|
|
|
s->mac_reg[TDT] &= 0xffff;
|
|
|
|
start_xmit(s);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
set_icr(E1000State *s, int index, uint32_t val)
|
|
|
|
{
|
|
|
|
DBGOUT(INTERRUPT, "set_icr %x\n", val);
|
|
|
|
set_interrupt_cause(s, 0, s->mac_reg[ICR] & ~val);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
set_imc(E1000State *s, int index, uint32_t val)
|
|
|
|
{
|
|
|
|
s->mac_reg[IMS] &= ~val;
|
|
|
|
set_ics(s, 0, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
set_ims(E1000State *s, int index, uint32_t val)
|
|
|
|
{
|
|
|
|
s->mac_reg[IMS] |= val;
|
|
|
|
set_ics(s, 0, 0);
|
|
|
|
}
|
|
|
|
|
2015-11-11 16:52:39 +03:00
|
|
|
#define getreg(x) [x] = mac_readreg
|
2020-03-05 04:04:44 +03:00
|
|
|
typedef uint32_t (*readops)(E1000State *, int);
|
2020-03-05 04:04:45 +03:00
|
|
|
static const readops macreg_readops[] = {
|
2015-11-11 16:52:39 +03:00
|
|
|
getreg(PBA), getreg(RCTL), getreg(TDH), getreg(TXDCTL),
|
|
|
|
getreg(WUFC), getreg(TDT), getreg(CTRL), getreg(LEDCTL),
|
|
|
|
getreg(MANC), getreg(MDIC), getreg(SWSM), getreg(STATUS),
|
|
|
|
getreg(TORL), getreg(TOTL), getreg(IMS), getreg(TCTL),
|
|
|
|
getreg(RDH), getreg(RDT), getreg(VET), getreg(ICS),
|
|
|
|
getreg(TDBAL), getreg(TDBAH), getreg(RDBAH), getreg(RDBAL),
|
|
|
|
getreg(TDLEN), getreg(RDLEN), getreg(RDTR), getreg(RADV),
|
2015-11-11 16:52:42 +03:00
|
|
|
getreg(TADV), getreg(ITR), getreg(FCRUC), getreg(IPAV),
|
|
|
|
getreg(WUC), getreg(WUS), getreg(SCC), getreg(ECOL),
|
|
|
|
getreg(MCC), getreg(LATECOL), getreg(COLC), getreg(DC),
|
2017-09-03 19:37:26 +03:00
|
|
|
getreg(TNCRS), getreg(SEQEC), getreg(CEXTERR), getreg(RLEC),
|
2015-11-11 16:52:42 +03:00
|
|
|
getreg(XONRXC), getreg(XONTXC), getreg(XOFFRXC), getreg(XOFFTXC),
|
|
|
|
getreg(RFC), getreg(RJC), getreg(RNBC), getreg(TSCTFC),
|
2015-11-11 16:52:46 +03:00
|
|
|
getreg(MGTPRC), getreg(MGTPDC), getreg(MGTPTC), getreg(GORCL),
|
2023-02-23 13:19:49 +03:00
|
|
|
getreg(GOTCL), getreg(RDFH), getreg(RDFT), getreg(RDFHS),
|
|
|
|
getreg(RDFTS), getreg(RDFPC), getreg(TDFH), getreg(TDFT),
|
|
|
|
getreg(TDFHS), getreg(TDFTS), getreg(TDFPC), getreg(AIT),
|
2015-11-11 16:52:39 +03:00
|
|
|
|
|
|
|
[TOTH] = mac_read_clr8, [TORH] = mac_read_clr8,
|
2015-11-11 16:52:46 +03:00
|
|
|
[GOTCH] = mac_read_clr8, [GORCH] = mac_read_clr8,
|
|
|
|
[PRC64] = mac_read_clr4, [PRC127] = mac_read_clr4,
|
|
|
|
[PRC255] = mac_read_clr4, [PRC511] = mac_read_clr4,
|
|
|
|
[PRC1023] = mac_read_clr4, [PRC1522] = mac_read_clr4,
|
|
|
|
[PTC64] = mac_read_clr4, [PTC127] = mac_read_clr4,
|
|
|
|
[PTC255] = mac_read_clr4, [PTC511] = mac_read_clr4,
|
|
|
|
[PTC1023] = mac_read_clr4, [PTC1522] = mac_read_clr4,
|
2015-11-11 16:52:39 +03:00
|
|
|
[GPRC] = mac_read_clr4, [GPTC] = mac_read_clr4,
|
|
|
|
[TPT] = mac_read_clr4, [TPR] = mac_read_clr4,
|
2015-11-11 16:52:46 +03:00
|
|
|
[RUC] = mac_read_clr4, [ROC] = mac_read_clr4,
|
|
|
|
[BPRC] = mac_read_clr4, [MPRC] = mac_read_clr4,
|
|
|
|
[TSCTC] = mac_read_clr4, [BPTC] = mac_read_clr4,
|
|
|
|
[MPTC] = mac_read_clr4,
|
2015-11-11 16:52:39 +03:00
|
|
|
[ICR] = mac_icr_read, [EECD] = get_eecd,
|
|
|
|
[EERD] = flash_eerd_read,
|
|
|
|
|
|
|
|
[CRCERRS ... MPC] = &mac_readreg,
|
2015-11-11 16:52:42 +03:00
|
|
|
[IP6AT ... IP6AT + 3] = &mac_readreg, [IP4AT ... IP4AT + 6] = &mac_readreg,
|
2023-02-23 13:19:49 +03:00
|
|
|
[FFLT ... FFLT + 6] = &mac_readreg,
|
2015-11-11 16:52:39 +03:00
|
|
|
[RA ... RA + 31] = &mac_readreg,
|
2015-11-11 16:52:42 +03:00
|
|
|
[WUPM ... WUPM + 31] = &mac_readreg,
|
2023-02-23 13:19:52 +03:00
|
|
|
[MTA ... MTA + E1000_MC_TBL_SIZE - 1] = &mac_readreg,
|
|
|
|
[VFTA ... VFTA + E1000_VLAN_FILTER_TBL_SIZE - 1] = &mac_readreg,
|
2023-02-23 13:19:49 +03:00
|
|
|
[FFMT ... FFMT + 254] = &mac_readreg,
|
2015-11-11 16:52:42 +03:00
|
|
|
[FFVT ... FFVT + 254] = &mac_readreg,
|
|
|
|
[PBM ... PBM + 16383] = &mac_readreg,
|
2008-02-03 05:20:18 +03:00
|
|
|
};
|
2008-12-22 23:33:55 +03:00
|
|
|
enum { NREADOPS = ARRAY_SIZE(macreg_readops) };
|
2008-02-03 05:20:18 +03:00
|
|
|
|
2015-11-11 16:52:39 +03:00
|
|
|
#define putreg(x) [x] = mac_writereg
|
2020-03-05 04:04:44 +03:00
|
|
|
typedef void (*writeops)(E1000State *, int, uint32_t);
|
2020-03-05 04:04:45 +03:00
|
|
|
static const writeops macreg_writeops[] = {
|
2015-11-11 16:52:39 +03:00
|
|
|
putreg(PBA), putreg(EERD), putreg(SWSM), putreg(WUFC),
|
|
|
|
putreg(TDBAL), putreg(TDBAH), putreg(TXDCTL), putreg(RDBAH),
|
2015-11-11 16:52:42 +03:00
|
|
|
putreg(RDBAL), putreg(LEDCTL), putreg(VET), putreg(FCRUC),
|
2023-02-23 13:19:49 +03:00
|
|
|
putreg(IPAV), putreg(WUC),
|
|
|
|
putreg(WUS),
|
|
|
|
|
|
|
|
[TDLEN] = set_dlen, [RDLEN] = set_dlen, [TCTL] = set_tctl,
|
|
|
|
[TDT] = set_tctl, [MDIC] = set_mdic, [ICS] = set_ics,
|
|
|
|
[TDH] = set_16bit, [RDH] = set_16bit, [RDT] = set_rdt,
|
|
|
|
[IMC] = set_imc, [IMS] = set_ims, [ICR] = set_icr,
|
|
|
|
[EECD] = set_eecd, [RCTL] = set_rx_control, [CTRL] = set_ctrl,
|
|
|
|
[RDTR] = set_16bit, [RADV] = set_16bit, [TADV] = set_16bit,
|
|
|
|
[ITR] = set_16bit, [TDFH] = set_11bit, [TDFT] = set_11bit,
|
|
|
|
[TDFHS] = set_13bit, [TDFTS] = set_13bit, [TDFPC] = set_13bit,
|
|
|
|
[RDFH] = set_13bit, [RDFT] = set_13bit, [RDFHS] = set_13bit,
|
|
|
|
[RDFTS] = set_13bit, [RDFPC] = set_13bit, [AIT] = set_16bit,
|
2015-11-11 16:52:39 +03:00
|
|
|
|
2015-11-11 16:52:42 +03:00
|
|
|
[IP6AT ... IP6AT + 3] = &mac_writereg, [IP4AT ... IP4AT + 6] = &mac_writereg,
|
2023-02-23 13:19:49 +03:00
|
|
|
[FFLT ... FFLT + 6] = &set_11bit,
|
2015-11-11 16:52:39 +03:00
|
|
|
[RA ... RA + 31] = &mac_writereg,
|
2015-11-11 16:52:42 +03:00
|
|
|
[WUPM ... WUPM + 31] = &mac_writereg,
|
2023-02-23 13:19:52 +03:00
|
|
|
[MTA ... MTA + E1000_MC_TBL_SIZE - 1] = &mac_writereg,
|
|
|
|
[VFTA ... VFTA + E1000_VLAN_FILTER_TBL_SIZE - 1] = &mac_writereg,
|
2023-02-23 13:19:49 +03:00
|
|
|
[FFMT ... FFMT + 254] = &set_4bit, [FFVT ... FFVT + 254] = &mac_writereg,
|
2015-11-11 16:52:42 +03:00
|
|
|
[PBM ... PBM + 16383] = &mac_writereg,
|
2008-02-03 05:20:18 +03:00
|
|
|
};
|
2012-03-22 14:02:24 +04:00
|
|
|
|
2008-12-22 23:33:55 +03:00
|
|
|
enum { NWRITEOPS = ARRAY_SIZE(macreg_writeops) };
|
2008-02-03 05:20:18 +03:00
|
|
|
|
e1000: Introduced an array to control the access to the MAC registers
The array of uint8_t's which is introduced here, contains access metadata
about the MAC registers: if a register is accessible, but partly implemented,
or if a register requires a certain compatibility flag in order to be
accessed. Currently, 6 hypothetical flags are supported (3 exist for e1000
so far) but in the future, if more than 6 flags will be needed, the datatype
of this array can simply be swapped for a larger one.
This patch is intended to solve the following current problems:
1) In a scenario of migration between different versions of QEMU, which
differ by the MAC registers implemented in them, some registers need not to
be active if a compatibility flag is set, in order to preserve the machine's
state perfectly for the older version. Checking this for each register
individually, would create a lot of clutter in the code.
2) Some registers are (or may be) only partly implemented (e.g.
placeholders that allow reading and writing, but lack other functions).
In such cases it is better to print a debug warning on read/write attempts.
As above, dealing with this functionality on a per-register level, would
require longer and more messy code.
Signed-off-by: Leonid Bloch <leonid.bloch@ravellosystems.com>
Signed-off-by: Dmitry Fleytman <dmitry.fleytman@ravellosystems.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
2015-11-11 16:52:41 +03:00
|
|
|
enum { MAC_ACCESS_PARTIAL = 1, MAC_ACCESS_FLAG_NEEDED = 2 };
|
|
|
|
|
|
|
|
#define markflag(x) ((E1000_FLAG_##x << 2) | MAC_ACCESS_FLAG_NEEDED)
|
|
|
|
/* In the array below the meaning of the bits is: [f|f|f|f|f|f|n|p]
|
|
|
|
* f - flag bits (up to 6 possible flags)
|
|
|
|
* n - flag needed
|
|
|
|
* p - partially implenented */
|
|
|
|
static const uint8_t mac_reg_access[0x8000] = {
|
2015-11-11 16:52:42 +03:00
|
|
|
[IPAV] = markflag(MAC), [WUC] = markflag(MAC),
|
|
|
|
[IP6AT] = markflag(MAC), [IP4AT] = markflag(MAC),
|
|
|
|
[FFVT] = markflag(MAC), [WUPM] = markflag(MAC),
|
|
|
|
[ECOL] = markflag(MAC), [MCC] = markflag(MAC),
|
|
|
|
[DC] = markflag(MAC), [TNCRS] = markflag(MAC),
|
|
|
|
[RLEC] = markflag(MAC), [XONRXC] = markflag(MAC),
|
|
|
|
[XOFFTXC] = markflag(MAC), [RFC] = markflag(MAC),
|
|
|
|
[TSCTFC] = markflag(MAC), [MGTPRC] = markflag(MAC),
|
|
|
|
[WUS] = markflag(MAC), [AIT] = markflag(MAC),
|
|
|
|
[FFLT] = markflag(MAC), [FFMT] = markflag(MAC),
|
|
|
|
[SCC] = markflag(MAC), [FCRUC] = markflag(MAC),
|
|
|
|
[LATECOL] = markflag(MAC), [COLC] = markflag(MAC),
|
2017-09-03 19:37:26 +03:00
|
|
|
[SEQEC] = markflag(MAC), [CEXTERR] = markflag(MAC),
|
2015-11-11 16:52:42 +03:00
|
|
|
[XONTXC] = markflag(MAC), [XOFFRXC] = markflag(MAC),
|
|
|
|
[RJC] = markflag(MAC), [RNBC] = markflag(MAC),
|
|
|
|
[MGTPDC] = markflag(MAC), [MGTPTC] = markflag(MAC),
|
2015-11-11 16:52:46 +03:00
|
|
|
[RUC] = markflag(MAC), [ROC] = markflag(MAC),
|
|
|
|
[GORCL] = markflag(MAC), [GORCH] = markflag(MAC),
|
|
|
|
[GOTCL] = markflag(MAC), [GOTCH] = markflag(MAC),
|
|
|
|
[BPRC] = markflag(MAC), [MPRC] = markflag(MAC),
|
|
|
|
[TSCTC] = markflag(MAC), [PRC64] = markflag(MAC),
|
|
|
|
[PRC127] = markflag(MAC), [PRC255] = markflag(MAC),
|
|
|
|
[PRC511] = markflag(MAC), [PRC1023] = markflag(MAC),
|
|
|
|
[PRC1522] = markflag(MAC), [PTC64] = markflag(MAC),
|
|
|
|
[PTC127] = markflag(MAC), [PTC255] = markflag(MAC),
|
|
|
|
[PTC511] = markflag(MAC), [PTC1023] = markflag(MAC),
|
|
|
|
[PTC1522] = markflag(MAC), [MPTC] = markflag(MAC),
|
|
|
|
[BPTC] = markflag(MAC),
|
2015-11-11 16:52:42 +03:00
|
|
|
|
|
|
|
[TDFH] = markflag(MAC) | MAC_ACCESS_PARTIAL,
|
|
|
|
[TDFT] = markflag(MAC) | MAC_ACCESS_PARTIAL,
|
|
|
|
[TDFHS] = markflag(MAC) | MAC_ACCESS_PARTIAL,
|
|
|
|
[TDFTS] = markflag(MAC) | MAC_ACCESS_PARTIAL,
|
|
|
|
[TDFPC] = markflag(MAC) | MAC_ACCESS_PARTIAL,
|
|
|
|
[RDFH] = markflag(MAC) | MAC_ACCESS_PARTIAL,
|
|
|
|
[RDFT] = markflag(MAC) | MAC_ACCESS_PARTIAL,
|
|
|
|
[RDFHS] = markflag(MAC) | MAC_ACCESS_PARTIAL,
|
|
|
|
[RDFTS] = markflag(MAC) | MAC_ACCESS_PARTIAL,
|
|
|
|
[RDFPC] = markflag(MAC) | MAC_ACCESS_PARTIAL,
|
|
|
|
[PBM] = markflag(MAC) | MAC_ACCESS_PARTIAL,
|
e1000: Introduced an array to control the access to the MAC registers
The array of uint8_t's which is introduced here, contains access metadata
about the MAC registers: if a register is accessible, but partly implemented,
or if a register requires a certain compatibility flag in order to be
accessed. Currently, 6 hypothetical flags are supported (3 exist for e1000
so far) but in the future, if more than 6 flags will be needed, the datatype
of this array can simply be swapped for a larger one.
This patch is intended to solve the following current problems:
1) In a scenario of migration between different versions of QEMU, which
differ by the MAC registers implemented in them, some registers need not to
be active if a compatibility flag is set, in order to preserve the machine's
state perfectly for the older version. Checking this for each register
individually, would create a lot of clutter in the code.
2) Some registers are (or may be) only partly implemented (e.g.
placeholders that allow reading and writing, but lack other functions).
In such cases it is better to print a debug warning on read/write attempts.
As above, dealing with this functionality on a per-register level, would
require longer and more messy code.
Signed-off-by: Leonid Bloch <leonid.bloch@ravellosystems.com>
Signed-off-by: Dmitry Fleytman <dmitry.fleytman@ravellosystems.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
2015-11-11 16:52:41 +03:00
|
|
|
};
|
|
|
|
|
2008-02-03 05:20:18 +03:00
|
|
|
static void
|
2012-10-23 14:30:10 +04:00
|
|
|
e1000_mmio_write(void *opaque, hwaddr addr, uint64_t val,
|
2011-08-08 17:09:08 +04:00
|
|
|
unsigned size)
|
2008-02-03 05:20:18 +03:00
|
|
|
{
|
|
|
|
E1000State *s = opaque;
|
2008-12-01 21:59:50 +03:00
|
|
|
unsigned int index = (addr & 0x1ffff) >> 2;
|
2008-02-03 05:20:18 +03:00
|
|
|
|
2010-11-11 18:10:04 +03:00
|
|
|
if (index < NWRITEOPS && macreg_writeops[index]) {
|
e1000: Introduced an array to control the access to the MAC registers
The array of uint8_t's which is introduced here, contains access metadata
about the MAC registers: if a register is accessible, but partly implemented,
or if a register requires a certain compatibility flag in order to be
accessed. Currently, 6 hypothetical flags are supported (3 exist for e1000
so far) but in the future, if more than 6 flags will be needed, the datatype
of this array can simply be swapped for a larger one.
This patch is intended to solve the following current problems:
1) In a scenario of migration between different versions of QEMU, which
differ by the MAC registers implemented in them, some registers need not to
be active if a compatibility flag is set, in order to preserve the machine's
state perfectly for the older version. Checking this for each register
individually, would create a lot of clutter in the code.
2) Some registers are (or may be) only partly implemented (e.g.
placeholders that allow reading and writing, but lack other functions).
In such cases it is better to print a debug warning on read/write attempts.
As above, dealing with this functionality on a per-register level, would
require longer and more messy code.
Signed-off-by: Leonid Bloch <leonid.bloch@ravellosystems.com>
Signed-off-by: Dmitry Fleytman <dmitry.fleytman@ravellosystems.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
2015-11-11 16:52:41 +03:00
|
|
|
if (!(mac_reg_access[index] & MAC_ACCESS_FLAG_NEEDED)
|
|
|
|
|| (s->compat_flags & (mac_reg_access[index] >> 2))) {
|
|
|
|
if (mac_reg_access[index] & MAC_ACCESS_PARTIAL) {
|
|
|
|
DBGOUT(GENERAL, "Writing to register at offset: 0x%08x. "
|
|
|
|
"It is not fully implemented.\n", index<<2);
|
|
|
|
}
|
|
|
|
macreg_writeops[index](s, index, val);
|
|
|
|
} else { /* "flag needed" bit is set, but the flag is not active */
|
|
|
|
DBGOUT(MMIO, "MMIO write attempt to disabled reg. addr=0x%08x\n",
|
|
|
|
index<<2);
|
|
|
|
}
|
2010-11-11 18:10:04 +03:00
|
|
|
} else if (index < NREADOPS && macreg_readops[index]) {
|
e1000: Introduced an array to control the access to the MAC registers
The array of uint8_t's which is introduced here, contains access metadata
about the MAC registers: if a register is accessible, but partly implemented,
or if a register requires a certain compatibility flag in order to be
accessed. Currently, 6 hypothetical flags are supported (3 exist for e1000
so far) but in the future, if more than 6 flags will be needed, the datatype
of this array can simply be swapped for a larger one.
This patch is intended to solve the following current problems:
1) In a scenario of migration between different versions of QEMU, which
differ by the MAC registers implemented in them, some registers need not to
be active if a compatibility flag is set, in order to preserve the machine's
state perfectly for the older version. Checking this for each register
individually, would create a lot of clutter in the code.
2) Some registers are (or may be) only partly implemented (e.g.
placeholders that allow reading and writing, but lack other functions).
In such cases it is better to print a debug warning on read/write attempts.
As above, dealing with this functionality on a per-register level, would
require longer and more messy code.
Signed-off-by: Leonid Bloch <leonid.bloch@ravellosystems.com>
Signed-off-by: Dmitry Fleytman <dmitry.fleytman@ravellosystems.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
2015-11-11 16:52:41 +03:00
|
|
|
DBGOUT(MMIO, "e1000_mmio_writel RO %x: 0x%04"PRIx64"\n",
|
|
|
|
index<<2, val);
|
2010-11-11 18:10:04 +03:00
|
|
|
} else {
|
2011-08-08 17:09:08 +04:00
|
|
|
DBGOUT(UNKNOWN, "MMIO unknown write addr=0x%08x,val=0x%08"PRIx64"\n",
|
2008-02-03 05:20:18 +03:00
|
|
|
index<<2, val);
|
2010-11-11 18:10:04 +03:00
|
|
|
}
|
2008-02-03 05:20:18 +03:00
|
|
|
}
|
|
|
|
|
2011-08-08 17:09:08 +04:00
|
|
|
static uint64_t
|
2012-10-23 14:30:10 +04:00
|
|
|
e1000_mmio_read(void *opaque, hwaddr addr, unsigned size)
|
2008-02-03 05:20:18 +03:00
|
|
|
{
|
|
|
|
E1000State *s = opaque;
|
2008-12-01 21:59:50 +03:00
|
|
|
unsigned int index = (addr & 0x1ffff) >> 2;
|
2008-02-03 05:20:18 +03:00
|
|
|
|
e1000: Introduced an array to control the access to the MAC registers
The array of uint8_t's which is introduced here, contains access metadata
about the MAC registers: if a register is accessible, but partly implemented,
or if a register requires a certain compatibility flag in order to be
accessed. Currently, 6 hypothetical flags are supported (3 exist for e1000
so far) but in the future, if more than 6 flags will be needed, the datatype
of this array can simply be swapped for a larger one.
This patch is intended to solve the following current problems:
1) In a scenario of migration between different versions of QEMU, which
differ by the MAC registers implemented in them, some registers need not to
be active if a compatibility flag is set, in order to preserve the machine's
state perfectly for the older version. Checking this for each register
individually, would create a lot of clutter in the code.
2) Some registers are (or may be) only partly implemented (e.g.
placeholders that allow reading and writing, but lack other functions).
In such cases it is better to print a debug warning on read/write attempts.
As above, dealing with this functionality on a per-register level, would
require longer and more messy code.
Signed-off-by: Leonid Bloch <leonid.bloch@ravellosystems.com>
Signed-off-by: Dmitry Fleytman <dmitry.fleytman@ravellosystems.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
2015-11-11 16:52:41 +03:00
|
|
|
if (index < NREADOPS && macreg_readops[index]) {
|
|
|
|
if (!(mac_reg_access[index] & MAC_ACCESS_FLAG_NEEDED)
|
|
|
|
|| (s->compat_flags & (mac_reg_access[index] >> 2))) {
|
|
|
|
if (mac_reg_access[index] & MAC_ACCESS_PARTIAL) {
|
|
|
|
DBGOUT(GENERAL, "Reading register at offset: 0x%08x. "
|
|
|
|
"It is not fully implemented.\n", index<<2);
|
|
|
|
}
|
|
|
|
return macreg_readops[index](s, index);
|
|
|
|
} else { /* "flag needed" bit is set, but the flag is not active */
|
|
|
|
DBGOUT(MMIO, "MMIO read attempt of disabled reg. addr=0x%08x\n",
|
|
|
|
index<<2);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
DBGOUT(UNKNOWN, "MMIO unknown read addr=0x%08x\n", index<<2);
|
2008-03-13 22:18:26 +03:00
|
|
|
}
|
2008-02-03 05:20:18 +03:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2011-08-08 17:09:08 +04:00
|
|
|
static const MemoryRegionOps e1000_mmio_ops = {
|
|
|
|
.read = e1000_mmio_read,
|
|
|
|
.write = e1000_mmio_write,
|
|
|
|
.endianness = DEVICE_LITTLE_ENDIAN,
|
|
|
|
.impl = {
|
|
|
|
.min_access_size = 4,
|
|
|
|
.max_access_size = 4,
|
|
|
|
},
|
|
|
|
};
|
|
|
|
|
2012-10-23 14:30:10 +04:00
|
|
|
static uint64_t e1000_io_read(void *opaque, hwaddr addr,
|
2011-08-08 17:09:08 +04:00
|
|
|
unsigned size)
|
2008-02-03 05:20:18 +03:00
|
|
|
{
|
2011-08-08 17:09:08 +04:00
|
|
|
E1000State *s = opaque;
|
|
|
|
|
|
|
|
(void)s;
|
|
|
|
return 0;
|
2008-02-03 05:20:18 +03:00
|
|
|
}
|
|
|
|
|
2012-10-23 14:30:10 +04:00
|
|
|
static void e1000_io_write(void *opaque, hwaddr addr,
|
2011-08-08 17:09:08 +04:00
|
|
|
uint64_t val, unsigned size)
|
2008-02-03 05:20:18 +03:00
|
|
|
{
|
2011-08-08 17:09:08 +04:00
|
|
|
E1000State *s = opaque;
|
|
|
|
|
|
|
|
(void)s;
|
2008-02-03 05:20:18 +03:00
|
|
|
}
|
|
|
|
|
2011-08-08 17:09:08 +04:00
|
|
|
static const MemoryRegionOps e1000_io_ops = {
|
|
|
|
.read = e1000_io_read,
|
|
|
|
.write = e1000_io_write,
|
|
|
|
.endianness = DEVICE_LITTLE_ENDIAN,
|
|
|
|
};
|
|
|
|
|
2009-10-19 22:06:05 +04:00
|
|
|
static bool is_version_1(void *opaque, int version_id)
|
2008-02-03 05:20:18 +03:00
|
|
|
{
|
2009-10-19 22:06:05 +04:00
|
|
|
return version_id == 1;
|
2008-02-03 05:20:18 +03:00
|
|
|
}
|
|
|
|
|
2017-09-25 14:29:12 +03:00
|
|
|
static int e1000_pre_save(void *opaque)
|
2013-02-05 23:00:21 +04:00
|
|
|
{
|
|
|
|
E1000State *s = opaque;
|
|
|
|
NetClientState *nc = qemu_get_queue(s->nic);
|
2013-02-14 21:11:27 +04:00
|
|
|
|
2013-02-05 23:00:21 +04:00
|
|
|
/*
|
2014-06-19 19:55:33 +04:00
|
|
|
* If link is down and auto-negotiation is supported and ongoing,
|
|
|
|
* complete auto-negotiation immediately. This allows us to look
|
2023-02-23 13:19:48 +03:00
|
|
|
* at MII_BMSR_AN_COMP to infer link status on load.
|
2013-02-05 23:00:21 +04:00
|
|
|
*/
|
2014-06-19 23:40:51 +04:00
|
|
|
if (nc->link_down && have_autoneg(s)) {
|
2023-02-23 13:19:48 +03:00
|
|
|
s->phy_reg[MII_BMSR] |= MII_BMSR_AN_COMP;
|
2013-02-05 23:00:21 +04:00
|
|
|
}
|
2017-09-25 14:29:12 +03:00
|
|
|
|
2018-03-28 19:36:29 +03:00
|
|
|
/* Decide which set of props to migrate in the main structure */
|
|
|
|
if (chkflag(TSO) || !s->use_tso_for_migration) {
|
|
|
|
/* Either we're migrating with the extra subsection, in which
|
|
|
|
* case the mig_props is always 'props' OR
|
|
|
|
* we've not got the subsection, but 'props' was the last
|
|
|
|
* updated.
|
|
|
|
*/
|
|
|
|
s->mig_props = s->tx.props;
|
|
|
|
} else {
|
|
|
|
/* We're not using the subsection, and 'tso_props' was
|
|
|
|
* the last updated.
|
|
|
|
*/
|
|
|
|
s->mig_props = s->tx.tso_props;
|
|
|
|
}
|
2017-09-25 14:29:12 +03:00
|
|
|
return 0;
|
2013-02-05 23:00:21 +04:00
|
|
|
}
|
|
|
|
|
2012-09-28 06:06:01 +04:00
|
|
|
static int e1000_post_load(void *opaque, int version_id)
|
|
|
|
{
|
|
|
|
E1000State *s = opaque;
|
2013-01-30 15:12:22 +04:00
|
|
|
NetClientState *nc = qemu_get_queue(s->nic);
|
2012-09-28 06:06:01 +04:00
|
|
|
|
e1000: add interrupt mitigation support
This patch partially implements the e1000 interrupt mitigation mechanisms.
Using a single QEMUTimer, it emulates the ITR register (which is the newer
mitigation register, recommended by Intel) and approximately emulates
RADV and TADV registers. TIDV and RDTR register functionalities are not
emulated (RDTR is only used to validate RADV, according to the e1000 specs).
RADV, TADV, TIDV and RDTR registers make up the older e1000 mitigation
mechanism and would need a timer each to be completely emulated. However,
a single timer has been used in order to reach a good compromise between
emulation accuracy and simplicity/efficiency.
The implemented mechanism can be enabled/disabled specifying the command
line e1000-specific boolean parameter "mitigation", e.g.
qemu-system-x86_64 -device e1000,mitigation=on,... ...
For more information, see the Software developer's manual at
http://download.intel.com/design/network/manuals/8254x_GBe_SDM.pdf.
Interrupt mitigation boosts performance when the guest suffers from
an high interrupt rate (i.e. receiving short UDP packets at high packet
rate). For some numerical results see the following link
http://info.iet.unipi.it/~luigi/papers/20130520-rizzo-vm.pdf
Signed-off-by: Vincenzo Maffione <v.maffione@gmail.com>
Reviewed-by: Andreas Färber <afaerber@suse.de> (for pc-* machines)
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2013-08-02 20:30:52 +04:00
|
|
|
s->mit_ide = 0;
|
2019-07-10 06:52:53 +03:00
|
|
|
s->mit_timer_on = true;
|
|
|
|
timer_mod(s->mit_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 1);
|
e1000: add interrupt mitigation support
This patch partially implements the e1000 interrupt mitigation mechanisms.
Using a single QEMUTimer, it emulates the ITR register (which is the newer
mitigation register, recommended by Intel) and approximately emulates
RADV and TADV registers. TIDV and RDTR register functionalities are not
emulated (RDTR is only used to validate RADV, according to the e1000 specs).
RADV, TADV, TIDV and RDTR registers make up the older e1000 mitigation
mechanism and would need a timer each to be completely emulated. However,
a single timer has been used in order to reach a good compromise between
emulation accuracy and simplicity/efficiency.
The implemented mechanism can be enabled/disabled specifying the command
line e1000-specific boolean parameter "mitigation", e.g.
qemu-system-x86_64 -device e1000,mitigation=on,... ...
For more information, see the Software developer's manual at
http://download.intel.com/design/network/manuals/8254x_GBe_SDM.pdf.
Interrupt mitigation boosts performance when the guest suffers from
an high interrupt rate (i.e. receiving short UDP packets at high packet
rate). For some numerical results see the following link
http://info.iet.unipi.it/~luigi/papers/20130520-rizzo-vm.pdf
Signed-off-by: Vincenzo Maffione <v.maffione@gmail.com>
Reviewed-by: Andreas Färber <afaerber@suse.de> (for pc-* machines)
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2013-08-02 20:30:52 +04:00
|
|
|
|
2012-09-28 06:06:01 +04:00
|
|
|
/* nc.link_down can't be migrated, so infer link_down according
|
2013-02-05 23:00:21 +04:00
|
|
|
* to link status bit in mac_reg[STATUS].
|
|
|
|
* Alternatively, restart link negotiation if it was in progress. */
|
2013-01-30 15:12:22 +04:00
|
|
|
nc->link_down = (s->mac_reg[STATUS] & E1000_STATUS_LU) == 0;
|
2013-02-14 21:11:27 +04:00
|
|
|
|
2023-02-23 13:19:48 +03:00
|
|
|
if (have_autoneg(s) && !(s->phy_reg[MII_BMSR] & MII_BMSR_AN_COMP)) {
|
2013-02-05 23:00:21 +04:00
|
|
|
nc->link_down = false;
|
2014-06-19 23:40:51 +04:00
|
|
|
timer_mod(s->autoneg_timer,
|
|
|
|
qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + 500);
|
2013-02-05 23:00:21 +04:00
|
|
|
}
|
2012-09-28 06:06:01 +04:00
|
|
|
|
2018-03-28 19:36:28 +03:00
|
|
|
s->tx.props = s->mig_props;
|
2018-03-28 19:36:26 +03:00
|
|
|
if (!s->received_tx_tso) {
|
|
|
|
/* We received only one set of offload data (tx.props)
|
|
|
|
* and haven't got tx.tso_props. The best we can do
|
|
|
|
* is dupe the data.
|
|
|
|
*/
|
2018-03-28 19:36:28 +03:00
|
|
|
s->tx.tso_props = s->mig_props;
|
2018-03-28 19:36:26 +03:00
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int e1000_tx_tso_post_load(void *opaque, int version_id)
|
|
|
|
{
|
|
|
|
E1000State *s = opaque;
|
|
|
|
s->received_tx_tso = true;
|
2012-09-28 06:06:01 +04:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2015-11-11 16:52:40 +03:00
|
|
|
static bool e1000_full_mac_needed(void *opaque)
|
|
|
|
{
|
|
|
|
E1000State *s = opaque;
|
|
|
|
|
e1000: Introduced an array to control the access to the MAC registers
The array of uint8_t's which is introduced here, contains access metadata
about the MAC registers: if a register is accessible, but partly implemented,
or if a register requires a certain compatibility flag in order to be
accessed. Currently, 6 hypothetical flags are supported (3 exist for e1000
so far) but in the future, if more than 6 flags will be needed, the datatype
of this array can simply be swapped for a larger one.
This patch is intended to solve the following current problems:
1) In a scenario of migration between different versions of QEMU, which
differ by the MAC registers implemented in them, some registers need not to
be active if a compatibility flag is set, in order to preserve the machine's
state perfectly for the older version. Checking this for each register
individually, would create a lot of clutter in the code.
2) Some registers are (or may be) only partly implemented (e.g.
placeholders that allow reading and writing, but lack other functions).
In such cases it is better to print a debug warning on read/write attempts.
As above, dealing with this functionality on a per-register level, would
require longer and more messy code.
Signed-off-by: Leonid Bloch <leonid.bloch@ravellosystems.com>
Signed-off-by: Dmitry Fleytman <dmitry.fleytman@ravellosystems.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
2015-11-11 16:52:41 +03:00
|
|
|
return chkflag(MAC);
|
2015-11-11 16:52:40 +03:00
|
|
|
}
|
|
|
|
|
2018-03-28 19:36:27 +03:00
|
|
|
static bool e1000_tso_state_needed(void *opaque)
|
|
|
|
{
|
|
|
|
E1000State *s = opaque;
|
|
|
|
|
|
|
|
return chkflag(TSO);
|
|
|
|
}
|
|
|
|
|
e1000: add interrupt mitigation support
This patch partially implements the e1000 interrupt mitigation mechanisms.
Using a single QEMUTimer, it emulates the ITR register (which is the newer
mitigation register, recommended by Intel) and approximately emulates
RADV and TADV registers. TIDV and RDTR register functionalities are not
emulated (RDTR is only used to validate RADV, according to the e1000 specs).
RADV, TADV, TIDV and RDTR registers make up the older e1000 mitigation
mechanism and would need a timer each to be completely emulated. However,
a single timer has been used in order to reach a good compromise between
emulation accuracy and simplicity/efficiency.
The implemented mechanism can be enabled/disabled specifying the command
line e1000-specific boolean parameter "mitigation", e.g.
qemu-system-x86_64 -device e1000,mitigation=on,... ...
For more information, see the Software developer's manual at
http://download.intel.com/design/network/manuals/8254x_GBe_SDM.pdf.
Interrupt mitigation boosts performance when the guest suffers from
an high interrupt rate (i.e. receiving short UDP packets at high packet
rate). For some numerical results see the following link
http://info.iet.unipi.it/~luigi/papers/20130520-rizzo-vm.pdf
Signed-off-by: Vincenzo Maffione <v.maffione@gmail.com>
Reviewed-by: Andreas Färber <afaerber@suse.de> (for pc-* machines)
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2013-08-02 20:30:52 +04:00
|
|
|
static const VMStateDescription vmstate_e1000_mit_state = {
|
|
|
|
.name = "e1000/mit_state",
|
|
|
|
.version_id = 1,
|
|
|
|
.minimum_version_id = 1,
|
2014-04-16 17:32:32 +04:00
|
|
|
.fields = (VMStateField[]) {
|
e1000: add interrupt mitigation support
This patch partially implements the e1000 interrupt mitigation mechanisms.
Using a single QEMUTimer, it emulates the ITR register (which is the newer
mitigation register, recommended by Intel) and approximately emulates
RADV and TADV registers. TIDV and RDTR register functionalities are not
emulated (RDTR is only used to validate RADV, according to the e1000 specs).
RADV, TADV, TIDV and RDTR registers make up the older e1000 mitigation
mechanism and would need a timer each to be completely emulated. However,
a single timer has been used in order to reach a good compromise between
emulation accuracy and simplicity/efficiency.
The implemented mechanism can be enabled/disabled specifying the command
line e1000-specific boolean parameter "mitigation", e.g.
qemu-system-x86_64 -device e1000,mitigation=on,... ...
For more information, see the Software developer's manual at
http://download.intel.com/design/network/manuals/8254x_GBe_SDM.pdf.
Interrupt mitigation boosts performance when the guest suffers from
an high interrupt rate (i.e. receiving short UDP packets at high packet
rate). For some numerical results see the following link
http://info.iet.unipi.it/~luigi/papers/20130520-rizzo-vm.pdf
Signed-off-by: Vincenzo Maffione <v.maffione@gmail.com>
Reviewed-by: Andreas Färber <afaerber@suse.de> (for pc-* machines)
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2013-08-02 20:30:52 +04:00
|
|
|
VMSTATE_UINT32(mac_reg[RDTR], E1000State),
|
|
|
|
VMSTATE_UINT32(mac_reg[RADV], E1000State),
|
|
|
|
VMSTATE_UINT32(mac_reg[TADV], E1000State),
|
|
|
|
VMSTATE_UINT32(mac_reg[ITR], E1000State),
|
|
|
|
VMSTATE_BOOL(mit_irq_level, E1000State),
|
|
|
|
VMSTATE_END_OF_LIST()
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2015-11-11 16:52:40 +03:00
|
|
|
static const VMStateDescription vmstate_e1000_full_mac_state = {
|
|
|
|
.name = "e1000/full_mac_state",
|
|
|
|
.version_id = 1,
|
|
|
|
.minimum_version_id = 1,
|
|
|
|
.needed = e1000_full_mac_needed,
|
|
|
|
.fields = (VMStateField[]) {
|
|
|
|
VMSTATE_UINT32_ARRAY(mac_reg, E1000State, 0x8000),
|
|
|
|
VMSTATE_END_OF_LIST()
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2018-03-28 19:36:25 +03:00
|
|
|
static const VMStateDescription vmstate_e1000_tx_tso_state = {
|
|
|
|
.name = "e1000/tx_tso_state",
|
|
|
|
.version_id = 1,
|
|
|
|
.minimum_version_id = 1,
|
2018-03-28 19:36:27 +03:00
|
|
|
.needed = e1000_tso_state_needed,
|
2018-03-28 19:36:26 +03:00
|
|
|
.post_load = e1000_tx_tso_post_load,
|
2018-03-28 19:36:25 +03:00
|
|
|
.fields = (VMStateField[]) {
|
|
|
|
VMSTATE_UINT8(tx.tso_props.ipcss, E1000State),
|
|
|
|
VMSTATE_UINT8(tx.tso_props.ipcso, E1000State),
|
|
|
|
VMSTATE_UINT16(tx.tso_props.ipcse, E1000State),
|
|
|
|
VMSTATE_UINT8(tx.tso_props.tucss, E1000State),
|
|
|
|
VMSTATE_UINT8(tx.tso_props.tucso, E1000State),
|
|
|
|
VMSTATE_UINT16(tx.tso_props.tucse, E1000State),
|
|
|
|
VMSTATE_UINT32(tx.tso_props.paylen, E1000State),
|
|
|
|
VMSTATE_UINT8(tx.tso_props.hdr_len, E1000State),
|
|
|
|
VMSTATE_UINT16(tx.tso_props.mss, E1000State),
|
|
|
|
VMSTATE_INT8(tx.tso_props.ip, E1000State),
|
|
|
|
VMSTATE_INT8(tx.tso_props.tcp, E1000State),
|
|
|
|
VMSTATE_END_OF_LIST()
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2009-10-19 22:06:05 +04:00
|
|
|
static const VMStateDescription vmstate_e1000 = {
|
|
|
|
.name = "e1000",
|
2018-03-28 19:36:25 +03:00
|
|
|
.version_id = 2,
|
2009-10-19 22:06:05 +04:00
|
|
|
.minimum_version_id = 1,
|
2013-02-05 23:00:21 +04:00
|
|
|
.pre_save = e1000_pre_save,
|
2012-09-28 06:06:01 +04:00
|
|
|
.post_load = e1000_post_load,
|
2014-04-16 17:32:32 +04:00
|
|
|
.fields = (VMStateField[]) {
|
2013-06-30 14:55:52 +04:00
|
|
|
VMSTATE_PCI_DEVICE(parent_obj, E1000State),
|
2009-10-19 22:06:05 +04:00
|
|
|
VMSTATE_UNUSED_TEST(is_version_1, 4), /* was instance id */
|
|
|
|
VMSTATE_UNUSED(4), /* Was mmio_base. */
|
|
|
|
VMSTATE_UINT32(rxbuf_size, E1000State),
|
|
|
|
VMSTATE_UINT32(rxbuf_min_shift, E1000State),
|
|
|
|
VMSTATE_UINT32(eecd_state.val_in, E1000State),
|
|
|
|
VMSTATE_UINT16(eecd_state.bitnum_in, E1000State),
|
|
|
|
VMSTATE_UINT16(eecd_state.bitnum_out, E1000State),
|
|
|
|
VMSTATE_UINT16(eecd_state.reading, E1000State),
|
|
|
|
VMSTATE_UINT32(eecd_state.old_eecd, E1000State),
|
2018-03-28 19:36:28 +03:00
|
|
|
VMSTATE_UINT8(mig_props.ipcss, E1000State),
|
|
|
|
VMSTATE_UINT8(mig_props.ipcso, E1000State),
|
|
|
|
VMSTATE_UINT16(mig_props.ipcse, E1000State),
|
|
|
|
VMSTATE_UINT8(mig_props.tucss, E1000State),
|
|
|
|
VMSTATE_UINT8(mig_props.tucso, E1000State),
|
|
|
|
VMSTATE_UINT16(mig_props.tucse, E1000State),
|
|
|
|
VMSTATE_UINT32(mig_props.paylen, E1000State),
|
|
|
|
VMSTATE_UINT8(mig_props.hdr_len, E1000State),
|
|
|
|
VMSTATE_UINT16(mig_props.mss, E1000State),
|
2009-10-19 22:06:05 +04:00
|
|
|
VMSTATE_UINT16(tx.size, E1000State),
|
|
|
|
VMSTATE_UINT16(tx.tso_frames, E1000State),
|
2017-11-15 02:23:33 +03:00
|
|
|
VMSTATE_UINT8(tx.sum_needed, E1000State),
|
2018-03-28 19:36:28 +03:00
|
|
|
VMSTATE_INT8(mig_props.ip, E1000State),
|
|
|
|
VMSTATE_INT8(mig_props.tcp, E1000State),
|
2009-10-19 22:06:05 +04:00
|
|
|
VMSTATE_BUFFER(tx.header, E1000State),
|
|
|
|
VMSTATE_BUFFER(tx.data, E1000State),
|
|
|
|
VMSTATE_UINT16_ARRAY(eeprom_data, E1000State, 64),
|
|
|
|
VMSTATE_UINT16_ARRAY(phy_reg, E1000State, 0x20),
|
|
|
|
VMSTATE_UINT32(mac_reg[CTRL], E1000State),
|
|
|
|
VMSTATE_UINT32(mac_reg[EECD], E1000State),
|
|
|
|
VMSTATE_UINT32(mac_reg[EERD], E1000State),
|
|
|
|
VMSTATE_UINT32(mac_reg[GPRC], E1000State),
|
|
|
|
VMSTATE_UINT32(mac_reg[GPTC], E1000State),
|
|
|
|
VMSTATE_UINT32(mac_reg[ICR], E1000State),
|
|
|
|
VMSTATE_UINT32(mac_reg[ICS], E1000State),
|
|
|
|
VMSTATE_UINT32(mac_reg[IMC], E1000State),
|
|
|
|
VMSTATE_UINT32(mac_reg[IMS], E1000State),
|
|
|
|
VMSTATE_UINT32(mac_reg[LEDCTL], E1000State),
|
|
|
|
VMSTATE_UINT32(mac_reg[MANC], E1000State),
|
|
|
|
VMSTATE_UINT32(mac_reg[MDIC], E1000State),
|
|
|
|
VMSTATE_UINT32(mac_reg[MPC], E1000State),
|
|
|
|
VMSTATE_UINT32(mac_reg[PBA], E1000State),
|
|
|
|
VMSTATE_UINT32(mac_reg[RCTL], E1000State),
|
|
|
|
VMSTATE_UINT32(mac_reg[RDBAH], E1000State),
|
|
|
|
VMSTATE_UINT32(mac_reg[RDBAL], E1000State),
|
|
|
|
VMSTATE_UINT32(mac_reg[RDH], E1000State),
|
|
|
|
VMSTATE_UINT32(mac_reg[RDLEN], E1000State),
|
|
|
|
VMSTATE_UINT32(mac_reg[RDT], E1000State),
|
|
|
|
VMSTATE_UINT32(mac_reg[STATUS], E1000State),
|
|
|
|
VMSTATE_UINT32(mac_reg[SWSM], E1000State),
|
|
|
|
VMSTATE_UINT32(mac_reg[TCTL], E1000State),
|
|
|
|
VMSTATE_UINT32(mac_reg[TDBAH], E1000State),
|
|
|
|
VMSTATE_UINT32(mac_reg[TDBAL], E1000State),
|
|
|
|
VMSTATE_UINT32(mac_reg[TDH], E1000State),
|
|
|
|
VMSTATE_UINT32(mac_reg[TDLEN], E1000State),
|
|
|
|
VMSTATE_UINT32(mac_reg[TDT], E1000State),
|
|
|
|
VMSTATE_UINT32(mac_reg[TORH], E1000State),
|
|
|
|
VMSTATE_UINT32(mac_reg[TORL], E1000State),
|
|
|
|
VMSTATE_UINT32(mac_reg[TOTH], E1000State),
|
|
|
|
VMSTATE_UINT32(mac_reg[TOTL], E1000State),
|
|
|
|
VMSTATE_UINT32(mac_reg[TPR], E1000State),
|
|
|
|
VMSTATE_UINT32(mac_reg[TPT], E1000State),
|
|
|
|
VMSTATE_UINT32(mac_reg[TXDCTL], E1000State),
|
|
|
|
VMSTATE_UINT32(mac_reg[WUFC], E1000State),
|
|
|
|
VMSTATE_UINT32(mac_reg[VET], E1000State),
|
|
|
|
VMSTATE_UINT32_SUB_ARRAY(mac_reg, E1000State, RA, 32),
|
2023-02-23 13:19:52 +03:00
|
|
|
VMSTATE_UINT32_SUB_ARRAY(mac_reg, E1000State, MTA, E1000_MC_TBL_SIZE),
|
|
|
|
VMSTATE_UINT32_SUB_ARRAY(mac_reg, E1000State, VFTA,
|
|
|
|
E1000_VLAN_FILTER_TBL_SIZE),
|
2009-10-19 22:06:05 +04:00
|
|
|
VMSTATE_END_OF_LIST()
|
e1000: add interrupt mitigation support
This patch partially implements the e1000 interrupt mitigation mechanisms.
Using a single QEMUTimer, it emulates the ITR register (which is the newer
mitigation register, recommended by Intel) and approximately emulates
RADV and TADV registers. TIDV and RDTR register functionalities are not
emulated (RDTR is only used to validate RADV, according to the e1000 specs).
RADV, TADV, TIDV and RDTR registers make up the older e1000 mitigation
mechanism and would need a timer each to be completely emulated. However,
a single timer has been used in order to reach a good compromise between
emulation accuracy and simplicity/efficiency.
The implemented mechanism can be enabled/disabled specifying the command
line e1000-specific boolean parameter "mitigation", e.g.
qemu-system-x86_64 -device e1000,mitigation=on,... ...
For more information, see the Software developer's manual at
http://download.intel.com/design/network/manuals/8254x_GBe_SDM.pdf.
Interrupt mitigation boosts performance when the guest suffers from
an high interrupt rate (i.e. receiving short UDP packets at high packet
rate). For some numerical results see the following link
http://info.iet.unipi.it/~luigi/papers/20130520-rizzo-vm.pdf
Signed-off-by: Vincenzo Maffione <v.maffione@gmail.com>
Reviewed-by: Andreas Färber <afaerber@suse.de> (for pc-* machines)
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2013-08-02 20:30:52 +04:00
|
|
|
},
|
2014-09-23 16:09:54 +04:00
|
|
|
.subsections = (const VMStateDescription*[]) {
|
|
|
|
&vmstate_e1000_mit_state,
|
2015-11-11 16:52:40 +03:00
|
|
|
&vmstate_e1000_full_mac_state,
|
2018-03-28 19:36:25 +03:00
|
|
|
&vmstate_e1000_tx_tso_state,
|
2014-09-23 16:09:54 +04:00
|
|
|
NULL
|
2009-10-19 22:06:05 +04:00
|
|
|
}
|
|
|
|
};
|
2008-02-03 05:20:18 +03:00
|
|
|
|
2014-06-02 17:33:27 +04:00
|
|
|
/*
|
|
|
|
* EEPROM contents documented in Tables 5-2 and 5-3, pp. 98-102.
|
2019-07-15 13:22:10 +03:00
|
|
|
* Note: A valid DevId will be inserted during pci_e1000_realize().
|
2014-06-02 17:33:27 +04:00
|
|
|
*/
|
2008-10-02 22:24:21 +04:00
|
|
|
static const uint16_t e1000_eeprom_template[64] = {
|
2008-02-03 05:20:18 +03:00
|
|
|
0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0x0000, 0x0000, 0x0000,
|
2014-06-02 17:33:27 +04:00
|
|
|
0x3000, 0x1000, 0x6403, 0 /*DevId*/, 0x8086, 0 /*DevId*/, 0x8086, 0x3040,
|
2008-02-03 05:20:18 +03:00
|
|
|
0x0008, 0x2000, 0x7e14, 0x0048, 0x1000, 0x00d8, 0x0000, 0x2700,
|
|
|
|
0x6cc9, 0x3150, 0x0722, 0x040b, 0x0984, 0x0000, 0xc000, 0x0706,
|
|
|
|
0x1008, 0x0000, 0x0f04, 0x7fff, 0x4d01, 0xffff, 0xffff, 0xffff,
|
|
|
|
0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
|
|
|
|
0x0100, 0x4000, 0x121c, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
|
|
|
|
0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000,
|
|
|
|
};
|
|
|
|
|
|
|
|
/* PCI interface */
|
|
|
|
|
|
|
|
static void
|
2011-08-08 17:09:08 +04:00
|
|
|
e1000_mmio_setup(E1000State *d)
|
2008-02-03 05:20:18 +03:00
|
|
|
{
|
2008-12-09 23:09:57 +03:00
|
|
|
int i;
|
|
|
|
const uint32_t excluded_regs[] = {
|
|
|
|
E1000_MDIC, E1000_ICR, E1000_ICS, E1000_IMS,
|
|
|
|
E1000_IMC, E1000_TCTL, E1000_TDT, PNPMMIO_SIZE
|
|
|
|
};
|
|
|
|
|
2013-06-07 05:25:08 +04:00
|
|
|
memory_region_init_io(&d->mmio, OBJECT(d), &e1000_mmio_ops, d,
|
|
|
|
"e1000-mmio", PNPMMIO_SIZE);
|
2011-08-08 17:09:08 +04:00
|
|
|
memory_region_add_coalescing(&d->mmio, 0, excluded_regs[0]);
|
2008-12-09 23:09:57 +03:00
|
|
|
for (i = 0; excluded_regs[i] != PNPMMIO_SIZE; i++)
|
2011-08-08 17:09:08 +04:00
|
|
|
memory_region_add_coalescing(&d->mmio, excluded_regs[i] + 4,
|
|
|
|
excluded_regs[i+1] - excluded_regs[i] - 4);
|
2013-06-07 05:25:08 +04:00
|
|
|
memory_region_init_io(&d->io, OBJECT(d), &e1000_io_ops, d, "e1000-io", IOPORT_SIZE);
|
2008-02-03 05:20:18 +03:00
|
|
|
}
|
|
|
|
|
2012-07-04 08:39:27 +04:00
|
|
|
static void
|
2009-02-11 18:21:22 +03:00
|
|
|
pci_e1000_uninit(PCIDevice *dev)
|
|
|
|
{
|
2013-06-24 10:50:30 +04:00
|
|
|
E1000State *d = E1000(dev);
|
2009-02-11 18:21:22 +03:00
|
|
|
|
2013-08-21 19:03:08 +04:00
|
|
|
timer_free(d->autoneg_timer);
|
e1000: add interrupt mitigation support
This patch partially implements the e1000 interrupt mitigation mechanisms.
Using a single QEMUTimer, it emulates the ITR register (which is the newer
mitigation register, recommended by Intel) and approximately emulates
RADV and TADV registers. TIDV and RDTR register functionalities are not
emulated (RDTR is only used to validate RADV, according to the e1000 specs).
RADV, TADV, TIDV and RDTR registers make up the older e1000 mitigation
mechanism and would need a timer each to be completely emulated. However,
a single timer has been used in order to reach a good compromise between
emulation accuracy and simplicity/efficiency.
The implemented mechanism can be enabled/disabled specifying the command
line e1000-specific boolean parameter "mitigation", e.g.
qemu-system-x86_64 -device e1000,mitigation=on,... ...
For more information, see the Software developer's manual at
http://download.intel.com/design/network/manuals/8254x_GBe_SDM.pdf.
Interrupt mitigation boosts performance when the guest suffers from
an high interrupt rate (i.e. receiving short UDP packets at high packet
rate). For some numerical results see the following link
http://info.iet.unipi.it/~luigi/papers/20130520-rizzo-vm.pdf
Signed-off-by: Vincenzo Maffione <v.maffione@gmail.com>
Reviewed-by: Andreas Färber <afaerber@suse.de> (for pc-* machines)
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2013-08-02 20:30:52 +04:00
|
|
|
timer_free(d->mit_timer);
|
e1000: Delay flush queue when receive RCTL
Due to too early RCT0 interrput, win10x32 may hang on booting.
This problem can be reproduced by doing power cycle on win10x32 guest.
In our environment, we have 10 win10x32 and stress power cycle.
The problem will happen about 20 rounds.
Below shows some log with comment:
The normal case:
22831@1551928392.984687:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
22831@1551928392.985655:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
22831@1551928392.985801:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: RCTL: 0, mac_reg[RCTL] = 0x0
22831@1551928393.056710:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: ICR read: 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: RCTL: 0, mac_reg[RCTL] = 0x0
22831@1551928393.077548:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: ICR read: 0
e1000: set_ics 2, ICR 0, IMR 0
e1000: set_ics 2, ICR 2, IMR 0
e1000: RCTL: 0, mac_reg[RCTL] = 0x0
22831@1551928393.102974:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
22831@1551928393.103267:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
e1000: RCTL: 255, mac_reg[RCTL] = 0x40002 <- win10x32 says it can handle
RX now
e1000: set_ics 0, ICR 2, IMR 9d <- unmask interrupt
e1000: RCTL: 255, mac_reg[RCTL] = 0x48002
e1000: set_ics 80, ICR 2, IMR 9d <- interrupt and work!
...
The bad case:
27744@1551930483.117766:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
27744@1551930483.118398:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: RCTL: 0, mac_reg[RCTL] = 0x0
27744@1551930483.198063:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: ICR read: 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: RCTL: 0, mac_reg[RCTL] = 0x0
27744@1551930483.218675:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: ICR read: 0
e1000: set_ics 2, ICR 0, IMR 0
e1000: set_ics 2, ICR 2, IMR 0
e1000: RCTL: 0, mac_reg[RCTL] = 0x0
27744@1551930483.241768:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
27744@1551930483.241979:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
e1000: RCTL: 255, mac_reg[RCTL] = 0x40002 <- win10x32 says it can handle
RX now
e1000: set_ics 80, ICR 2, IMR 0 <- flush queue (caused by setting RCTL)
e1000: set_ics 0, ICR 82, IMR 9d <- unmask interrupt and because 0x82&0x9d
!= 0 generate interrupt, hang on here...
To workaround this problem, simply delay flush queue. Also stop receiving
when timer is going to run.
Tested on CentOS, Win7SP1x64 and Win10x32.
Signed-off-by: yuchenlin <yuchenlin@synology.com>
Reviewed-by: Dmitry Fleytman <dmitry.fleytman@gmail.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
2019-03-13 09:56:49 +03:00
|
|
|
timer_free(d->flush_queue_timer);
|
2013-01-30 15:12:24 +04:00
|
|
|
qemu_del_nic(d->nic);
|
2009-02-11 18:21:22 +03:00
|
|
|
}
|
|
|
|
|
2009-11-25 21:49:12 +03:00
|
|
|
static NetClientInfo net_e1000_info = {
|
qapi: Change Netdev into a flat union
This is a mostly-mechanical conversion that creates a new flat
union 'Netdev' QAPI type that covers all the branches of the
former 'NetClientOptions' simple union, where the branches are
now listed in a new 'NetClientDriver' enum rather than generated
from the simple union. The existence of a flat union has no
change to the command line syntax accepted for new code, and
will make it possible for a future patch to switch the QMP
command to parse a boxed union for no change to valid QMP; but
it does have some ripple effect on the C code when dealing with
the new types.
While making the conversion, note that the 'NetLegacy' type
remains unchanged: it applies only to legacy command line options,
and will not be ported to QMP, so it should remain a wrapper
around a simple union; to avoid confusion, the type named
'NetClientOptions' is now gone, and we introduce 'NetLegacyOptions'
in its place. Then, in the C code, we convert from NetLegacy to
Netdev as soon as possible, so that the bulk of the net stack
only has to deal with one QAPI type, not two. Note that since
the old legacy code always rejected 'hubport', we can just omit
that branch from the new 'NetLegacyOptions' simple union.
Based on an idea originally by Zoltán Kővágó <DirtY.iCE.hu@gmail.com>:
Message-Id: <01a527fbf1a5de880091f98cf011616a78adeeee.1441627176.git.DirtY.iCE.hu@gmail.com>
although the sed script in that patch no longer applies due to
other changes in the tree since then, and I also did some manual
cleanups (such as fixing whitespace to keep checkpatch happy).
Signed-off-by: Eric Blake <eblake@redhat.com>
Message-Id: <1468468228-27827-13-git-send-email-eblake@redhat.com>
Reviewed-by: Markus Armbruster <armbru@redhat.com>
[Fixup from Eric squashed in]
Signed-off-by: Markus Armbruster <armbru@redhat.com>
2016-07-14 06:50:23 +03:00
|
|
|
.type = NET_CLIENT_DRIVER_NIC,
|
2009-11-25 21:49:12 +03:00
|
|
|
.size = sizeof(NICState),
|
|
|
|
.can_receive = e1000_can_receive,
|
|
|
|
.receive = e1000_receive,
|
2013-09-12 12:47:37 +04:00
|
|
|
.receive_iov = e1000_receive_iov,
|
2009-11-25 21:49:12 +03:00
|
|
|
.link_status_changed = e1000_set_link_status,
|
|
|
|
};
|
|
|
|
|
2014-12-01 21:06:52 +03:00
|
|
|
static void e1000_write_config(PCIDevice *pci_dev, uint32_t address,
|
|
|
|
uint32_t val, int len)
|
|
|
|
{
|
|
|
|
E1000State *s = E1000(pci_dev);
|
|
|
|
|
|
|
|
pci_default_write_config(pci_dev, address, val, len);
|
|
|
|
|
|
|
|
if (range_covers_byte(address, len, PCI_COMMAND) &&
|
|
|
|
(pci_dev->config[PCI_COMMAND] & PCI_COMMAND_MASTER)) {
|
|
|
|
qemu_flush_queued_packets(qemu_get_queue(s->nic));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-01-19 17:52:30 +03:00
|
|
|
static void pci_e1000_realize(PCIDevice *pci_dev, Error **errp)
|
2008-02-03 05:20:18 +03:00
|
|
|
{
|
2013-06-24 10:50:30 +04:00
|
|
|
DeviceState *dev = DEVICE(pci_dev);
|
|
|
|
E1000State *d = E1000(pci_dev);
|
2008-02-03 05:20:18 +03:00
|
|
|
uint8_t *pci_conf;
|
2009-10-21 17:25:31 +04:00
|
|
|
uint8_t *macaddr;
|
2008-04-22 03:02:48 +04:00
|
|
|
|
2014-12-01 21:06:52 +03:00
|
|
|
pci_dev->config_write = e1000_write_config;
|
|
|
|
|
2013-06-30 14:55:52 +04:00
|
|
|
pci_conf = pci_dev->config;
|
2008-02-03 05:20:18 +03:00
|
|
|
|
2009-12-10 16:23:59 +03:00
|
|
|
/* TODO: RST# value should be 0, PCI spec 6.2.4 */
|
|
|
|
pci_conf[PCI_CACHE_LINE_SIZE] = 0x10;
|
2008-02-03 05:20:18 +03:00
|
|
|
|
2011-09-11 14:40:23 +04:00
|
|
|
pci_conf[PCI_INTERRUPT_PIN] = 1; /* interrupt pin A */
|
2008-02-03 05:20:18 +03:00
|
|
|
|
2011-08-08 17:09:08 +04:00
|
|
|
e1000_mmio_setup(d);
|
2008-02-03 05:20:18 +03:00
|
|
|
|
2013-06-30 14:55:52 +04:00
|
|
|
pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY, &d->mmio);
|
2008-02-03 05:20:18 +03:00
|
|
|
|
2013-06-30 14:55:52 +04:00
|
|
|
pci_register_bar(pci_dev, 1, PCI_BASE_ADDRESS_SPACE_IO, &d->io);
|
2008-02-03 05:20:18 +03:00
|
|
|
|
2009-10-21 17:25:31 +04:00
|
|
|
qemu_macaddr_default_if_unset(&d->conf.macaddr);
|
|
|
|
macaddr = d->conf.macaddr.a;
|
2016-06-01 11:23:44 +03:00
|
|
|
|
|
|
|
e1000x_core_prepare_eeprom(d->eeprom_data,
|
|
|
|
e1000_eeprom_template,
|
|
|
|
sizeof(e1000_eeprom_template),
|
|
|
|
PCI_DEVICE_GET_CLASS(pci_dev)->device_id,
|
|
|
|
macaddr);
|
2008-02-03 05:20:18 +03:00
|
|
|
|
2009-11-25 21:49:12 +03:00
|
|
|
d->nic = qemu_new_nic(&net_e1000_info, &d->conf,
|
2013-06-24 10:50:30 +04:00
|
|
|
object_get_typename(OBJECT(d)), dev->id, d);
|
2008-02-03 05:20:18 +03:00
|
|
|
|
2013-01-30 15:12:22 +04:00
|
|
|
qemu_format_nic_info_str(qemu_get_queue(d->nic), macaddr);
|
2010-12-08 14:35:05 +03:00
|
|
|
|
2013-08-21 19:03:08 +04:00
|
|
|
d->autoneg_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL, e1000_autoneg_timer, d);
|
e1000: add interrupt mitigation support
This patch partially implements the e1000 interrupt mitigation mechanisms.
Using a single QEMUTimer, it emulates the ITR register (which is the newer
mitigation register, recommended by Intel) and approximately emulates
RADV and TADV registers. TIDV and RDTR register functionalities are not
emulated (RDTR is only used to validate RADV, according to the e1000 specs).
RADV, TADV, TIDV and RDTR registers make up the older e1000 mitigation
mechanism and would need a timer each to be completely emulated. However,
a single timer has been used in order to reach a good compromise between
emulation accuracy and simplicity/efficiency.
The implemented mechanism can be enabled/disabled specifying the command
line e1000-specific boolean parameter "mitigation", e.g.
qemu-system-x86_64 -device e1000,mitigation=on,... ...
For more information, see the Software developer's manual at
http://download.intel.com/design/network/manuals/8254x_GBe_SDM.pdf.
Interrupt mitigation boosts performance when the guest suffers from
an high interrupt rate (i.e. receiving short UDP packets at high packet
rate). For some numerical results see the following link
http://info.iet.unipi.it/~luigi/papers/20130520-rizzo-vm.pdf
Signed-off-by: Vincenzo Maffione <v.maffione@gmail.com>
Reviewed-by: Andreas Färber <afaerber@suse.de> (for pc-* machines)
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2013-08-02 20:30:52 +04:00
|
|
|
d->mit_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, e1000_mit_timer, d);
|
e1000: Delay flush queue when receive RCTL
Due to too early RCT0 interrput, win10x32 may hang on booting.
This problem can be reproduced by doing power cycle on win10x32 guest.
In our environment, we have 10 win10x32 and stress power cycle.
The problem will happen about 20 rounds.
Below shows some log with comment:
The normal case:
22831@1551928392.984687:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
22831@1551928392.985655:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
22831@1551928392.985801:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: RCTL: 0, mac_reg[RCTL] = 0x0
22831@1551928393.056710:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: ICR read: 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: RCTL: 0, mac_reg[RCTL] = 0x0
22831@1551928393.077548:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: ICR read: 0
e1000: set_ics 2, ICR 0, IMR 0
e1000: set_ics 2, ICR 2, IMR 0
e1000: RCTL: 0, mac_reg[RCTL] = 0x0
22831@1551928393.102974:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
22831@1551928393.103267:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
e1000: RCTL: 255, mac_reg[RCTL] = 0x40002 <- win10x32 says it can handle
RX now
e1000: set_ics 0, ICR 2, IMR 9d <- unmask interrupt
e1000: RCTL: 255, mac_reg[RCTL] = 0x48002
e1000: set_ics 80, ICR 2, IMR 9d <- interrupt and work!
...
The bad case:
27744@1551930483.117766:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
27744@1551930483.118398:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: RCTL: 0, mac_reg[RCTL] = 0x0
27744@1551930483.198063:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: ICR read: 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: RCTL: 0, mac_reg[RCTL] = 0x0
27744@1551930483.218675:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
e1000: set_ics 0, ICR 0, IMR 0
e1000: ICR read: 0
e1000: set_ics 2, ICR 0, IMR 0
e1000: set_ics 2, ICR 2, IMR 0
e1000: RCTL: 0, mac_reg[RCTL] = 0x0
27744@1551930483.241768:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
27744@1551930483.241979:e1000x_rx_disabled Received packet dropped
because receive is disabled RCTL = 0
e1000: RCTL: 255, mac_reg[RCTL] = 0x40002 <- win10x32 says it can handle
RX now
e1000: set_ics 80, ICR 2, IMR 0 <- flush queue (caused by setting RCTL)
e1000: set_ics 0, ICR 82, IMR 9d <- unmask interrupt and because 0x82&0x9d
!= 0 generate interrupt, hang on here...
To workaround this problem, simply delay flush queue. Also stop receiving
when timer is going to run.
Tested on CentOS, Win7SP1x64 and Win10x32.
Signed-off-by: yuchenlin <yuchenlin@synology.com>
Reviewed-by: Dmitry Fleytman <dmitry.fleytman@gmail.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
2019-03-13 09:56:49 +03:00
|
|
|
d->flush_queue_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL,
|
|
|
|
e1000_flush_queue_timer, d);
|
2009-05-15 01:35:07 +04:00
|
|
|
}
|
2009-02-11 18:19:52 +03:00
|
|
|
|
2011-12-04 22:22:06 +04:00
|
|
|
static Property e1000_properties[] = {
|
|
|
|
DEFINE_NIC_PROPERTIES(E1000State, conf),
|
2015-11-11 16:52:47 +03:00
|
|
|
DEFINE_PROP_BIT("extra_mac_registers", E1000State,
|
|
|
|
compat_flags, E1000_FLAG_MAC_BIT, true),
|
2018-03-28 19:36:27 +03:00
|
|
|
DEFINE_PROP_BIT("migrate_tso_props", E1000State,
|
|
|
|
compat_flags, E1000_FLAG_TSO_BIT, true),
|
2021-07-23 10:55:10 +03:00
|
|
|
DEFINE_PROP_BIT("init-vet", E1000State,
|
|
|
|
compat_flags, E1000_FLAG_VET_BIT, true),
|
2011-12-04 22:22:06 +04:00
|
|
|
DEFINE_PROP_END_OF_LIST(),
|
|
|
|
};
|
|
|
|
|
2014-06-02 17:33:27 +04:00
|
|
|
typedef struct E1000Info {
|
|
|
|
const char *name;
|
|
|
|
uint16_t device_id;
|
|
|
|
uint8_t revision;
|
|
|
|
uint16_t phy_id2;
|
|
|
|
} E1000Info;
|
|
|
|
|
2011-12-04 22:22:06 +04:00
|
|
|
static void e1000_class_init(ObjectClass *klass, void *data)
|
|
|
|
{
|
2011-12-08 07:34:16 +04:00
|
|
|
DeviceClass *dc = DEVICE_CLASS(klass);
|
2023-02-23 13:19:58 +03:00
|
|
|
ResettableClass *rc = RESETTABLE_CLASS(klass);
|
2011-12-04 22:22:06 +04:00
|
|
|
PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
|
2020-08-25 22:19:57 +03:00
|
|
|
E1000BaseClass *e = E1000_CLASS(klass);
|
2014-06-02 17:33:27 +04:00
|
|
|
const E1000Info *info = data;
|
2011-12-04 22:22:06 +04:00
|
|
|
|
2015-01-19 17:52:30 +03:00
|
|
|
k->realize = pci_e1000_realize;
|
2011-12-04 22:22:06 +04:00
|
|
|
k->exit = pci_e1000_uninit;
|
2013-02-26 20:46:11 +04:00
|
|
|
k->romfile = "efi-e1000.rom";
|
2011-12-04 22:22:06 +04:00
|
|
|
k->vendor_id = PCI_VENDOR_ID_INTEL;
|
2014-06-02 17:33:27 +04:00
|
|
|
k->device_id = info->device_id;
|
|
|
|
k->revision = info->revision;
|
|
|
|
e->phy_id2 = info->phy_id2;
|
2011-12-04 22:22:06 +04:00
|
|
|
k->class_id = PCI_CLASS_NETWORK_ETHERNET;
|
2023-02-23 13:19:58 +03:00
|
|
|
rc->phases.hold = e1000_reset_hold;
|
2013-07-29 18:17:45 +04:00
|
|
|
set_bit(DEVICE_CATEGORY_NETWORK, dc->categories);
|
2011-12-08 07:34:16 +04:00
|
|
|
dc->desc = "Intel Gigabit Ethernet";
|
|
|
|
dc->vmsd = &vmstate_e1000;
|
2020-01-10 18:30:32 +03:00
|
|
|
device_class_set_props(dc, e1000_properties);
|
2011-12-04 22:22:06 +04:00
|
|
|
}
|
|
|
|
|
2014-10-07 12:00:13 +04:00
|
|
|
static void e1000_instance_init(Object *obj)
|
|
|
|
{
|
|
|
|
E1000State *n = E1000(obj);
|
|
|
|
device_add_bootindex_property(obj, &n->conf.bootindex,
|
|
|
|
"bootindex", "/ethernet-phy@0",
|
2020-05-05 18:29:23 +03:00
|
|
|
DEVICE(n));
|
2014-10-07 12:00:13 +04:00
|
|
|
}
|
|
|
|
|
2014-06-02 17:33:27 +04:00
|
|
|
static const TypeInfo e1000_base_info = {
|
|
|
|
.name = TYPE_E1000_BASE,
|
2011-12-08 07:34:16 +04:00
|
|
|
.parent = TYPE_PCI_DEVICE,
|
|
|
|
.instance_size = sizeof(E1000State),
|
2014-10-07 12:00:13 +04:00
|
|
|
.instance_init = e1000_instance_init,
|
2014-06-02 17:33:27 +04:00
|
|
|
.class_size = sizeof(E1000BaseClass),
|
|
|
|
.abstract = true,
|
2017-09-27 22:56:34 +03:00
|
|
|
.interfaces = (InterfaceInfo[]) {
|
|
|
|
{ INTERFACE_CONVENTIONAL_PCI_DEVICE },
|
|
|
|
{ },
|
|
|
|
},
|
2014-06-02 17:33:27 +04:00
|
|
|
};
|
|
|
|
|
|
|
|
static const E1000Info e1000_devices[] = {
|
|
|
|
{
|
2015-09-28 08:37:26 +03:00
|
|
|
.name = "e1000",
|
2014-06-02 17:33:27 +04:00
|
|
|
.device_id = E1000_DEV_ID_82540EM,
|
|
|
|
.revision = 0x03,
|
|
|
|
.phy_id2 = E1000_PHY_ID2_8254xx_DEFAULT,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
.name = "e1000-82544gc",
|
|
|
|
.device_id = E1000_DEV_ID_82544GC_COPPER,
|
|
|
|
.revision = 0x03,
|
|
|
|
.phy_id2 = E1000_PHY_ID2_82544x,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
.name = "e1000-82545em",
|
|
|
|
.device_id = E1000_DEV_ID_82545EM_COPPER,
|
|
|
|
.revision = 0x03,
|
|
|
|
.phy_id2 = E1000_PHY_ID2_8254xx_DEFAULT,
|
|
|
|
},
|
|
|
|
};
|
|
|
|
|
2012-02-09 18:20:55 +04:00
|
|
|
static void e1000_register_types(void)
|
2009-05-15 01:35:07 +04:00
|
|
|
{
|
2014-06-02 17:33:27 +04:00
|
|
|
int i;
|
|
|
|
|
|
|
|
type_register_static(&e1000_base_info);
|
|
|
|
for (i = 0; i < ARRAY_SIZE(e1000_devices); i++) {
|
|
|
|
const E1000Info *info = &e1000_devices[i];
|
|
|
|
TypeInfo type_info = {};
|
|
|
|
|
|
|
|
type_info.name = info->name;
|
|
|
|
type_info.parent = TYPE_E1000_BASE;
|
|
|
|
type_info.class_data = (void *)info;
|
|
|
|
type_info.class_init = e1000_class_init;
|
|
|
|
|
|
|
|
type_register(&type_info);
|
|
|
|
}
|
2008-02-03 05:20:18 +03:00
|
|
|
}
|
2009-05-15 01:35:07 +04:00
|
|
|
|
2012-02-09 18:20:55 +04:00
|
|
|
type_init(e1000_register_types)
|