2011-04-01 08:15:29 +04:00
|
|
|
/*
|
|
|
|
* QEMU PowerPC pSeries Logical Partition (aka sPAPR) hardware System Emulator
|
|
|
|
*
|
|
|
|
* PAPR Inter-VM Logical Lan, aka ibmveth
|
|
|
|
*
|
|
|
|
* Copyright (c) 2010,2011 David Gibson, IBM Corporation.
|
|
|
|
*
|
|
|
|
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
|
|
* of this software and associated documentation files (the "Software"), to deal
|
|
|
|
* in the Software without restriction, including without limitation the rights
|
|
|
|
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
|
|
* copies of the Software, and to permit persons to whom the Software is
|
|
|
|
* furnished to do so, subject to the following conditions:
|
|
|
|
*
|
|
|
|
* The above copyright notice and this permission notice shall be included in
|
|
|
|
* all copies or substantial portions of the Software.
|
|
|
|
*
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
|
|
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
|
|
* THE SOFTWARE.
|
|
|
|
*
|
|
|
|
*/
|
2019-05-23 17:35:07 +03:00
|
|
|
|
2016-01-26 21:16:58 +03:00
|
|
|
#include "qemu/osdep.h"
|
2016-01-19 23:51:44 +03:00
|
|
|
#include "cpu.h"
|
2015-12-15 15:16:16 +03:00
|
|
|
#include "qemu/log.h"
|
2019-05-23 17:35:07 +03:00
|
|
|
#include "qemu/module.h"
|
2012-10-24 10:43:34 +04:00
|
|
|
#include "net/net.h"
|
2019-08-12 08:23:45 +03:00
|
|
|
#include "migration/vmstate.h"
|
2013-02-05 20:06:20 +04:00
|
|
|
#include "hw/ppc/spapr.h"
|
|
|
|
#include "hw/ppc/spapr_vio.h"
|
2019-08-12 08:23:51 +03:00
|
|
|
#include "hw/qdev-properties.h"
|
2014-03-17 06:40:24 +04:00
|
|
|
#include "sysemu/sysemu.h"
|
2016-09-14 21:48:26 +03:00
|
|
|
#include "trace.h"
|
2011-04-01 08:15:29 +04:00
|
|
|
|
|
|
|
#include <libfdt.h>
|
2020-09-03 23:43:22 +03:00
|
|
|
#include "qom/object.h"
|
2011-04-01 08:15:29 +04:00
|
|
|
|
|
|
|
#define ETH_ALEN 6
|
|
|
|
#define MAX_PACKET_SIZE 65536
|
|
|
|
|
hw/net/spapr_llan: Fix receive buffer handling for better performance
tl;dr:
This patch introduces an alternate way of handling the receive
buffers of the spapr-vlan device, resulting in much better
receive performance for the guest.
Full story:
One of our testers recently discovered that the performance of the
spapr-vlan device is very poor compared to other NICs, and that
a simple "ping -i 0.2 -s 65507 someip" in the guest can result
in more than 50% lost ping packets (especially with older guest
kernels < 3.17).
After doing some analysis, it was clear that there is a problem
with the way we handle the receive buffers in spapr_llan.c: The
ibmveth driver of the guest Linux kernel tries to add a lot of
buffers into several buffer pools (with 512, 2048 and 65536 byte
sizes by default, but it can be changed via the entries in the
/sys/devices/vio/1000/pool* directories of the guest). However,
the spapr-vlan device of QEMU only tries to squeeze all receive
buffer descriptors into one single page which has been supplied
by the guest during the H_REGISTER_LOGICAL_LAN call, without
taking care of different buffer sizes. This has two bad effects:
First, only a very limited number of buffer descriptors is accepted
at all. Second, we also hand 64k buffers to the guest even if
the 2k buffers would fit better - and this results in dropped packets
in the IP layer of the guest since too much skbuf memory is used.
Though it seems at a first glance like PAPR says that we should store
the receive buffer descriptors in the page that is supplied during
the H_REGISTER_LOGICAL_LAN call, chapter 16.4.1.2 in the LoPAPR spec
declares that "the contents of these descriptors are architecturally
opaque, none of these descriptors are manipulated by code above
the architected interfaces". That means we don't have to store
the RX buffer descriptors in this page, but can also manage the
receive buffers at the hypervisor level only. This is now what we
are doing here: Introducing proper RX buffer pools which are also
sorted by size of the buffers, so we can hand out a buffer with
the best fitting size when a packet has been received.
To avoid problems with migration from/to older version of QEMU,
the old behavior is also retained and enabled by default. The new
buffer management has to be enabled via a new "use-rx-buffer-pools"
property.
Now with the new buffer pool management enabled, the problem with
"ping -s 65507" is fixed for me, and the throughput of a simple
test with wget increases from creeping 3MB/s up to 20MB/s!
Signed-off-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2016-03-21 19:25:23 +03:00
|
|
|
/* Compatibility flags for migration */
|
|
|
|
#define SPAPRVLAN_FLAG_RX_BUF_POOLS_BIT 0
|
|
|
|
#define SPAPRVLAN_FLAG_RX_BUF_POOLS (1 << SPAPRVLAN_FLAG_RX_BUF_POOLS_BIT)
|
|
|
|
|
2011-04-01 08:15:29 +04:00
|
|
|
/*
|
|
|
|
* Virtual LAN device
|
|
|
|
*/
|
|
|
|
|
|
|
|
typedef uint64_t vlan_bd_t;
|
|
|
|
|
|
|
|
#define VLAN_BD_VALID 0x8000000000000000ULL
|
|
|
|
#define VLAN_BD_TOGGLE 0x4000000000000000ULL
|
|
|
|
#define VLAN_BD_NO_CSUM 0x0200000000000000ULL
|
|
|
|
#define VLAN_BD_CSUM_GOOD 0x0100000000000000ULL
|
|
|
|
#define VLAN_BD_LEN_MASK 0x00ffffff00000000ULL
|
|
|
|
#define VLAN_BD_LEN(bd) (((bd) & VLAN_BD_LEN_MASK) >> 32)
|
|
|
|
#define VLAN_BD_ADDR_MASK 0x00000000ffffffffULL
|
|
|
|
#define VLAN_BD_ADDR(bd) ((bd) & VLAN_BD_ADDR_MASK)
|
|
|
|
|
|
|
|
#define VLAN_VALID_BD(addr, len) (VLAN_BD_VALID | \
|
|
|
|
(((len) << 32) & VLAN_BD_LEN_MASK) | \
|
|
|
|
(addr & VLAN_BD_ADDR_MASK))
|
|
|
|
|
|
|
|
#define VLAN_RXQC_TOGGLE 0x80
|
|
|
|
#define VLAN_RXQC_VALID 0x40
|
|
|
|
#define VLAN_RXQC_NO_CSUM 0x02
|
|
|
|
#define VLAN_RXQC_CSUM_GOOD 0x01
|
|
|
|
|
|
|
|
#define VLAN_RQ_ALIGNMENT 16
|
|
|
|
#define VLAN_RXQ_BD_OFF 0
|
|
|
|
#define VLAN_FILTER_BD_OFF 8
|
|
|
|
#define VLAN_RX_BDS_OFF 16
|
2014-08-22 05:50:57 +04:00
|
|
|
/*
|
|
|
|
* The final 8 bytes of the buffer list is a counter of frames dropped
|
|
|
|
* because there was not a buffer in the buffer list capable of holding
|
|
|
|
* the frame. We must avoid it, or the operating system will report garbage
|
|
|
|
* for this statistic.
|
|
|
|
*/
|
|
|
|
#define VLAN_RX_BDS_LEN (SPAPR_TCE_PAGE_SIZE - VLAN_RX_BDS_OFF - 8)
|
|
|
|
#define VLAN_MAX_BUFS (VLAN_RX_BDS_LEN / 8)
|
2011-04-01 08:15:29 +04:00
|
|
|
|
2013-04-07 23:08:16 +04:00
|
|
|
#define TYPE_VIO_SPAPR_VLAN_DEVICE "spapr-vlan"
|
2020-09-16 21:25:19 +03:00
|
|
|
OBJECT_DECLARE_SIMPLE_TYPE(SpaprVioVlan, VIO_SPAPR_VLAN_DEVICE)
|
2013-04-07 23:08:16 +04:00
|
|
|
|
hw/net/spapr_llan: Fix receive buffer handling for better performance
tl;dr:
This patch introduces an alternate way of handling the receive
buffers of the spapr-vlan device, resulting in much better
receive performance for the guest.
Full story:
One of our testers recently discovered that the performance of the
spapr-vlan device is very poor compared to other NICs, and that
a simple "ping -i 0.2 -s 65507 someip" in the guest can result
in more than 50% lost ping packets (especially with older guest
kernels < 3.17).
After doing some analysis, it was clear that there is a problem
with the way we handle the receive buffers in spapr_llan.c: The
ibmveth driver of the guest Linux kernel tries to add a lot of
buffers into several buffer pools (with 512, 2048 and 65536 byte
sizes by default, but it can be changed via the entries in the
/sys/devices/vio/1000/pool* directories of the guest). However,
the spapr-vlan device of QEMU only tries to squeeze all receive
buffer descriptors into one single page which has been supplied
by the guest during the H_REGISTER_LOGICAL_LAN call, without
taking care of different buffer sizes. This has two bad effects:
First, only a very limited number of buffer descriptors is accepted
at all. Second, we also hand 64k buffers to the guest even if
the 2k buffers would fit better - and this results in dropped packets
in the IP layer of the guest since too much skbuf memory is used.
Though it seems at a first glance like PAPR says that we should store
the receive buffer descriptors in the page that is supplied during
the H_REGISTER_LOGICAL_LAN call, chapter 16.4.1.2 in the LoPAPR spec
declares that "the contents of these descriptors are architecturally
opaque, none of these descriptors are manipulated by code above
the architected interfaces". That means we don't have to store
the RX buffer descriptors in this page, but can also manage the
receive buffers at the hypervisor level only. This is now what we
are doing here: Introducing proper RX buffer pools which are also
sorted by size of the buffers, so we can hand out a buffer with
the best fitting size when a packet has been received.
To avoid problems with migration from/to older version of QEMU,
the old behavior is also retained and enabled by default. The new
buffer management has to be enabled via a new "use-rx-buffer-pools"
property.
Now with the new buffer pool management enabled, the problem with
"ping -s 65507" is fixed for me, and the throughput of a simple
test with wget increases from creeping 3MB/s up to 20MB/s!
Signed-off-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2016-03-21 19:25:23 +03:00
|
|
|
#define RX_POOL_MAX_BDS 4096
|
|
|
|
#define RX_MAX_POOLS 5
|
|
|
|
|
|
|
|
typedef struct {
|
|
|
|
int32_t bufsize;
|
|
|
|
int32_t count;
|
|
|
|
vlan_bd_t bds[RX_POOL_MAX_BDS];
|
|
|
|
} RxBufPool;
|
|
|
|
|
2020-09-03 23:43:22 +03:00
|
|
|
struct SpaprVioVlan {
|
spapr: Use CamelCase properly
The qemu coding standard is to use CamelCase for type and structure names,
and the pseries code follows that... sort of. There are quite a lot of
places where we bend the rules in order to preserve the capitalization of
internal acronyms like "PHB", "TCE", "DIMM" and most commonly "sPAPR".
That was a bad idea - it frequently leads to names ending up with hard to
read clusters of capital letters, and means they don't catch the eye as
type identifiers, which is kind of the point of the CamelCase convention in
the first place.
In short, keeping type identifiers look like CamelCase is more important
than preserving standard capitalization of internal "words". So, this
patch renames a heap of spapr internal type names to a more standard
CamelCase.
In addition to case changes, we also make some other identifier renames:
VIOsPAPR* -> SpaprVio*
The reverse word ordering was only ever used to mitigate the capital
cluster, so revert to the natural ordering.
VIOsPAPRVTYDevice -> SpaprVioVty
VIOsPAPRVLANDevice -> SpaprVioVlan
Brevity, since the "Device" didn't add useful information
sPAPRDRConnector -> SpaprDrc
sPAPRDRConnectorClass -> SpaprDrcClass
Brevity, and makes it clearer this is the same thing as a "DRC"
mentioned in many other places in the code
This is 100% a mechanical search-and-replace patch. It will, however,
conflict with essentially any and all outstanding patches touching the
spapr code.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2019-03-06 07:35:37 +03:00
|
|
|
SpaprVioDevice sdev;
|
2011-04-01 08:15:29 +04:00
|
|
|
NICConf nicconf;
|
|
|
|
NICState *nic;
|
2016-09-01 11:10:49 +03:00
|
|
|
MACAddr perm_mac;
|
2013-07-18 23:32:56 +04:00
|
|
|
bool isopen;
|
2014-06-27 09:58:27 +04:00
|
|
|
hwaddr buf_list;
|
2013-07-18 23:32:56 +04:00
|
|
|
uint32_t add_buf_ptr, use_buf_ptr, rx_bufs;
|
2014-06-27 09:58:27 +04:00
|
|
|
hwaddr rxq_ptr;
|
hw/net/spapr_llan: Delay flushing of the RX queue while adding new RX buffers
Currently, the spapr-vlan device is trying to flush the RX queue
after each RX buffer that has been added by the guest via the
H_ADD_LOGICAL_LAN_BUFFER hypercall. In case the receive buffer pool
was empty before, we only pass single packets to the guest this
way. This can cause very bad performance if a sender is trying
to stream fragmented UDP packets to the guest. For example when
using the UDP_STREAM test from netperf with UDP packets that are
much bigger than the MTU size, almost all UDP packets are dropped
in the guest since the chances are quite high that at least one of
the fragments got lost on the way.
When flushing the receive queue, it's much better if we'd have
a bunch of receive buffers available already, so that fragmented
packets can be passed to the guest in one go. To do this, the
spapr_vlan_receive() function should return 0 instead of -1 if there
are no more receive buffers available, so that receive_disabled = 1
gets temporarily set for the receive queue, and we have to delay
the queue flushing at the end of h_add_logical_lan_buffer() a little
bit by using a timer, so that the guest gets a chance to add multiple
RX buffers before we flush the queue again.
This improves the UDP_STREAM test with the spapr-vlan device a lot:
Running
netserver -p 44444 -L <guestip> -f -D -4
in the guest, and
netperf -p 44444 -L <hostip> -H <guestip> -t UDP_STREAM -l 60 -- -m 16384
in the host, I get the following values _without_ this patch:
Socket Message Elapsed Messages
Size Size Time Okay Errors Throughput
bytes bytes secs # # 10^6bits/sec
229376 16384 60.00 1738970 0 3798.83
229376 60.00 23 0.05
That "0.05" means that almost all UDP packets got lost/discarded
at the receiving side.
With this patch applied, the value look much better:
Socket Message Elapsed Messages
Size Size Time Okay Errors Throughput
bytes bytes secs # # 10^6bits/sec
229376 16384 60.00 1789104 0 3908.35
229376 60.00 22818 49.85
Signed-off-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2016-03-31 14:47:05 +03:00
|
|
|
QEMUTimer *rxp_timer;
|
2016-11-19 22:29:26 +03:00
|
|
|
uint32_t compat_flags; /* Compatibility flags for migration */
|
hw/net/spapr_llan: Fix receive buffer handling for better performance
tl;dr:
This patch introduces an alternate way of handling the receive
buffers of the spapr-vlan device, resulting in much better
receive performance for the guest.
Full story:
One of our testers recently discovered that the performance of the
spapr-vlan device is very poor compared to other NICs, and that
a simple "ping -i 0.2 -s 65507 someip" in the guest can result
in more than 50% lost ping packets (especially with older guest
kernels < 3.17).
After doing some analysis, it was clear that there is a problem
with the way we handle the receive buffers in spapr_llan.c: The
ibmveth driver of the guest Linux kernel tries to add a lot of
buffers into several buffer pools (with 512, 2048 and 65536 byte
sizes by default, but it can be changed via the entries in the
/sys/devices/vio/1000/pool* directories of the guest). However,
the spapr-vlan device of QEMU only tries to squeeze all receive
buffer descriptors into one single page which has been supplied
by the guest during the H_REGISTER_LOGICAL_LAN call, without
taking care of different buffer sizes. This has two bad effects:
First, only a very limited number of buffer descriptors is accepted
at all. Second, we also hand 64k buffers to the guest even if
the 2k buffers would fit better - and this results in dropped packets
in the IP layer of the guest since too much skbuf memory is used.
Though it seems at a first glance like PAPR says that we should store
the receive buffer descriptors in the page that is supplied during
the H_REGISTER_LOGICAL_LAN call, chapter 16.4.1.2 in the LoPAPR spec
declares that "the contents of these descriptors are architecturally
opaque, none of these descriptors are manipulated by code above
the architected interfaces". That means we don't have to store
the RX buffer descriptors in this page, but can also manage the
receive buffers at the hypervisor level only. This is now what we
are doing here: Introducing proper RX buffer pools which are also
sorted by size of the buffers, so we can hand out a buffer with
the best fitting size when a packet has been received.
To avoid problems with migration from/to older version of QEMU,
the old behavior is also retained and enabled by default. The new
buffer management has to be enabled via a new "use-rx-buffer-pools"
property.
Now with the new buffer pool management enabled, the problem with
"ping -s 65507" is fixed for me, and the throughput of a simple
test with wget increases from creeping 3MB/s up to 20MB/s!
Signed-off-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2016-03-21 19:25:23 +03:00
|
|
|
RxBufPool *rx_pool[RX_MAX_POOLS]; /* Receive buffer descriptor pools */
|
2020-09-03 23:43:22 +03:00
|
|
|
};
|
2011-04-01 08:15:29 +04:00
|
|
|
|
2020-03-05 20:56:49 +03:00
|
|
|
static bool spapr_vlan_can_receive(NetClientState *nc)
|
2011-04-01 08:15:29 +04:00
|
|
|
{
|
spapr: Use CamelCase properly
The qemu coding standard is to use CamelCase for type and structure names,
and the pseries code follows that... sort of. There are quite a lot of
places where we bend the rules in order to preserve the capitalization of
internal acronyms like "PHB", "TCE", "DIMM" and most commonly "sPAPR".
That was a bad idea - it frequently leads to names ending up with hard to
read clusters of capital letters, and means they don't catch the eye as
type identifiers, which is kind of the point of the CamelCase convention in
the first place.
In short, keeping type identifiers look like CamelCase is more important
than preserving standard capitalization of internal "words". So, this
patch renames a heap of spapr internal type names to a more standard
CamelCase.
In addition to case changes, we also make some other identifier renames:
VIOsPAPR* -> SpaprVio*
The reverse word ordering was only ever used to mitigate the capital
cluster, so revert to the natural ordering.
VIOsPAPRVTYDevice -> SpaprVioVty
VIOsPAPRVLANDevice -> SpaprVioVlan
Brevity, since the "Device" didn't add useful information
sPAPRDRConnector -> SpaprDrc
sPAPRDRConnectorClass -> SpaprDrcClass
Brevity, and makes it clearer this is the same thing as a "DRC"
mentioned in many other places in the code
This is 100% a mechanical search-and-replace patch. It will, however,
conflict with essentially any and all outstanding patches touching the
spapr code.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2019-03-06 07:35:37 +03:00
|
|
|
SpaprVioVlan *dev = qemu_get_nic_opaque(nc);
|
2011-04-01 08:15:29 +04:00
|
|
|
|
2020-03-05 20:56:49 +03:00
|
|
|
return dev->isopen && dev->rx_bufs > 0;
|
2011-04-01 08:15:29 +04:00
|
|
|
}
|
|
|
|
|
2016-04-04 13:13:10 +03:00
|
|
|
/**
|
|
|
|
* The last 8 bytes of the receive buffer list page (that has been
|
|
|
|
* supplied by the guest with the H_REGISTER_LOGICAL_LAN call) contain
|
|
|
|
* a counter for frames that have been dropped because there was no
|
|
|
|
* suitable receive buffer available. This function is used to increase
|
|
|
|
* this counter by one.
|
|
|
|
*/
|
spapr: Use CamelCase properly
The qemu coding standard is to use CamelCase for type and structure names,
and the pseries code follows that... sort of. There are quite a lot of
places where we bend the rules in order to preserve the capitalization of
internal acronyms like "PHB", "TCE", "DIMM" and most commonly "sPAPR".
That was a bad idea - it frequently leads to names ending up with hard to
read clusters of capital letters, and means they don't catch the eye as
type identifiers, which is kind of the point of the CamelCase convention in
the first place.
In short, keeping type identifiers look like CamelCase is more important
than preserving standard capitalization of internal "words". So, this
patch renames a heap of spapr internal type names to a more standard
CamelCase.
In addition to case changes, we also make some other identifier renames:
VIOsPAPR* -> SpaprVio*
The reverse word ordering was only ever used to mitigate the capital
cluster, so revert to the natural ordering.
VIOsPAPRVTYDevice -> SpaprVioVty
VIOsPAPRVLANDevice -> SpaprVioVlan
Brevity, since the "Device" didn't add useful information
sPAPRDRConnector -> SpaprDrc
sPAPRDRConnectorClass -> SpaprDrcClass
Brevity, and makes it clearer this is the same thing as a "DRC"
mentioned in many other places in the code
This is 100% a mechanical search-and-replace patch. It will, however,
conflict with essentially any and all outstanding patches touching the
spapr code.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2019-03-06 07:35:37 +03:00
|
|
|
static void spapr_vlan_record_dropped_rx_frame(SpaprVioVlan *dev)
|
2016-04-04 13:13:10 +03:00
|
|
|
{
|
|
|
|
uint64_t cnt;
|
|
|
|
|
|
|
|
cnt = vio_ldq(&dev->sdev, dev->buf_list + 4096 - 8);
|
|
|
|
vio_stq(&dev->sdev, dev->buf_list + 4096 - 8, cnt + 1);
|
|
|
|
}
|
|
|
|
|
hw/net/spapr_llan: Fix receive buffer handling for better performance
tl;dr:
This patch introduces an alternate way of handling the receive
buffers of the spapr-vlan device, resulting in much better
receive performance for the guest.
Full story:
One of our testers recently discovered that the performance of the
spapr-vlan device is very poor compared to other NICs, and that
a simple "ping -i 0.2 -s 65507 someip" in the guest can result
in more than 50% lost ping packets (especially with older guest
kernels < 3.17).
After doing some analysis, it was clear that there is a problem
with the way we handle the receive buffers in spapr_llan.c: The
ibmveth driver of the guest Linux kernel tries to add a lot of
buffers into several buffer pools (with 512, 2048 and 65536 byte
sizes by default, but it can be changed via the entries in the
/sys/devices/vio/1000/pool* directories of the guest). However,
the spapr-vlan device of QEMU only tries to squeeze all receive
buffer descriptors into one single page which has been supplied
by the guest during the H_REGISTER_LOGICAL_LAN call, without
taking care of different buffer sizes. This has two bad effects:
First, only a very limited number of buffer descriptors is accepted
at all. Second, we also hand 64k buffers to the guest even if
the 2k buffers would fit better - and this results in dropped packets
in the IP layer of the guest since too much skbuf memory is used.
Though it seems at a first glance like PAPR says that we should store
the receive buffer descriptors in the page that is supplied during
the H_REGISTER_LOGICAL_LAN call, chapter 16.4.1.2 in the LoPAPR spec
declares that "the contents of these descriptors are architecturally
opaque, none of these descriptors are manipulated by code above
the architected interfaces". That means we don't have to store
the RX buffer descriptors in this page, but can also manage the
receive buffers at the hypervisor level only. This is now what we
are doing here: Introducing proper RX buffer pools which are also
sorted by size of the buffers, so we can hand out a buffer with
the best fitting size when a packet has been received.
To avoid problems with migration from/to older version of QEMU,
the old behavior is also retained and enabled by default. The new
buffer management has to be enabled via a new "use-rx-buffer-pools"
property.
Now with the new buffer pool management enabled, the problem with
"ping -s 65507" is fixed for me, and the throughput of a simple
test with wget increases from creeping 3MB/s up to 20MB/s!
Signed-off-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2016-03-21 19:25:23 +03:00
|
|
|
/**
|
|
|
|
* Get buffer descriptor from one of our receive buffer pools
|
|
|
|
*/
|
spapr: Use CamelCase properly
The qemu coding standard is to use CamelCase for type and structure names,
and the pseries code follows that... sort of. There are quite a lot of
places where we bend the rules in order to preserve the capitalization of
internal acronyms like "PHB", "TCE", "DIMM" and most commonly "sPAPR".
That was a bad idea - it frequently leads to names ending up with hard to
read clusters of capital letters, and means they don't catch the eye as
type identifiers, which is kind of the point of the CamelCase convention in
the first place.
In short, keeping type identifiers look like CamelCase is more important
than preserving standard capitalization of internal "words". So, this
patch renames a heap of spapr internal type names to a more standard
CamelCase.
In addition to case changes, we also make some other identifier renames:
VIOsPAPR* -> SpaprVio*
The reverse word ordering was only ever used to mitigate the capital
cluster, so revert to the natural ordering.
VIOsPAPRVTYDevice -> SpaprVioVty
VIOsPAPRVLANDevice -> SpaprVioVlan
Brevity, since the "Device" didn't add useful information
sPAPRDRConnector -> SpaprDrc
sPAPRDRConnectorClass -> SpaprDrcClass
Brevity, and makes it clearer this is the same thing as a "DRC"
mentioned in many other places in the code
This is 100% a mechanical search-and-replace patch. It will, however,
conflict with essentially any and all outstanding patches touching the
spapr code.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2019-03-06 07:35:37 +03:00
|
|
|
static vlan_bd_t spapr_vlan_get_rx_bd_from_pool(SpaprVioVlan *dev,
|
hw/net/spapr_llan: Fix receive buffer handling for better performance
tl;dr:
This patch introduces an alternate way of handling the receive
buffers of the spapr-vlan device, resulting in much better
receive performance for the guest.
Full story:
One of our testers recently discovered that the performance of the
spapr-vlan device is very poor compared to other NICs, and that
a simple "ping -i 0.2 -s 65507 someip" in the guest can result
in more than 50% lost ping packets (especially with older guest
kernels < 3.17).
After doing some analysis, it was clear that there is a problem
with the way we handle the receive buffers in spapr_llan.c: The
ibmveth driver of the guest Linux kernel tries to add a lot of
buffers into several buffer pools (with 512, 2048 and 65536 byte
sizes by default, but it can be changed via the entries in the
/sys/devices/vio/1000/pool* directories of the guest). However,
the spapr-vlan device of QEMU only tries to squeeze all receive
buffer descriptors into one single page which has been supplied
by the guest during the H_REGISTER_LOGICAL_LAN call, without
taking care of different buffer sizes. This has two bad effects:
First, only a very limited number of buffer descriptors is accepted
at all. Second, we also hand 64k buffers to the guest even if
the 2k buffers would fit better - and this results in dropped packets
in the IP layer of the guest since too much skbuf memory is used.
Though it seems at a first glance like PAPR says that we should store
the receive buffer descriptors in the page that is supplied during
the H_REGISTER_LOGICAL_LAN call, chapter 16.4.1.2 in the LoPAPR spec
declares that "the contents of these descriptors are architecturally
opaque, none of these descriptors are manipulated by code above
the architected interfaces". That means we don't have to store
the RX buffer descriptors in this page, but can also manage the
receive buffers at the hypervisor level only. This is now what we
are doing here: Introducing proper RX buffer pools which are also
sorted by size of the buffers, so we can hand out a buffer with
the best fitting size when a packet has been received.
To avoid problems with migration from/to older version of QEMU,
the old behavior is also retained and enabled by default. The new
buffer management has to be enabled via a new "use-rx-buffer-pools"
property.
Now with the new buffer pool management enabled, the problem with
"ping -s 65507" is fixed for me, and the throughput of a simple
test with wget increases from creeping 3MB/s up to 20MB/s!
Signed-off-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2016-03-21 19:25:23 +03:00
|
|
|
size_t size)
|
|
|
|
{
|
|
|
|
vlan_bd_t bd;
|
|
|
|
int pool;
|
|
|
|
|
|
|
|
for (pool = 0; pool < RX_MAX_POOLS; pool++) {
|
|
|
|
if (dev->rx_pool[pool]->count > 0 &&
|
|
|
|
dev->rx_pool[pool]->bufsize >= size + 8) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (pool == RX_MAX_POOLS) {
|
|
|
|
/* Failed to find a suitable buffer */
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2016-09-14 21:48:26 +03:00
|
|
|
|
|
|
|
trace_spapr_vlan_get_rx_bd_from_pool_found(pool,
|
|
|
|
dev->rx_pool[pool]->count,
|
|
|
|
dev->rx_bufs);
|
hw/net/spapr_llan: Fix receive buffer handling for better performance
tl;dr:
This patch introduces an alternate way of handling the receive
buffers of the spapr-vlan device, resulting in much better
receive performance for the guest.
Full story:
One of our testers recently discovered that the performance of the
spapr-vlan device is very poor compared to other NICs, and that
a simple "ping -i 0.2 -s 65507 someip" in the guest can result
in more than 50% lost ping packets (especially with older guest
kernels < 3.17).
After doing some analysis, it was clear that there is a problem
with the way we handle the receive buffers in spapr_llan.c: The
ibmveth driver of the guest Linux kernel tries to add a lot of
buffers into several buffer pools (with 512, 2048 and 65536 byte
sizes by default, but it can be changed via the entries in the
/sys/devices/vio/1000/pool* directories of the guest). However,
the spapr-vlan device of QEMU only tries to squeeze all receive
buffer descriptors into one single page which has been supplied
by the guest during the H_REGISTER_LOGICAL_LAN call, without
taking care of different buffer sizes. This has two bad effects:
First, only a very limited number of buffer descriptors is accepted
at all. Second, we also hand 64k buffers to the guest even if
the 2k buffers would fit better - and this results in dropped packets
in the IP layer of the guest since too much skbuf memory is used.
Though it seems at a first glance like PAPR says that we should store
the receive buffer descriptors in the page that is supplied during
the H_REGISTER_LOGICAL_LAN call, chapter 16.4.1.2 in the LoPAPR spec
declares that "the contents of these descriptors are architecturally
opaque, none of these descriptors are manipulated by code above
the architected interfaces". That means we don't have to store
the RX buffer descriptors in this page, but can also manage the
receive buffers at the hypervisor level only. This is now what we
are doing here: Introducing proper RX buffer pools which are also
sorted by size of the buffers, so we can hand out a buffer with
the best fitting size when a packet has been received.
To avoid problems with migration from/to older version of QEMU,
the old behavior is also retained and enabled by default. The new
buffer management has to be enabled via a new "use-rx-buffer-pools"
property.
Now with the new buffer pool management enabled, the problem with
"ping -s 65507" is fixed for me, and the throughput of a simple
test with wget increases from creeping 3MB/s up to 20MB/s!
Signed-off-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2016-03-21 19:25:23 +03:00
|
|
|
|
|
|
|
/* Remove the buffer from the pool */
|
|
|
|
dev->rx_pool[pool]->count--;
|
|
|
|
bd = dev->rx_pool[pool]->bds[dev->rx_pool[pool]->count];
|
|
|
|
dev->rx_pool[pool]->bds[dev->rx_pool[pool]->count] = 0;
|
|
|
|
|
|
|
|
return bd;
|
|
|
|
}
|
|
|
|
|
2016-03-21 19:25:22 +03:00
|
|
|
/**
|
|
|
|
* Get buffer descriptor from the receive buffer list page that has been
|
|
|
|
* supplied by the guest with the H_REGISTER_LOGICAL_LAN call
|
|
|
|
*/
|
spapr: Use CamelCase properly
The qemu coding standard is to use CamelCase for type and structure names,
and the pseries code follows that... sort of. There are quite a lot of
places where we bend the rules in order to preserve the capitalization of
internal acronyms like "PHB", "TCE", "DIMM" and most commonly "sPAPR".
That was a bad idea - it frequently leads to names ending up with hard to
read clusters of capital letters, and means they don't catch the eye as
type identifiers, which is kind of the point of the CamelCase convention in
the first place.
In short, keeping type identifiers look like CamelCase is more important
than preserving standard capitalization of internal "words". So, this
patch renames a heap of spapr internal type names to a more standard
CamelCase.
In addition to case changes, we also make some other identifier renames:
VIOsPAPR* -> SpaprVio*
The reverse word ordering was only ever used to mitigate the capital
cluster, so revert to the natural ordering.
VIOsPAPRVTYDevice -> SpaprVioVty
VIOsPAPRVLANDevice -> SpaprVioVlan
Brevity, since the "Device" didn't add useful information
sPAPRDRConnector -> SpaprDrc
sPAPRDRConnectorClass -> SpaprDrcClass
Brevity, and makes it clearer this is the same thing as a "DRC"
mentioned in many other places in the code
This is 100% a mechanical search-and-replace patch. It will, however,
conflict with essentially any and all outstanding patches touching the
spapr code.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2019-03-06 07:35:37 +03:00
|
|
|
static vlan_bd_t spapr_vlan_get_rx_bd_from_page(SpaprVioVlan *dev,
|
2016-03-21 19:25:22 +03:00
|
|
|
size_t size)
|
|
|
|
{
|
|
|
|
int buf_ptr = dev->use_buf_ptr;
|
|
|
|
vlan_bd_t bd;
|
|
|
|
|
|
|
|
do {
|
|
|
|
buf_ptr += 8;
|
|
|
|
if (buf_ptr >= VLAN_RX_BDS_LEN + VLAN_RX_BDS_OFF) {
|
|
|
|
buf_ptr = VLAN_RX_BDS_OFF;
|
|
|
|
}
|
|
|
|
|
|
|
|
bd = vio_ldq(&dev->sdev, dev->buf_list + buf_ptr);
|
2016-09-14 21:48:26 +03:00
|
|
|
|
|
|
|
trace_spapr_vlan_get_rx_bd_from_page(buf_ptr, (uint64_t)bd);
|
2016-03-21 19:25:22 +03:00
|
|
|
} while ((!(bd & VLAN_BD_VALID) || VLAN_BD_LEN(bd) < size + 8)
|
|
|
|
&& buf_ptr != dev->use_buf_ptr);
|
|
|
|
|
|
|
|
if (!(bd & VLAN_BD_VALID) || VLAN_BD_LEN(bd) < size + 8) {
|
|
|
|
/* Failed to find a suitable buffer */
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Remove the buffer from the pool */
|
|
|
|
dev->use_buf_ptr = buf_ptr;
|
|
|
|
vio_stq(&dev->sdev, dev->buf_list + dev->use_buf_ptr, 0);
|
|
|
|
|
2016-09-14 21:48:26 +03:00
|
|
|
trace_spapr_vlan_get_rx_bd_from_page_found(dev->use_buf_ptr, dev->rx_bufs);
|
2016-03-21 19:25:22 +03:00
|
|
|
|
|
|
|
return bd;
|
|
|
|
}
|
|
|
|
|
2012-07-24 19:35:13 +04:00
|
|
|
static ssize_t spapr_vlan_receive(NetClientState *nc, const uint8_t *buf,
|
2011-04-01 08:15:29 +04:00
|
|
|
size_t size)
|
|
|
|
{
|
spapr: Use CamelCase properly
The qemu coding standard is to use CamelCase for type and structure names,
and the pseries code follows that... sort of. There are quite a lot of
places where we bend the rules in order to preserve the capitalization of
internal acronyms like "PHB", "TCE", "DIMM" and most commonly "sPAPR".
That was a bad idea - it frequently leads to names ending up with hard to
read clusters of capital letters, and means they don't catch the eye as
type identifiers, which is kind of the point of the CamelCase convention in
the first place.
In short, keeping type identifiers look like CamelCase is more important
than preserving standard capitalization of internal "words". So, this
patch renames a heap of spapr internal type names to a more standard
CamelCase.
In addition to case changes, we also make some other identifier renames:
VIOsPAPR* -> SpaprVio*
The reverse word ordering was only ever used to mitigate the capital
cluster, so revert to the natural ordering.
VIOsPAPRVTYDevice -> SpaprVioVty
VIOsPAPRVLANDevice -> SpaprVioVlan
Brevity, since the "Device" didn't add useful information
sPAPRDRConnector -> SpaprDrc
sPAPRDRConnectorClass -> SpaprDrcClass
Brevity, and makes it clearer this is the same thing as a "DRC"
mentioned in many other places in the code
This is 100% a mechanical search-and-replace patch. It will, however,
conflict with essentially any and all outstanding patches touching the
spapr code.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2019-03-06 07:35:37 +03:00
|
|
|
SpaprVioVlan *dev = qemu_get_nic_opaque(nc);
|
|
|
|
SpaprVioDevice *sdev = VIO_SPAPR_DEVICE(dev);
|
2012-06-27 08:50:44 +04:00
|
|
|
vlan_bd_t rxq_bd = vio_ldq(sdev, dev->buf_list + VLAN_RXQ_BD_OFF);
|
2011-04-01 08:15:29 +04:00
|
|
|
vlan_bd_t bd;
|
|
|
|
uint64_t handle;
|
|
|
|
uint8_t control;
|
|
|
|
|
2016-09-14 21:48:26 +03:00
|
|
|
trace_spapr_vlan_receive(sdev->qdev.id, dev->rx_bufs);
|
2011-04-01 08:15:29 +04:00
|
|
|
|
|
|
|
if (!dev->isopen) {
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!dev->rx_bufs) {
|
2016-04-04 13:13:10 +03:00
|
|
|
spapr_vlan_record_dropped_rx_frame(dev);
|
hw/net/spapr_llan: Delay flushing of the RX queue while adding new RX buffers
Currently, the spapr-vlan device is trying to flush the RX queue
after each RX buffer that has been added by the guest via the
H_ADD_LOGICAL_LAN_BUFFER hypercall. In case the receive buffer pool
was empty before, we only pass single packets to the guest this
way. This can cause very bad performance if a sender is trying
to stream fragmented UDP packets to the guest. For example when
using the UDP_STREAM test from netperf with UDP packets that are
much bigger than the MTU size, almost all UDP packets are dropped
in the guest since the chances are quite high that at least one of
the fragments got lost on the way.
When flushing the receive queue, it's much better if we'd have
a bunch of receive buffers available already, so that fragmented
packets can be passed to the guest in one go. To do this, the
spapr_vlan_receive() function should return 0 instead of -1 if there
are no more receive buffers available, so that receive_disabled = 1
gets temporarily set for the receive queue, and we have to delay
the queue flushing at the end of h_add_logical_lan_buffer() a little
bit by using a timer, so that the guest gets a chance to add multiple
RX buffers before we flush the queue again.
This improves the UDP_STREAM test with the spapr-vlan device a lot:
Running
netserver -p 44444 -L <guestip> -f -D -4
in the guest, and
netperf -p 44444 -L <hostip> -H <guestip> -t UDP_STREAM -l 60 -- -m 16384
in the host, I get the following values _without_ this patch:
Socket Message Elapsed Messages
Size Size Time Okay Errors Throughput
bytes bytes secs # # 10^6bits/sec
229376 16384 60.00 1738970 0 3798.83
229376 60.00 23 0.05
That "0.05" means that almost all UDP packets got lost/discarded
at the receiving side.
With this patch applied, the value look much better:
Socket Message Elapsed Messages
Size Size Time Okay Errors Throughput
bytes bytes secs # # 10^6bits/sec
229376 16384 60.00 1789104 0 3908.35
229376 60.00 22818 49.85
Signed-off-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2016-03-31 14:47:05 +03:00
|
|
|
return 0;
|
2011-04-01 08:15:29 +04:00
|
|
|
}
|
|
|
|
|
hw/net/spapr_llan: Fix receive buffer handling for better performance
tl;dr:
This patch introduces an alternate way of handling the receive
buffers of the spapr-vlan device, resulting in much better
receive performance for the guest.
Full story:
One of our testers recently discovered that the performance of the
spapr-vlan device is very poor compared to other NICs, and that
a simple "ping -i 0.2 -s 65507 someip" in the guest can result
in more than 50% lost ping packets (especially with older guest
kernels < 3.17).
After doing some analysis, it was clear that there is a problem
with the way we handle the receive buffers in spapr_llan.c: The
ibmveth driver of the guest Linux kernel tries to add a lot of
buffers into several buffer pools (with 512, 2048 and 65536 byte
sizes by default, but it can be changed via the entries in the
/sys/devices/vio/1000/pool* directories of the guest). However,
the spapr-vlan device of QEMU only tries to squeeze all receive
buffer descriptors into one single page which has been supplied
by the guest during the H_REGISTER_LOGICAL_LAN call, without
taking care of different buffer sizes. This has two bad effects:
First, only a very limited number of buffer descriptors is accepted
at all. Second, we also hand 64k buffers to the guest even if
the 2k buffers would fit better - and this results in dropped packets
in the IP layer of the guest since too much skbuf memory is used.
Though it seems at a first glance like PAPR says that we should store
the receive buffer descriptors in the page that is supplied during
the H_REGISTER_LOGICAL_LAN call, chapter 16.4.1.2 in the LoPAPR spec
declares that "the contents of these descriptors are architecturally
opaque, none of these descriptors are manipulated by code above
the architected interfaces". That means we don't have to store
the RX buffer descriptors in this page, but can also manage the
receive buffers at the hypervisor level only. This is now what we
are doing here: Introducing proper RX buffer pools which are also
sorted by size of the buffers, so we can hand out a buffer with
the best fitting size when a packet has been received.
To avoid problems with migration from/to older version of QEMU,
the old behavior is also retained and enabled by default. The new
buffer management has to be enabled via a new "use-rx-buffer-pools"
property.
Now with the new buffer pool management enabled, the problem with
"ping -s 65507" is fixed for me, and the throughput of a simple
test with wget increases from creeping 3MB/s up to 20MB/s!
Signed-off-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2016-03-21 19:25:23 +03:00
|
|
|
if (dev->compat_flags & SPAPRVLAN_FLAG_RX_BUF_POOLS) {
|
|
|
|
bd = spapr_vlan_get_rx_bd_from_pool(dev, size);
|
|
|
|
} else {
|
|
|
|
bd = spapr_vlan_get_rx_bd_from_page(dev, size);
|
|
|
|
}
|
2016-03-21 19:25:22 +03:00
|
|
|
if (!bd) {
|
2016-04-04 13:13:10 +03:00
|
|
|
spapr_vlan_record_dropped_rx_frame(dev);
|
hw/net/spapr_llan: Delay flushing of the RX queue while adding new RX buffers
Currently, the spapr-vlan device is trying to flush the RX queue
after each RX buffer that has been added by the guest via the
H_ADD_LOGICAL_LAN_BUFFER hypercall. In case the receive buffer pool
was empty before, we only pass single packets to the guest this
way. This can cause very bad performance if a sender is trying
to stream fragmented UDP packets to the guest. For example when
using the UDP_STREAM test from netperf with UDP packets that are
much bigger than the MTU size, almost all UDP packets are dropped
in the guest since the chances are quite high that at least one of
the fragments got lost on the way.
When flushing the receive queue, it's much better if we'd have
a bunch of receive buffers available already, so that fragmented
packets can be passed to the guest in one go. To do this, the
spapr_vlan_receive() function should return 0 instead of -1 if there
are no more receive buffers available, so that receive_disabled = 1
gets temporarily set for the receive queue, and we have to delay
the queue flushing at the end of h_add_logical_lan_buffer() a little
bit by using a timer, so that the guest gets a chance to add multiple
RX buffers before we flush the queue again.
This improves the UDP_STREAM test with the spapr-vlan device a lot:
Running
netserver -p 44444 -L <guestip> -f -D -4
in the guest, and
netperf -p 44444 -L <hostip> -H <guestip> -t UDP_STREAM -l 60 -- -m 16384
in the host, I get the following values _without_ this patch:
Socket Message Elapsed Messages
Size Size Time Okay Errors Throughput
bytes bytes secs # # 10^6bits/sec
229376 16384 60.00 1738970 0 3798.83
229376 60.00 23 0.05
That "0.05" means that almost all UDP packets got lost/discarded
at the receiving side.
With this patch applied, the value look much better:
Socket Message Elapsed Messages
Size Size Time Okay Errors Throughput
bytes bytes secs # # 10^6bits/sec
229376 16384 60.00 1789104 0 3908.35
229376 60.00 22818 49.85
Signed-off-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2016-03-31 14:47:05 +03:00
|
|
|
return 0;
|
2011-04-01 08:15:29 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
dev->rx_bufs--;
|
|
|
|
|
|
|
|
/* Transfer the packet data */
|
2012-06-27 08:50:44 +04:00
|
|
|
if (spapr_vio_dma_write(sdev, VLAN_BD_ADDR(bd) + 8, buf, size) < 0) {
|
2011-04-01 08:15:29 +04:00
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
2016-09-14 21:48:26 +03:00
|
|
|
trace_spapr_vlan_receive_dma_completed();
|
2011-04-01 08:15:29 +04:00
|
|
|
|
|
|
|
/* Update the receive queue */
|
|
|
|
control = VLAN_RXQC_TOGGLE | VLAN_RXQC_VALID;
|
|
|
|
if (rxq_bd & VLAN_BD_TOGGLE) {
|
|
|
|
control ^= VLAN_RXQC_TOGGLE;
|
|
|
|
}
|
|
|
|
|
2012-06-27 08:50:44 +04:00
|
|
|
handle = vio_ldq(sdev, VLAN_BD_ADDR(bd));
|
|
|
|
vio_stq(sdev, VLAN_BD_ADDR(rxq_bd) + dev->rxq_ptr + 8, handle);
|
|
|
|
vio_stl(sdev, VLAN_BD_ADDR(rxq_bd) + dev->rxq_ptr + 4, size);
|
|
|
|
vio_sth(sdev, VLAN_BD_ADDR(rxq_bd) + dev->rxq_ptr + 2, 8);
|
|
|
|
vio_stb(sdev, VLAN_BD_ADDR(rxq_bd) + dev->rxq_ptr, control);
|
2011-04-01 08:15:29 +04:00
|
|
|
|
2016-09-14 21:48:26 +03:00
|
|
|
trace_spapr_vlan_receive_wrote(dev->rxq_ptr,
|
|
|
|
vio_ldq(sdev, VLAN_BD_ADDR(rxq_bd) +
|
|
|
|
dev->rxq_ptr),
|
|
|
|
vio_ldq(sdev, VLAN_BD_ADDR(rxq_bd) +
|
|
|
|
dev->rxq_ptr + 8));
|
2011-04-01 08:15:29 +04:00
|
|
|
|
|
|
|
dev->rxq_ptr += 16;
|
|
|
|
if (dev->rxq_ptr >= VLAN_BD_LEN(rxq_bd)) {
|
|
|
|
dev->rxq_ptr = 0;
|
2012-06-27 08:50:44 +04:00
|
|
|
vio_stq(sdev, dev->buf_list + VLAN_RXQ_BD_OFF, rxq_bd ^ VLAN_BD_TOGGLE);
|
2011-04-01 08:15:29 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
if (sdev->signal_state & 1) {
|
2019-09-23 08:50:09 +03:00
|
|
|
spapr_vio_irq_pulse(sdev);
|
2011-04-01 08:15:29 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
return size;
|
|
|
|
}
|
|
|
|
|
|
|
|
static NetClientInfo net_spapr_vlan_info = {
|
qapi: Change Netdev into a flat union
This is a mostly-mechanical conversion that creates a new flat
union 'Netdev' QAPI type that covers all the branches of the
former 'NetClientOptions' simple union, where the branches are
now listed in a new 'NetClientDriver' enum rather than generated
from the simple union. The existence of a flat union has no
change to the command line syntax accepted for new code, and
will make it possible for a future patch to switch the QMP
command to parse a boxed union for no change to valid QMP; but
it does have some ripple effect on the C code when dealing with
the new types.
While making the conversion, note that the 'NetLegacy' type
remains unchanged: it applies only to legacy command line options,
and will not be ported to QMP, so it should remain a wrapper
around a simple union; to avoid confusion, the type named
'NetClientOptions' is now gone, and we introduce 'NetLegacyOptions'
in its place. Then, in the C code, we convert from NetLegacy to
Netdev as soon as possible, so that the bulk of the net stack
only has to deal with one QAPI type, not two. Note that since
the old legacy code always rejected 'hubport', we can just omit
that branch from the new 'NetLegacyOptions' simple union.
Based on an idea originally by Zoltán Kővágó <DirtY.iCE.hu@gmail.com>:
Message-Id: <01a527fbf1a5de880091f98cf011616a78adeeee.1441627176.git.DirtY.iCE.hu@gmail.com>
although the sed script in that patch no longer applies due to
other changes in the tree since then, and I also did some manual
cleanups (such as fixing whitespace to keep checkpatch happy).
Signed-off-by: Eric Blake <eblake@redhat.com>
Message-Id: <1468468228-27827-13-git-send-email-eblake@redhat.com>
Reviewed-by: Markus Armbruster <armbru@redhat.com>
[Fixup from Eric squashed in]
Signed-off-by: Markus Armbruster <armbru@redhat.com>
2016-07-14 06:50:23 +03:00
|
|
|
.type = NET_CLIENT_DRIVER_NIC,
|
2011-04-01 08:15:29 +04:00
|
|
|
.size = sizeof(NICState),
|
|
|
|
.can_receive = spapr_vlan_can_receive,
|
|
|
|
.receive = spapr_vlan_receive,
|
|
|
|
};
|
|
|
|
|
hw/net/spapr_llan: Delay flushing of the RX queue while adding new RX buffers
Currently, the spapr-vlan device is trying to flush the RX queue
after each RX buffer that has been added by the guest via the
H_ADD_LOGICAL_LAN_BUFFER hypercall. In case the receive buffer pool
was empty before, we only pass single packets to the guest this
way. This can cause very bad performance if a sender is trying
to stream fragmented UDP packets to the guest. For example when
using the UDP_STREAM test from netperf with UDP packets that are
much bigger than the MTU size, almost all UDP packets are dropped
in the guest since the chances are quite high that at least one of
the fragments got lost on the way.
When flushing the receive queue, it's much better if we'd have
a bunch of receive buffers available already, so that fragmented
packets can be passed to the guest in one go. To do this, the
spapr_vlan_receive() function should return 0 instead of -1 if there
are no more receive buffers available, so that receive_disabled = 1
gets temporarily set for the receive queue, and we have to delay
the queue flushing at the end of h_add_logical_lan_buffer() a little
bit by using a timer, so that the guest gets a chance to add multiple
RX buffers before we flush the queue again.
This improves the UDP_STREAM test with the spapr-vlan device a lot:
Running
netserver -p 44444 -L <guestip> -f -D -4
in the guest, and
netperf -p 44444 -L <hostip> -H <guestip> -t UDP_STREAM -l 60 -- -m 16384
in the host, I get the following values _without_ this patch:
Socket Message Elapsed Messages
Size Size Time Okay Errors Throughput
bytes bytes secs # # 10^6bits/sec
229376 16384 60.00 1738970 0 3798.83
229376 60.00 23 0.05
That "0.05" means that almost all UDP packets got lost/discarded
at the receiving side.
With this patch applied, the value look much better:
Socket Message Elapsed Messages
Size Size Time Okay Errors Throughput
bytes bytes secs # # 10^6bits/sec
229376 16384 60.00 1789104 0 3908.35
229376 60.00 22818 49.85
Signed-off-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2016-03-31 14:47:05 +03:00
|
|
|
static void spapr_vlan_flush_rx_queue(void *opaque)
|
|
|
|
{
|
spapr: Use CamelCase properly
The qemu coding standard is to use CamelCase for type and structure names,
and the pseries code follows that... sort of. There are quite a lot of
places where we bend the rules in order to preserve the capitalization of
internal acronyms like "PHB", "TCE", "DIMM" and most commonly "sPAPR".
That was a bad idea - it frequently leads to names ending up with hard to
read clusters of capital letters, and means they don't catch the eye as
type identifiers, which is kind of the point of the CamelCase convention in
the first place.
In short, keeping type identifiers look like CamelCase is more important
than preserving standard capitalization of internal "words". So, this
patch renames a heap of spapr internal type names to a more standard
CamelCase.
In addition to case changes, we also make some other identifier renames:
VIOsPAPR* -> SpaprVio*
The reverse word ordering was only ever used to mitigate the capital
cluster, so revert to the natural ordering.
VIOsPAPRVTYDevice -> SpaprVioVty
VIOsPAPRVLANDevice -> SpaprVioVlan
Brevity, since the "Device" didn't add useful information
sPAPRDRConnector -> SpaprDrc
sPAPRDRConnectorClass -> SpaprDrcClass
Brevity, and makes it clearer this is the same thing as a "DRC"
mentioned in many other places in the code
This is 100% a mechanical search-and-replace patch. It will, however,
conflict with essentially any and all outstanding patches touching the
spapr code.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2019-03-06 07:35:37 +03:00
|
|
|
SpaprVioVlan *dev = opaque;
|
hw/net/spapr_llan: Delay flushing of the RX queue while adding new RX buffers
Currently, the spapr-vlan device is trying to flush the RX queue
after each RX buffer that has been added by the guest via the
H_ADD_LOGICAL_LAN_BUFFER hypercall. In case the receive buffer pool
was empty before, we only pass single packets to the guest this
way. This can cause very bad performance if a sender is trying
to stream fragmented UDP packets to the guest. For example when
using the UDP_STREAM test from netperf with UDP packets that are
much bigger than the MTU size, almost all UDP packets are dropped
in the guest since the chances are quite high that at least one of
the fragments got lost on the way.
When flushing the receive queue, it's much better if we'd have
a bunch of receive buffers available already, so that fragmented
packets can be passed to the guest in one go. To do this, the
spapr_vlan_receive() function should return 0 instead of -1 if there
are no more receive buffers available, so that receive_disabled = 1
gets temporarily set for the receive queue, and we have to delay
the queue flushing at the end of h_add_logical_lan_buffer() a little
bit by using a timer, so that the guest gets a chance to add multiple
RX buffers before we flush the queue again.
This improves the UDP_STREAM test with the spapr-vlan device a lot:
Running
netserver -p 44444 -L <guestip> -f -D -4
in the guest, and
netperf -p 44444 -L <hostip> -H <guestip> -t UDP_STREAM -l 60 -- -m 16384
in the host, I get the following values _without_ this patch:
Socket Message Elapsed Messages
Size Size Time Okay Errors Throughput
bytes bytes secs # # 10^6bits/sec
229376 16384 60.00 1738970 0 3798.83
229376 60.00 23 0.05
That "0.05" means that almost all UDP packets got lost/discarded
at the receiving side.
With this patch applied, the value look much better:
Socket Message Elapsed Messages
Size Size Time Okay Errors Throughput
bytes bytes secs # # 10^6bits/sec
229376 16384 60.00 1789104 0 3908.35
229376 60.00 22818 49.85
Signed-off-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2016-03-31 14:47:05 +03:00
|
|
|
|
|
|
|
qemu_flush_queued_packets(qemu_get_queue(dev->nic));
|
|
|
|
}
|
|
|
|
|
hw/net/spapr_llan: Fix receive buffer handling for better performance
tl;dr:
This patch introduces an alternate way of handling the receive
buffers of the spapr-vlan device, resulting in much better
receive performance for the guest.
Full story:
One of our testers recently discovered that the performance of the
spapr-vlan device is very poor compared to other NICs, and that
a simple "ping -i 0.2 -s 65507 someip" in the guest can result
in more than 50% lost ping packets (especially with older guest
kernels < 3.17).
After doing some analysis, it was clear that there is a problem
with the way we handle the receive buffers in spapr_llan.c: The
ibmveth driver of the guest Linux kernel tries to add a lot of
buffers into several buffer pools (with 512, 2048 and 65536 byte
sizes by default, but it can be changed via the entries in the
/sys/devices/vio/1000/pool* directories of the guest). However,
the spapr-vlan device of QEMU only tries to squeeze all receive
buffer descriptors into one single page which has been supplied
by the guest during the H_REGISTER_LOGICAL_LAN call, without
taking care of different buffer sizes. This has two bad effects:
First, only a very limited number of buffer descriptors is accepted
at all. Second, we also hand 64k buffers to the guest even if
the 2k buffers would fit better - and this results in dropped packets
in the IP layer of the guest since too much skbuf memory is used.
Though it seems at a first glance like PAPR says that we should store
the receive buffer descriptors in the page that is supplied during
the H_REGISTER_LOGICAL_LAN call, chapter 16.4.1.2 in the LoPAPR spec
declares that "the contents of these descriptors are architecturally
opaque, none of these descriptors are manipulated by code above
the architected interfaces". That means we don't have to store
the RX buffer descriptors in this page, but can also manage the
receive buffers at the hypervisor level only. This is now what we
are doing here: Introducing proper RX buffer pools which are also
sorted by size of the buffers, so we can hand out a buffer with
the best fitting size when a packet has been received.
To avoid problems with migration from/to older version of QEMU,
the old behavior is also retained and enabled by default. The new
buffer management has to be enabled via a new "use-rx-buffer-pools"
property.
Now with the new buffer pool management enabled, the problem with
"ping -s 65507" is fixed for me, and the throughput of a simple
test with wget increases from creeping 3MB/s up to 20MB/s!
Signed-off-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2016-03-21 19:25:23 +03:00
|
|
|
static void spapr_vlan_reset_rx_pool(RxBufPool *rxp)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Use INT_MAX as bufsize so that unused buffers are moved to the end
|
|
|
|
* of the list during the qsort in spapr_vlan_add_rxbuf_to_pool() later.
|
|
|
|
*/
|
|
|
|
rxp->bufsize = INT_MAX;
|
|
|
|
rxp->count = 0;
|
|
|
|
memset(rxp->bds, 0, sizeof(rxp->bds));
|
|
|
|
}
|
|
|
|
|
spapr: Use CamelCase properly
The qemu coding standard is to use CamelCase for type and structure names,
and the pseries code follows that... sort of. There are quite a lot of
places where we bend the rules in order to preserve the capitalization of
internal acronyms like "PHB", "TCE", "DIMM" and most commonly "sPAPR".
That was a bad idea - it frequently leads to names ending up with hard to
read clusters of capital letters, and means they don't catch the eye as
type identifiers, which is kind of the point of the CamelCase convention in
the first place.
In short, keeping type identifiers look like CamelCase is more important
than preserving standard capitalization of internal "words". So, this
patch renames a heap of spapr internal type names to a more standard
CamelCase.
In addition to case changes, we also make some other identifier renames:
VIOsPAPR* -> SpaprVio*
The reverse word ordering was only ever used to mitigate the capital
cluster, so revert to the natural ordering.
VIOsPAPRVTYDevice -> SpaprVioVty
VIOsPAPRVLANDevice -> SpaprVioVlan
Brevity, since the "Device" didn't add useful information
sPAPRDRConnector -> SpaprDrc
sPAPRDRConnectorClass -> SpaprDrcClass
Brevity, and makes it clearer this is the same thing as a "DRC"
mentioned in many other places in the code
This is 100% a mechanical search-and-replace patch. It will, however,
conflict with essentially any and all outstanding patches touching the
spapr code.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2019-03-06 07:35:37 +03:00
|
|
|
static void spapr_vlan_reset(SpaprVioDevice *sdev)
|
2012-04-12 06:44:15 +04:00
|
|
|
{
|
spapr: Use CamelCase properly
The qemu coding standard is to use CamelCase for type and structure names,
and the pseries code follows that... sort of. There are quite a lot of
places where we bend the rules in order to preserve the capitalization of
internal acronyms like "PHB", "TCE", "DIMM" and most commonly "sPAPR".
That was a bad idea - it frequently leads to names ending up with hard to
read clusters of capital letters, and means they don't catch the eye as
type identifiers, which is kind of the point of the CamelCase convention in
the first place.
In short, keeping type identifiers look like CamelCase is more important
than preserving standard capitalization of internal "words". So, this
patch renames a heap of spapr internal type names to a more standard
CamelCase.
In addition to case changes, we also make some other identifier renames:
VIOsPAPR* -> SpaprVio*
The reverse word ordering was only ever used to mitigate the capital
cluster, so revert to the natural ordering.
VIOsPAPRVTYDevice -> SpaprVioVty
VIOsPAPRVLANDevice -> SpaprVioVlan
Brevity, since the "Device" didn't add useful information
sPAPRDRConnector -> SpaprDrc
sPAPRDRConnectorClass -> SpaprDrcClass
Brevity, and makes it clearer this is the same thing as a "DRC"
mentioned in many other places in the code
This is 100% a mechanical search-and-replace patch. It will, however,
conflict with essentially any and all outstanding patches touching the
spapr code.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2019-03-06 07:35:37 +03:00
|
|
|
SpaprVioVlan *dev = VIO_SPAPR_VLAN_DEVICE(sdev);
|
hw/net/spapr_llan: Fix receive buffer handling for better performance
tl;dr:
This patch introduces an alternate way of handling the receive
buffers of the spapr-vlan device, resulting in much better
receive performance for the guest.
Full story:
One of our testers recently discovered that the performance of the
spapr-vlan device is very poor compared to other NICs, and that
a simple "ping -i 0.2 -s 65507 someip" in the guest can result
in more than 50% lost ping packets (especially with older guest
kernels < 3.17).
After doing some analysis, it was clear that there is a problem
with the way we handle the receive buffers in spapr_llan.c: The
ibmveth driver of the guest Linux kernel tries to add a lot of
buffers into several buffer pools (with 512, 2048 and 65536 byte
sizes by default, but it can be changed via the entries in the
/sys/devices/vio/1000/pool* directories of the guest). However,
the spapr-vlan device of QEMU only tries to squeeze all receive
buffer descriptors into one single page which has been supplied
by the guest during the H_REGISTER_LOGICAL_LAN call, without
taking care of different buffer sizes. This has two bad effects:
First, only a very limited number of buffer descriptors is accepted
at all. Second, we also hand 64k buffers to the guest even if
the 2k buffers would fit better - and this results in dropped packets
in the IP layer of the guest since too much skbuf memory is used.
Though it seems at a first glance like PAPR says that we should store
the receive buffer descriptors in the page that is supplied during
the H_REGISTER_LOGICAL_LAN call, chapter 16.4.1.2 in the LoPAPR spec
declares that "the contents of these descriptors are architecturally
opaque, none of these descriptors are manipulated by code above
the architected interfaces". That means we don't have to store
the RX buffer descriptors in this page, but can also manage the
receive buffers at the hypervisor level only. This is now what we
are doing here: Introducing proper RX buffer pools which are also
sorted by size of the buffers, so we can hand out a buffer with
the best fitting size when a packet has been received.
To avoid problems with migration from/to older version of QEMU,
the old behavior is also retained and enabled by default. The new
buffer management has to be enabled via a new "use-rx-buffer-pools"
property.
Now with the new buffer pool management enabled, the problem with
"ping -s 65507" is fixed for me, and the throughput of a simple
test with wget increases from creeping 3MB/s up to 20MB/s!
Signed-off-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2016-03-21 19:25:23 +03:00
|
|
|
int i;
|
2012-04-12 06:44:15 +04:00
|
|
|
|
|
|
|
dev->buf_list = 0;
|
|
|
|
dev->rx_bufs = 0;
|
|
|
|
dev->isopen = 0;
|
hw/net/spapr_llan: Fix receive buffer handling for better performance
tl;dr:
This patch introduces an alternate way of handling the receive
buffers of the spapr-vlan device, resulting in much better
receive performance for the guest.
Full story:
One of our testers recently discovered that the performance of the
spapr-vlan device is very poor compared to other NICs, and that
a simple "ping -i 0.2 -s 65507 someip" in the guest can result
in more than 50% lost ping packets (especially with older guest
kernels < 3.17).
After doing some analysis, it was clear that there is a problem
with the way we handle the receive buffers in spapr_llan.c: The
ibmveth driver of the guest Linux kernel tries to add a lot of
buffers into several buffer pools (with 512, 2048 and 65536 byte
sizes by default, but it can be changed via the entries in the
/sys/devices/vio/1000/pool* directories of the guest). However,
the spapr-vlan device of QEMU only tries to squeeze all receive
buffer descriptors into one single page which has been supplied
by the guest during the H_REGISTER_LOGICAL_LAN call, without
taking care of different buffer sizes. This has two bad effects:
First, only a very limited number of buffer descriptors is accepted
at all. Second, we also hand 64k buffers to the guest even if
the 2k buffers would fit better - and this results in dropped packets
in the IP layer of the guest since too much skbuf memory is used.
Though it seems at a first glance like PAPR says that we should store
the receive buffer descriptors in the page that is supplied during
the H_REGISTER_LOGICAL_LAN call, chapter 16.4.1.2 in the LoPAPR spec
declares that "the contents of these descriptors are architecturally
opaque, none of these descriptors are manipulated by code above
the architected interfaces". That means we don't have to store
the RX buffer descriptors in this page, but can also manage the
receive buffers at the hypervisor level only. This is now what we
are doing here: Introducing proper RX buffer pools which are also
sorted by size of the buffers, so we can hand out a buffer with
the best fitting size when a packet has been received.
To avoid problems with migration from/to older version of QEMU,
the old behavior is also retained and enabled by default. The new
buffer management has to be enabled via a new "use-rx-buffer-pools"
property.
Now with the new buffer pool management enabled, the problem with
"ping -s 65507" is fixed for me, and the throughput of a simple
test with wget increases from creeping 3MB/s up to 20MB/s!
Signed-off-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2016-03-21 19:25:23 +03:00
|
|
|
|
|
|
|
if (dev->compat_flags & SPAPRVLAN_FLAG_RX_BUF_POOLS) {
|
|
|
|
for (i = 0; i < RX_MAX_POOLS; i++) {
|
|
|
|
spapr_vlan_reset_rx_pool(dev->rx_pool[i]);
|
|
|
|
}
|
|
|
|
}
|
2016-09-01 11:10:49 +03:00
|
|
|
|
|
|
|
memcpy(&dev->nicconf.macaddr.a, &dev->perm_mac.a,
|
|
|
|
sizeof(dev->nicconf.macaddr.a));
|
|
|
|
qemu_format_nic_info_str(qemu_get_queue(dev->nic), dev->nicconf.macaddr.a);
|
2012-04-12 06:44:15 +04:00
|
|
|
}
|
|
|
|
|
spapr: Use CamelCase properly
The qemu coding standard is to use CamelCase for type and structure names,
and the pseries code follows that... sort of. There are quite a lot of
places where we bend the rules in order to preserve the capitalization of
internal acronyms like "PHB", "TCE", "DIMM" and most commonly "sPAPR".
That was a bad idea - it frequently leads to names ending up with hard to
read clusters of capital letters, and means they don't catch the eye as
type identifiers, which is kind of the point of the CamelCase convention in
the first place.
In short, keeping type identifiers look like CamelCase is more important
than preserving standard capitalization of internal "words". So, this
patch renames a heap of spapr internal type names to a more standard
CamelCase.
In addition to case changes, we also make some other identifier renames:
VIOsPAPR* -> SpaprVio*
The reverse word ordering was only ever used to mitigate the capital
cluster, so revert to the natural ordering.
VIOsPAPRVTYDevice -> SpaprVioVty
VIOsPAPRVLANDevice -> SpaprVioVlan
Brevity, since the "Device" didn't add useful information
sPAPRDRConnector -> SpaprDrc
sPAPRDRConnectorClass -> SpaprDrcClass
Brevity, and makes it clearer this is the same thing as a "DRC"
mentioned in many other places in the code
This is 100% a mechanical search-and-replace patch. It will, however,
conflict with essentially any and all outstanding patches touching the
spapr code.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2019-03-06 07:35:37 +03:00
|
|
|
static void spapr_vlan_realize(SpaprVioDevice *sdev, Error **errp)
|
2011-04-01 08:15:29 +04:00
|
|
|
{
|
spapr: Use CamelCase properly
The qemu coding standard is to use CamelCase for type and structure names,
and the pseries code follows that... sort of. There are quite a lot of
places where we bend the rules in order to preserve the capitalization of
internal acronyms like "PHB", "TCE", "DIMM" and most commonly "sPAPR".
That was a bad idea - it frequently leads to names ending up with hard to
read clusters of capital letters, and means they don't catch the eye as
type identifiers, which is kind of the point of the CamelCase convention in
the first place.
In short, keeping type identifiers look like CamelCase is more important
than preserving standard capitalization of internal "words". So, this
patch renames a heap of spapr internal type names to a more standard
CamelCase.
In addition to case changes, we also make some other identifier renames:
VIOsPAPR* -> SpaprVio*
The reverse word ordering was only ever used to mitigate the capital
cluster, so revert to the natural ordering.
VIOsPAPRVTYDevice -> SpaprVioVty
VIOsPAPRVLANDevice -> SpaprVioVlan
Brevity, since the "Device" didn't add useful information
sPAPRDRConnector -> SpaprDrc
sPAPRDRConnectorClass -> SpaprDrcClass
Brevity, and makes it clearer this is the same thing as a "DRC"
mentioned in many other places in the code
This is 100% a mechanical search-and-replace patch. It will, however,
conflict with essentially any and all outstanding patches touching the
spapr code.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2019-03-06 07:35:37 +03:00
|
|
|
SpaprVioVlan *dev = VIO_SPAPR_VLAN_DEVICE(sdev);
|
2011-04-01 08:15:29 +04:00
|
|
|
|
|
|
|
qemu_macaddr_default_if_unset(&dev->nicconf.macaddr);
|
|
|
|
|
2016-09-01 11:10:49 +03:00
|
|
|
memcpy(&dev->perm_mac.a, &dev->nicconf.macaddr.a, sizeof(dev->perm_mac.a));
|
|
|
|
|
2011-04-01 08:15:29 +04:00
|
|
|
dev->nic = qemu_new_nic(&net_spapr_vlan_info, &dev->nicconf,
|
2011-12-04 21:17:51 +04:00
|
|
|
object_get_typename(OBJECT(sdev)), sdev->qdev.id, dev);
|
2013-01-30 15:12:22 +04:00
|
|
|
qemu_format_nic_info_str(qemu_get_queue(dev->nic), dev->nicconf.macaddr.a);
|
hw/net/spapr_llan: Delay flushing of the RX queue while adding new RX buffers
Currently, the spapr-vlan device is trying to flush the RX queue
after each RX buffer that has been added by the guest via the
H_ADD_LOGICAL_LAN_BUFFER hypercall. In case the receive buffer pool
was empty before, we only pass single packets to the guest this
way. This can cause very bad performance if a sender is trying
to stream fragmented UDP packets to the guest. For example when
using the UDP_STREAM test from netperf with UDP packets that are
much bigger than the MTU size, almost all UDP packets are dropped
in the guest since the chances are quite high that at least one of
the fragments got lost on the way.
When flushing the receive queue, it's much better if we'd have
a bunch of receive buffers available already, so that fragmented
packets can be passed to the guest in one go. To do this, the
spapr_vlan_receive() function should return 0 instead of -1 if there
are no more receive buffers available, so that receive_disabled = 1
gets temporarily set for the receive queue, and we have to delay
the queue flushing at the end of h_add_logical_lan_buffer() a little
bit by using a timer, so that the guest gets a chance to add multiple
RX buffers before we flush the queue again.
This improves the UDP_STREAM test with the spapr-vlan device a lot:
Running
netserver -p 44444 -L <guestip> -f -D -4
in the guest, and
netperf -p 44444 -L <hostip> -H <guestip> -t UDP_STREAM -l 60 -- -m 16384
in the host, I get the following values _without_ this patch:
Socket Message Elapsed Messages
Size Size Time Okay Errors Throughput
bytes bytes secs # # 10^6bits/sec
229376 16384 60.00 1738970 0 3798.83
229376 60.00 23 0.05
That "0.05" means that almost all UDP packets got lost/discarded
at the receiving side.
With this patch applied, the value look much better:
Socket Message Elapsed Messages
Size Size Time Okay Errors Throughput
bytes bytes secs # # 10^6bits/sec
229376 16384 60.00 1789104 0 3908.35
229376 60.00 22818 49.85
Signed-off-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2016-03-31 14:47:05 +03:00
|
|
|
|
|
|
|
dev->rxp_timer = timer_new_us(QEMU_CLOCK_VIRTUAL, spapr_vlan_flush_rx_queue,
|
|
|
|
dev);
|
2011-04-01 08:15:29 +04:00
|
|
|
}
|
|
|
|
|
2014-10-07 12:00:18 +04:00
|
|
|
static void spapr_vlan_instance_init(Object *obj)
|
|
|
|
{
|
spapr: Use CamelCase properly
The qemu coding standard is to use CamelCase for type and structure names,
and the pseries code follows that... sort of. There are quite a lot of
places where we bend the rules in order to preserve the capitalization of
internal acronyms like "PHB", "TCE", "DIMM" and most commonly "sPAPR".
That was a bad idea - it frequently leads to names ending up with hard to
read clusters of capital letters, and means they don't catch the eye as
type identifiers, which is kind of the point of the CamelCase convention in
the first place.
In short, keeping type identifiers look like CamelCase is more important
than preserving standard capitalization of internal "words". So, this
patch renames a heap of spapr internal type names to a more standard
CamelCase.
In addition to case changes, we also make some other identifier renames:
VIOsPAPR* -> SpaprVio*
The reverse word ordering was only ever used to mitigate the capital
cluster, so revert to the natural ordering.
VIOsPAPRVTYDevice -> SpaprVioVty
VIOsPAPRVLANDevice -> SpaprVioVlan
Brevity, since the "Device" didn't add useful information
sPAPRDRConnector -> SpaprDrc
sPAPRDRConnectorClass -> SpaprDrcClass
Brevity, and makes it clearer this is the same thing as a "DRC"
mentioned in many other places in the code
This is 100% a mechanical search-and-replace patch. It will, however,
conflict with essentially any and all outstanding patches touching the
spapr code.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2019-03-06 07:35:37 +03:00
|
|
|
SpaprVioVlan *dev = VIO_SPAPR_VLAN_DEVICE(obj);
|
hw/net/spapr_llan: Fix receive buffer handling for better performance
tl;dr:
This patch introduces an alternate way of handling the receive
buffers of the spapr-vlan device, resulting in much better
receive performance for the guest.
Full story:
One of our testers recently discovered that the performance of the
spapr-vlan device is very poor compared to other NICs, and that
a simple "ping -i 0.2 -s 65507 someip" in the guest can result
in more than 50% lost ping packets (especially with older guest
kernels < 3.17).
After doing some analysis, it was clear that there is a problem
with the way we handle the receive buffers in spapr_llan.c: The
ibmveth driver of the guest Linux kernel tries to add a lot of
buffers into several buffer pools (with 512, 2048 and 65536 byte
sizes by default, but it can be changed via the entries in the
/sys/devices/vio/1000/pool* directories of the guest). However,
the spapr-vlan device of QEMU only tries to squeeze all receive
buffer descriptors into one single page which has been supplied
by the guest during the H_REGISTER_LOGICAL_LAN call, without
taking care of different buffer sizes. This has two bad effects:
First, only a very limited number of buffer descriptors is accepted
at all. Second, we also hand 64k buffers to the guest even if
the 2k buffers would fit better - and this results in dropped packets
in the IP layer of the guest since too much skbuf memory is used.
Though it seems at a first glance like PAPR says that we should store
the receive buffer descriptors in the page that is supplied during
the H_REGISTER_LOGICAL_LAN call, chapter 16.4.1.2 in the LoPAPR spec
declares that "the contents of these descriptors are architecturally
opaque, none of these descriptors are manipulated by code above
the architected interfaces". That means we don't have to store
the RX buffer descriptors in this page, but can also manage the
receive buffers at the hypervisor level only. This is now what we
are doing here: Introducing proper RX buffer pools which are also
sorted by size of the buffers, so we can hand out a buffer with
the best fitting size when a packet has been received.
To avoid problems with migration from/to older version of QEMU,
the old behavior is also retained and enabled by default. The new
buffer management has to be enabled via a new "use-rx-buffer-pools"
property.
Now with the new buffer pool management enabled, the problem with
"ping -s 65507" is fixed for me, and the throughput of a simple
test with wget increases from creeping 3MB/s up to 20MB/s!
Signed-off-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2016-03-21 19:25:23 +03:00
|
|
|
int i;
|
2014-10-07 12:00:18 +04:00
|
|
|
|
|
|
|
device_add_bootindex_property(obj, &dev->nicconf.bootindex,
|
|
|
|
"bootindex", "",
|
2020-05-05 18:29:23 +03:00
|
|
|
DEVICE(dev));
|
hw/net/spapr_llan: Fix receive buffer handling for better performance
tl;dr:
This patch introduces an alternate way of handling the receive
buffers of the spapr-vlan device, resulting in much better
receive performance for the guest.
Full story:
One of our testers recently discovered that the performance of the
spapr-vlan device is very poor compared to other NICs, and that
a simple "ping -i 0.2 -s 65507 someip" in the guest can result
in more than 50% lost ping packets (especially with older guest
kernels < 3.17).
After doing some analysis, it was clear that there is a problem
with the way we handle the receive buffers in spapr_llan.c: The
ibmveth driver of the guest Linux kernel tries to add a lot of
buffers into several buffer pools (with 512, 2048 and 65536 byte
sizes by default, but it can be changed via the entries in the
/sys/devices/vio/1000/pool* directories of the guest). However,
the spapr-vlan device of QEMU only tries to squeeze all receive
buffer descriptors into one single page which has been supplied
by the guest during the H_REGISTER_LOGICAL_LAN call, without
taking care of different buffer sizes. This has two bad effects:
First, only a very limited number of buffer descriptors is accepted
at all. Second, we also hand 64k buffers to the guest even if
the 2k buffers would fit better - and this results in dropped packets
in the IP layer of the guest since too much skbuf memory is used.
Though it seems at a first glance like PAPR says that we should store
the receive buffer descriptors in the page that is supplied during
the H_REGISTER_LOGICAL_LAN call, chapter 16.4.1.2 in the LoPAPR spec
declares that "the contents of these descriptors are architecturally
opaque, none of these descriptors are manipulated by code above
the architected interfaces". That means we don't have to store
the RX buffer descriptors in this page, but can also manage the
receive buffers at the hypervisor level only. This is now what we
are doing here: Introducing proper RX buffer pools which are also
sorted by size of the buffers, so we can hand out a buffer with
the best fitting size when a packet has been received.
To avoid problems with migration from/to older version of QEMU,
the old behavior is also retained and enabled by default. The new
buffer management has to be enabled via a new "use-rx-buffer-pools"
property.
Now with the new buffer pool management enabled, the problem with
"ping -s 65507" is fixed for me, and the throughput of a simple
test with wget increases from creeping 3MB/s up to 20MB/s!
Signed-off-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2016-03-21 19:25:23 +03:00
|
|
|
|
|
|
|
if (dev->compat_flags & SPAPRVLAN_FLAG_RX_BUF_POOLS) {
|
|
|
|
for (i = 0; i < RX_MAX_POOLS; i++) {
|
|
|
|
dev->rx_pool[i] = g_new(RxBufPool, 1);
|
|
|
|
spapr_vlan_reset_rx_pool(dev->rx_pool[i]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void spapr_vlan_instance_finalize(Object *obj)
|
|
|
|
{
|
spapr: Use CamelCase properly
The qemu coding standard is to use CamelCase for type and structure names,
and the pseries code follows that... sort of. There are quite a lot of
places where we bend the rules in order to preserve the capitalization of
internal acronyms like "PHB", "TCE", "DIMM" and most commonly "sPAPR".
That was a bad idea - it frequently leads to names ending up with hard to
read clusters of capital letters, and means they don't catch the eye as
type identifiers, which is kind of the point of the CamelCase convention in
the first place.
In short, keeping type identifiers look like CamelCase is more important
than preserving standard capitalization of internal "words". So, this
patch renames a heap of spapr internal type names to a more standard
CamelCase.
In addition to case changes, we also make some other identifier renames:
VIOsPAPR* -> SpaprVio*
The reverse word ordering was only ever used to mitigate the capital
cluster, so revert to the natural ordering.
VIOsPAPRVTYDevice -> SpaprVioVty
VIOsPAPRVLANDevice -> SpaprVioVlan
Brevity, since the "Device" didn't add useful information
sPAPRDRConnector -> SpaprDrc
sPAPRDRConnectorClass -> SpaprDrcClass
Brevity, and makes it clearer this is the same thing as a "DRC"
mentioned in many other places in the code
This is 100% a mechanical search-and-replace patch. It will, however,
conflict with essentially any and all outstanding patches touching the
spapr code.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2019-03-06 07:35:37 +03:00
|
|
|
SpaprVioVlan *dev = VIO_SPAPR_VLAN_DEVICE(obj);
|
hw/net/spapr_llan: Fix receive buffer handling for better performance
tl;dr:
This patch introduces an alternate way of handling the receive
buffers of the spapr-vlan device, resulting in much better
receive performance for the guest.
Full story:
One of our testers recently discovered that the performance of the
spapr-vlan device is very poor compared to other NICs, and that
a simple "ping -i 0.2 -s 65507 someip" in the guest can result
in more than 50% lost ping packets (especially with older guest
kernels < 3.17).
After doing some analysis, it was clear that there is a problem
with the way we handle the receive buffers in spapr_llan.c: The
ibmveth driver of the guest Linux kernel tries to add a lot of
buffers into several buffer pools (with 512, 2048 and 65536 byte
sizes by default, but it can be changed via the entries in the
/sys/devices/vio/1000/pool* directories of the guest). However,
the spapr-vlan device of QEMU only tries to squeeze all receive
buffer descriptors into one single page which has been supplied
by the guest during the H_REGISTER_LOGICAL_LAN call, without
taking care of different buffer sizes. This has two bad effects:
First, only a very limited number of buffer descriptors is accepted
at all. Second, we also hand 64k buffers to the guest even if
the 2k buffers would fit better - and this results in dropped packets
in the IP layer of the guest since too much skbuf memory is used.
Though it seems at a first glance like PAPR says that we should store
the receive buffer descriptors in the page that is supplied during
the H_REGISTER_LOGICAL_LAN call, chapter 16.4.1.2 in the LoPAPR spec
declares that "the contents of these descriptors are architecturally
opaque, none of these descriptors are manipulated by code above
the architected interfaces". That means we don't have to store
the RX buffer descriptors in this page, but can also manage the
receive buffers at the hypervisor level only. This is now what we
are doing here: Introducing proper RX buffer pools which are also
sorted by size of the buffers, so we can hand out a buffer with
the best fitting size when a packet has been received.
To avoid problems with migration from/to older version of QEMU,
the old behavior is also retained and enabled by default. The new
buffer management has to be enabled via a new "use-rx-buffer-pools"
property.
Now with the new buffer pool management enabled, the problem with
"ping -s 65507" is fixed for me, and the throughput of a simple
test with wget increases from creeping 3MB/s up to 20MB/s!
Signed-off-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2016-03-21 19:25:23 +03:00
|
|
|
int i;
|
|
|
|
|
|
|
|
if (dev->compat_flags & SPAPRVLAN_FLAG_RX_BUF_POOLS) {
|
|
|
|
for (i = 0; i < RX_MAX_POOLS; i++) {
|
|
|
|
g_free(dev->rx_pool[i]);
|
|
|
|
dev->rx_pool[i] = NULL;
|
|
|
|
}
|
|
|
|
}
|
hw/net/spapr_llan: Delay flushing of the RX queue while adding new RX buffers
Currently, the spapr-vlan device is trying to flush the RX queue
after each RX buffer that has been added by the guest via the
H_ADD_LOGICAL_LAN_BUFFER hypercall. In case the receive buffer pool
was empty before, we only pass single packets to the guest this
way. This can cause very bad performance if a sender is trying
to stream fragmented UDP packets to the guest. For example when
using the UDP_STREAM test from netperf with UDP packets that are
much bigger than the MTU size, almost all UDP packets are dropped
in the guest since the chances are quite high that at least one of
the fragments got lost on the way.
When flushing the receive queue, it's much better if we'd have
a bunch of receive buffers available already, so that fragmented
packets can be passed to the guest in one go. To do this, the
spapr_vlan_receive() function should return 0 instead of -1 if there
are no more receive buffers available, so that receive_disabled = 1
gets temporarily set for the receive queue, and we have to delay
the queue flushing at the end of h_add_logical_lan_buffer() a little
bit by using a timer, so that the guest gets a chance to add multiple
RX buffers before we flush the queue again.
This improves the UDP_STREAM test with the spapr-vlan device a lot:
Running
netserver -p 44444 -L <guestip> -f -D -4
in the guest, and
netperf -p 44444 -L <hostip> -H <guestip> -t UDP_STREAM -l 60 -- -m 16384
in the host, I get the following values _without_ this patch:
Socket Message Elapsed Messages
Size Size Time Okay Errors Throughput
bytes bytes secs # # 10^6bits/sec
229376 16384 60.00 1738970 0 3798.83
229376 60.00 23 0.05
That "0.05" means that almost all UDP packets got lost/discarded
at the receiving side.
With this patch applied, the value look much better:
Socket Message Elapsed Messages
Size Size Time Okay Errors Throughput
bytes bytes secs # # 10^6bits/sec
229376 16384 60.00 1789104 0 3908.35
229376 60.00 22818 49.85
Signed-off-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2016-03-31 14:47:05 +03:00
|
|
|
|
|
|
|
if (dev->rxp_timer) {
|
|
|
|
timer_free(dev->rxp_timer);
|
|
|
|
}
|
2014-10-07 12:00:18 +04:00
|
|
|
}
|
|
|
|
|
spapr: Use CamelCase properly
The qemu coding standard is to use CamelCase for type and structure names,
and the pseries code follows that... sort of. There are quite a lot of
places where we bend the rules in order to preserve the capitalization of
internal acronyms like "PHB", "TCE", "DIMM" and most commonly "sPAPR".
That was a bad idea - it frequently leads to names ending up with hard to
read clusters of capital letters, and means they don't catch the eye as
type identifiers, which is kind of the point of the CamelCase convention in
the first place.
In short, keeping type identifiers look like CamelCase is more important
than preserving standard capitalization of internal "words". So, this
patch renames a heap of spapr internal type names to a more standard
CamelCase.
In addition to case changes, we also make some other identifier renames:
VIOsPAPR* -> SpaprVio*
The reverse word ordering was only ever used to mitigate the capital
cluster, so revert to the natural ordering.
VIOsPAPRVTYDevice -> SpaprVioVty
VIOsPAPRVLANDevice -> SpaprVioVlan
Brevity, since the "Device" didn't add useful information
sPAPRDRConnector -> SpaprDrc
sPAPRDRConnectorClass -> SpaprDrcClass
Brevity, and makes it clearer this is the same thing as a "DRC"
mentioned in many other places in the code
This is 100% a mechanical search-and-replace patch. It will, however,
conflict with essentially any and all outstanding patches touching the
spapr code.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2019-03-06 07:35:37 +03:00
|
|
|
void spapr_vlan_create(SpaprVioBus *bus, NICInfo *nd)
|
2011-04-01 08:15:29 +04:00
|
|
|
{
|
|
|
|
DeviceState *dev;
|
|
|
|
|
qdev: Convert uses of qdev_create() with Coccinelle
This is the transformation explained in the commit before previous.
Takes care of just one pattern that needs conversion. More to come in
this series.
Coccinelle script:
@ depends on !(file in "hw/arm/highbank.c")@
expression bus, type_name, dev, expr;
@@
- dev = qdev_create(bus, type_name);
+ dev = qdev_new(type_name);
... when != dev = expr
- qdev_init_nofail(dev);
+ qdev_realize_and_unref(dev, bus, &error_fatal);
@@
expression bus, type_name, dev, expr;
identifier DOWN;
@@
- dev = DOWN(qdev_create(bus, type_name));
+ dev = DOWN(qdev_new(type_name));
... when != dev = expr
- qdev_init_nofail(DEVICE(dev));
+ qdev_realize_and_unref(DEVICE(dev), bus, &error_fatal);
@@
expression bus, type_name, expr;
identifier dev;
@@
- DeviceState *dev = qdev_create(bus, type_name);
+ DeviceState *dev = qdev_new(type_name);
... when != dev = expr
- qdev_init_nofail(dev);
+ qdev_realize_and_unref(dev, bus, &error_fatal);
@@
expression bus, type_name, dev, expr, errp;
symbol true;
@@
- dev = qdev_create(bus, type_name);
+ dev = qdev_new(type_name);
... when != dev = expr
- object_property_set_bool(OBJECT(dev), true, "realized", errp);
+ qdev_realize_and_unref(dev, bus, errp);
@@
expression bus, type_name, expr, errp;
identifier dev;
symbol true;
@@
- DeviceState *dev = qdev_create(bus, type_name);
+ DeviceState *dev = qdev_new(type_name);
... when != dev = expr
- object_property_set_bool(OBJECT(dev), true, "realized", errp);
+ qdev_realize_and_unref(dev, bus, errp);
The first rule exempts hw/arm/highbank.c, because it matches along two
control flow paths there, with different @type_name. Covered by the
next commit's manual conversions.
Missing #include "qapi/error.h" added manually.
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20200610053247.1583243-10-armbru@redhat.com>
[Conflicts in hw/misc/empty_slot.c and hw/sparc/leon3.c resolved]
2020-06-10 08:31:58 +03:00
|
|
|
dev = qdev_new("spapr-vlan");
|
2011-04-01 08:15:29 +04:00
|
|
|
|
|
|
|
qdev_set_nic_properties(dev, nd);
|
|
|
|
|
qdev: Convert uses of qdev_create() with Coccinelle
This is the transformation explained in the commit before previous.
Takes care of just one pattern that needs conversion. More to come in
this series.
Coccinelle script:
@ depends on !(file in "hw/arm/highbank.c")@
expression bus, type_name, dev, expr;
@@
- dev = qdev_create(bus, type_name);
+ dev = qdev_new(type_name);
... when != dev = expr
- qdev_init_nofail(dev);
+ qdev_realize_and_unref(dev, bus, &error_fatal);
@@
expression bus, type_name, dev, expr;
identifier DOWN;
@@
- dev = DOWN(qdev_create(bus, type_name));
+ dev = DOWN(qdev_new(type_name));
... when != dev = expr
- qdev_init_nofail(DEVICE(dev));
+ qdev_realize_and_unref(DEVICE(dev), bus, &error_fatal);
@@
expression bus, type_name, expr;
identifier dev;
@@
- DeviceState *dev = qdev_create(bus, type_name);
+ DeviceState *dev = qdev_new(type_name);
... when != dev = expr
- qdev_init_nofail(dev);
+ qdev_realize_and_unref(dev, bus, &error_fatal);
@@
expression bus, type_name, dev, expr, errp;
symbol true;
@@
- dev = qdev_create(bus, type_name);
+ dev = qdev_new(type_name);
... when != dev = expr
- object_property_set_bool(OBJECT(dev), true, "realized", errp);
+ qdev_realize_and_unref(dev, bus, errp);
@@
expression bus, type_name, expr, errp;
identifier dev;
symbol true;
@@
- DeviceState *dev = qdev_create(bus, type_name);
+ DeviceState *dev = qdev_new(type_name);
... when != dev = expr
- object_property_set_bool(OBJECT(dev), true, "realized", errp);
+ qdev_realize_and_unref(dev, bus, errp);
The first rule exempts hw/arm/highbank.c, because it matches along two
control flow paths there, with different @type_name. Covered by the
next commit's manual conversions.
Missing #include "qapi/error.h" added manually.
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20200610053247.1583243-10-armbru@redhat.com>
[Conflicts in hw/misc/empty_slot.c and hw/sparc/leon3.c resolved]
2020-06-10 08:31:58 +03:00
|
|
|
qdev_realize_and_unref(dev, &bus->bus, &error_fatal);
|
2011-04-01 08:15:29 +04:00
|
|
|
}
|
|
|
|
|
spapr: Use CamelCase properly
The qemu coding standard is to use CamelCase for type and structure names,
and the pseries code follows that... sort of. There are quite a lot of
places where we bend the rules in order to preserve the capitalization of
internal acronyms like "PHB", "TCE", "DIMM" and most commonly "sPAPR".
That was a bad idea - it frequently leads to names ending up with hard to
read clusters of capital letters, and means they don't catch the eye as
type identifiers, which is kind of the point of the CamelCase convention in
the first place.
In short, keeping type identifiers look like CamelCase is more important
than preserving standard capitalization of internal "words". So, this
patch renames a heap of spapr internal type names to a more standard
CamelCase.
In addition to case changes, we also make some other identifier renames:
VIOsPAPR* -> SpaprVio*
The reverse word ordering was only ever used to mitigate the capital
cluster, so revert to the natural ordering.
VIOsPAPRVTYDevice -> SpaprVioVty
VIOsPAPRVLANDevice -> SpaprVioVlan
Brevity, since the "Device" didn't add useful information
sPAPRDRConnector -> SpaprDrc
sPAPRDRConnectorClass -> SpaprDrcClass
Brevity, and makes it clearer this is the same thing as a "DRC"
mentioned in many other places in the code
This is 100% a mechanical search-and-replace patch. It will, however,
conflict with essentially any and all outstanding patches touching the
spapr code.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2019-03-06 07:35:37 +03:00
|
|
|
static int spapr_vlan_devnode(SpaprVioDevice *dev, void *fdt, int node_off)
|
2011-04-01 08:15:29 +04:00
|
|
|
{
|
spapr: Use CamelCase properly
The qemu coding standard is to use CamelCase for type and structure names,
and the pseries code follows that... sort of. There are quite a lot of
places where we bend the rules in order to preserve the capitalization of
internal acronyms like "PHB", "TCE", "DIMM" and most commonly "sPAPR".
That was a bad idea - it frequently leads to names ending up with hard to
read clusters of capital letters, and means they don't catch the eye as
type identifiers, which is kind of the point of the CamelCase convention in
the first place.
In short, keeping type identifiers look like CamelCase is more important
than preserving standard capitalization of internal "words". So, this
patch renames a heap of spapr internal type names to a more standard
CamelCase.
In addition to case changes, we also make some other identifier renames:
VIOsPAPR* -> SpaprVio*
The reverse word ordering was only ever used to mitigate the capital
cluster, so revert to the natural ordering.
VIOsPAPRVTYDevice -> SpaprVioVty
VIOsPAPRVLANDevice -> SpaprVioVlan
Brevity, since the "Device" didn't add useful information
sPAPRDRConnector -> SpaprDrc
sPAPRDRConnectorClass -> SpaprDrcClass
Brevity, and makes it clearer this is the same thing as a "DRC"
mentioned in many other places in the code
This is 100% a mechanical search-and-replace patch. It will, however,
conflict with essentially any and all outstanding patches touching the
spapr code.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2019-03-06 07:35:37 +03:00
|
|
|
SpaprVioVlan *vdev = VIO_SPAPR_VLAN_DEVICE(dev);
|
2011-04-01 08:15:29 +04:00
|
|
|
uint8_t padded_mac[8] = {0, 0};
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
/* Some old phyp versions give the mac address in an 8-byte
|
2016-11-22 02:19:38 +03:00
|
|
|
* property. The kernel driver (before 3.10) has an insane workaround;
|
2011-04-01 08:15:29 +04:00
|
|
|
* rather than doing the obvious thing and checking the property
|
|
|
|
* length, it checks whether the first byte has 0b10 in the low
|
|
|
|
* bits. If a correct 6-byte property has a different first byte
|
|
|
|
* the kernel will get the wrong mac address, overrunning its
|
|
|
|
* buffer in the process (read only, thank goodness).
|
|
|
|
*
|
2016-11-22 02:19:38 +03:00
|
|
|
* Here we return a 6-byte address unless that would break a pre-3.10
|
|
|
|
* driver. In that case we return a padded 8-byte address to allow the old
|
|
|
|
* workaround to succeed. */
|
|
|
|
if ((vdev->nicconf.macaddr.a[0] & 0x3) == 0x2) {
|
|
|
|
ret = fdt_setprop(fdt, node_off, "local-mac-address",
|
|
|
|
&vdev->nicconf.macaddr, ETH_ALEN);
|
|
|
|
} else {
|
|
|
|
memcpy(&padded_mac[2], &vdev->nicconf.macaddr, ETH_ALEN);
|
|
|
|
ret = fdt_setprop(fdt, node_off, "local-mac-address",
|
|
|
|
padded_mac, sizeof(padded_mac));
|
|
|
|
}
|
2011-04-01 08:15:29 +04:00
|
|
|
if (ret < 0) {
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = fdt_setprop_cell(fdt, node_off, "ibm,mac-address-filters", 0);
|
|
|
|
if (ret < 0) {
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
spapr: Use CamelCase properly
The qemu coding standard is to use CamelCase for type and structure names,
and the pseries code follows that... sort of. There are quite a lot of
places where we bend the rules in order to preserve the capitalization of
internal acronyms like "PHB", "TCE", "DIMM" and most commonly "sPAPR".
That was a bad idea - it frequently leads to names ending up with hard to
read clusters of capital letters, and means they don't catch the eye as
type identifiers, which is kind of the point of the CamelCase convention in
the first place.
In short, keeping type identifiers look like CamelCase is more important
than preserving standard capitalization of internal "words". So, this
patch renames a heap of spapr internal type names to a more standard
CamelCase.
In addition to case changes, we also make some other identifier renames:
VIOsPAPR* -> SpaprVio*
The reverse word ordering was only ever used to mitigate the capital
cluster, so revert to the natural ordering.
VIOsPAPRVTYDevice -> SpaprVioVty
VIOsPAPRVLANDevice -> SpaprVioVlan
Brevity, since the "Device" didn't add useful information
sPAPRDRConnector -> SpaprDrc
sPAPRDRConnectorClass -> SpaprDrcClass
Brevity, and makes it clearer this is the same thing as a "DRC"
mentioned in many other places in the code
This is 100% a mechanical search-and-replace patch. It will, however,
conflict with essentially any and all outstanding patches touching the
spapr code.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2019-03-06 07:35:37 +03:00
|
|
|
static int check_bd(SpaprVioVlan *dev, vlan_bd_t bd,
|
2011-04-01 08:15:29 +04:00
|
|
|
target_ulong alignment)
|
|
|
|
{
|
|
|
|
if ((VLAN_BD_ADDR(bd) % alignment)
|
|
|
|
|| (VLAN_BD_LEN(bd) % alignment)) {
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
2012-06-27 08:50:44 +04:00
|
|
|
if (!spapr_vio_dma_valid(&dev->sdev, VLAN_BD_ADDR(bd),
|
|
|
|
VLAN_BD_LEN(bd), DMA_DIRECTION_FROM_DEVICE)
|
|
|
|
|| !spapr_vio_dma_valid(&dev->sdev, VLAN_BD_ADDR(bd),
|
|
|
|
VLAN_BD_LEN(bd), DMA_DIRECTION_TO_DEVICE)) {
|
2011-04-01 08:15:29 +04:00
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2012-05-03 08:23:01 +04:00
|
|
|
static target_ulong h_register_logical_lan(PowerPCCPU *cpu,
|
spapr: Use CamelCase properly
The qemu coding standard is to use CamelCase for type and structure names,
and the pseries code follows that... sort of. There are quite a lot of
places where we bend the rules in order to preserve the capitalization of
internal acronyms like "PHB", "TCE", "DIMM" and most commonly "sPAPR".
That was a bad idea - it frequently leads to names ending up with hard to
read clusters of capital letters, and means they don't catch the eye as
type identifiers, which is kind of the point of the CamelCase convention in
the first place.
In short, keeping type identifiers look like CamelCase is more important
than preserving standard capitalization of internal "words". So, this
patch renames a heap of spapr internal type names to a more standard
CamelCase.
In addition to case changes, we also make some other identifier renames:
VIOsPAPR* -> SpaprVio*
The reverse word ordering was only ever used to mitigate the capital
cluster, so revert to the natural ordering.
VIOsPAPRVTYDevice -> SpaprVioVty
VIOsPAPRVLANDevice -> SpaprVioVlan
Brevity, since the "Device" didn't add useful information
sPAPRDRConnector -> SpaprDrc
sPAPRDRConnectorClass -> SpaprDrcClass
Brevity, and makes it clearer this is the same thing as a "DRC"
mentioned in many other places in the code
This is 100% a mechanical search-and-replace patch. It will, however,
conflict with essentially any and all outstanding patches touching the
spapr code.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2019-03-06 07:35:37 +03:00
|
|
|
SpaprMachineState *spapr,
|
2011-04-01 08:15:29 +04:00
|
|
|
target_ulong opcode,
|
|
|
|
target_ulong *args)
|
|
|
|
{
|
|
|
|
target_ulong reg = args[0];
|
|
|
|
target_ulong buf_list = args[1];
|
|
|
|
target_ulong rec_queue = args[2];
|
|
|
|
target_ulong filter_list = args[3];
|
spapr: Use CamelCase properly
The qemu coding standard is to use CamelCase for type and structure names,
and the pseries code follows that... sort of. There are quite a lot of
places where we bend the rules in order to preserve the capitalization of
internal acronyms like "PHB", "TCE", "DIMM" and most commonly "sPAPR".
That was a bad idea - it frequently leads to names ending up with hard to
read clusters of capital letters, and means they don't catch the eye as
type identifiers, which is kind of the point of the CamelCase convention in
the first place.
In short, keeping type identifiers look like CamelCase is more important
than preserving standard capitalization of internal "words". So, this
patch renames a heap of spapr internal type names to a more standard
CamelCase.
In addition to case changes, we also make some other identifier renames:
VIOsPAPR* -> SpaprVio*
The reverse word ordering was only ever used to mitigate the capital
cluster, so revert to the natural ordering.
VIOsPAPRVTYDevice -> SpaprVioVty
VIOsPAPRVLANDevice -> SpaprVioVlan
Brevity, since the "Device" didn't add useful information
sPAPRDRConnector -> SpaprDrc
sPAPRDRConnectorClass -> SpaprDrcClass
Brevity, and makes it clearer this is the same thing as a "DRC"
mentioned in many other places in the code
This is 100% a mechanical search-and-replace patch. It will, however,
conflict with essentially any and all outstanding patches touching the
spapr code.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2019-03-06 07:35:37 +03:00
|
|
|
SpaprVioDevice *sdev = spapr_vio_find_by_reg(spapr->vio_bus, reg);
|
|
|
|
SpaprVioVlan *dev = VIO_SPAPR_VLAN_DEVICE(sdev);
|
2011-04-01 08:15:29 +04:00
|
|
|
vlan_bd_t filter_list_bd;
|
|
|
|
|
|
|
|
if (!dev) {
|
|
|
|
return H_PARAMETER;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (dev->isopen) {
|
|
|
|
hcall_dprintf("H_REGISTER_LOGICAL_LAN called twice without "
|
|
|
|
"H_FREE_LOGICAL_LAN\n");
|
|
|
|
return H_RESOURCE;
|
|
|
|
}
|
|
|
|
|
2012-06-27 08:50:44 +04:00
|
|
|
if (check_bd(dev, VLAN_VALID_BD(buf_list, SPAPR_TCE_PAGE_SIZE),
|
|
|
|
SPAPR_TCE_PAGE_SIZE) < 0) {
|
2012-03-29 01:39:45 +04:00
|
|
|
hcall_dprintf("Bad buf_list 0x" TARGET_FMT_lx "\n", buf_list);
|
2011-04-01 08:15:29 +04:00
|
|
|
return H_PARAMETER;
|
|
|
|
}
|
|
|
|
|
2012-06-27 08:50:44 +04:00
|
|
|
filter_list_bd = VLAN_VALID_BD(filter_list, SPAPR_TCE_PAGE_SIZE);
|
|
|
|
if (check_bd(dev, filter_list_bd, SPAPR_TCE_PAGE_SIZE) < 0) {
|
2012-03-29 01:39:45 +04:00
|
|
|
hcall_dprintf("Bad filter_list 0x" TARGET_FMT_lx "\n", filter_list);
|
2011-04-01 08:15:29 +04:00
|
|
|
return H_PARAMETER;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!(rec_queue & VLAN_BD_VALID)
|
|
|
|
|| (check_bd(dev, rec_queue, VLAN_RQ_ALIGNMENT) < 0)) {
|
2012-03-29 01:39:45 +04:00
|
|
|
hcall_dprintf("Bad receive queue\n");
|
2011-04-01 08:15:29 +04:00
|
|
|
return H_PARAMETER;
|
|
|
|
}
|
|
|
|
|
|
|
|
dev->buf_list = buf_list;
|
|
|
|
sdev->signal_state = 0;
|
|
|
|
|
|
|
|
rec_queue &= ~VLAN_BD_TOGGLE;
|
|
|
|
|
|
|
|
/* Initialize the buffer list */
|
2012-06-27 08:50:44 +04:00
|
|
|
vio_stq(sdev, buf_list, rec_queue);
|
|
|
|
vio_stq(sdev, buf_list + 8, filter_list_bd);
|
|
|
|
spapr_vio_dma_set(sdev, buf_list + VLAN_RX_BDS_OFF, 0,
|
|
|
|
SPAPR_TCE_PAGE_SIZE - VLAN_RX_BDS_OFF);
|
2011-04-01 08:15:29 +04:00
|
|
|
dev->add_buf_ptr = VLAN_RX_BDS_OFF - 8;
|
|
|
|
dev->use_buf_ptr = VLAN_RX_BDS_OFF - 8;
|
|
|
|
dev->rx_bufs = 0;
|
|
|
|
dev->rxq_ptr = 0;
|
|
|
|
|
|
|
|
/* Initialize the receive queue */
|
2012-06-27 08:50:44 +04:00
|
|
|
spapr_vio_dma_set(sdev, VLAN_BD_ADDR(rec_queue), 0, VLAN_BD_LEN(rec_queue));
|
2011-04-01 08:15:29 +04:00
|
|
|
|
|
|
|
dev->isopen = 1;
|
2013-05-03 00:22:03 +04:00
|
|
|
qemu_flush_queued_packets(qemu_get_queue(dev->nic));
|
|
|
|
|
2011-04-01 08:15:29 +04:00
|
|
|
return H_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2015-07-02 09:23:04 +03:00
|
|
|
static target_ulong h_free_logical_lan(PowerPCCPU *cpu,
|
spapr: Use CamelCase properly
The qemu coding standard is to use CamelCase for type and structure names,
and the pseries code follows that... sort of. There are quite a lot of
places where we bend the rules in order to preserve the capitalization of
internal acronyms like "PHB", "TCE", "DIMM" and most commonly "sPAPR".
That was a bad idea - it frequently leads to names ending up with hard to
read clusters of capital letters, and means they don't catch the eye as
type identifiers, which is kind of the point of the CamelCase convention in
the first place.
In short, keeping type identifiers look like CamelCase is more important
than preserving standard capitalization of internal "words". So, this
patch renames a heap of spapr internal type names to a more standard
CamelCase.
In addition to case changes, we also make some other identifier renames:
VIOsPAPR* -> SpaprVio*
The reverse word ordering was only ever used to mitigate the capital
cluster, so revert to the natural ordering.
VIOsPAPRVTYDevice -> SpaprVioVty
VIOsPAPRVLANDevice -> SpaprVioVlan
Brevity, since the "Device" didn't add useful information
sPAPRDRConnector -> SpaprDrc
sPAPRDRConnectorClass -> SpaprDrcClass
Brevity, and makes it clearer this is the same thing as a "DRC"
mentioned in many other places in the code
This is 100% a mechanical search-and-replace patch. It will, however,
conflict with essentially any and all outstanding patches touching the
spapr code.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2019-03-06 07:35:37 +03:00
|
|
|
SpaprMachineState *spapr,
|
2011-04-01 08:15:29 +04:00
|
|
|
target_ulong opcode, target_ulong *args)
|
|
|
|
{
|
|
|
|
target_ulong reg = args[0];
|
spapr: Use CamelCase properly
The qemu coding standard is to use CamelCase for type and structure names,
and the pseries code follows that... sort of. There are quite a lot of
places where we bend the rules in order to preserve the capitalization of
internal acronyms like "PHB", "TCE", "DIMM" and most commonly "sPAPR".
That was a bad idea - it frequently leads to names ending up with hard to
read clusters of capital letters, and means they don't catch the eye as
type identifiers, which is kind of the point of the CamelCase convention in
the first place.
In short, keeping type identifiers look like CamelCase is more important
than preserving standard capitalization of internal "words". So, this
patch renames a heap of spapr internal type names to a more standard
CamelCase.
In addition to case changes, we also make some other identifier renames:
VIOsPAPR* -> SpaprVio*
The reverse word ordering was only ever used to mitigate the capital
cluster, so revert to the natural ordering.
VIOsPAPRVTYDevice -> SpaprVioVty
VIOsPAPRVLANDevice -> SpaprVioVlan
Brevity, since the "Device" didn't add useful information
sPAPRDRConnector -> SpaprDrc
sPAPRDRConnectorClass -> SpaprDrcClass
Brevity, and makes it clearer this is the same thing as a "DRC"
mentioned in many other places in the code
This is 100% a mechanical search-and-replace patch. It will, however,
conflict with essentially any and all outstanding patches touching the
spapr code.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2019-03-06 07:35:37 +03:00
|
|
|
SpaprVioDevice *sdev = spapr_vio_find_by_reg(spapr->vio_bus, reg);
|
|
|
|
SpaprVioVlan *dev = VIO_SPAPR_VLAN_DEVICE(sdev);
|
2011-04-01 08:15:29 +04:00
|
|
|
|
|
|
|
if (!dev) {
|
|
|
|
return H_PARAMETER;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!dev->isopen) {
|
|
|
|
hcall_dprintf("H_FREE_LOGICAL_LAN called without "
|
|
|
|
"H_REGISTER_LOGICAL_LAN\n");
|
|
|
|
return H_RESOURCE;
|
|
|
|
}
|
|
|
|
|
2012-04-12 06:44:15 +04:00
|
|
|
spapr_vlan_reset(sdev);
|
2011-04-01 08:15:29 +04:00
|
|
|
return H_SUCCESS;
|
|
|
|
}
|
|
|
|
|
hw/net/spapr_llan: Fix receive buffer handling for better performance
tl;dr:
This patch introduces an alternate way of handling the receive
buffers of the spapr-vlan device, resulting in much better
receive performance for the guest.
Full story:
One of our testers recently discovered that the performance of the
spapr-vlan device is very poor compared to other NICs, and that
a simple "ping -i 0.2 -s 65507 someip" in the guest can result
in more than 50% lost ping packets (especially with older guest
kernels < 3.17).
After doing some analysis, it was clear that there is a problem
with the way we handle the receive buffers in spapr_llan.c: The
ibmveth driver of the guest Linux kernel tries to add a lot of
buffers into several buffer pools (with 512, 2048 and 65536 byte
sizes by default, but it can be changed via the entries in the
/sys/devices/vio/1000/pool* directories of the guest). However,
the spapr-vlan device of QEMU only tries to squeeze all receive
buffer descriptors into one single page which has been supplied
by the guest during the H_REGISTER_LOGICAL_LAN call, without
taking care of different buffer sizes. This has two bad effects:
First, only a very limited number of buffer descriptors is accepted
at all. Second, we also hand 64k buffers to the guest even if
the 2k buffers would fit better - and this results in dropped packets
in the IP layer of the guest since too much skbuf memory is used.
Though it seems at a first glance like PAPR says that we should store
the receive buffer descriptors in the page that is supplied during
the H_REGISTER_LOGICAL_LAN call, chapter 16.4.1.2 in the LoPAPR spec
declares that "the contents of these descriptors are architecturally
opaque, none of these descriptors are manipulated by code above
the architected interfaces". That means we don't have to store
the RX buffer descriptors in this page, but can also manage the
receive buffers at the hypervisor level only. This is now what we
are doing here: Introducing proper RX buffer pools which are also
sorted by size of the buffers, so we can hand out a buffer with
the best fitting size when a packet has been received.
To avoid problems with migration from/to older version of QEMU,
the old behavior is also retained and enabled by default. The new
buffer management has to be enabled via a new "use-rx-buffer-pools"
property.
Now with the new buffer pool management enabled, the problem with
"ping -s 65507" is fixed for me, and the throughput of a simple
test with wget increases from creeping 3MB/s up to 20MB/s!
Signed-off-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2016-03-21 19:25:23 +03:00
|
|
|
/**
|
|
|
|
* Used for qsort, this function compares two RxBufPools by size.
|
|
|
|
*/
|
|
|
|
static int rx_pool_size_compare(const void *p1, const void *p2)
|
|
|
|
{
|
|
|
|
const RxBufPool *pool1 = *(RxBufPool **)p1;
|
|
|
|
const RxBufPool *pool2 = *(RxBufPool **)p2;
|
|
|
|
|
|
|
|
if (pool1->bufsize < pool2->bufsize) {
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
return pool1->bufsize > pool2->bufsize;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Search for a matching buffer pool with exact matching size,
|
|
|
|
* or return -1 if no matching pool has been found.
|
|
|
|
*/
|
spapr: Use CamelCase properly
The qemu coding standard is to use CamelCase for type and structure names,
and the pseries code follows that... sort of. There are quite a lot of
places where we bend the rules in order to preserve the capitalization of
internal acronyms like "PHB", "TCE", "DIMM" and most commonly "sPAPR".
That was a bad idea - it frequently leads to names ending up with hard to
read clusters of capital letters, and means they don't catch the eye as
type identifiers, which is kind of the point of the CamelCase convention in
the first place.
In short, keeping type identifiers look like CamelCase is more important
than preserving standard capitalization of internal "words". So, this
patch renames a heap of spapr internal type names to a more standard
CamelCase.
In addition to case changes, we also make some other identifier renames:
VIOsPAPR* -> SpaprVio*
The reverse word ordering was only ever used to mitigate the capital
cluster, so revert to the natural ordering.
VIOsPAPRVTYDevice -> SpaprVioVty
VIOsPAPRVLANDevice -> SpaprVioVlan
Brevity, since the "Device" didn't add useful information
sPAPRDRConnector -> SpaprDrc
sPAPRDRConnectorClass -> SpaprDrcClass
Brevity, and makes it clearer this is the same thing as a "DRC"
mentioned in many other places in the code
This is 100% a mechanical search-and-replace patch. It will, however,
conflict with essentially any and all outstanding patches touching the
spapr code.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2019-03-06 07:35:37 +03:00
|
|
|
static int spapr_vlan_get_rx_pool_id(SpaprVioVlan *dev, int size)
|
hw/net/spapr_llan: Fix receive buffer handling for better performance
tl;dr:
This patch introduces an alternate way of handling the receive
buffers of the spapr-vlan device, resulting in much better
receive performance for the guest.
Full story:
One of our testers recently discovered that the performance of the
spapr-vlan device is very poor compared to other NICs, and that
a simple "ping -i 0.2 -s 65507 someip" in the guest can result
in more than 50% lost ping packets (especially with older guest
kernels < 3.17).
After doing some analysis, it was clear that there is a problem
with the way we handle the receive buffers in spapr_llan.c: The
ibmveth driver of the guest Linux kernel tries to add a lot of
buffers into several buffer pools (with 512, 2048 and 65536 byte
sizes by default, but it can be changed via the entries in the
/sys/devices/vio/1000/pool* directories of the guest). However,
the spapr-vlan device of QEMU only tries to squeeze all receive
buffer descriptors into one single page which has been supplied
by the guest during the H_REGISTER_LOGICAL_LAN call, without
taking care of different buffer sizes. This has two bad effects:
First, only a very limited number of buffer descriptors is accepted
at all. Second, we also hand 64k buffers to the guest even if
the 2k buffers would fit better - and this results in dropped packets
in the IP layer of the guest since too much skbuf memory is used.
Though it seems at a first glance like PAPR says that we should store
the receive buffer descriptors in the page that is supplied during
the H_REGISTER_LOGICAL_LAN call, chapter 16.4.1.2 in the LoPAPR spec
declares that "the contents of these descriptors are architecturally
opaque, none of these descriptors are manipulated by code above
the architected interfaces". That means we don't have to store
the RX buffer descriptors in this page, but can also manage the
receive buffers at the hypervisor level only. This is now what we
are doing here: Introducing proper RX buffer pools which are also
sorted by size of the buffers, so we can hand out a buffer with
the best fitting size when a packet has been received.
To avoid problems with migration from/to older version of QEMU,
the old behavior is also retained and enabled by default. The new
buffer management has to be enabled via a new "use-rx-buffer-pools"
property.
Now with the new buffer pool management enabled, the problem with
"ping -s 65507" is fixed for me, and the throughput of a simple
test with wget increases from creeping 3MB/s up to 20MB/s!
Signed-off-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2016-03-21 19:25:23 +03:00
|
|
|
{
|
|
|
|
int pool;
|
|
|
|
|
|
|
|
for (pool = 0; pool < RX_MAX_POOLS; pool++) {
|
|
|
|
if (dev->rx_pool[pool]->bufsize == size) {
|
|
|
|
return pool;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Enqueuing receive buffer by adding it to one of our receive buffer pools
|
|
|
|
*/
|
spapr: Use CamelCase properly
The qemu coding standard is to use CamelCase for type and structure names,
and the pseries code follows that... sort of. There are quite a lot of
places where we bend the rules in order to preserve the capitalization of
internal acronyms like "PHB", "TCE", "DIMM" and most commonly "sPAPR".
That was a bad idea - it frequently leads to names ending up with hard to
read clusters of capital letters, and means they don't catch the eye as
type identifiers, which is kind of the point of the CamelCase convention in
the first place.
In short, keeping type identifiers look like CamelCase is more important
than preserving standard capitalization of internal "words". So, this
patch renames a heap of spapr internal type names to a more standard
CamelCase.
In addition to case changes, we also make some other identifier renames:
VIOsPAPR* -> SpaprVio*
The reverse word ordering was only ever used to mitigate the capital
cluster, so revert to the natural ordering.
VIOsPAPRVTYDevice -> SpaprVioVty
VIOsPAPRVLANDevice -> SpaprVioVlan
Brevity, since the "Device" didn't add useful information
sPAPRDRConnector -> SpaprDrc
sPAPRDRConnectorClass -> SpaprDrcClass
Brevity, and makes it clearer this is the same thing as a "DRC"
mentioned in many other places in the code
This is 100% a mechanical search-and-replace patch. It will, however,
conflict with essentially any and all outstanding patches touching the
spapr code.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2019-03-06 07:35:37 +03:00
|
|
|
static target_long spapr_vlan_add_rxbuf_to_pool(SpaprVioVlan *dev,
|
hw/net/spapr_llan: Fix receive buffer handling for better performance
tl;dr:
This patch introduces an alternate way of handling the receive
buffers of the spapr-vlan device, resulting in much better
receive performance for the guest.
Full story:
One of our testers recently discovered that the performance of the
spapr-vlan device is very poor compared to other NICs, and that
a simple "ping -i 0.2 -s 65507 someip" in the guest can result
in more than 50% lost ping packets (especially with older guest
kernels < 3.17).
After doing some analysis, it was clear that there is a problem
with the way we handle the receive buffers in spapr_llan.c: The
ibmveth driver of the guest Linux kernel tries to add a lot of
buffers into several buffer pools (with 512, 2048 and 65536 byte
sizes by default, but it can be changed via the entries in the
/sys/devices/vio/1000/pool* directories of the guest). However,
the spapr-vlan device of QEMU only tries to squeeze all receive
buffer descriptors into one single page which has been supplied
by the guest during the H_REGISTER_LOGICAL_LAN call, without
taking care of different buffer sizes. This has two bad effects:
First, only a very limited number of buffer descriptors is accepted
at all. Second, we also hand 64k buffers to the guest even if
the 2k buffers would fit better - and this results in dropped packets
in the IP layer of the guest since too much skbuf memory is used.
Though it seems at a first glance like PAPR says that we should store
the receive buffer descriptors in the page that is supplied during
the H_REGISTER_LOGICAL_LAN call, chapter 16.4.1.2 in the LoPAPR spec
declares that "the contents of these descriptors are architecturally
opaque, none of these descriptors are manipulated by code above
the architected interfaces". That means we don't have to store
the RX buffer descriptors in this page, but can also manage the
receive buffers at the hypervisor level only. This is now what we
are doing here: Introducing proper RX buffer pools which are also
sorted by size of the buffers, so we can hand out a buffer with
the best fitting size when a packet has been received.
To avoid problems with migration from/to older version of QEMU,
the old behavior is also retained and enabled by default. The new
buffer management has to be enabled via a new "use-rx-buffer-pools"
property.
Now with the new buffer pool management enabled, the problem with
"ping -s 65507" is fixed for me, and the throughput of a simple
test with wget increases from creeping 3MB/s up to 20MB/s!
Signed-off-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2016-03-21 19:25:23 +03:00
|
|
|
target_ulong buf)
|
|
|
|
{
|
|
|
|
int size = VLAN_BD_LEN(buf);
|
|
|
|
int pool;
|
|
|
|
|
|
|
|
pool = spapr_vlan_get_rx_pool_id(dev, size);
|
|
|
|
if (pool < 0) {
|
|
|
|
/*
|
|
|
|
* No matching pool found? Try to use a new one. If the guest used all
|
2016-11-19 22:29:26 +03:00
|
|
|
* pools before, but changed the size of one pool in the meantime, we might
|
hw/net/spapr_llan: Fix receive buffer handling for better performance
tl;dr:
This patch introduces an alternate way of handling the receive
buffers of the spapr-vlan device, resulting in much better
receive performance for the guest.
Full story:
One of our testers recently discovered that the performance of the
spapr-vlan device is very poor compared to other NICs, and that
a simple "ping -i 0.2 -s 65507 someip" in the guest can result
in more than 50% lost ping packets (especially with older guest
kernels < 3.17).
After doing some analysis, it was clear that there is a problem
with the way we handle the receive buffers in spapr_llan.c: The
ibmveth driver of the guest Linux kernel tries to add a lot of
buffers into several buffer pools (with 512, 2048 and 65536 byte
sizes by default, but it can be changed via the entries in the
/sys/devices/vio/1000/pool* directories of the guest). However,
the spapr-vlan device of QEMU only tries to squeeze all receive
buffer descriptors into one single page which has been supplied
by the guest during the H_REGISTER_LOGICAL_LAN call, without
taking care of different buffer sizes. This has two bad effects:
First, only a very limited number of buffer descriptors is accepted
at all. Second, we also hand 64k buffers to the guest even if
the 2k buffers would fit better - and this results in dropped packets
in the IP layer of the guest since too much skbuf memory is used.
Though it seems at a first glance like PAPR says that we should store
the receive buffer descriptors in the page that is supplied during
the H_REGISTER_LOGICAL_LAN call, chapter 16.4.1.2 in the LoPAPR spec
declares that "the contents of these descriptors are architecturally
opaque, none of these descriptors are manipulated by code above
the architected interfaces". That means we don't have to store
the RX buffer descriptors in this page, but can also manage the
receive buffers at the hypervisor level only. This is now what we
are doing here: Introducing proper RX buffer pools which are also
sorted by size of the buffers, so we can hand out a buffer with
the best fitting size when a packet has been received.
To avoid problems with migration from/to older version of QEMU,
the old behavior is also retained and enabled by default. The new
buffer management has to be enabled via a new "use-rx-buffer-pools"
property.
Now with the new buffer pool management enabled, the problem with
"ping -s 65507" is fixed for me, and the throughput of a simple
test with wget increases from creeping 3MB/s up to 20MB/s!
Signed-off-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2016-03-21 19:25:23 +03:00
|
|
|
* need to recycle that pool here (if it's empty already). Thus scan
|
|
|
|
* all buffer pools now, starting with the last (likely empty) one.
|
|
|
|
*/
|
|
|
|
for (pool = RX_MAX_POOLS - 1; pool >= 0 ; pool--) {
|
|
|
|
if (dev->rx_pool[pool]->count == 0) {
|
|
|
|
dev->rx_pool[pool]->bufsize = size;
|
|
|
|
/*
|
|
|
|
* Sort pools by size so that spapr_vlan_receive()
|
|
|
|
* can later find the smallest buffer pool easily.
|
|
|
|
*/
|
|
|
|
qsort(dev->rx_pool, RX_MAX_POOLS, sizeof(dev->rx_pool[0]),
|
|
|
|
rx_pool_size_compare);
|
|
|
|
pool = spapr_vlan_get_rx_pool_id(dev, size);
|
2016-09-14 21:48:26 +03:00
|
|
|
trace_spapr_vlan_add_rxbuf_to_pool_create(pool,
|
|
|
|
VLAN_BD_LEN(buf));
|
hw/net/spapr_llan: Fix receive buffer handling for better performance
tl;dr:
This patch introduces an alternate way of handling the receive
buffers of the spapr-vlan device, resulting in much better
receive performance for the guest.
Full story:
One of our testers recently discovered that the performance of the
spapr-vlan device is very poor compared to other NICs, and that
a simple "ping -i 0.2 -s 65507 someip" in the guest can result
in more than 50% lost ping packets (especially with older guest
kernels < 3.17).
After doing some analysis, it was clear that there is a problem
with the way we handle the receive buffers in spapr_llan.c: The
ibmveth driver of the guest Linux kernel tries to add a lot of
buffers into several buffer pools (with 512, 2048 and 65536 byte
sizes by default, but it can be changed via the entries in the
/sys/devices/vio/1000/pool* directories of the guest). However,
the spapr-vlan device of QEMU only tries to squeeze all receive
buffer descriptors into one single page which has been supplied
by the guest during the H_REGISTER_LOGICAL_LAN call, without
taking care of different buffer sizes. This has two bad effects:
First, only a very limited number of buffer descriptors is accepted
at all. Second, we also hand 64k buffers to the guest even if
the 2k buffers would fit better - and this results in dropped packets
in the IP layer of the guest since too much skbuf memory is used.
Though it seems at a first glance like PAPR says that we should store
the receive buffer descriptors in the page that is supplied during
the H_REGISTER_LOGICAL_LAN call, chapter 16.4.1.2 in the LoPAPR spec
declares that "the contents of these descriptors are architecturally
opaque, none of these descriptors are manipulated by code above
the architected interfaces". That means we don't have to store
the RX buffer descriptors in this page, but can also manage the
receive buffers at the hypervisor level only. This is now what we
are doing here: Introducing proper RX buffer pools which are also
sorted by size of the buffers, so we can hand out a buffer with
the best fitting size when a packet has been received.
To avoid problems with migration from/to older version of QEMU,
the old behavior is also retained and enabled by default. The new
buffer management has to be enabled via a new "use-rx-buffer-pools"
property.
Now with the new buffer pool management enabled, the problem with
"ping -s 65507" is fixed for me, and the throughput of a simple
test with wget increases from creeping 3MB/s up to 20MB/s!
Signed-off-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2016-03-21 19:25:23 +03:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
/* Still no usable pool? Give up */
|
|
|
|
if (pool < 0 || dev->rx_pool[pool]->count >= RX_POOL_MAX_BDS) {
|
|
|
|
return H_RESOURCE;
|
|
|
|
}
|
|
|
|
|
2016-09-14 21:48:26 +03:00
|
|
|
trace_spapr_vlan_add_rxbuf_to_pool(pool, VLAN_BD_LEN(buf),
|
|
|
|
dev->rx_pool[pool]->count);
|
hw/net/spapr_llan: Fix receive buffer handling for better performance
tl;dr:
This patch introduces an alternate way of handling the receive
buffers of the spapr-vlan device, resulting in much better
receive performance for the guest.
Full story:
One of our testers recently discovered that the performance of the
spapr-vlan device is very poor compared to other NICs, and that
a simple "ping -i 0.2 -s 65507 someip" in the guest can result
in more than 50% lost ping packets (especially with older guest
kernels < 3.17).
After doing some analysis, it was clear that there is a problem
with the way we handle the receive buffers in spapr_llan.c: The
ibmveth driver of the guest Linux kernel tries to add a lot of
buffers into several buffer pools (with 512, 2048 and 65536 byte
sizes by default, but it can be changed via the entries in the
/sys/devices/vio/1000/pool* directories of the guest). However,
the spapr-vlan device of QEMU only tries to squeeze all receive
buffer descriptors into one single page which has been supplied
by the guest during the H_REGISTER_LOGICAL_LAN call, without
taking care of different buffer sizes. This has two bad effects:
First, only a very limited number of buffer descriptors is accepted
at all. Second, we also hand 64k buffers to the guest even if
the 2k buffers would fit better - and this results in dropped packets
in the IP layer of the guest since too much skbuf memory is used.
Though it seems at a first glance like PAPR says that we should store
the receive buffer descriptors in the page that is supplied during
the H_REGISTER_LOGICAL_LAN call, chapter 16.4.1.2 in the LoPAPR spec
declares that "the contents of these descriptors are architecturally
opaque, none of these descriptors are manipulated by code above
the architected interfaces". That means we don't have to store
the RX buffer descriptors in this page, but can also manage the
receive buffers at the hypervisor level only. This is now what we
are doing here: Introducing proper RX buffer pools which are also
sorted by size of the buffers, so we can hand out a buffer with
the best fitting size when a packet has been received.
To avoid problems with migration from/to older version of QEMU,
the old behavior is also retained and enabled by default. The new
buffer management has to be enabled via a new "use-rx-buffer-pools"
property.
Now with the new buffer pool management enabled, the problem with
"ping -s 65507" is fixed for me, and the throughput of a simple
test with wget increases from creeping 3MB/s up to 20MB/s!
Signed-off-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2016-03-21 19:25:23 +03:00
|
|
|
|
|
|
|
dev->rx_pool[pool]->bds[dev->rx_pool[pool]->count++] = buf;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* This is the old way of enqueuing receive buffers: Add it to the rx queue
|
|
|
|
* page that has been supplied by the guest (which is quite limited in size).
|
|
|
|
*/
|
spapr: Use CamelCase properly
The qemu coding standard is to use CamelCase for type and structure names,
and the pseries code follows that... sort of. There are quite a lot of
places where we bend the rules in order to preserve the capitalization of
internal acronyms like "PHB", "TCE", "DIMM" and most commonly "sPAPR".
That was a bad idea - it frequently leads to names ending up with hard to
read clusters of capital letters, and means they don't catch the eye as
type identifiers, which is kind of the point of the CamelCase convention in
the first place.
In short, keeping type identifiers look like CamelCase is more important
than preserving standard capitalization of internal "words". So, this
patch renames a heap of spapr internal type names to a more standard
CamelCase.
In addition to case changes, we also make some other identifier renames:
VIOsPAPR* -> SpaprVio*
The reverse word ordering was only ever used to mitigate the capital
cluster, so revert to the natural ordering.
VIOsPAPRVTYDevice -> SpaprVioVty
VIOsPAPRVLANDevice -> SpaprVioVlan
Brevity, since the "Device" didn't add useful information
sPAPRDRConnector -> SpaprDrc
sPAPRDRConnectorClass -> SpaprDrcClass
Brevity, and makes it clearer this is the same thing as a "DRC"
mentioned in many other places in the code
This is 100% a mechanical search-and-replace patch. It will, however,
conflict with essentially any and all outstanding patches touching the
spapr code.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2019-03-06 07:35:37 +03:00
|
|
|
static target_long spapr_vlan_add_rxbuf_to_page(SpaprVioVlan *dev,
|
2016-03-21 19:25:22 +03:00
|
|
|
target_ulong buf)
|
|
|
|
{
|
|
|
|
vlan_bd_t bd;
|
|
|
|
|
|
|
|
if (dev->rx_bufs >= VLAN_MAX_BUFS) {
|
|
|
|
return H_RESOURCE;
|
|
|
|
}
|
|
|
|
|
|
|
|
do {
|
|
|
|
dev->add_buf_ptr += 8;
|
|
|
|
if (dev->add_buf_ptr >= VLAN_RX_BDS_LEN + VLAN_RX_BDS_OFF) {
|
|
|
|
dev->add_buf_ptr = VLAN_RX_BDS_OFF;
|
|
|
|
}
|
|
|
|
|
|
|
|
bd = vio_ldq(&dev->sdev, dev->buf_list + dev->add_buf_ptr);
|
|
|
|
} while (bd & VLAN_BD_VALID);
|
|
|
|
|
|
|
|
vio_stq(&dev->sdev, dev->buf_list + dev->add_buf_ptr, buf);
|
|
|
|
|
2016-09-14 21:48:26 +03:00
|
|
|
trace_spapr_vlan_add_rxbuf_to_page(dev->add_buf_ptr, dev->rx_bufs, buf);
|
2016-03-21 19:25:22 +03:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2012-05-03 08:23:01 +04:00
|
|
|
static target_ulong h_add_logical_lan_buffer(PowerPCCPU *cpu,
|
spapr: Use CamelCase properly
The qemu coding standard is to use CamelCase for type and structure names,
and the pseries code follows that... sort of. There are quite a lot of
places where we bend the rules in order to preserve the capitalization of
internal acronyms like "PHB", "TCE", "DIMM" and most commonly "sPAPR".
That was a bad idea - it frequently leads to names ending up with hard to
read clusters of capital letters, and means they don't catch the eye as
type identifiers, which is kind of the point of the CamelCase convention in
the first place.
In short, keeping type identifiers look like CamelCase is more important
than preserving standard capitalization of internal "words". So, this
patch renames a heap of spapr internal type names to a more standard
CamelCase.
In addition to case changes, we also make some other identifier renames:
VIOsPAPR* -> SpaprVio*
The reverse word ordering was only ever used to mitigate the capital
cluster, so revert to the natural ordering.
VIOsPAPRVTYDevice -> SpaprVioVty
VIOsPAPRVLANDevice -> SpaprVioVlan
Brevity, since the "Device" didn't add useful information
sPAPRDRConnector -> SpaprDrc
sPAPRDRConnectorClass -> SpaprDrcClass
Brevity, and makes it clearer this is the same thing as a "DRC"
mentioned in many other places in the code
This is 100% a mechanical search-and-replace patch. It will, however,
conflict with essentially any and all outstanding patches touching the
spapr code.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2019-03-06 07:35:37 +03:00
|
|
|
SpaprMachineState *spapr,
|
2011-04-01 08:15:29 +04:00
|
|
|
target_ulong opcode,
|
|
|
|
target_ulong *args)
|
|
|
|
{
|
|
|
|
target_ulong reg = args[0];
|
|
|
|
target_ulong buf = args[1];
|
spapr: Use CamelCase properly
The qemu coding standard is to use CamelCase for type and structure names,
and the pseries code follows that... sort of. There are quite a lot of
places where we bend the rules in order to preserve the capitalization of
internal acronyms like "PHB", "TCE", "DIMM" and most commonly "sPAPR".
That was a bad idea - it frequently leads to names ending up with hard to
read clusters of capital letters, and means they don't catch the eye as
type identifiers, which is kind of the point of the CamelCase convention in
the first place.
In short, keeping type identifiers look like CamelCase is more important
than preserving standard capitalization of internal "words". So, this
patch renames a heap of spapr internal type names to a more standard
CamelCase.
In addition to case changes, we also make some other identifier renames:
VIOsPAPR* -> SpaprVio*
The reverse word ordering was only ever used to mitigate the capital
cluster, so revert to the natural ordering.
VIOsPAPRVTYDevice -> SpaprVioVty
VIOsPAPRVLANDevice -> SpaprVioVlan
Brevity, since the "Device" didn't add useful information
sPAPRDRConnector -> SpaprDrc
sPAPRDRConnectorClass -> SpaprDrcClass
Brevity, and makes it clearer this is the same thing as a "DRC"
mentioned in many other places in the code
This is 100% a mechanical search-and-replace patch. It will, however,
conflict with essentially any and all outstanding patches touching the
spapr code.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2019-03-06 07:35:37 +03:00
|
|
|
SpaprVioDevice *sdev = spapr_vio_find_by_reg(spapr->vio_bus, reg);
|
|
|
|
SpaprVioVlan *dev = VIO_SPAPR_VLAN_DEVICE(sdev);
|
2016-03-21 19:25:22 +03:00
|
|
|
target_long ret;
|
2011-04-01 08:15:29 +04:00
|
|
|
|
2016-09-14 21:48:26 +03:00
|
|
|
trace_spapr_vlan_h_add_logical_lan_buffer(reg, buf);
|
2011-04-01 08:15:29 +04:00
|
|
|
|
|
|
|
if (!sdev) {
|
2012-03-29 01:39:45 +04:00
|
|
|
hcall_dprintf("Bad device\n");
|
2011-04-01 08:15:29 +04:00
|
|
|
return H_PARAMETER;
|
|
|
|
}
|
|
|
|
|
|
|
|
if ((check_bd(dev, buf, 4) < 0)
|
|
|
|
|| (VLAN_BD_LEN(buf) < 16)) {
|
2012-03-29 01:39:45 +04:00
|
|
|
hcall_dprintf("Bad buffer enqueued\n");
|
2011-04-01 08:15:29 +04:00
|
|
|
return H_PARAMETER;
|
|
|
|
}
|
|
|
|
|
2016-03-21 19:25:22 +03:00
|
|
|
if (!dev->isopen) {
|
2011-04-01 08:15:29 +04:00
|
|
|
return H_RESOURCE;
|
|
|
|
}
|
|
|
|
|
hw/net/spapr_llan: Fix receive buffer handling for better performance
tl;dr:
This patch introduces an alternate way of handling the receive
buffers of the spapr-vlan device, resulting in much better
receive performance for the guest.
Full story:
One of our testers recently discovered that the performance of the
spapr-vlan device is very poor compared to other NICs, and that
a simple "ping -i 0.2 -s 65507 someip" in the guest can result
in more than 50% lost ping packets (especially with older guest
kernels < 3.17).
After doing some analysis, it was clear that there is a problem
with the way we handle the receive buffers in spapr_llan.c: The
ibmveth driver of the guest Linux kernel tries to add a lot of
buffers into several buffer pools (with 512, 2048 and 65536 byte
sizes by default, but it can be changed via the entries in the
/sys/devices/vio/1000/pool* directories of the guest). However,
the spapr-vlan device of QEMU only tries to squeeze all receive
buffer descriptors into one single page which has been supplied
by the guest during the H_REGISTER_LOGICAL_LAN call, without
taking care of different buffer sizes. This has two bad effects:
First, only a very limited number of buffer descriptors is accepted
at all. Second, we also hand 64k buffers to the guest even if
the 2k buffers would fit better - and this results in dropped packets
in the IP layer of the guest since too much skbuf memory is used.
Though it seems at a first glance like PAPR says that we should store
the receive buffer descriptors in the page that is supplied during
the H_REGISTER_LOGICAL_LAN call, chapter 16.4.1.2 in the LoPAPR spec
declares that "the contents of these descriptors are architecturally
opaque, none of these descriptors are manipulated by code above
the architected interfaces". That means we don't have to store
the RX buffer descriptors in this page, but can also manage the
receive buffers at the hypervisor level only. This is now what we
are doing here: Introducing proper RX buffer pools which are also
sorted by size of the buffers, so we can hand out a buffer with
the best fitting size when a packet has been received.
To avoid problems with migration from/to older version of QEMU,
the old behavior is also retained and enabled by default. The new
buffer management has to be enabled via a new "use-rx-buffer-pools"
property.
Now with the new buffer pool management enabled, the problem with
"ping -s 65507" is fixed for me, and the throughput of a simple
test with wget increases from creeping 3MB/s up to 20MB/s!
Signed-off-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2016-03-21 19:25:23 +03:00
|
|
|
if (dev->compat_flags & SPAPRVLAN_FLAG_RX_BUF_POOLS) {
|
|
|
|
ret = spapr_vlan_add_rxbuf_to_pool(dev, buf);
|
|
|
|
} else {
|
|
|
|
ret = spapr_vlan_add_rxbuf_to_page(dev, buf);
|
|
|
|
}
|
2016-03-21 19:25:22 +03:00
|
|
|
if (ret) {
|
|
|
|
return ret;
|
|
|
|
}
|
2011-04-01 08:15:29 +04:00
|
|
|
|
|
|
|
dev->rx_bufs++;
|
|
|
|
|
hw/net/spapr_llan: Delay flushing of the RX queue while adding new RX buffers
Currently, the spapr-vlan device is trying to flush the RX queue
after each RX buffer that has been added by the guest via the
H_ADD_LOGICAL_LAN_BUFFER hypercall. In case the receive buffer pool
was empty before, we only pass single packets to the guest this
way. This can cause very bad performance if a sender is trying
to stream fragmented UDP packets to the guest. For example when
using the UDP_STREAM test from netperf with UDP packets that are
much bigger than the MTU size, almost all UDP packets are dropped
in the guest since the chances are quite high that at least one of
the fragments got lost on the way.
When flushing the receive queue, it's much better if we'd have
a bunch of receive buffers available already, so that fragmented
packets can be passed to the guest in one go. To do this, the
spapr_vlan_receive() function should return 0 instead of -1 if there
are no more receive buffers available, so that receive_disabled = 1
gets temporarily set for the receive queue, and we have to delay
the queue flushing at the end of h_add_logical_lan_buffer() a little
bit by using a timer, so that the guest gets a chance to add multiple
RX buffers before we flush the queue again.
This improves the UDP_STREAM test with the spapr-vlan device a lot:
Running
netserver -p 44444 -L <guestip> -f -D -4
in the guest, and
netperf -p 44444 -L <hostip> -H <guestip> -t UDP_STREAM -l 60 -- -m 16384
in the host, I get the following values _without_ this patch:
Socket Message Elapsed Messages
Size Size Time Okay Errors Throughput
bytes bytes secs # # 10^6bits/sec
229376 16384 60.00 1738970 0 3798.83
229376 60.00 23 0.05
That "0.05" means that almost all UDP packets got lost/discarded
at the receiving side.
With this patch applied, the value look much better:
Socket Message Elapsed Messages
Size Size Time Okay Errors Throughput
bytes bytes secs # # 10^6bits/sec
229376 16384 60.00 1789104 0 3908.35
229376 60.00 22818 49.85
Signed-off-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2016-03-31 14:47:05 +03:00
|
|
|
/*
|
|
|
|
* Give guest some more time to add additional RX buffers before we
|
|
|
|
* flush the receive queue, so that e.g. fragmented IP packets can
|
|
|
|
* be passed to the guest in one go later (instead of passing single
|
|
|
|
* fragments if there is only one receive buffer available).
|
|
|
|
*/
|
|
|
|
timer_mod(dev->rxp_timer, qemu_clock_get_us(QEMU_CLOCK_VIRTUAL) + 500);
|
2014-02-14 05:27:04 +04:00
|
|
|
|
2011-04-01 08:15:29 +04:00
|
|
|
return H_SUCCESS;
|
|
|
|
}
|
|
|
|
|
2015-07-02 09:23:04 +03:00
|
|
|
static target_ulong h_send_logical_lan(PowerPCCPU *cpu,
|
spapr: Use CamelCase properly
The qemu coding standard is to use CamelCase for type and structure names,
and the pseries code follows that... sort of. There are quite a lot of
places where we bend the rules in order to preserve the capitalization of
internal acronyms like "PHB", "TCE", "DIMM" and most commonly "sPAPR".
That was a bad idea - it frequently leads to names ending up with hard to
read clusters of capital letters, and means they don't catch the eye as
type identifiers, which is kind of the point of the CamelCase convention in
the first place.
In short, keeping type identifiers look like CamelCase is more important
than preserving standard capitalization of internal "words". So, this
patch renames a heap of spapr internal type names to a more standard
CamelCase.
In addition to case changes, we also make some other identifier renames:
VIOsPAPR* -> SpaprVio*
The reverse word ordering was only ever used to mitigate the capital
cluster, so revert to the natural ordering.
VIOsPAPRVTYDevice -> SpaprVioVty
VIOsPAPRVLANDevice -> SpaprVioVlan
Brevity, since the "Device" didn't add useful information
sPAPRDRConnector -> SpaprDrc
sPAPRDRConnectorClass -> SpaprDrcClass
Brevity, and makes it clearer this is the same thing as a "DRC"
mentioned in many other places in the code
This is 100% a mechanical search-and-replace patch. It will, however,
conflict with essentially any and all outstanding patches touching the
spapr code.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2019-03-06 07:35:37 +03:00
|
|
|
SpaprMachineState *spapr,
|
2011-04-01 08:15:29 +04:00
|
|
|
target_ulong opcode, target_ulong *args)
|
|
|
|
{
|
|
|
|
target_ulong reg = args[0];
|
|
|
|
target_ulong *bufs = args + 1;
|
|
|
|
target_ulong continue_token = args[7];
|
spapr: Use CamelCase properly
The qemu coding standard is to use CamelCase for type and structure names,
and the pseries code follows that... sort of. There are quite a lot of
places where we bend the rules in order to preserve the capitalization of
internal acronyms like "PHB", "TCE", "DIMM" and most commonly "sPAPR".
That was a bad idea - it frequently leads to names ending up with hard to
read clusters of capital letters, and means they don't catch the eye as
type identifiers, which is kind of the point of the CamelCase convention in
the first place.
In short, keeping type identifiers look like CamelCase is more important
than preserving standard capitalization of internal "words". So, this
patch renames a heap of spapr internal type names to a more standard
CamelCase.
In addition to case changes, we also make some other identifier renames:
VIOsPAPR* -> SpaprVio*
The reverse word ordering was only ever used to mitigate the capital
cluster, so revert to the natural ordering.
VIOsPAPRVTYDevice -> SpaprVioVty
VIOsPAPRVLANDevice -> SpaprVioVlan
Brevity, since the "Device" didn't add useful information
sPAPRDRConnector -> SpaprDrc
sPAPRDRConnectorClass -> SpaprDrcClass
Brevity, and makes it clearer this is the same thing as a "DRC"
mentioned in many other places in the code
This is 100% a mechanical search-and-replace patch. It will, however,
conflict with essentially any and all outstanding patches touching the
spapr code.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2019-03-06 07:35:37 +03:00
|
|
|
SpaprVioDevice *sdev = spapr_vio_find_by_reg(spapr->vio_bus, reg);
|
|
|
|
SpaprVioVlan *dev = VIO_SPAPR_VLAN_DEVICE(sdev);
|
2011-04-01 08:15:29 +04:00
|
|
|
unsigned total_len;
|
2020-10-09 16:41:36 +03:00
|
|
|
uint8_t *p;
|
|
|
|
g_autofree uint8_t *lbuf = NULL;
|
2011-04-01 08:15:29 +04:00
|
|
|
int i, nbufs;
|
|
|
|
int ret;
|
|
|
|
|
2016-09-14 21:48:26 +03:00
|
|
|
trace_spapr_vlan_h_send_logical_lan(reg, continue_token);
|
2011-04-01 08:15:29 +04:00
|
|
|
|
|
|
|
if (!sdev) {
|
|
|
|
return H_PARAMETER;
|
|
|
|
}
|
|
|
|
|
2016-09-14 21:48:26 +03:00
|
|
|
trace_spapr_vlan_h_send_logical_lan_rxbufs(dev->rx_bufs);
|
2011-04-01 08:15:29 +04:00
|
|
|
|
|
|
|
if (!dev->isopen) {
|
|
|
|
return H_DROPPED;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (continue_token) {
|
|
|
|
return H_HARDWARE; /* FIXME actually handle this */
|
|
|
|
}
|
|
|
|
|
|
|
|
total_len = 0;
|
|
|
|
for (i = 0; i < 6; i++) {
|
2016-09-14 21:48:26 +03:00
|
|
|
trace_spapr_vlan_h_send_logical_lan_buf_desc(bufs[i]);
|
2011-04-01 08:15:29 +04:00
|
|
|
if (!(bufs[i] & VLAN_BD_VALID)) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
total_len += VLAN_BD_LEN(bufs[i]);
|
|
|
|
}
|
|
|
|
|
|
|
|
nbufs = i;
|
2016-09-14 21:48:26 +03:00
|
|
|
trace_spapr_vlan_h_send_logical_lan_total(nbufs, total_len);
|
2011-04-01 08:15:29 +04:00
|
|
|
|
|
|
|
if (total_len == 0) {
|
|
|
|
return H_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (total_len > MAX_PACKET_SIZE) {
|
|
|
|
/* Don't let the guest force too large an allocation */
|
|
|
|
return H_RESOURCE;
|
|
|
|
}
|
|
|
|
|
2020-10-09 16:41:36 +03:00
|
|
|
lbuf = g_malloc(total_len);
|
2011-04-01 08:15:29 +04:00
|
|
|
p = lbuf;
|
|
|
|
for (i = 0; i < nbufs; i++) {
|
2012-06-27 08:50:44 +04:00
|
|
|
ret = spapr_vio_dma_read(sdev, VLAN_BD_ADDR(bufs[i]),
|
2011-04-01 08:15:29 +04:00
|
|
|
p, VLAN_BD_LEN(bufs[i]));
|
|
|
|
if (ret < 0) {
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
p += VLAN_BD_LEN(bufs[i]);
|
|
|
|
}
|
|
|
|
|
2013-01-30 15:12:22 +04:00
|
|
|
qemu_send_packet(qemu_get_queue(dev->nic), lbuf, total_len);
|
2011-04-01 08:15:29 +04:00
|
|
|
|
|
|
|
return H_SUCCESS;
|
|
|
|
}
|
|
|
|
|
spapr: Use CamelCase properly
The qemu coding standard is to use CamelCase for type and structure names,
and the pseries code follows that... sort of. There are quite a lot of
places where we bend the rules in order to preserve the capitalization of
internal acronyms like "PHB", "TCE", "DIMM" and most commonly "sPAPR".
That was a bad idea - it frequently leads to names ending up with hard to
read clusters of capital letters, and means they don't catch the eye as
type identifiers, which is kind of the point of the CamelCase convention in
the first place.
In short, keeping type identifiers look like CamelCase is more important
than preserving standard capitalization of internal "words". So, this
patch renames a heap of spapr internal type names to a more standard
CamelCase.
In addition to case changes, we also make some other identifier renames:
VIOsPAPR* -> SpaprVio*
The reverse word ordering was only ever used to mitigate the capital
cluster, so revert to the natural ordering.
VIOsPAPRVTYDevice -> SpaprVioVty
VIOsPAPRVLANDevice -> SpaprVioVlan
Brevity, since the "Device" didn't add useful information
sPAPRDRConnector -> SpaprDrc
sPAPRDRConnectorClass -> SpaprDrcClass
Brevity, and makes it clearer this is the same thing as a "DRC"
mentioned in many other places in the code
This is 100% a mechanical search-and-replace patch. It will, however,
conflict with essentially any and all outstanding patches touching the
spapr code.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2019-03-06 07:35:37 +03:00
|
|
|
static target_ulong h_multicast_ctrl(PowerPCCPU *cpu, SpaprMachineState *spapr,
|
2011-04-01 08:15:29 +04:00
|
|
|
target_ulong opcode, target_ulong *args)
|
|
|
|
{
|
|
|
|
target_ulong reg = args[0];
|
spapr: Use CamelCase properly
The qemu coding standard is to use CamelCase for type and structure names,
and the pseries code follows that... sort of. There are quite a lot of
places where we bend the rules in order to preserve the capitalization of
internal acronyms like "PHB", "TCE", "DIMM" and most commonly "sPAPR".
That was a bad idea - it frequently leads to names ending up with hard to
read clusters of capital letters, and means they don't catch the eye as
type identifiers, which is kind of the point of the CamelCase convention in
the first place.
In short, keeping type identifiers look like CamelCase is more important
than preserving standard capitalization of internal "words". So, this
patch renames a heap of spapr internal type names to a more standard
CamelCase.
In addition to case changes, we also make some other identifier renames:
VIOsPAPR* -> SpaprVio*
The reverse word ordering was only ever used to mitigate the capital
cluster, so revert to the natural ordering.
VIOsPAPRVTYDevice -> SpaprVioVty
VIOsPAPRVLANDevice -> SpaprVioVlan
Brevity, since the "Device" didn't add useful information
sPAPRDRConnector -> SpaprDrc
sPAPRDRConnectorClass -> SpaprDrcClass
Brevity, and makes it clearer this is the same thing as a "DRC"
mentioned in many other places in the code
This is 100% a mechanical search-and-replace patch. It will, however,
conflict with essentially any and all outstanding patches touching the
spapr code.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2019-03-06 07:35:37 +03:00
|
|
|
SpaprVioDevice *dev = spapr_vio_find_by_reg(spapr->vio_bus, reg);
|
2011-04-01 08:15:29 +04:00
|
|
|
|
|
|
|
if (!dev) {
|
|
|
|
return H_PARAMETER;
|
|
|
|
}
|
|
|
|
|
|
|
|
return H_SUCCESS;
|
|
|
|
}
|
|
|
|
|
2016-09-01 11:10:49 +03:00
|
|
|
static target_ulong h_change_logical_lan_mac(PowerPCCPU *cpu,
|
spapr: Use CamelCase properly
The qemu coding standard is to use CamelCase for type and structure names,
and the pseries code follows that... sort of. There are quite a lot of
places where we bend the rules in order to preserve the capitalization of
internal acronyms like "PHB", "TCE", "DIMM" and most commonly "sPAPR".
That was a bad idea - it frequently leads to names ending up with hard to
read clusters of capital letters, and means they don't catch the eye as
type identifiers, which is kind of the point of the CamelCase convention in
the first place.
In short, keeping type identifiers look like CamelCase is more important
than preserving standard capitalization of internal "words". So, this
patch renames a heap of spapr internal type names to a more standard
CamelCase.
In addition to case changes, we also make some other identifier renames:
VIOsPAPR* -> SpaprVio*
The reverse word ordering was only ever used to mitigate the capital
cluster, so revert to the natural ordering.
VIOsPAPRVTYDevice -> SpaprVioVty
VIOsPAPRVLANDevice -> SpaprVioVlan
Brevity, since the "Device" didn't add useful information
sPAPRDRConnector -> SpaprDrc
sPAPRDRConnectorClass -> SpaprDrcClass
Brevity, and makes it clearer this is the same thing as a "DRC"
mentioned in many other places in the code
This is 100% a mechanical search-and-replace patch. It will, however,
conflict with essentially any and all outstanding patches touching the
spapr code.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2019-03-06 07:35:37 +03:00
|
|
|
SpaprMachineState *spapr,
|
2016-09-01 11:10:49 +03:00
|
|
|
target_ulong opcode,
|
|
|
|
target_ulong *args)
|
|
|
|
{
|
|
|
|
target_ulong reg = args[0];
|
|
|
|
target_ulong macaddr = args[1];
|
spapr: Use CamelCase properly
The qemu coding standard is to use CamelCase for type and structure names,
and the pseries code follows that... sort of. There are quite a lot of
places where we bend the rules in order to preserve the capitalization of
internal acronyms like "PHB", "TCE", "DIMM" and most commonly "sPAPR".
That was a bad idea - it frequently leads to names ending up with hard to
read clusters of capital letters, and means they don't catch the eye as
type identifiers, which is kind of the point of the CamelCase convention in
the first place.
In short, keeping type identifiers look like CamelCase is more important
than preserving standard capitalization of internal "words". So, this
patch renames a heap of spapr internal type names to a more standard
CamelCase.
In addition to case changes, we also make some other identifier renames:
VIOsPAPR* -> SpaprVio*
The reverse word ordering was only ever used to mitigate the capital
cluster, so revert to the natural ordering.
VIOsPAPRVTYDevice -> SpaprVioVty
VIOsPAPRVLANDevice -> SpaprVioVlan
Brevity, since the "Device" didn't add useful information
sPAPRDRConnector -> SpaprDrc
sPAPRDRConnectorClass -> SpaprDrcClass
Brevity, and makes it clearer this is the same thing as a "DRC"
mentioned in many other places in the code
This is 100% a mechanical search-and-replace patch. It will, however,
conflict with essentially any and all outstanding patches touching the
spapr code.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2019-03-06 07:35:37 +03:00
|
|
|
SpaprVioDevice *sdev = spapr_vio_find_by_reg(spapr->vio_bus, reg);
|
|
|
|
SpaprVioVlan *dev = VIO_SPAPR_VLAN_DEVICE(sdev);
|
2016-09-01 11:10:49 +03:00
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < ETH_ALEN; i++) {
|
|
|
|
dev->nicconf.macaddr.a[ETH_ALEN - i - 1] = macaddr & 0xff;
|
|
|
|
macaddr >>= 8;
|
|
|
|
}
|
|
|
|
|
|
|
|
qemu_format_nic_info_str(qemu_get_queue(dev->nic), dev->nicconf.macaddr.a);
|
|
|
|
|
|
|
|
return H_SUCCESS;
|
|
|
|
}
|
|
|
|
|
2011-12-16 02:31:06 +04:00
|
|
|
static Property spapr_vlan_properties[] = {
|
spapr: Use CamelCase properly
The qemu coding standard is to use CamelCase for type and structure names,
and the pseries code follows that... sort of. There are quite a lot of
places where we bend the rules in order to preserve the capitalization of
internal acronyms like "PHB", "TCE", "DIMM" and most commonly "sPAPR".
That was a bad idea - it frequently leads to names ending up with hard to
read clusters of capital letters, and means they don't catch the eye as
type identifiers, which is kind of the point of the CamelCase convention in
the first place.
In short, keeping type identifiers look like CamelCase is more important
than preserving standard capitalization of internal "words". So, this
patch renames a heap of spapr internal type names to a more standard
CamelCase.
In addition to case changes, we also make some other identifier renames:
VIOsPAPR* -> SpaprVio*
The reverse word ordering was only ever used to mitigate the capital
cluster, so revert to the natural ordering.
VIOsPAPRVTYDevice -> SpaprVioVty
VIOsPAPRVLANDevice -> SpaprVioVlan
Brevity, since the "Device" didn't add useful information
sPAPRDRConnector -> SpaprDrc
sPAPRDRConnectorClass -> SpaprDrcClass
Brevity, and makes it clearer this is the same thing as a "DRC"
mentioned in many other places in the code
This is 100% a mechanical search-and-replace patch. It will, however,
conflict with essentially any and all outstanding patches touching the
spapr code.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2019-03-06 07:35:37 +03:00
|
|
|
DEFINE_SPAPR_PROPERTIES(SpaprVioVlan, sdev),
|
|
|
|
DEFINE_NIC_PROPERTIES(SpaprVioVlan, nicconf),
|
|
|
|
DEFINE_PROP_BIT("use-rx-buffer-pools", SpaprVioVlan,
|
2016-03-21 19:25:24 +03:00
|
|
|
compat_flags, SPAPRVLAN_FLAG_RX_BUF_POOLS_BIT, true),
|
2011-12-16 02:31:06 +04:00
|
|
|
DEFINE_PROP_END_OF_LIST(),
|
|
|
|
};
|
|
|
|
|
hw/net/spapr_llan: Fix receive buffer handling for better performance
tl;dr:
This patch introduces an alternate way of handling the receive
buffers of the spapr-vlan device, resulting in much better
receive performance for the guest.
Full story:
One of our testers recently discovered that the performance of the
spapr-vlan device is very poor compared to other NICs, and that
a simple "ping -i 0.2 -s 65507 someip" in the guest can result
in more than 50% lost ping packets (especially with older guest
kernels < 3.17).
After doing some analysis, it was clear that there is a problem
with the way we handle the receive buffers in spapr_llan.c: The
ibmveth driver of the guest Linux kernel tries to add a lot of
buffers into several buffer pools (with 512, 2048 and 65536 byte
sizes by default, but it can be changed via the entries in the
/sys/devices/vio/1000/pool* directories of the guest). However,
the spapr-vlan device of QEMU only tries to squeeze all receive
buffer descriptors into one single page which has been supplied
by the guest during the H_REGISTER_LOGICAL_LAN call, without
taking care of different buffer sizes. This has two bad effects:
First, only a very limited number of buffer descriptors is accepted
at all. Second, we also hand 64k buffers to the guest even if
the 2k buffers would fit better - and this results in dropped packets
in the IP layer of the guest since too much skbuf memory is used.
Though it seems at a first glance like PAPR says that we should store
the receive buffer descriptors in the page that is supplied during
the H_REGISTER_LOGICAL_LAN call, chapter 16.4.1.2 in the LoPAPR spec
declares that "the contents of these descriptors are architecturally
opaque, none of these descriptors are manipulated by code above
the architected interfaces". That means we don't have to store
the RX buffer descriptors in this page, but can also manage the
receive buffers at the hypervisor level only. This is now what we
are doing here: Introducing proper RX buffer pools which are also
sorted by size of the buffers, so we can hand out a buffer with
the best fitting size when a packet has been received.
To avoid problems with migration from/to older version of QEMU,
the old behavior is also retained and enabled by default. The new
buffer management has to be enabled via a new "use-rx-buffer-pools"
property.
Now with the new buffer pool management enabled, the problem with
"ping -s 65507" is fixed for me, and the throughput of a simple
test with wget increases from creeping 3MB/s up to 20MB/s!
Signed-off-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2016-03-21 19:25:23 +03:00
|
|
|
static bool spapr_vlan_rx_buffer_pools_needed(void *opaque)
|
|
|
|
{
|
spapr: Use CamelCase properly
The qemu coding standard is to use CamelCase for type and structure names,
and the pseries code follows that... sort of. There are quite a lot of
places where we bend the rules in order to preserve the capitalization of
internal acronyms like "PHB", "TCE", "DIMM" and most commonly "sPAPR".
That was a bad idea - it frequently leads to names ending up with hard to
read clusters of capital letters, and means they don't catch the eye as
type identifiers, which is kind of the point of the CamelCase convention in
the first place.
In short, keeping type identifiers look like CamelCase is more important
than preserving standard capitalization of internal "words". So, this
patch renames a heap of spapr internal type names to a more standard
CamelCase.
In addition to case changes, we also make some other identifier renames:
VIOsPAPR* -> SpaprVio*
The reverse word ordering was only ever used to mitigate the capital
cluster, so revert to the natural ordering.
VIOsPAPRVTYDevice -> SpaprVioVty
VIOsPAPRVLANDevice -> SpaprVioVlan
Brevity, since the "Device" didn't add useful information
sPAPRDRConnector -> SpaprDrc
sPAPRDRConnectorClass -> SpaprDrcClass
Brevity, and makes it clearer this is the same thing as a "DRC"
mentioned in many other places in the code
This is 100% a mechanical search-and-replace patch. It will, however,
conflict with essentially any and all outstanding patches touching the
spapr code.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2019-03-06 07:35:37 +03:00
|
|
|
SpaprVioVlan *dev = opaque;
|
hw/net/spapr_llan: Fix receive buffer handling for better performance
tl;dr:
This patch introduces an alternate way of handling the receive
buffers of the spapr-vlan device, resulting in much better
receive performance for the guest.
Full story:
One of our testers recently discovered that the performance of the
spapr-vlan device is very poor compared to other NICs, and that
a simple "ping -i 0.2 -s 65507 someip" in the guest can result
in more than 50% lost ping packets (especially with older guest
kernels < 3.17).
After doing some analysis, it was clear that there is a problem
with the way we handle the receive buffers in spapr_llan.c: The
ibmveth driver of the guest Linux kernel tries to add a lot of
buffers into several buffer pools (with 512, 2048 and 65536 byte
sizes by default, but it can be changed via the entries in the
/sys/devices/vio/1000/pool* directories of the guest). However,
the spapr-vlan device of QEMU only tries to squeeze all receive
buffer descriptors into one single page which has been supplied
by the guest during the H_REGISTER_LOGICAL_LAN call, without
taking care of different buffer sizes. This has two bad effects:
First, only a very limited number of buffer descriptors is accepted
at all. Second, we also hand 64k buffers to the guest even if
the 2k buffers would fit better - and this results in dropped packets
in the IP layer of the guest since too much skbuf memory is used.
Though it seems at a first glance like PAPR says that we should store
the receive buffer descriptors in the page that is supplied during
the H_REGISTER_LOGICAL_LAN call, chapter 16.4.1.2 in the LoPAPR spec
declares that "the contents of these descriptors are architecturally
opaque, none of these descriptors are manipulated by code above
the architected interfaces". That means we don't have to store
the RX buffer descriptors in this page, but can also manage the
receive buffers at the hypervisor level only. This is now what we
are doing here: Introducing proper RX buffer pools which are also
sorted by size of the buffers, so we can hand out a buffer with
the best fitting size when a packet has been received.
To avoid problems with migration from/to older version of QEMU,
the old behavior is also retained and enabled by default. The new
buffer management has to be enabled via a new "use-rx-buffer-pools"
property.
Now with the new buffer pool management enabled, the problem with
"ping -s 65507" is fixed for me, and the throughput of a simple
test with wget increases from creeping 3MB/s up to 20MB/s!
Signed-off-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2016-03-21 19:25:23 +03:00
|
|
|
|
|
|
|
return (dev->compat_flags & SPAPRVLAN_FLAG_RX_BUF_POOLS) != 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static const VMStateDescription vmstate_rx_buffer_pool = {
|
|
|
|
.name = "spapr_llan/rx_buffer_pool",
|
|
|
|
.version_id = 1,
|
|
|
|
.minimum_version_id = 1,
|
|
|
|
.needed = spapr_vlan_rx_buffer_pools_needed,
|
|
|
|
.fields = (VMStateField[]) {
|
|
|
|
VMSTATE_INT32(bufsize, RxBufPool),
|
|
|
|
VMSTATE_INT32(count, RxBufPool),
|
|
|
|
VMSTATE_UINT64_ARRAY(bds, RxBufPool, RX_POOL_MAX_BDS),
|
|
|
|
VMSTATE_END_OF_LIST()
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
static const VMStateDescription vmstate_rx_pools = {
|
|
|
|
.name = "spapr_llan/rx_pools",
|
|
|
|
.version_id = 1,
|
|
|
|
.minimum_version_id = 1,
|
|
|
|
.needed = spapr_vlan_rx_buffer_pools_needed,
|
|
|
|
.fields = (VMStateField[]) {
|
spapr: Use CamelCase properly
The qemu coding standard is to use CamelCase for type and structure names,
and the pseries code follows that... sort of. There are quite a lot of
places where we bend the rules in order to preserve the capitalization of
internal acronyms like "PHB", "TCE", "DIMM" and most commonly "sPAPR".
That was a bad idea - it frequently leads to names ending up with hard to
read clusters of capital letters, and means they don't catch the eye as
type identifiers, which is kind of the point of the CamelCase convention in
the first place.
In short, keeping type identifiers look like CamelCase is more important
than preserving standard capitalization of internal "words". So, this
patch renames a heap of spapr internal type names to a more standard
CamelCase.
In addition to case changes, we also make some other identifier renames:
VIOsPAPR* -> SpaprVio*
The reverse word ordering was only ever used to mitigate the capital
cluster, so revert to the natural ordering.
VIOsPAPRVTYDevice -> SpaprVioVty
VIOsPAPRVLANDevice -> SpaprVioVlan
Brevity, since the "Device" didn't add useful information
sPAPRDRConnector -> SpaprDrc
sPAPRDRConnectorClass -> SpaprDrcClass
Brevity, and makes it clearer this is the same thing as a "DRC"
mentioned in many other places in the code
This is 100% a mechanical search-and-replace patch. It will, however,
conflict with essentially any and all outstanding patches touching the
spapr code.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2019-03-06 07:35:37 +03:00
|
|
|
VMSTATE_ARRAY_OF_POINTER_TO_STRUCT(rx_pool, SpaprVioVlan,
|
hw/net/spapr_llan: Fix receive buffer handling for better performance
tl;dr:
This patch introduces an alternate way of handling the receive
buffers of the spapr-vlan device, resulting in much better
receive performance for the guest.
Full story:
One of our testers recently discovered that the performance of the
spapr-vlan device is very poor compared to other NICs, and that
a simple "ping -i 0.2 -s 65507 someip" in the guest can result
in more than 50% lost ping packets (especially with older guest
kernels < 3.17).
After doing some analysis, it was clear that there is a problem
with the way we handle the receive buffers in spapr_llan.c: The
ibmveth driver of the guest Linux kernel tries to add a lot of
buffers into several buffer pools (with 512, 2048 and 65536 byte
sizes by default, but it can be changed via the entries in the
/sys/devices/vio/1000/pool* directories of the guest). However,
the spapr-vlan device of QEMU only tries to squeeze all receive
buffer descriptors into one single page which has been supplied
by the guest during the H_REGISTER_LOGICAL_LAN call, without
taking care of different buffer sizes. This has two bad effects:
First, only a very limited number of buffer descriptors is accepted
at all. Second, we also hand 64k buffers to the guest even if
the 2k buffers would fit better - and this results in dropped packets
in the IP layer of the guest since too much skbuf memory is used.
Though it seems at a first glance like PAPR says that we should store
the receive buffer descriptors in the page that is supplied during
the H_REGISTER_LOGICAL_LAN call, chapter 16.4.1.2 in the LoPAPR spec
declares that "the contents of these descriptors are architecturally
opaque, none of these descriptors are manipulated by code above
the architected interfaces". That means we don't have to store
the RX buffer descriptors in this page, but can also manage the
receive buffers at the hypervisor level only. This is now what we
are doing here: Introducing proper RX buffer pools which are also
sorted by size of the buffers, so we can hand out a buffer with
the best fitting size when a packet has been received.
To avoid problems with migration from/to older version of QEMU,
the old behavior is also retained and enabled by default. The new
buffer management has to be enabled via a new "use-rx-buffer-pools"
property.
Now with the new buffer pool management enabled, the problem with
"ping -s 65507" is fixed for me, and the throughput of a simple
test with wget increases from creeping 3MB/s up to 20MB/s!
Signed-off-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2016-03-21 19:25:23 +03:00
|
|
|
RX_MAX_POOLS, 1,
|
|
|
|
vmstate_rx_buffer_pool, RxBufPool),
|
|
|
|
VMSTATE_END_OF_LIST()
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2013-07-18 23:32:56 +04:00
|
|
|
static const VMStateDescription vmstate_spapr_llan = {
|
|
|
|
.name = "spapr_llan",
|
|
|
|
.version_id = 1,
|
|
|
|
.minimum_version_id = 1,
|
2014-04-16 17:24:04 +04:00
|
|
|
.fields = (VMStateField[]) {
|
spapr: Use CamelCase properly
The qemu coding standard is to use CamelCase for type and structure names,
and the pseries code follows that... sort of. There are quite a lot of
places where we bend the rules in order to preserve the capitalization of
internal acronyms like "PHB", "TCE", "DIMM" and most commonly "sPAPR".
That was a bad idea - it frequently leads to names ending up with hard to
read clusters of capital letters, and means they don't catch the eye as
type identifiers, which is kind of the point of the CamelCase convention in
the first place.
In short, keeping type identifiers look like CamelCase is more important
than preserving standard capitalization of internal "words". So, this
patch renames a heap of spapr internal type names to a more standard
CamelCase.
In addition to case changes, we also make some other identifier renames:
VIOsPAPR* -> SpaprVio*
The reverse word ordering was only ever used to mitigate the capital
cluster, so revert to the natural ordering.
VIOsPAPRVTYDevice -> SpaprVioVty
VIOsPAPRVLANDevice -> SpaprVioVlan
Brevity, since the "Device" didn't add useful information
sPAPRDRConnector -> SpaprDrc
sPAPRDRConnectorClass -> SpaprDrcClass
Brevity, and makes it clearer this is the same thing as a "DRC"
mentioned in many other places in the code
This is 100% a mechanical search-and-replace patch. It will, however,
conflict with essentially any and all outstanding patches touching the
spapr code.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2019-03-06 07:35:37 +03:00
|
|
|
VMSTATE_SPAPR_VIO(sdev, SpaprVioVlan),
|
2013-07-18 23:32:56 +04:00
|
|
|
/* LLAN state */
|
spapr: Use CamelCase properly
The qemu coding standard is to use CamelCase for type and structure names,
and the pseries code follows that... sort of. There are quite a lot of
places where we bend the rules in order to preserve the capitalization of
internal acronyms like "PHB", "TCE", "DIMM" and most commonly "sPAPR".
That was a bad idea - it frequently leads to names ending up with hard to
read clusters of capital letters, and means they don't catch the eye as
type identifiers, which is kind of the point of the CamelCase convention in
the first place.
In short, keeping type identifiers look like CamelCase is more important
than preserving standard capitalization of internal "words". So, this
patch renames a heap of spapr internal type names to a more standard
CamelCase.
In addition to case changes, we also make some other identifier renames:
VIOsPAPR* -> SpaprVio*
The reverse word ordering was only ever used to mitigate the capital
cluster, so revert to the natural ordering.
VIOsPAPRVTYDevice -> SpaprVioVty
VIOsPAPRVLANDevice -> SpaprVioVlan
Brevity, since the "Device" didn't add useful information
sPAPRDRConnector -> SpaprDrc
sPAPRDRConnectorClass -> SpaprDrcClass
Brevity, and makes it clearer this is the same thing as a "DRC"
mentioned in many other places in the code
This is 100% a mechanical search-and-replace patch. It will, however,
conflict with essentially any and all outstanding patches touching the
spapr code.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2019-03-06 07:35:37 +03:00
|
|
|
VMSTATE_BOOL(isopen, SpaprVioVlan),
|
|
|
|
VMSTATE_UINT64(buf_list, SpaprVioVlan),
|
|
|
|
VMSTATE_UINT32(add_buf_ptr, SpaprVioVlan),
|
|
|
|
VMSTATE_UINT32(use_buf_ptr, SpaprVioVlan),
|
|
|
|
VMSTATE_UINT32(rx_bufs, SpaprVioVlan),
|
|
|
|
VMSTATE_UINT64(rxq_ptr, SpaprVioVlan),
|
2013-07-18 23:32:56 +04:00
|
|
|
|
|
|
|
VMSTATE_END_OF_LIST()
|
|
|
|
},
|
hw/net/spapr_llan: Fix receive buffer handling for better performance
tl;dr:
This patch introduces an alternate way of handling the receive
buffers of the spapr-vlan device, resulting in much better
receive performance for the guest.
Full story:
One of our testers recently discovered that the performance of the
spapr-vlan device is very poor compared to other NICs, and that
a simple "ping -i 0.2 -s 65507 someip" in the guest can result
in more than 50% lost ping packets (especially with older guest
kernels < 3.17).
After doing some analysis, it was clear that there is a problem
with the way we handle the receive buffers in spapr_llan.c: The
ibmveth driver of the guest Linux kernel tries to add a lot of
buffers into several buffer pools (with 512, 2048 and 65536 byte
sizes by default, but it can be changed via the entries in the
/sys/devices/vio/1000/pool* directories of the guest). However,
the spapr-vlan device of QEMU only tries to squeeze all receive
buffer descriptors into one single page which has been supplied
by the guest during the H_REGISTER_LOGICAL_LAN call, without
taking care of different buffer sizes. This has two bad effects:
First, only a very limited number of buffer descriptors is accepted
at all. Second, we also hand 64k buffers to the guest even if
the 2k buffers would fit better - and this results in dropped packets
in the IP layer of the guest since too much skbuf memory is used.
Though it seems at a first glance like PAPR says that we should store
the receive buffer descriptors in the page that is supplied during
the H_REGISTER_LOGICAL_LAN call, chapter 16.4.1.2 in the LoPAPR spec
declares that "the contents of these descriptors are architecturally
opaque, none of these descriptors are manipulated by code above
the architected interfaces". That means we don't have to store
the RX buffer descriptors in this page, but can also manage the
receive buffers at the hypervisor level only. This is now what we
are doing here: Introducing proper RX buffer pools which are also
sorted by size of the buffers, so we can hand out a buffer with
the best fitting size when a packet has been received.
To avoid problems with migration from/to older version of QEMU,
the old behavior is also retained and enabled by default. The new
buffer management has to be enabled via a new "use-rx-buffer-pools"
property.
Now with the new buffer pool management enabled, the problem with
"ping -s 65507" is fixed for me, and the throughput of a simple
test with wget increases from creeping 3MB/s up to 20MB/s!
Signed-off-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2016-03-21 19:25:23 +03:00
|
|
|
.subsections = (const VMStateDescription * []) {
|
|
|
|
&vmstate_rx_pools,
|
|
|
|
NULL
|
|
|
|
}
|
2013-07-18 23:32:56 +04:00
|
|
|
};
|
|
|
|
|
2011-12-16 02:31:06 +04:00
|
|
|
static void spapr_vlan_class_init(ObjectClass *klass, void *data)
|
|
|
|
{
|
2011-12-08 07:34:16 +04:00
|
|
|
DeviceClass *dc = DEVICE_CLASS(klass);
|
spapr: Use CamelCase properly
The qemu coding standard is to use CamelCase for type and structure names,
and the pseries code follows that... sort of. There are quite a lot of
places where we bend the rules in order to preserve the capitalization of
internal acronyms like "PHB", "TCE", "DIMM" and most commonly "sPAPR".
That was a bad idea - it frequently leads to names ending up with hard to
read clusters of capital letters, and means they don't catch the eye as
type identifiers, which is kind of the point of the CamelCase convention in
the first place.
In short, keeping type identifiers look like CamelCase is more important
than preserving standard capitalization of internal "words". So, this
patch renames a heap of spapr internal type names to a more standard
CamelCase.
In addition to case changes, we also make some other identifier renames:
VIOsPAPR* -> SpaprVio*
The reverse word ordering was only ever used to mitigate the capital
cluster, so revert to the natural ordering.
VIOsPAPRVTYDevice -> SpaprVioVty
VIOsPAPRVLANDevice -> SpaprVioVlan
Brevity, since the "Device" didn't add useful information
sPAPRDRConnector -> SpaprDrc
sPAPRDRConnectorClass -> SpaprDrcClass
Brevity, and makes it clearer this is the same thing as a "DRC"
mentioned in many other places in the code
This is 100% a mechanical search-and-replace patch. It will, however,
conflict with essentially any and all outstanding patches touching the
spapr code.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2019-03-06 07:35:37 +03:00
|
|
|
SpaprVioDeviceClass *k = VIO_SPAPR_DEVICE_CLASS(klass);
|
2011-12-16 02:31:06 +04:00
|
|
|
|
2015-02-27 13:52:17 +03:00
|
|
|
k->realize = spapr_vlan_realize;
|
2012-04-12 06:44:15 +04:00
|
|
|
k->reset = spapr_vlan_reset;
|
2011-12-16 02:31:06 +04:00
|
|
|
k->devnode = spapr_vlan_devnode;
|
|
|
|
k->dt_name = "l-lan";
|
|
|
|
k->dt_type = "network";
|
|
|
|
k->dt_compatible = "IBM,l-lan";
|
|
|
|
k->signal_mask = 0x1;
|
2013-10-11 07:08:20 +04:00
|
|
|
set_bit(DEVICE_CATEGORY_NETWORK, dc->categories);
|
2020-01-10 18:30:32 +03:00
|
|
|
device_class_set_props(dc, spapr_vlan_properties);
|
2012-06-27 08:50:44 +04:00
|
|
|
k->rtce_window_size = 0x10000000;
|
2013-07-18 23:32:56 +04:00
|
|
|
dc->vmsd = &vmstate_spapr_llan;
|
2011-12-16 02:31:06 +04:00
|
|
|
}
|
|
|
|
|
2013-01-10 19:19:07 +04:00
|
|
|
static const TypeInfo spapr_vlan_info = {
|
2013-04-07 23:08:16 +04:00
|
|
|
.name = TYPE_VIO_SPAPR_VLAN_DEVICE,
|
2011-12-08 07:34:16 +04:00
|
|
|
.parent = TYPE_VIO_SPAPR_DEVICE,
|
spapr: Use CamelCase properly
The qemu coding standard is to use CamelCase for type and structure names,
and the pseries code follows that... sort of. There are quite a lot of
places where we bend the rules in order to preserve the capitalization of
internal acronyms like "PHB", "TCE", "DIMM" and most commonly "sPAPR".
That was a bad idea - it frequently leads to names ending up with hard to
read clusters of capital letters, and means they don't catch the eye as
type identifiers, which is kind of the point of the CamelCase convention in
the first place.
In short, keeping type identifiers look like CamelCase is more important
than preserving standard capitalization of internal "words". So, this
patch renames a heap of spapr internal type names to a more standard
CamelCase.
In addition to case changes, we also make some other identifier renames:
VIOsPAPR* -> SpaprVio*
The reverse word ordering was only ever used to mitigate the capital
cluster, so revert to the natural ordering.
VIOsPAPRVTYDevice -> SpaprVioVty
VIOsPAPRVLANDevice -> SpaprVioVlan
Brevity, since the "Device" didn't add useful information
sPAPRDRConnector -> SpaprDrc
sPAPRDRConnectorClass -> SpaprDrcClass
Brevity, and makes it clearer this is the same thing as a "DRC"
mentioned in many other places in the code
This is 100% a mechanical search-and-replace patch. It will, however,
conflict with essentially any and all outstanding patches touching the
spapr code.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2019-03-06 07:35:37 +03:00
|
|
|
.instance_size = sizeof(SpaprVioVlan),
|
2011-12-08 07:34:16 +04:00
|
|
|
.class_init = spapr_vlan_class_init,
|
2014-10-07 12:00:18 +04:00
|
|
|
.instance_init = spapr_vlan_instance_init,
|
hw/net/spapr_llan: Fix receive buffer handling for better performance
tl;dr:
This patch introduces an alternate way of handling the receive
buffers of the spapr-vlan device, resulting in much better
receive performance for the guest.
Full story:
One of our testers recently discovered that the performance of the
spapr-vlan device is very poor compared to other NICs, and that
a simple "ping -i 0.2 -s 65507 someip" in the guest can result
in more than 50% lost ping packets (especially with older guest
kernels < 3.17).
After doing some analysis, it was clear that there is a problem
with the way we handle the receive buffers in spapr_llan.c: The
ibmveth driver of the guest Linux kernel tries to add a lot of
buffers into several buffer pools (with 512, 2048 and 65536 byte
sizes by default, but it can be changed via the entries in the
/sys/devices/vio/1000/pool* directories of the guest). However,
the spapr-vlan device of QEMU only tries to squeeze all receive
buffer descriptors into one single page which has been supplied
by the guest during the H_REGISTER_LOGICAL_LAN call, without
taking care of different buffer sizes. This has two bad effects:
First, only a very limited number of buffer descriptors is accepted
at all. Second, we also hand 64k buffers to the guest even if
the 2k buffers would fit better - and this results in dropped packets
in the IP layer of the guest since too much skbuf memory is used.
Though it seems at a first glance like PAPR says that we should store
the receive buffer descriptors in the page that is supplied during
the H_REGISTER_LOGICAL_LAN call, chapter 16.4.1.2 in the LoPAPR spec
declares that "the contents of these descriptors are architecturally
opaque, none of these descriptors are manipulated by code above
the architected interfaces". That means we don't have to store
the RX buffer descriptors in this page, but can also manage the
receive buffers at the hypervisor level only. This is now what we
are doing here: Introducing proper RX buffer pools which are also
sorted by size of the buffers, so we can hand out a buffer with
the best fitting size when a packet has been received.
To avoid problems with migration from/to older version of QEMU,
the old behavior is also retained and enabled by default. The new
buffer management has to be enabled via a new "use-rx-buffer-pools"
property.
Now with the new buffer pool management enabled, the problem with
"ping -s 65507" is fixed for me, and the throughput of a simple
test with wget increases from creeping 3MB/s up to 20MB/s!
Signed-off-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2016-03-21 19:25:23 +03:00
|
|
|
.instance_finalize = spapr_vlan_instance_finalize,
|
2011-04-01 08:15:29 +04:00
|
|
|
};
|
|
|
|
|
2012-02-09 18:20:55 +04:00
|
|
|
static void spapr_vlan_register_types(void)
|
2011-04-01 08:15:29 +04:00
|
|
|
{
|
2011-12-12 22:24:28 +04:00
|
|
|
spapr_register_hypercall(H_REGISTER_LOGICAL_LAN, h_register_logical_lan);
|
|
|
|
spapr_register_hypercall(H_FREE_LOGICAL_LAN, h_free_logical_lan);
|
|
|
|
spapr_register_hypercall(H_SEND_LOGICAL_LAN, h_send_logical_lan);
|
|
|
|
spapr_register_hypercall(H_ADD_LOGICAL_LAN_BUFFER,
|
|
|
|
h_add_logical_lan_buffer);
|
|
|
|
spapr_register_hypercall(H_MULTICAST_CTRL, h_multicast_ctrl);
|
2016-09-01 11:10:49 +03:00
|
|
|
spapr_register_hypercall(H_CHANGE_LOGICAL_LAN_MAC,
|
|
|
|
h_change_logical_lan_mac);
|
2011-12-08 07:34:16 +04:00
|
|
|
type_register_static(&spapr_vlan_info);
|
2011-04-01 08:15:29 +04:00
|
|
|
}
|
2012-02-09 18:20:55 +04:00
|
|
|
|
|
|
|
type_init(spapr_vlan_register_types)
|