qemu/include/exec/cpu-defs.h

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

222 lines
6.6 KiB
C
Raw Normal View History

/*
* common defines for all CPUs
*
* Copyright (c) 2003 Fabrice Bellard
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, see <http://www.gnu.org/licenses/>.
*/
#ifndef CPU_DEFS_H
#define CPU_DEFS_H
#ifndef NEED_CPU_H
#error cpu.h included from common code
#endif
#include "qemu/host-utils.h"
cputlb: serialize tlb updates with env->tlb_lock Currently we rely on atomic operations for cross-CPU invalidations. There are two cases that these atomics miss: cross-CPU invalidations can race with either (1) vCPU threads flushing their TLB, which happens via memset, or (2) vCPUs calling tlb_reset_dirty on their TLB, which updates .addr_write with a regular store. This results in undefined behaviour, since we're mixing regular and atomic ops on concurrent accesses. Fix it by using tlb_lock, a per-vCPU lock. All updaters of tlb_table and the corresponding victim cache now hold the lock. The readers that do not hold tlb_lock must use atomic reads when reading .addr_write, since this field can be updated by other threads; the conversion to atomic reads is done in the next patch. Note that an alternative fix would be to expand the use of atomic ops. However, in the case of TLB flushes this would have a huge performance impact, since (1) TLB flushes can happen very frequently and (2) we currently use a full memory barrier to flush each TLB entry, and a TLB has many entries. Instead, acquiring the lock is barely slower than a full memory barrier since it is uncontended, and with a single lock acquisition we can flush the entire TLB. Tested-by: Alex Bennée <alex.bennee@linaro.org> Reviewed-by: Alex Bennée <alex.bennee@linaro.org> Signed-off-by: Emilio G. Cota <cota@braap.org> Message-Id: <20181009174557.16125-6-cota@braap.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2018-10-09 20:45:56 +03:00
#include "qemu/thread.h"
#ifndef CONFIG_USER_ONLY
#include "exec/hwaddr.h"
#endif
#include "exec/memattrs.h"
#include "hw/core/cpu.h"
#include "cpu-param.h"
#ifndef TARGET_LONG_BITS
# error TARGET_LONG_BITS must be defined in cpu-param.h
#endif
#ifndef TARGET_PHYS_ADDR_SPACE_BITS
# error TARGET_PHYS_ADDR_SPACE_BITS must be defined in cpu-param.h
#endif
#ifndef TARGET_VIRT_ADDR_SPACE_BITS
# error TARGET_VIRT_ADDR_SPACE_BITS must be defined in cpu-param.h
#endif
#ifndef TARGET_PAGE_BITS
# ifdef TARGET_PAGE_BITS_VARY
# ifndef TARGET_PAGE_BITS_MIN
# error TARGET_PAGE_BITS_MIN must be defined in cpu-param.h
# endif
# else
# error TARGET_PAGE_BITS must be defined in cpu-param.h
# endif
#endif
#include "exec/target_long.h"
/*
* Fix the number of mmu modes to 16, which is also the maximum
* supported by the softmmu tlb api.
*/
#define NB_MMU_MODES 16
#if defined(CONFIG_SOFTMMU) && defined(CONFIG_TCG)
#include "exec/tlb-common.h"
implementing victim TLB for QEMU system emulated TLB QEMU system mode page table walks are expensive. Taken by running QEMU qemu-system-x86_64 system mode on Intel PIN , a TLB miss and walking a 4-level page tables in guest Linux OS takes ~450 X86 instructions on average. QEMU system mode TLB is implemented using a directly-mapped hashtable. This structure suffers from conflict misses. Increasing the associativity of the TLB may not be the solution to conflict misses as all the ways may have to be walked in serial. A victim TLB is a TLB used to hold translations evicted from the primary TLB upon replacement. The victim TLB lies between the main TLB and its refill path. Victim TLB is of greater associativity (fully associative in this patch). It takes longer to lookup the victim TLB, but its likely better than a full page table walk. The memory translation path is changed as follows : Before Victim TLB: 1. Inline TLB lookup 2. Exit code cache on TLB miss. 3. Check for unaligned, IO accesses 4. TLB refill. 5. Do the memory access. 6. Return to code cache. After Victim TLB: 1. Inline TLB lookup 2. Exit code cache on TLB miss. 3. Check for unaligned, IO accesses 4. Victim TLB lookup. 5. If victim TLB misses, TLB refill 6. Do the memory access. 7. Return to code cache The advantage is that victim TLB can offer more associativity to a directly mapped TLB and thus potentially fewer page table walks while still keeping the time taken to flush within reasonable limits. However, placing a victim TLB before the refill path increase TLB refill path as the victim TLB is consulted before the TLB refill. The performance results demonstrate that the pros outweigh the cons. some performance results taken on SPECINT2006 train datasets and kernel boot and qemu configure script on an Intel(R) Xeon(R) CPU E5620 @ 2.40GHz Linux machine are shown in the Google Doc link below. https://docs.google.com/spreadsheets/d/1eiItzekZwNQOal_h-5iJmC4tMDi051m9qidi5_nwvH4/edit?usp=sharing In summary, victim TLB improves the performance of qemu-system-x86_64 by 11% on average on SPECINT2006, kernelboot and qemu configscript and with highest improvement of in 26% in 456.hmmer. And victim TLB does not result in any performance degradation in any of the measured benchmarks. Furthermore, the implemented victim TLB is architecture independent and is expected to benefit other architectures in QEMU as well. Although there are measurement fluctuations, the performance improvement is very significant and by no means in the range of noises. Signed-off-by: Xin Tong <trent.tong@gmail.com> Message-id: 1407202523-23553-1-git-send-email-trent.tong@gmail.com Reviewed-by: Peter Maydell <peter.maydell@linaro.org> Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
2014-08-05 05:35:23 +04:00
/* use a fully associative victim tlb of 8 entries */
#define CPU_VTLB_SIZE 8
#define CPU_TLB_DYN_MIN_BITS 6
#define CPU_TLB_DYN_DEFAULT_BITS 8
# if HOST_LONG_BITS == 32
/* Make sure we do not require a double-word shift for the TLB load */
# define CPU_TLB_DYN_MAX_BITS (32 - TARGET_PAGE_BITS)
# else /* HOST_LONG_BITS == 64 */
/*
* Assuming TARGET_PAGE_BITS==12, with 2**22 entries we can cover 2**(22+12) ==
* 2**34 == 16G of address space. This is roughly what one would expect a
* TLB to cover in a modern (as of 2018) x86_64 CPU. For instance, Intel
* Skylake's Level-2 STLB has 16 1G entries.
* Also, make sure we do not size the TLB past the guest's address space.
*/
osdep: Make MIN/MAX evaluate arguments only once I'm not aware of any immediate bugs in qemu where a second runtime evaluation of the arguments to MIN() or MAX() causes a problem, but proactively preventing such abuse is easier than falling prey to an unintended case down the road. At any rate, here's the conversation that sparked the current patch: https://lists.gnu.org/archive/html/qemu-devel/2018-12/msg05718.html Update the MIN/MAX macros to only evaluate their argument once at runtime; this uses typeof(1 ? (a) : (b)) to ensure that we are promoting the temporaries to the same type as the final comparison (we have to trigger type promotion, as typeof(bitfield) won't compile; and we can't use typeof((a) + (b)) or even typeof((a) + 0), as some of our uses of MAX are on void* pointers where such addition is undefined). However, we are unable to work around gcc refusing to compile ({}) in a constant context (such as the array length of a static variable), even when only used in the dead branch of a __builtin_choose_expr(), so we have to provide a second macro pair MIN_CONST and MAX_CONST for use when both arguments are known to be compile-time constants and where the result must also be usable as a constant; this second form evaluates arguments multiple times but that doesn't matter for constants. By using a void expression as the expansion if a non-constant is presented to this second form, we can enlist the compiler to ensure the double evaluation is not attempted on non-constants. Alas, as both macros now rely on compiler intrinsics, they are no longer usable in preprocessor #if conditions; those will just have to be open-coded or the logic rewritten into #define or runtime 'if' conditions (but where the compiler dead-code-elimination will probably still apply). I tested that both gcc 10.1.1 and clang 10.0.0 produce errors for all forms of macro mis-use. As the errors can sometimes be cryptic, I'm demonstrating the gcc output: Use of MIN when MIN_CONST is needed: In file included from /home/eblake/qemu/qemu-img.c:25: /home/eblake/qemu/include/qemu/osdep.h:249:5: error: braced-group within expression allowed only inside a function 249 | ({ \ | ^ /home/eblake/qemu/qemu-img.c:92:12: note: in expansion of macro ‘MIN’ 92 | char array[MIN(1, 2)] = ""; | ^~~ Use of MIN_CONST when MIN is needed: /home/eblake/qemu/qemu-img.c: In function ‘is_allocated_sectors’: /home/eblake/qemu/qemu-img.c:1225:15: error: void value not ignored as it ought to be 1225 | i = MIN_CONST(i, n); | ^ Use of MIN in the preprocessor: In file included from /home/eblake/qemu/accel/tcg/translate-all.c:20: /home/eblake/qemu/accel/tcg/translate-all.c: In function ‘page_check_range’: /home/eblake/qemu/include/qemu/osdep.h:249:6: error: token "{" is not valid in preprocessor expressions 249 | ({ \ | ^ Fix the resulting callsites that used #if or computed a compile-time constant min or max to use the new macros. cpu-defs.h is interesting, as CPU_TLB_DYN_MAX_BITS is sometimes used as a constant and sometimes dynamic. It may be worth improving glib's MIN/MAX definitions to be saner, but that is a task for another day. Signed-off-by: Eric Blake <eblake@redhat.com> Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com> Tested-by: Philippe Mathieu-Daudé <philmd@redhat.com> Message-Id: <20200625162602.700741-1-eblake@redhat.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2020-06-25 19:26:02 +03:00
# ifdef TARGET_PAGE_BITS_VARY
# define CPU_TLB_DYN_MAX_BITS \
MIN(22, TARGET_VIRT_ADDR_SPACE_BITS - TARGET_PAGE_BITS)
osdep: Make MIN/MAX evaluate arguments only once I'm not aware of any immediate bugs in qemu where a second runtime evaluation of the arguments to MIN() or MAX() causes a problem, but proactively preventing such abuse is easier than falling prey to an unintended case down the road. At any rate, here's the conversation that sparked the current patch: https://lists.gnu.org/archive/html/qemu-devel/2018-12/msg05718.html Update the MIN/MAX macros to only evaluate their argument once at runtime; this uses typeof(1 ? (a) : (b)) to ensure that we are promoting the temporaries to the same type as the final comparison (we have to trigger type promotion, as typeof(bitfield) won't compile; and we can't use typeof((a) + (b)) or even typeof((a) + 0), as some of our uses of MAX are on void* pointers where such addition is undefined). However, we are unable to work around gcc refusing to compile ({}) in a constant context (such as the array length of a static variable), even when only used in the dead branch of a __builtin_choose_expr(), so we have to provide a second macro pair MIN_CONST and MAX_CONST for use when both arguments are known to be compile-time constants and where the result must also be usable as a constant; this second form evaluates arguments multiple times but that doesn't matter for constants. By using a void expression as the expansion if a non-constant is presented to this second form, we can enlist the compiler to ensure the double evaluation is not attempted on non-constants. Alas, as both macros now rely on compiler intrinsics, they are no longer usable in preprocessor #if conditions; those will just have to be open-coded or the logic rewritten into #define or runtime 'if' conditions (but where the compiler dead-code-elimination will probably still apply). I tested that both gcc 10.1.1 and clang 10.0.0 produce errors for all forms of macro mis-use. As the errors can sometimes be cryptic, I'm demonstrating the gcc output: Use of MIN when MIN_CONST is needed: In file included from /home/eblake/qemu/qemu-img.c:25: /home/eblake/qemu/include/qemu/osdep.h:249:5: error: braced-group within expression allowed only inside a function 249 | ({ \ | ^ /home/eblake/qemu/qemu-img.c:92:12: note: in expansion of macro ‘MIN’ 92 | char array[MIN(1, 2)] = ""; | ^~~ Use of MIN_CONST when MIN is needed: /home/eblake/qemu/qemu-img.c: In function ‘is_allocated_sectors’: /home/eblake/qemu/qemu-img.c:1225:15: error: void value not ignored as it ought to be 1225 | i = MIN_CONST(i, n); | ^ Use of MIN in the preprocessor: In file included from /home/eblake/qemu/accel/tcg/translate-all.c:20: /home/eblake/qemu/accel/tcg/translate-all.c: In function ‘page_check_range’: /home/eblake/qemu/include/qemu/osdep.h:249:6: error: token "{" is not valid in preprocessor expressions 249 | ({ \ | ^ Fix the resulting callsites that used #if or computed a compile-time constant min or max to use the new macros. cpu-defs.h is interesting, as CPU_TLB_DYN_MAX_BITS is sometimes used as a constant and sometimes dynamic. It may be worth improving glib's MIN/MAX definitions to be saner, but that is a task for another day. Signed-off-by: Eric Blake <eblake@redhat.com> Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com> Tested-by: Philippe Mathieu-Daudé <philmd@redhat.com> Message-Id: <20200625162602.700741-1-eblake@redhat.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2020-06-25 19:26:02 +03:00
# else
# define CPU_TLB_DYN_MAX_BITS \
MIN_CONST(22, TARGET_VIRT_ADDR_SPACE_BITS - TARGET_PAGE_BITS)
# endif
# endif
#endif /* CONFIG_SOFTMMU && CONFIG_TCG */
#if defined(CONFIG_SOFTMMU)
/*
* The full TLB entry, which is not accessed by generated TCG code,
* so the layout is not as critical as that of CPUTLBEntry. This is
* also why we don't want to combine the two structs.
*/
typedef struct CPUTLBEntryFull {
/*
* @xlat_section contains:
* - For ram, an offset which must be added to the virtual address
* to obtain the ram_addr_t of the target RAM
* - For other memory regions,
* + in the lower TARGET_PAGE_BITS, the physical section number
* + with the TARGET_PAGE_BITS masked off, the offset within
* the target MemoryRegion
*/
hwaddr xlat_section;
/*
* @phys_addr contains the physical address in the address space
* given by cpu_asidx_from_attrs(cpu, @attrs).
*/
hwaddr phys_addr;
/* @attrs contains the memory transaction attributes for the page. */
MemTxAttrs attrs;
/* @prot contains the complete protections for the page. */
uint8_t prot;
/* @lg_page_size contains the log2 of the page size. */
uint8_t lg_page_size;
/*
* Additional tlb flags for use by the slow path. If non-zero,
* the corresponding CPUTLBEntry comparator must have TLB_FORCE_SLOW.
*/
uint8_t slow_flags[MMU_ACCESS_COUNT];
/*
* Allow target-specific additions to this structure.
* This may be used to cache items from the guest cpu
* page tables for later use by the implementation.
*/
#ifdef TARGET_PAGE_ENTRY_EXTRA
TARGET_PAGE_ENTRY_EXTRA
#endif
} CPUTLBEntryFull;
#endif /* CONFIG_SOFTMMU */
#if defined(CONFIG_SOFTMMU) && defined(CONFIG_TCG)
/*
* Data elements that are per MMU mode, minus the bits accessed by
* the TCG fast path.
*/
typedef struct CPUTLBDesc {
/*
* Describe a region covering all of the large pages allocated
* into the tlb. When any page within this region is flushed,
* we must flush the entire tlb. The region is matched if
* (addr & large_page_mask) == large_page_addr.
*/
vaddr large_page_addr;
vaddr large_page_mask;
/* host time (in ns) at the beginning of the time window */
int64_t window_begin_ns;
/* maximum number of entries observed in the window */
size_t window_max_entries;
size_t n_used_entries;
/* The next index to use in the tlb victim table. */
size_t vindex;
/* The tlb victim table, in two parts. */
CPUTLBEntry vtable[CPU_VTLB_SIZE];
CPUTLBEntryFull vfulltlb[CPU_VTLB_SIZE];
CPUTLBEntryFull *fulltlb;
} CPUTLBDesc;
/*
* Data elements that are shared between all MMU modes.
*/
typedef struct CPUTLBCommon {
/* Serialize updates to f.table and d.vtable, and others as noted. */
QemuSpin lock;
/*
* Within dirty, for each bit N, modifications have been made to
* mmu_idx N since the last time that mmu_idx was flushed.
* Protected by tlb_c.lock.
*/
uint16_t dirty;
/*
* Statistics. These are not lock protected, but are read and
* written atomically. This allows the monitor to print a snapshot
* of the stats without interfering with the cpu.
*/
size_t full_flush_count;
size_t part_flush_count;
size_t elide_flush_count;
} CPUTLBCommon;
/*
* The entire softmmu tlb, for all MMU modes.
* The meaning of each of the MMU modes is defined in the target code.
* Since this is placed within CPUNegativeOffsetState, the smallest
* negative offsets are at the end of the struct.
*/
typedef struct CPUTLB {
CPUTLBCommon c;
CPUTLBDesc d[NB_MMU_MODES];
CPUTLBDescFast f[NB_MMU_MODES];
} CPUTLB;
#else
typedef struct CPUTLB { } CPUTLB;
#endif /* CONFIG_SOFTMMU && CONFIG_TCG */
/*
* This structure must be placed in ArchCPU immediately
* before CPUArchState, as a field named "neg".
*/
typedef struct CPUNegativeOffsetState {
CPUTLB tlb;
IcountDecr icount_decr;
} CPUNegativeOffsetState;
#endif