qemu/include/exec/cpu_ldst.h

483 lines
11 KiB
C
Raw Normal View History

/*
* Software MMU support
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, see <http://www.gnu.org/licenses/>.
*
*/
/*
* Generate inline load/store functions for all MMU modes (typically
* at least _user and _kernel) as well as _data versions, for all data
* sizes.
*
* Used by target op helpers.
*
* The syntax for the accessors is:
*
* load: cpu_ld{sign}{size}_{mmusuffix}(env, ptr)
*
* store: cpu_st{sign}{size}_{mmusuffix}(env, ptr, val)
*
* sign is:
* (empty): for 32 and 64 bit sizes
* u : unsigned
* s : signed
*
* size is:
* b: 8 bits
* w: 16 bits
* l: 32 bits
* q: 64 bits
*
* mmusuffix is one of the generic suffixes "data" or "code", or
* (for softmmu configs) a target-specific MMU mode suffix as defined
* in target cpu.h.
*/
#ifndef CPU_LDST_H
#define CPU_LDST_H
#if defined(CONFIG_USER_ONLY)
/* sparc32plus has 64bit long but 32bit space address
* this can make bad result with g2h() and h2g()
*/
#if TARGET_VIRT_ADDR_SPACE_BITS <= 32
typedef uint32_t abi_ptr;
#define TARGET_ABI_FMT_ptr "%x"
#else
typedef uint64_t abi_ptr;
#define TARGET_ABI_FMT_ptr "%"PRIx64
#endif
/* All direct uses of g2h and h2g need to go away for usermode softmmu. */
#define g2h(x) ((void *)((unsigned long)(abi_ptr)(x) + guest_base))
#define guest_addr_valid(x) ((x) <= GUEST_ADDR_MAX)
#define h2g_valid(x) guest_addr_valid((unsigned long)(x) - guest_base)
static inline int guest_range_valid(unsigned long start, unsigned long len)
{
return len - 1 <= GUEST_ADDR_MAX && start <= GUEST_ADDR_MAX - len + 1;
}
#define h2g_nocheck(x) ({ \
unsigned long __ret = (unsigned long)(x) - guest_base; \
(abi_ptr)__ret; \
})
#define h2g(x) ({ \
/* Check if given address fits target address space */ \
assert(h2g_valid(x)); \
h2g_nocheck(x); \
})
#else
typedef target_ulong abi_ptr;
#define TARGET_ABI_FMT_ptr TARGET_ABI_FMT_lx
#endif
#if defined(CONFIG_USER_ONLY)
extern __thread uintptr_t helper_retaddr;
/* In user-only mode we provide only the _code and _data accessors. */
#define MEMSUFFIX _data
#define DATA_SIZE 1
#include "exec/cpu_ldst_useronly_template.h"
#define DATA_SIZE 2
#include "exec/cpu_ldst_useronly_template.h"
#define DATA_SIZE 4
#include "exec/cpu_ldst_useronly_template.h"
#define DATA_SIZE 8
#include "exec/cpu_ldst_useronly_template.h"
#undef MEMSUFFIX
#define MEMSUFFIX _code
#define CODE_ACCESS
#define DATA_SIZE 1
#include "exec/cpu_ldst_useronly_template.h"
#define DATA_SIZE 2
#include "exec/cpu_ldst_useronly_template.h"
#define DATA_SIZE 4
#include "exec/cpu_ldst_useronly_template.h"
#define DATA_SIZE 8
#include "exec/cpu_ldst_useronly_template.h"
#undef MEMSUFFIX
#undef CODE_ACCESS
#else
/* The memory helpers for tcg-generated code need tcg_target_long etc. */
#include "tcg.h"
cputlb: read CPUTLBEntry.addr_write atomically Updates can come from other threads, so readers that do not take tlb_lock must use atomic_read to avoid undefined behaviour (UB). This completes the conversion to tlb_lock. This conversion results on average in no performance loss, as the following experiments (run on an Intel i7-6700K CPU @ 4.00GHz) show. 1. aarch64 bootup+shutdown test: - Before: Performance counter stats for 'taskset -c 0 ../img/aarch64/die.sh' (10 runs): 7487.087786 task-clock (msec) # 0.998 CPUs utilized ( +- 0.12% ) 31,574,905,303 cycles # 4.217 GHz ( +- 0.12% ) 57,097,908,812 instructions # 1.81 insns per cycle ( +- 0.08% ) 10,255,415,367 branches # 1369.747 M/sec ( +- 0.08% ) 173,278,962 branch-misses # 1.69% of all branches ( +- 0.18% ) 7.504481349 seconds time elapsed ( +- 0.14% ) - After: Performance counter stats for 'taskset -c 0 ../img/aarch64/die.sh' (10 runs): 7462.441328 task-clock (msec) # 0.998 CPUs utilized ( +- 0.07% ) 31,478,476,520 cycles # 4.218 GHz ( +- 0.07% ) 57,017,330,084 instructions # 1.81 insns per cycle ( +- 0.05% ) 10,251,929,667 branches # 1373.804 M/sec ( +- 0.05% ) 173,023,787 branch-misses # 1.69% of all branches ( +- 0.11% ) 7.474970463 seconds time elapsed ( +- 0.07% ) 2. SPEC06int: SPEC06int (test set) [Y axis: Speedup over master] 1.15 +-+----+------+------+------+------+------+-------+------+------+------+------+------+------+----+-+ | | 1.1 +-+.................................+++.............................+ tlb-lock-v2 (m+++x) +-+ | +++ | +++ tlb-lock-v3 (spinl|ck) | | +++ | | +++ +++ | | | 1.05 +-+....+++...........####.........|####.+++.|......|.....###....+++...........+++....###.........+-+ | ### ++#| # |# |# ***### +++### +++#+# | +++ | #|# ### | 1 +-+++***+#++++####+++#++#++++++++++#++#+*+*++#++++#+#+****+#++++###++++###++++###++++#+#++++#+#+++-+ | *+* # #++# *** # #### *** # * *++# ****+# *| * # ****|# |# # #|# #+# # # | 0.95 +-+..*.*.#....#..#.*|*..#...#..#.*|*..#.*.*..#.*|.*.#.*++*.#.*++*+#.****.#....#+#....#.#..++#.#..+-+ | * * # # # *|* # # # *|* # * * # *++* # * * # * * # * |* # ++# # # # *** # | | * * # ++# # *+* # # # *|* # * * # * * # * * # * * # *++* # **** # ++# # * * # | 0.9 +-+..*.*.#...|#..#.*.*..#.++#..#.*|*..#.*.*..#.*..*.#.*..*.#.*..*.#.*..*.#.*.|*.#...|#.#..*.*.#..+-+ | * * # *** # * * # |# # *+* # * * # * * # * * # * * # * * # *++* # |# # * * # | 0.85 +-+..*.*.#..*|*..#.*.*..#.***..#.*.*..#.*.*..#.*..*.#.*..*.#.*..*.#.*..*.#.*..*.#.****.#..*.*.#..+-+ | * * # *+* # * * # *|* # * * # * * # * * # * * # * * # * * # * * # * |* # * * # | | * * # * * # * * # *+* # * * # * * # * * # * * # * * # * * # * * # * |* # * * # | 0.8 +-+..*.*.#..*.*..#.*.*..#.*.*..#.*.*..#.*.*..#.*..*.#.*..*.#.*..*.#.*..*.#.*..*.#.*++*.#..*.*.#..+-+ | * * # * * # * * # * * # * * # * * # * * # * * # * * # * * # * * # * * # * * # | 0.75 +-+--***##--***###-***###-***###-***###-***###-****##-****##-****##-****##-****##-****##--***##--+-+ 400.perlben401.bzip2403.gcc429.m445.gob456.hmme45462.libqua464.h26471.omnet473483.xalancbmkgeomean png: https://imgur.com/a/BHzpPTW Notes: - tlb-lock-v2 corresponds to an implementation with a mutex. - tlb-lock-v3 corresponds to the current implementation, i.e. a spinlock and a single lock acquisition in tlb_set_page_with_attrs. Signed-off-by: Emilio G. Cota <cota@braap.org> Message-Id: <20181016153840.25877-1-cota@braap.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2018-10-16 18:38:40 +03:00
static inline target_ulong tlb_addr_write(const CPUTLBEntry *entry)
{
#if TCG_OVERSIZED_GUEST
return entry->addr_write;
#else
return atomic_read(&entry->addr_write);
#endif
}
/* Find the TLB index corresponding to the mmu_idx + address pair. */
static inline uintptr_t tlb_index(CPUArchState *env, uintptr_t mmu_idx,
target_ulong addr)
{
uintptr_t size_mask = env->tlb_mask[mmu_idx] >> CPU_TLB_ENTRY_BITS;
return (addr >> TARGET_PAGE_BITS) & size_mask;
}
static inline size_t tlb_n_entries(CPUArchState *env, uintptr_t mmu_idx)
{
return (env->tlb_mask[mmu_idx] >> CPU_TLB_ENTRY_BITS) + 1;
}
/* Find the TLB entry corresponding to the mmu_idx + address pair. */
static inline CPUTLBEntry *tlb_entry(CPUArchState *env, uintptr_t mmu_idx,
target_ulong addr)
{
return &env->tlb_table[mmu_idx][tlb_index(env, mmu_idx, addr)];
}
#ifdef MMU_MODE0_SUFFIX
#define CPU_MMU_INDEX 0
#define MEMSUFFIX MMU_MODE0_SUFFIX
#define DATA_SIZE 1
#include "exec/cpu_ldst_template.h"
#define DATA_SIZE 2
#include "exec/cpu_ldst_template.h"
#define DATA_SIZE 4
#include "exec/cpu_ldst_template.h"
#define DATA_SIZE 8
#include "exec/cpu_ldst_template.h"
#undef CPU_MMU_INDEX
#undef MEMSUFFIX
#endif
#if (NB_MMU_MODES >= 2) && defined(MMU_MODE1_SUFFIX)
#define CPU_MMU_INDEX 1
#define MEMSUFFIX MMU_MODE1_SUFFIX
#define DATA_SIZE 1
#include "exec/cpu_ldst_template.h"
#define DATA_SIZE 2
#include "exec/cpu_ldst_template.h"
#define DATA_SIZE 4
#include "exec/cpu_ldst_template.h"
#define DATA_SIZE 8
#include "exec/cpu_ldst_template.h"
#undef CPU_MMU_INDEX
#undef MEMSUFFIX
#endif
#if (NB_MMU_MODES >= 3) && defined(MMU_MODE2_SUFFIX)
#define CPU_MMU_INDEX 2
#define MEMSUFFIX MMU_MODE2_SUFFIX
#define DATA_SIZE 1
#include "exec/cpu_ldst_template.h"
#define DATA_SIZE 2
#include "exec/cpu_ldst_template.h"
#define DATA_SIZE 4
#include "exec/cpu_ldst_template.h"
#define DATA_SIZE 8
#include "exec/cpu_ldst_template.h"
#undef CPU_MMU_INDEX
#undef MEMSUFFIX
#endif /* (NB_MMU_MODES >= 3) */
#if (NB_MMU_MODES >= 4) && defined(MMU_MODE3_SUFFIX)
#define CPU_MMU_INDEX 3
#define MEMSUFFIX MMU_MODE3_SUFFIX
#define DATA_SIZE 1
#include "exec/cpu_ldst_template.h"
#define DATA_SIZE 2
#include "exec/cpu_ldst_template.h"
#define DATA_SIZE 4
#include "exec/cpu_ldst_template.h"
#define DATA_SIZE 8
#include "exec/cpu_ldst_template.h"
#undef CPU_MMU_INDEX
#undef MEMSUFFIX
#endif /* (NB_MMU_MODES >= 4) */
#if (NB_MMU_MODES >= 5) && defined(MMU_MODE4_SUFFIX)
#define CPU_MMU_INDEX 4
#define MEMSUFFIX MMU_MODE4_SUFFIX
#define DATA_SIZE 1
#include "exec/cpu_ldst_template.h"
#define DATA_SIZE 2
#include "exec/cpu_ldst_template.h"
#define DATA_SIZE 4
#include "exec/cpu_ldst_template.h"
#define DATA_SIZE 8
#include "exec/cpu_ldst_template.h"
#undef CPU_MMU_INDEX
#undef MEMSUFFIX
#endif /* (NB_MMU_MODES >= 5) */
#if (NB_MMU_MODES >= 6) && defined(MMU_MODE5_SUFFIX)
#define CPU_MMU_INDEX 5
#define MEMSUFFIX MMU_MODE5_SUFFIX
#define DATA_SIZE 1
#include "exec/cpu_ldst_template.h"
#define DATA_SIZE 2
#include "exec/cpu_ldst_template.h"
#define DATA_SIZE 4
#include "exec/cpu_ldst_template.h"
#define DATA_SIZE 8
#include "exec/cpu_ldst_template.h"
#undef CPU_MMU_INDEX
#undef MEMSUFFIX
#endif /* (NB_MMU_MODES >= 6) */
#if (NB_MMU_MODES >= 7) && defined(MMU_MODE6_SUFFIX)
#define CPU_MMU_INDEX 6
#define MEMSUFFIX MMU_MODE6_SUFFIX
#define DATA_SIZE 1
#include "exec/cpu_ldst_template.h"
#define DATA_SIZE 2
#include "exec/cpu_ldst_template.h"
#define DATA_SIZE 4
#include "exec/cpu_ldst_template.h"
#define DATA_SIZE 8
#include "exec/cpu_ldst_template.h"
#undef CPU_MMU_INDEX
#undef MEMSUFFIX
#endif /* (NB_MMU_MODES >= 7) */
#if (NB_MMU_MODES >= 8) && defined(MMU_MODE7_SUFFIX)
#define CPU_MMU_INDEX 7
#define MEMSUFFIX MMU_MODE7_SUFFIX
#define DATA_SIZE 1
#include "exec/cpu_ldst_template.h"
#define DATA_SIZE 2
#include "exec/cpu_ldst_template.h"
#define DATA_SIZE 4
#include "exec/cpu_ldst_template.h"
#define DATA_SIZE 8
#include "exec/cpu_ldst_template.h"
#undef CPU_MMU_INDEX
#undef MEMSUFFIX
#endif /* (NB_MMU_MODES >= 8) */
#if (NB_MMU_MODES >= 9) && defined(MMU_MODE8_SUFFIX)
#define CPU_MMU_INDEX 8
#define MEMSUFFIX MMU_MODE8_SUFFIX
#define DATA_SIZE 1
#include "exec/cpu_ldst_template.h"
#define DATA_SIZE 2
#include "exec/cpu_ldst_template.h"
#define DATA_SIZE 4
#include "exec/cpu_ldst_template.h"
#define DATA_SIZE 8
#include "exec/cpu_ldst_template.h"
#undef CPU_MMU_INDEX
#undef MEMSUFFIX
#endif /* (NB_MMU_MODES >= 9) */
#if (NB_MMU_MODES >= 10) && defined(MMU_MODE9_SUFFIX)
#define CPU_MMU_INDEX 9
#define MEMSUFFIX MMU_MODE9_SUFFIX
#define DATA_SIZE 1
#include "exec/cpu_ldst_template.h"
#define DATA_SIZE 2
#include "exec/cpu_ldst_template.h"
#define DATA_SIZE 4
#include "exec/cpu_ldst_template.h"
#define DATA_SIZE 8
#include "exec/cpu_ldst_template.h"
#undef CPU_MMU_INDEX
#undef MEMSUFFIX
#endif /* (NB_MMU_MODES >= 10) */
#if (NB_MMU_MODES >= 11) && defined(MMU_MODE10_SUFFIX)
#define CPU_MMU_INDEX 10
#define MEMSUFFIX MMU_MODE10_SUFFIX
#define DATA_SIZE 1
#include "exec/cpu_ldst_template.h"
#define DATA_SIZE 2
#include "exec/cpu_ldst_template.h"
#define DATA_SIZE 4
#include "exec/cpu_ldst_template.h"
#define DATA_SIZE 8
#include "exec/cpu_ldst_template.h"
#undef CPU_MMU_INDEX
#undef MEMSUFFIX
#endif /* (NB_MMU_MODES >= 11) */
#if (NB_MMU_MODES >= 12) && defined(MMU_MODE11_SUFFIX)
#define CPU_MMU_INDEX 11
#define MEMSUFFIX MMU_MODE11_SUFFIX
#define DATA_SIZE 1
#include "exec/cpu_ldst_template.h"
#define DATA_SIZE 2
#include "exec/cpu_ldst_template.h"
#define DATA_SIZE 4
#include "exec/cpu_ldst_template.h"
#define DATA_SIZE 8
#include "exec/cpu_ldst_template.h"
#undef CPU_MMU_INDEX
#undef MEMSUFFIX
#endif /* (NB_MMU_MODES >= 12) */
#if (NB_MMU_MODES > 12)
#error "NB_MMU_MODES > 12 is not supported for now"
#endif /* (NB_MMU_MODES > 12) */
/* these access are slower, they must be as rare as possible */
#define CPU_MMU_INDEX (cpu_mmu_index(env, false))
#define MEMSUFFIX _data
#define DATA_SIZE 1
#include "exec/cpu_ldst_template.h"
#define DATA_SIZE 2
#include "exec/cpu_ldst_template.h"
#define DATA_SIZE 4
#include "exec/cpu_ldst_template.h"
#define DATA_SIZE 8
#include "exec/cpu_ldst_template.h"
#undef CPU_MMU_INDEX
#undef MEMSUFFIX
#define CPU_MMU_INDEX (cpu_mmu_index(env, true))
#define MEMSUFFIX _code
#define SOFTMMU_CODE_ACCESS
#define DATA_SIZE 1
#include "exec/cpu_ldst_template.h"
#define DATA_SIZE 2
#include "exec/cpu_ldst_template.h"
#define DATA_SIZE 4
#include "exec/cpu_ldst_template.h"
#define DATA_SIZE 8
#include "exec/cpu_ldst_template.h"
#undef CPU_MMU_INDEX
#undef MEMSUFFIX
#undef SOFTMMU_CODE_ACCESS
#endif /* defined(CONFIG_USER_ONLY) */
/**
* tlb_vaddr_to_host:
* @env: CPUArchState
* @addr: guest virtual address to look up
* @access_type: 0 for read, 1 for write, 2 for execute
* @mmu_idx: MMU index to use for lookup
*
* Look up the specified guest virtual index in the TCG softmmu TLB.
* If the TLB contains a host virtual address suitable for direct RAM
* access, then return it. Otherwise (TLB miss, TLB entry is for an
* I/O access, etc) return NULL.
*
* This is the equivalent of the initial fast-path code used by
* TCG backends for guest load and store accesses.
*/
static inline void *tlb_vaddr_to_host(CPUArchState *env, abi_ptr addr,
int access_type, int mmu_idx)
{
#if defined(CONFIG_USER_ONLY)
return g2h(addr);
#else
CPUTLBEntry *tlbentry = tlb_entry(env, mmu_idx, addr);
abi_ptr tlb_addr;
uintptr_t haddr;
switch (access_type) {
case 0:
tlb_addr = tlbentry->addr_read;
break;
case 1:
cputlb: read CPUTLBEntry.addr_write atomically Updates can come from other threads, so readers that do not take tlb_lock must use atomic_read to avoid undefined behaviour (UB). This completes the conversion to tlb_lock. This conversion results on average in no performance loss, as the following experiments (run on an Intel i7-6700K CPU @ 4.00GHz) show. 1. aarch64 bootup+shutdown test: - Before: Performance counter stats for 'taskset -c 0 ../img/aarch64/die.sh' (10 runs): 7487.087786 task-clock (msec) # 0.998 CPUs utilized ( +- 0.12% ) 31,574,905,303 cycles # 4.217 GHz ( +- 0.12% ) 57,097,908,812 instructions # 1.81 insns per cycle ( +- 0.08% ) 10,255,415,367 branches # 1369.747 M/sec ( +- 0.08% ) 173,278,962 branch-misses # 1.69% of all branches ( +- 0.18% ) 7.504481349 seconds time elapsed ( +- 0.14% ) - After: Performance counter stats for 'taskset -c 0 ../img/aarch64/die.sh' (10 runs): 7462.441328 task-clock (msec) # 0.998 CPUs utilized ( +- 0.07% ) 31,478,476,520 cycles # 4.218 GHz ( +- 0.07% ) 57,017,330,084 instructions # 1.81 insns per cycle ( +- 0.05% ) 10,251,929,667 branches # 1373.804 M/sec ( +- 0.05% ) 173,023,787 branch-misses # 1.69% of all branches ( +- 0.11% ) 7.474970463 seconds time elapsed ( +- 0.07% ) 2. SPEC06int: SPEC06int (test set) [Y axis: Speedup over master] 1.15 +-+----+------+------+------+------+------+-------+------+------+------+------+------+------+----+-+ | | 1.1 +-+.................................+++.............................+ tlb-lock-v2 (m+++x) +-+ | +++ | +++ tlb-lock-v3 (spinl|ck) | | +++ | | +++ +++ | | | 1.05 +-+....+++...........####.........|####.+++.|......|.....###....+++...........+++....###.........+-+ | ### ++#| # |# |# ***### +++### +++#+# | +++ | #|# ### | 1 +-+++***+#++++####+++#++#++++++++++#++#+*+*++#++++#+#+****+#++++###++++###++++###++++#+#++++#+#+++-+ | *+* # #++# *** # #### *** # * *++# ****+# *| * # ****|# |# # #|# #+# # # | 0.95 +-+..*.*.#....#..#.*|*..#...#..#.*|*..#.*.*..#.*|.*.#.*++*.#.*++*+#.****.#....#+#....#.#..++#.#..+-+ | * * # # # *|* # # # *|* # * * # *++* # * * # * * # * |* # ++# # # # *** # | | * * # ++# # *+* # # # *|* # * * # * * # * * # * * # *++* # **** # ++# # * * # | 0.9 +-+..*.*.#...|#..#.*.*..#.++#..#.*|*..#.*.*..#.*..*.#.*..*.#.*..*.#.*..*.#.*.|*.#...|#.#..*.*.#..+-+ | * * # *** # * * # |# # *+* # * * # * * # * * # * * # * * # *++* # |# # * * # | 0.85 +-+..*.*.#..*|*..#.*.*..#.***..#.*.*..#.*.*..#.*..*.#.*..*.#.*..*.#.*..*.#.*..*.#.****.#..*.*.#..+-+ | * * # *+* # * * # *|* # * * # * * # * * # * * # * * # * * # * * # * |* # * * # | | * * # * * # * * # *+* # * * # * * # * * # * * # * * # * * # * * # * |* # * * # | 0.8 +-+..*.*.#..*.*..#.*.*..#.*.*..#.*.*..#.*.*..#.*..*.#.*..*.#.*..*.#.*..*.#.*..*.#.*++*.#..*.*.#..+-+ | * * # * * # * * # * * # * * # * * # * * # * * # * * # * * # * * # * * # * * # | 0.75 +-+--***##--***###-***###-***###-***###-***###-****##-****##-****##-****##-****##-****##--***##--+-+ 400.perlben401.bzip2403.gcc429.m445.gob456.hmme45462.libqua464.h26471.omnet473483.xalancbmkgeomean png: https://imgur.com/a/BHzpPTW Notes: - tlb-lock-v2 corresponds to an implementation with a mutex. - tlb-lock-v3 corresponds to the current implementation, i.e. a spinlock and a single lock acquisition in tlb_set_page_with_attrs. Signed-off-by: Emilio G. Cota <cota@braap.org> Message-Id: <20181016153840.25877-1-cota@braap.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2018-10-16 18:38:40 +03:00
tlb_addr = tlb_addr_write(tlbentry);
break;
case 2:
tlb_addr = tlbentry->addr_code;
break;
default:
g_assert_not_reached();
}
if (!tlb_hit(tlb_addr, addr)) {
/* TLB entry is for a different page */
return NULL;
}
if (tlb_addr & ~TARGET_PAGE_MASK) {
/* IO access */
return NULL;
}
haddr = addr + tlbentry->addend;
return (void *)haddr;
#endif /* defined(CONFIG_USER_ONLY) */
}
#endif /* CPU_LDST_H */