37b995f6e7
It's been superseded by the atomic helpers. The use of the atomic helpers provides a significant performance and scalability improvement. Below is the result of running the atomic_add-test microbenchmark with: $ x86_64-linux-user/qemu-x86_64 tests/atomic_add-bench -o 5000000 -r $r -n $n , where $n is the number of threads and $r is the allowed range for the additions. The scenarios measured are: - atomic: implements x86' ADDL with the atomic_add helper (i.e. this patchset) - cmpxchg: implement x86' ADDL with a TCG loop using the cmpxchg helper - master: before this patchset Results sorted in ascending range, i.e. descending degree of contention. Y axis is Throughput in Mops/s. Tests are run on an AMD machine with 64 Opteron 6376 cores. atomic_add-bench: 5000000 ops/thread, [0,1] range 25 ++---------+----------+---------+----------+----------+----------+---++ + atomic +-E--+ + + + + + | |cmpxchg +-H--+ | 20 +Emaster +-N--+ ++ || | |++ | || | 15 +++ ++ |N| | |+| | 10 ++| ++ |+|+ | | | -+E+------ +++ ---+E+------+E+------+E+-----+E+------+E| |+E+E+- +++ +E+------+E+-- | 5 ++|+ ++ |+N+H+--- +++ | ++++N+--+H++----+++ + +++ --++H+------+H+------+H++----+H+---+--- | 0 ++---------+-----H----+---H-----+----------+----------+----------+---H+ 0 10 20 30 40 50 60 Number of threads atomic_add-bench: 5000000 ops/thread, [0,2] range 25 ++---------+----------+---------+----------+----------+----------+---++ ++atomic +-E--+ + + + + + | |cmpxchg +-H--+ | 20 ++master +-N--+ ++ |E| | |++ | ||E | 15 ++| ++ |N|| | |+|| ---+E+------+E+-----+E+------+E| 10 ++| | ---+E+------+E+-----+E+--- +++ +++ ||H+E+--+E+-- | |+++++ | | || | 5 ++|+H+-- +++ ++ |+N+ - ---+H+------+H+------ | + +N+--+H++----+H+---+--+H+----++H+--- + + +H+---+--+H| 0 ++---------+----------+---------+----------+----------+----------+---++ 0 10 20 30 40 50 60 Number of threads atomic_add-bench: 5000000 ops/thread, [0,8] range 40 ++---------+----------+---------+----------+----------+----------+---++ ++atomic +-E--+ + + + + + | 35 +cmpxchg +-H--+ ++ | master +-N--+ ---+E+------+E+------+E+-----+E+------+E| 30 ++| ---+E+-- +++ ++ | | -+E+--- | 25 ++E ---- +++ ++ |+++++ -+E+ | 20 +E+ E-- +++ ++ |H|+++ | |+| +H+------- | 15 ++H+ ---+++ +H+------ ++ |N++H+-- +++--- +H+------++| 10 ++ +++ - +++ ---+H+ +++ +H+ | | +H+-----+H+------+H+-- | 5 ++| +++ ++ ++N+N+--+N++ + + + + + | 0 ++---------+----------+---------+----------+----------+----------+---++ 0 10 20 30 40 50 60 Number of threads atomic_add-bench: 5000000 ops/thread, [0,128] range 160 ++---------+---------+----------+---------+----------+----------+---++ + atomic +-E--+ + + + + + | 140 +cmpxchg +-H--+ +++ +++ ++ | master +-N--+ E--------E------+E+------++| 120 ++ --| | +++ E+ | -- +++ +++ ++| 100 ++ - ++ | +++- +++ ++| 80 ++ -+E+ -+H+------+H+------H--------++ | ---- ---- +++ H| | ---+E+-----+E+- ---+H+ ++| 60 ++ +E+--- +++ ---+H+--- ++ | --+++ ---+H+-- | 40 ++ +E+-+H+--- ++ | +H+ | 20 +EE+ ++ +N+ + + + + + + | 0 ++N-N---N--+---------+----------+---------+----------+----------+---++ 0 10 20 30 40 50 60 Number of threads atomic_add-bench: 5000000 ops/thread, [0,1024] range 350 ++---------+---------+----------+---------+----------+----------+---++ + atomic +-E--+ + + + + + | 300 +cmpxchg +-H--+ +++ | master +-N--+ +++ || | +++ | ----E| 250 ++ | ----E---- ++ | ----E--- | ---+H| 200 ++ -+E+--- +++ ---+H+--- ++ | ---- -+H+-- | | +E+ +++ ---- +++ | 150 ++ ---+++ ---+H+- ++ | --- -+H+-- | 100 ++ ---+E+ ---- +++ ++ | +++ ---+E+-----+H+- | | -+E+------+H+-- | 50 ++ +E+ ++ +EE+ + + + + + + | 0 ++N-N---N--+---------+----------+---------+----------+----------+---++ 0 10 20 30 40 50 60 Number of threads hi-res: http://imgur.com/a/fMRmq For master I stopped measuring master after 8 threads, because there is little point in measuring the well-known performance collapse of a contended lock. Reviewed-by: Alex Bennée <alex.bennee@linaro.org> Signed-off-by: Emilio G. Cota <cota@braap.org> Message-Id: <1467054136-10430-21-git-send-email-cota@braap.org> Signed-off-by: Richard Henderson <rth@twiddle.net>
216 lines
5.9 KiB
C
216 lines
5.9 KiB
C
/*
|
|
* x86 memory access helpers
|
|
*
|
|
* Copyright (c) 2003 Fabrice Bellard
|
|
*
|
|
* This library is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
* License as published by the Free Software Foundation; either
|
|
* version 2 of the License, or (at your option) any later version.
|
|
*
|
|
* This library is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
* License along with this library; if not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
#include "qemu/osdep.h"
|
|
#include "cpu.h"
|
|
#include "exec/helper-proto.h"
|
|
#include "exec/exec-all.h"
|
|
#include "exec/cpu_ldst.h"
|
|
#include "qemu/int128.h"
|
|
#include "tcg.h"
|
|
|
|
void helper_cmpxchg8b_unlocked(CPUX86State *env, target_ulong a0)
|
|
{
|
|
uintptr_t ra = GETPC();
|
|
uint64_t oldv, cmpv, newv;
|
|
int eflags;
|
|
|
|
eflags = cpu_cc_compute_all(env, CC_OP);
|
|
|
|
cmpv = deposit64(env->regs[R_EAX], 32, 32, env->regs[R_EDX]);
|
|
newv = deposit64(env->regs[R_EBX], 32, 32, env->regs[R_ECX]);
|
|
|
|
oldv = cpu_ldq_data_ra(env, a0, ra);
|
|
newv = (cmpv == oldv ? newv : oldv);
|
|
/* always do the store */
|
|
cpu_stq_data_ra(env, a0, newv, ra);
|
|
|
|
if (oldv == cmpv) {
|
|
eflags |= CC_Z;
|
|
} else {
|
|
env->regs[R_EAX] = (uint32_t)oldv;
|
|
env->regs[R_EDX] = (uint32_t)(oldv >> 32);
|
|
eflags &= ~CC_Z;
|
|
}
|
|
CC_SRC = eflags;
|
|
}
|
|
|
|
void helper_cmpxchg8b(CPUX86State *env, target_ulong a0)
|
|
{
|
|
#ifdef CONFIG_ATOMIC64
|
|
uint64_t oldv, cmpv, newv;
|
|
int eflags;
|
|
|
|
eflags = cpu_cc_compute_all(env, CC_OP);
|
|
|
|
cmpv = deposit64(env->regs[R_EAX], 32, 32, env->regs[R_EDX]);
|
|
newv = deposit64(env->regs[R_EBX], 32, 32, env->regs[R_ECX]);
|
|
|
|
#ifdef CONFIG_USER_ONLY
|
|
{
|
|
uint64_t *haddr = g2h(a0);
|
|
cmpv = cpu_to_le64(cmpv);
|
|
newv = cpu_to_le64(newv);
|
|
oldv = atomic_cmpxchg__nocheck(haddr, cmpv, newv);
|
|
oldv = le64_to_cpu(oldv);
|
|
}
|
|
#else
|
|
{
|
|
uintptr_t ra = GETPC();
|
|
int mem_idx = cpu_mmu_index(env, false);
|
|
TCGMemOpIdx oi = make_memop_idx(MO_TEQ, mem_idx);
|
|
oldv = helper_atomic_cmpxchgq_le_mmu(env, a0, cmpv, newv, oi, ra);
|
|
}
|
|
#endif
|
|
|
|
if (oldv == cmpv) {
|
|
eflags |= CC_Z;
|
|
} else {
|
|
env->regs[R_EAX] = (uint32_t)oldv;
|
|
env->regs[R_EDX] = (uint32_t)(oldv >> 32);
|
|
eflags &= ~CC_Z;
|
|
}
|
|
CC_SRC = eflags;
|
|
#else
|
|
cpu_loop_exit_atomic(ENV_GET_CPU(env), GETPC());
|
|
#endif /* CONFIG_ATOMIC64 */
|
|
}
|
|
|
|
#ifdef TARGET_X86_64
|
|
void helper_cmpxchg16b_unlocked(CPUX86State *env, target_ulong a0)
|
|
{
|
|
uintptr_t ra = GETPC();
|
|
Int128 oldv, cmpv, newv;
|
|
uint64_t o0, o1;
|
|
int eflags;
|
|
bool success;
|
|
|
|
if ((a0 & 0xf) != 0) {
|
|
raise_exception_ra(env, EXCP0D_GPF, GETPC());
|
|
}
|
|
eflags = cpu_cc_compute_all(env, CC_OP);
|
|
|
|
cmpv = int128_make128(env->regs[R_EAX], env->regs[R_EDX]);
|
|
newv = int128_make128(env->regs[R_EBX], env->regs[R_ECX]);
|
|
|
|
o0 = cpu_ldq_data_ra(env, a0 + 0, ra);
|
|
o1 = cpu_ldq_data_ra(env, a0 + 8, ra);
|
|
|
|
oldv = int128_make128(o0, o1);
|
|
success = int128_eq(oldv, cmpv);
|
|
if (!success) {
|
|
newv = oldv;
|
|
}
|
|
|
|
cpu_stq_data_ra(env, a0 + 0, int128_getlo(newv), ra);
|
|
cpu_stq_data_ra(env, a0 + 8, int128_gethi(newv), ra);
|
|
|
|
if (success) {
|
|
eflags |= CC_Z;
|
|
} else {
|
|
env->regs[R_EAX] = int128_getlo(oldv);
|
|
env->regs[R_EDX] = int128_gethi(oldv);
|
|
eflags &= ~CC_Z;
|
|
}
|
|
CC_SRC = eflags;
|
|
}
|
|
|
|
void helper_cmpxchg16b(CPUX86State *env, target_ulong a0)
|
|
{
|
|
uintptr_t ra = GETPC();
|
|
|
|
if ((a0 & 0xf) != 0) {
|
|
raise_exception_ra(env, EXCP0D_GPF, ra);
|
|
} else {
|
|
#ifndef CONFIG_ATOMIC128
|
|
cpu_loop_exit_atomic(ENV_GET_CPU(env), ra);
|
|
#else
|
|
int eflags = cpu_cc_compute_all(env, CC_OP);
|
|
|
|
Int128 cmpv = int128_make128(env->regs[R_EAX], env->regs[R_EDX]);
|
|
Int128 newv = int128_make128(env->regs[R_EBX], env->regs[R_ECX]);
|
|
|
|
int mem_idx = cpu_mmu_index(env, false);
|
|
TCGMemOpIdx oi = make_memop_idx(MO_TEQ | MO_ALIGN_16, mem_idx);
|
|
Int128 oldv = helper_atomic_cmpxchgo_le_mmu(env, a0, cmpv,
|
|
newv, oi, ra);
|
|
|
|
if (int128_eq(oldv, cmpv)) {
|
|
eflags |= CC_Z;
|
|
} else {
|
|
env->regs[R_EAX] = int128_getlo(oldv);
|
|
env->regs[R_EDX] = int128_gethi(oldv);
|
|
eflags &= ~CC_Z;
|
|
}
|
|
CC_SRC = eflags;
|
|
#endif
|
|
}
|
|
}
|
|
#endif
|
|
|
|
void helper_boundw(CPUX86State *env, target_ulong a0, int v)
|
|
{
|
|
int low, high;
|
|
|
|
low = cpu_ldsw_data_ra(env, a0, GETPC());
|
|
high = cpu_ldsw_data_ra(env, a0 + 2, GETPC());
|
|
v = (int16_t)v;
|
|
if (v < low || v > high) {
|
|
if (env->hflags & HF_MPX_EN_MASK) {
|
|
env->bndcs_regs.sts = 0;
|
|
}
|
|
raise_exception_ra(env, EXCP05_BOUND, GETPC());
|
|
}
|
|
}
|
|
|
|
void helper_boundl(CPUX86State *env, target_ulong a0, int v)
|
|
{
|
|
int low, high;
|
|
|
|
low = cpu_ldl_data_ra(env, a0, GETPC());
|
|
high = cpu_ldl_data_ra(env, a0 + 4, GETPC());
|
|
if (v < low || v > high) {
|
|
if (env->hflags & HF_MPX_EN_MASK) {
|
|
env->bndcs_regs.sts = 0;
|
|
}
|
|
raise_exception_ra(env, EXCP05_BOUND, GETPC());
|
|
}
|
|
}
|
|
|
|
#if !defined(CONFIG_USER_ONLY)
|
|
/* try to fill the TLB and return an exception if error. If retaddr is
|
|
* NULL, it means that the function was called in C code (i.e. not
|
|
* from generated code or from helper.c)
|
|
*/
|
|
/* XXX: fix it to restore all registers */
|
|
void tlb_fill(CPUState *cs, target_ulong addr, MMUAccessType access_type,
|
|
int mmu_idx, uintptr_t retaddr)
|
|
{
|
|
int ret;
|
|
|
|
ret = x86_cpu_handle_mmu_fault(cs, addr, access_type, mmu_idx);
|
|
if (ret) {
|
|
X86CPU *cpu = X86_CPU(cs);
|
|
CPUX86State *env = &cpu->env;
|
|
|
|
raise_exception_err_ra(env, cs->exception_index, env->error_code, retaddr);
|
|
}
|
|
}
|
|
#endif
|