target-arm: emulate aarch64's LL/SC using cmpxchg helpers

Emulating LL/SC with cmpxchg is not correct, since it can
suffer from the ABA problem. Portable parallel code, however,
is written assuming only cmpxchg--and not LL/SC--is available.
This means that in practice emulating LL/SC with cmpxchg is
a viable alternative.

The appended emulates LL/SC pairs in aarch64 with cmpxchg helpers.
This works in both user and system mode. In usermode, it avoids
pausing all other CPUs to perform the LL/SC pair. The subsequent
performance and scalability improvement is significant, as the
plots below show. They plot the throughput of atomic_add-bench
compiled for ARM and executed on a 64-core x86 machine.

Hi-res plots: http://imgur.com/a/JVc8Y

                atomic_add-bench: 1000000 ops/thread, [0,1] range

  18 ++---------+----------+---------+----------+----------+----------+---++
     +cmpxchg +-E--+       +         +          +          +          +    |
  16 ++master +-H--+                                                      ++
     ||                                                                    |
  14 ++                                                                   ++
     | |                                                                   |
  12 ++|                                                                  ++
     | |                                                                   |
  10 ++++                                                                 ++
   8 ++E                                                                  ++
     |+++                                                                  |
   6 ++ |                                                                 ++
     |  |                                                                  |
   4 ++ |                                                                 ++
     |   |                                                                 |
   2 +H++E+---                                                            ++
     + |     +E++----+E+---+--+E+----++E+------+E+------+E++----+E+---+--+E|
   0 ++H-H----H-+-----H----+---------+----------+----------+----------+---++
     0          10         20        30         40         50         60
                                Number of threads

                atomic_add-bench: 1000000 ops/thread, [0,2] range

  18 ++---------+----------+---------+----------+----------+----------+---++
     +cmpxchg +-E--+       +         +          +          +          +    |
  16 ++master +-H--+                                                      ++
     | |                                                                   |
  14 ++E                                                                  ++
     | |                                                                   |
  12 ++|                                                                  ++
     |+++                                                                  |
  10 ++ |                                                                 ++
   8 ++ |                                                                 ++
     |  |                                                                  |
   6 ++ |                                                                 ++
     |   |                                                                 |
   4 ++  |                                                                ++
     |  +E+---                                                             |
   2 +H+     +E+-----+++              +++      +++   ---+E+-----+E+------+++
     +++        +    +E+---+--+E+----++E+------+E+---   ++++    +++   +  +E|
   0 ++H-H----H-+-----H----+---------+----------+----------+----------+---++
     0          10         20        30         40         50         60
                                Number of threads

               atomic_add-bench: 1000000 ops/thread, [0,128] range

  70 ++---------+----------+---------+----------+----------+----------+---++
     +cmpxchg +-E--+       +         +          +          +          +    |
  60 ++master +-H--+                  +++            ---+E+-----+E+------+E+
     |                        +E+------E-------+E+---                      |
     |                     ---        +++                                  |
  50 ++              +++---                                               ++
     |              -+E+                                                   |
  40 ++      +++----                                                      ++
     |        E-                                                           |
     |      --|                                                            |
  30 ++   -- +++                                                          ++
     |  +E+                                                                |
  20 ++E+                                                                 ++
     |E+                                                                   |
     |                                                                     |
  10 ++                                                                   ++
     +          +          +         +          +          +          +    |
   0 +HH-H----H-+-----H----+---------+----------+----------+----------+---++
     0          10         20        30         40         50         60
                                Number of threads

              atomic_add-bench: 1000000 ops/thread, [0,1024] range

  160 ++---------+---------+----------+---------+----------+----------+---++
      +cmpxchg +-E--+      +          +         +          +          +    |
  140 ++master +-H--+                                           +++      +++
      |                                                -+E+-----+E+-------E|
  120 ++                                       +++ ----                  +++
      |                                +++  ----E--                        |
  100 ++                              --E---   +++                        ++
      |                       +++ ---- +++                                 |
   80 ++                     --E--                                        ++
      |                  ---- +++                                          |
      |              -+E+                                                  |
   60 ++         ---- +++                                                 ++
      |      +E+-                                                          |
   40 ++   --                                                             ++
      |  +E+                                                               |
   20 +EE+                                                                ++
      +++        +         +          +         +          +          +    |
    0 +HH-H---H--+-----H---+----------+---------+----------+----------+---++
      0          10        20         30        40         50         60
                                Number of threads

[rth: Rearrange 128-bit cmpxchg helper.  Enforce alignment on LL.]

Signed-off-by: Emilio G. Cota <cota@braap.org>
Message-Id: <1467054136-10430-28-git-send-email-cota@braap.org>
Signed-off-by: Richard Henderson <rth@twiddle.net>
This commit is contained in:
Emilio G. Cota 2016-06-27 15:02:13 -04:00 committed by Richard Henderson
parent cf12bce088
commit 1dd089d0ee
3 changed files with 163 additions and 58 deletions

View File

@ -27,6 +27,10 @@
#include "qemu/bitops.h" #include "qemu/bitops.h"
#include "internals.h" #include "internals.h"
#include "qemu/crc32c.h" #include "qemu/crc32c.h"
#include "exec/exec-all.h"
#include "exec/cpu_ldst.h"
#include "qemu/int128.h"
#include "tcg.h"
#include <zlib.h> /* For crc32 */ #include <zlib.h> /* For crc32 */
/* C2.4.7 Multiply and divide */ /* C2.4.7 Multiply and divide */
@ -444,3 +448,112 @@ uint64_t HELPER(crc32c_64)(uint64_t acc, uint64_t val, uint32_t bytes)
/* Linux crc32c converts the output to one's complement. */ /* Linux crc32c converts the output to one's complement. */
return crc32c(acc, buf, bytes) ^ 0xffffffff; return crc32c(acc, buf, bytes) ^ 0xffffffff;
} }
/* Returns 0 on success; 1 otherwise. */
uint64_t HELPER(paired_cmpxchg64_le)(CPUARMState *env, uint64_t addr,
uint64_t new_lo, uint64_t new_hi)
{
uintptr_t ra = GETPC();
Int128 oldv, cmpv, newv;
bool success;
cmpv = int128_make128(env->exclusive_val, env->exclusive_high);
newv = int128_make128(new_lo, new_hi);
if (parallel_cpus) {
#ifndef CONFIG_ATOMIC128
cpu_loop_exit_atomic(ENV_GET_CPU(env), ra);
#else
int mem_idx = cpu_mmu_index(env, false);
TCGMemOpIdx oi = make_memop_idx(MO_LEQ | MO_ALIGN_16, mem_idx);
oldv = helper_atomic_cmpxchgo_le_mmu(env, addr, cmpv, newv, oi, ra);
success = int128_eq(oldv, cmpv);
#endif
} else {
uint64_t o0, o1;
#ifdef CONFIG_USER_ONLY
/* ??? Enforce alignment. */
uint64_t *haddr = g2h(addr);
o0 = ldq_le_p(haddr + 0);
o1 = ldq_le_p(haddr + 1);
oldv = int128_make128(o0, o1);
success = int128_eq(oldv, cmpv);
if (success) {
stq_le_p(haddr + 0, int128_getlo(newv));
stq_le_p(haddr + 1, int128_gethi(newv));
}
#else
int mem_idx = cpu_mmu_index(env, false);
TCGMemOpIdx oi0 = make_memop_idx(MO_LEQ | MO_ALIGN_16, mem_idx);
TCGMemOpIdx oi1 = make_memop_idx(MO_LEQ, mem_idx);
o0 = helper_le_ldq_mmu(env, addr + 0, oi0, ra);
o1 = helper_le_ldq_mmu(env, addr + 8, oi1, ra);
oldv = int128_make128(o0, o1);
success = int128_eq(oldv, cmpv);
if (success) {
helper_le_stq_mmu(env, addr + 0, int128_getlo(newv), oi1, ra);
helper_le_stq_mmu(env, addr + 8, int128_gethi(newv), oi1, ra);
}
#endif
}
return !success;
}
uint64_t HELPER(paired_cmpxchg64_be)(CPUARMState *env, uint64_t addr,
uint64_t new_lo, uint64_t new_hi)
{
uintptr_t ra = GETPC();
Int128 oldv, cmpv, newv;
bool success;
cmpv = int128_make128(env->exclusive_val, env->exclusive_high);
newv = int128_make128(new_lo, new_hi);
if (parallel_cpus) {
#ifndef CONFIG_ATOMIC128
cpu_loop_exit_atomic(ENV_GET_CPU(env), ra);
#else
int mem_idx = cpu_mmu_index(env, false);
TCGMemOpIdx oi = make_memop_idx(MO_BEQ | MO_ALIGN_16, mem_idx);
oldv = helper_atomic_cmpxchgo_be_mmu(env, addr, cmpv, newv, oi, ra);
success = int128_eq(oldv, cmpv);
#endif
} else {
uint64_t o0, o1;
#ifdef CONFIG_USER_ONLY
/* ??? Enforce alignment. */
uint64_t *haddr = g2h(addr);
o1 = ldq_be_p(haddr + 0);
o0 = ldq_be_p(haddr + 1);
oldv = int128_make128(o0, o1);
success = int128_eq(oldv, cmpv);
if (success) {
stq_be_p(haddr + 0, int128_gethi(newv));
stq_be_p(haddr + 1, int128_getlo(newv));
}
#else
int mem_idx = cpu_mmu_index(env, false);
TCGMemOpIdx oi0 = make_memop_idx(MO_BEQ | MO_ALIGN_16, mem_idx);
TCGMemOpIdx oi1 = make_memop_idx(MO_BEQ, mem_idx);
o1 = helper_be_ldq_mmu(env, addr + 0, oi0, ra);
o0 = helper_be_ldq_mmu(env, addr + 8, oi1, ra);
oldv = int128_make128(o0, o1);
success = int128_eq(oldv, cmpv);
if (success) {
helper_be_stq_mmu(env, addr + 0, int128_gethi(newv), oi1, ra);
helper_be_stq_mmu(env, addr + 8, int128_getlo(newv), oi1, ra);
}
#endif
}
return !success;
}

View File

@ -46,3 +46,5 @@ DEF_HELPER_FLAGS_2(frecpx_f32, TCG_CALL_NO_RWG, f32, f32, ptr)
DEF_HELPER_FLAGS_2(fcvtx_f64_to_f32, TCG_CALL_NO_RWG, f32, f64, env) DEF_HELPER_FLAGS_2(fcvtx_f64_to_f32, TCG_CALL_NO_RWG, f32, f64, env)
DEF_HELPER_FLAGS_3(crc32_64, TCG_CALL_NO_RWG_SE, i64, i64, i64, i32) DEF_HELPER_FLAGS_3(crc32_64, TCG_CALL_NO_RWG_SE, i64, i64, i64, i32)
DEF_HELPER_FLAGS_3(crc32c_64, TCG_CALL_NO_RWG_SE, i64, i64, i64, i32) DEF_HELPER_FLAGS_3(crc32c_64, TCG_CALL_NO_RWG_SE, i64, i64, i64, i32)
DEF_HELPER_FLAGS_4(paired_cmpxchg64_le, TCG_CALL_NO_WG, i64, env, i64, i64, i64)
DEF_HELPER_FLAGS_4(paired_cmpxchg64_be, TCG_CALL_NO_WG, i64, env, i64, i64, i64)

View File

@ -1839,37 +1839,41 @@ static void disas_b_exc_sys(DisasContext *s, uint32_t insn)
} }
} }
/*
* Load/Store exclusive instructions are implemented by remembering
* the value/address loaded, and seeing if these are the same
* when the store is performed. This is not actually the architecturally
* mandated semantics, but it works for typical guest code sequences
* and avoids having to monitor regular stores.
*
* In system emulation mode only one CPU will be running at once, so
* this sequence is effectively atomic. In user emulation mode we
* throw an exception and handle the atomic operation elsewhere.
*/
static void gen_load_exclusive(DisasContext *s, int rt, int rt2, static void gen_load_exclusive(DisasContext *s, int rt, int rt2,
TCGv_i64 addr, int size, bool is_pair) TCGv_i64 addr, int size, bool is_pair)
{ {
TCGv_i64 tmp = tcg_temp_new_i64(); TCGv_i64 tmp = tcg_temp_new_i64();
TCGMemOp memop = s->be_data + size; TCGMemOp be = s->be_data;
g_assert(size <= 3); g_assert(size <= 3);
tcg_gen_qemu_ld_i64(tmp, addr, get_mem_index(s), memop);
if (is_pair) { if (is_pair) {
TCGv_i64 addr2 = tcg_temp_new_i64();
TCGv_i64 hitmp = tcg_temp_new_i64(); TCGv_i64 hitmp = tcg_temp_new_i64();
g_assert(size >= 2); if (size == 3) {
tcg_gen_addi_i64(addr2, addr, 1 << size); TCGv_i64 addr2 = tcg_temp_new_i64();
tcg_gen_qemu_ld_i64(hitmp, addr2, get_mem_index(s), memop);
tcg_temp_free_i64(addr2); tcg_gen_qemu_ld_i64(tmp, addr, get_mem_index(s),
MO_64 | MO_ALIGN_16 | be);
tcg_gen_addi_i64(addr2, addr, 8);
tcg_gen_qemu_ld_i64(hitmp, addr2, get_mem_index(s),
MO_64 | MO_ALIGN | be);
tcg_temp_free_i64(addr2);
} else {
g_assert(size == 2);
tcg_gen_qemu_ld_i64(tmp, addr, get_mem_index(s),
MO_64 | MO_ALIGN | be);
if (be == MO_LE) {
tcg_gen_extr32_i64(tmp, hitmp, tmp);
} else {
tcg_gen_extr32_i64(hitmp, tmp, tmp);
}
}
tcg_gen_mov_i64(cpu_exclusive_high, hitmp); tcg_gen_mov_i64(cpu_exclusive_high, hitmp);
tcg_gen_mov_i64(cpu_reg(s, rt2), hitmp); tcg_gen_mov_i64(cpu_reg(s, rt2), hitmp);
tcg_temp_free_i64(hitmp); tcg_temp_free_i64(hitmp);
} else {
tcg_gen_qemu_ld_i64(tmp, addr, get_mem_index(s), size | MO_ALIGN | be);
} }
tcg_gen_mov_i64(cpu_exclusive_val, tmp); tcg_gen_mov_i64(cpu_exclusive_val, tmp);
@ -1879,16 +1883,6 @@ static void gen_load_exclusive(DisasContext *s, int rt, int rt2,
tcg_gen_mov_i64(cpu_exclusive_addr, addr); tcg_gen_mov_i64(cpu_exclusive_addr, addr);
} }
#ifdef CONFIG_USER_ONLY
static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2,
TCGv_i64 addr, int size, int is_pair)
{
tcg_gen_mov_i64(cpu_exclusive_test, addr);
tcg_gen_movi_i32(cpu_exclusive_info,
size | is_pair << 2 | (rd << 4) | (rt << 9) | (rt2 << 14));
gen_exception_internal_insn(s, 4, EXCP_STREX);
}
#else
static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2, static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2,
TCGv_i64 inaddr, int size, int is_pair) TCGv_i64 inaddr, int size, int is_pair)
{ {
@ -1916,46 +1910,42 @@ static void gen_store_exclusive(DisasContext *s, int rd, int rt, int rt2,
tcg_gen_brcond_i64(TCG_COND_NE, addr, cpu_exclusive_addr, fail_label); tcg_gen_brcond_i64(TCG_COND_NE, addr, cpu_exclusive_addr, fail_label);
tmp = tcg_temp_new_i64(); tmp = tcg_temp_new_i64();
tcg_gen_qemu_ld_i64(tmp, addr, get_mem_index(s), s->be_data + size);
tcg_gen_brcond_i64(TCG_COND_NE, tmp, cpu_exclusive_val, fail_label);
tcg_temp_free_i64(tmp);
if (is_pair) { if (is_pair) {
TCGv_i64 addrhi = tcg_temp_new_i64(); if (size == 2) {
TCGv_i64 tmphi = tcg_temp_new_i64(); TCGv_i64 val = tcg_temp_new_i64();
tcg_gen_concat32_i64(tmp, cpu_reg(s, rt), cpu_reg(s, rt2));
tcg_gen_addi_i64(addrhi, addr, 1 << size); tcg_gen_concat32_i64(val, cpu_exclusive_val, cpu_exclusive_high);
tcg_gen_qemu_ld_i64(tmphi, addrhi, get_mem_index(s), tcg_gen_atomic_cmpxchg_i64(tmp, addr, val, tmp,
s->be_data + size); get_mem_index(s),
tcg_gen_brcond_i64(TCG_COND_NE, tmphi, cpu_exclusive_high, fail_label); size | MO_ALIGN | s->be_data);
tcg_gen_setcond_i64(TCG_COND_NE, tmp, tmp, val);
tcg_temp_free_i64(tmphi); tcg_temp_free_i64(val);
tcg_temp_free_i64(addrhi); } else if (s->be_data == MO_LE) {
} gen_helper_paired_cmpxchg64_le(tmp, cpu_env, addr, cpu_reg(s, rt),
cpu_reg(s, rt2));
/* We seem to still have the exclusive monitor, so do the store */ } else {
tcg_gen_qemu_st_i64(cpu_reg(s, rt), addr, get_mem_index(s), gen_helper_paired_cmpxchg64_be(tmp, cpu_env, addr, cpu_reg(s, rt),
s->be_data + size); cpu_reg(s, rt2));
if (is_pair) { }
TCGv_i64 addrhi = tcg_temp_new_i64(); } else {
TCGv_i64 val = cpu_reg(s, rt);
tcg_gen_addi_i64(addrhi, addr, 1 << size); tcg_gen_atomic_cmpxchg_i64(tmp, addr, cpu_exclusive_val, val,
tcg_gen_qemu_st_i64(cpu_reg(s, rt2), addrhi, get_mem_index(s),
get_mem_index(s), s->be_data + size); size | MO_ALIGN | s->be_data);
tcg_temp_free_i64(addrhi); tcg_gen_setcond_i64(TCG_COND_NE, tmp, tmp, cpu_exclusive_val);
} }
tcg_temp_free_i64(addr); tcg_temp_free_i64(addr);
tcg_gen_movi_i64(cpu_reg(s, rd), 0); tcg_gen_mov_i64(cpu_reg(s, rd), tmp);
tcg_temp_free_i64(tmp);
tcg_gen_br(done_label); tcg_gen_br(done_label);
gen_set_label(fail_label); gen_set_label(fail_label);
tcg_gen_movi_i64(cpu_reg(s, rd), 1); tcg_gen_movi_i64(cpu_reg(s, rd), 1);
gen_set_label(done_label); gen_set_label(done_label);
tcg_gen_movi_i64(cpu_exclusive_addr, -1); tcg_gen_movi_i64(cpu_exclusive_addr, -1);
} }
#endif
/* Update the Sixty-Four bit (SF) registersize. This logic is derived /* Update the Sixty-Four bit (SF) registersize. This logic is derived
* from the ARMv8 specs for LDR (Shared decode for all encodings). * from the ARMv8 specs for LDR (Shared decode for all encodings).