qemu/accel/tcg/translate-all.c

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

778 lines
25 KiB
C
Raw Normal View History

/*
* Host code generation
*
* Copyright (c) 2003 Fabrice Bellard
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, see <http://www.gnu.org/licenses/>.
*/
#include "qemu/osdep.h"
#include "trace.h"
#include "disas/disas.h"
#include "exec/exec-all.h"
#include "tcg/tcg.h"
#if defined(CONFIG_USER_ONLY)
#include "qemu.h"
#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
#include <sys/param.h>
#if __FreeBSD_version >= 700104
#define HAVE_KINFO_GETVMMAP
#define sigqueue sigqueue_freebsd /* avoid redefinition */
#include <sys/proc.h>
#include <machine/profile.h>
#define _KERNEL
#include <sys/user.h>
#undef _KERNEL
#undef sigqueue
#include <libutil.h>
#endif
#endif
#else
#include "exec/ram_addr.h"
#endif
#include "exec/cputlb.h"
#include "exec/translate-all.h"
#include "exec/translator.h"
#include "exec/tb-flush.h"
#include "qemu/bitmap.h"
#include "qemu/qemu-print.h"
tcg: drop global lock during TCG code execution This finally allows TCG to benefit from the iothread introduction: Drop the global mutex while running pure TCG CPU code. Reacquire the lock when entering MMIO or PIO emulation, or when leaving the TCG loop. We have to revert a few optimization for the current TCG threading model, namely kicking the TCG thread in qemu_mutex_lock_iothread and not kicking it in qemu_cpu_kick. We also need to disable RAM block reordering until we have a more efficient locking mechanism at hand. Still, a Linux x86 UP guest and my Musicpal ARM model boot fine here. These numbers demonstrate where we gain something: 20338 jan 20 0 331m 75m 6904 R 99 0.9 0:50.95 qemu-system-arm 20337 jan 20 0 331m 75m 6904 S 20 0.9 0:26.50 qemu-system-arm The guest CPU was fully loaded, but the iothread could still run mostly independent on a second core. Without the patch we don't get beyond 32206 jan 20 0 330m 73m 7036 R 82 0.9 1:06.00 qemu-system-arm 32204 jan 20 0 330m 73m 7036 S 21 0.9 0:17.03 qemu-system-arm We don't benefit significantly, though, when the guest is not fully loading a host CPU. Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com> Message-Id: <1439220437-23957-10-git-send-email-fred.konrad@greensocs.com> [FK: Rebase, fix qemu_devices_reset deadlock, rm address_space_* mutex] Signed-off-by: KONRAD Frederic <fred.konrad@greensocs.com> [EGC: fixed iothread lock for cpu-exec IRQ handling] Signed-off-by: Emilio G. Cota <cota@braap.org> [AJB: -smp single-threaded fix, clean commit msg, BQL fixes] Signed-off-by: Alex Bennée <alex.bennee@linaro.org> Reviewed-by: Richard Henderson <rth@twiddle.net> Reviewed-by: Pranith Kumar <bobby.prani@gmail.com> [PM: target-arm changes] Acked-by: Peter Maydell <peter.maydell@linaro.org>
2017-02-23 21:29:11 +03:00
#include "qemu/main-loop.h"
#include "qemu/cacheinfo.h"
#include "qemu/timer.h"
#include "exec/log.h"
#include "sysemu/cpus.h"
#include "sysemu/cpu-timers.h"
#include "sysemu/tcg.h"
#include "qapi/error.h"
#include "hw/core/tcg-cpu-ops.h"
#include "tb-jmp-cache.h"
#include "tb-hash.h"
#include "tb-context.h"
#include "internal.h"
#include "perf.h"
#include "tcg/insn-start-words.h"
TBContext tb_ctx;
/*
* Encode VAL as a signed leb128 sequence at P.
* Return P incremented past the encoded value.
*/
static uint8_t *encode_sleb128(uint8_t *p, int64_t val)
{
int more, byte;
do {
byte = val & 0x7f;
val >>= 7;
more = !((val == 0 && (byte & 0x40) == 0)
|| (val == -1 && (byte & 0x40) != 0));
if (more) {
byte |= 0x80;
}
*p++ = byte;
} while (more);
return p;
}
/*
* Decode a signed leb128 sequence at *PP; increment *PP past the
* decoded value. Return the decoded value.
*/
static int64_t decode_sleb128(const uint8_t **pp)
{
const uint8_t *p = *pp;
int64_t val = 0;
int byte, shift = 0;
do {
byte = *p++;
val |= (int64_t)(byte & 0x7f) << shift;
shift += 7;
} while (byte & 0x80);
if (shift < TARGET_LONG_BITS && (byte & 0x40)) {
val |= -(int64_t)1 << shift;
}
*pp = p;
return val;
}
/* Encode the data collected about the instructions while compiling TB.
Place the data at BLOCK, and return the number of bytes consumed.
The logical table consists of TARGET_INSN_START_WORDS target_ulong's,
which come from the target's insn_start data, followed by a uintptr_t
which comes from the host pc of the end of the code implementing the insn.
Each line of the table is encoded as sleb128 deltas from the previous
line. The seed for the first line is { tb->pc, 0..., tb->tc.ptr }.
That is, the first column is seeded with the guest pc, the last column
with the host pc, and the middle columns with zeros. */
static int encode_search(TranslationBlock *tb, uint8_t *block)
{
uint8_t *highwater = tcg_ctx->code_gen_highwater;
uint64_t *insn_data = tcg_ctx->gen_insn_data;
uint16_t *insn_end_off = tcg_ctx->gen_insn_end_off;
uint8_t *p = block;
int i, j, n;
for (i = 0, n = tb->icount; i < n; ++i) {
uint64_t prev, curr;
for (j = 0; j < TARGET_INSN_START_WORDS; ++j) {
if (i == 0) {
prev = (!(tb_cflags(tb) & CF_PCREL) && j == 0 ? tb->pc : 0);
} else {
prev = insn_data[(i - 1) * TARGET_INSN_START_WORDS + j];
}
curr = insn_data[i * TARGET_INSN_START_WORDS + j];
p = encode_sleb128(p, curr - prev);
}
prev = (i == 0 ? 0 : insn_end_off[i - 1]);
curr = insn_end_off[i];
p = encode_sleb128(p, curr - prev);
/* Test for (pending) buffer overflow. The assumption is that any
one row beginning below the high water mark cannot overrun
the buffer completely. Thus we can test for overflow after
encoding a row without having to check during encoding. */
if (unlikely(p > highwater)) {
return -1;
}
}
return p - block;
}
static int cpu_unwind_data_from_tb(TranslationBlock *tb, uintptr_t host_pc,
uint64_t *data)
{
uintptr_t iter_pc = (uintptr_t)tb->tc.ptr;
const uint8_t *p = tb->tc.ptr + tb->tc.size;
int i, j, num_insns = tb->icount;
host_pc -= GETPC_ADJ;
if (host_pc < iter_pc) {
return -1;
}
memset(data, 0, sizeof(uint64_t) * TARGET_INSN_START_WORDS);
if (!(tb_cflags(tb) & CF_PCREL)) {
data[0] = tb->pc;
}
/*
* Reconstruct the stored insn data while looking for the point
* at which the end of the insn exceeds host_pc.
*/
for (i = 0; i < num_insns; ++i) {
for (j = 0; j < TARGET_INSN_START_WORDS; ++j) {
data[j] += decode_sleb128(&p);
}
iter_pc += decode_sleb128(&p);
if (iter_pc > host_pc) {
return num_insns - i;
}
}
return -1;
}
/*
* The cpu state corresponding to 'host_pc' is restored in
* preparation for exiting the TB.
*/
void cpu_restore_state_from_tb(CPUState *cpu, TranslationBlock *tb,
uintptr_t host_pc)
{
uint64_t data[TARGET_INSN_START_WORDS];
int insns_left = cpu_unwind_data_from_tb(tb, host_pc, data);
if (insns_left < 0) {
return;
}
if (tb_cflags(tb) & CF_USE_ICOUNT) {
assert(icount_enabled());
/*
* Reset the cycle counter to the start of the block and
* shift if to the number of actually executed instructions.
*/
cpu_neg(cpu)->icount_decr.u16.low += insns_left;
}
cpu->cc->tcg_ops->restore_state_to_opc(cpu, tb, data);
}
bool cpu_restore_state(CPUState *cpu, uintptr_t host_pc)
{
/*
* The host_pc has to be in the rx region of the code buffer.
* If it is not we will not be able to resolve it here.
* The two cases where host_pc will not be correct are:
*
* - fault during translation (instruction fetch)
* - fault from helper (not using GETPC() macro)
*
tcg: remove tb_lock Use mmap_lock in user-mode to protect TCG state and the page descriptors. In !user-mode, each vCPU has its own TCG state, so no locks needed. Per-page locks are used to protect the page descriptors. Per-TB locks are used in both modes to protect TB jumps. Some notes: - tb_lock is removed from notdirty_mem_write by passing a locked page_collection to tb_invalidate_phys_page_fast. - tcg_tb_lookup/remove/insert/etc have their own internal lock(s), so there is no need to further serialize access to them. - do_tb_flush is run in a safe async context, meaning no other vCPU threads are running. Therefore acquiring mmap_lock there is just to please tools such as thread sanitizer. - Not visible in the diff, but tb_invalidate_phys_page already has an assert_memory_lock. - cpu_io_recompile is !user-only, so no mmap_lock there. - Added mmap_unlock()'s before all siglongjmp's that could be called in user-mode while mmap_lock is held. + Added an assert for !have_mmap_lock() after returning from the longjmp in cpu_exec, just like we do in cpu_exec_step_atomic. Performance numbers before/after: Host: AMD Opteron(tm) Processor 6376 ubuntu 17.04 ppc64 bootup+shutdown time 700 +-+--+----+------+------------+-----------+------------*--+-+ | + + + + + *B | | before ***B*** ** * | |tb lock removal ###D### *** | 600 +-+ *** +-+ | ** # | | *B* #D | | *** * ## | 500 +-+ *** ### +-+ | * *** ### | | *B* # ## | | ** * #D# | 400 +-+ ** ## +-+ | ** ### | | ** ## | | ** # ## | 300 +-+ * B* #D# +-+ | B *** ### | | * ** #### | | * *** ### | 200 +-+ B *B #D# +-+ | #B* * ## # | | #* ## | | + D##D# + + + + | 100 +-+--+----+------+------------+-----------+------------+--+-+ 1 8 16 Guest CPUs 48 64 png: https://imgur.com/HwmBHXe debian jessie aarch64 bootup+shutdown time 90 +-+--+-----+-----+------------+------------+------------+--+-+ | + + + + + + | | before ***B*** B | 80 +tb lock removal ###D### **D +-+ | **### | | **## | 70 +-+ ** # +-+ | ** ## | | ** # | 60 +-+ *B ## +-+ | ** ## | | *** #D | 50 +-+ *** ## +-+ | * ** ### | | **B* ### | 40 +-+ **** # ## +-+ | **** #D# | | ***B** ### | 30 +-+ B***B** #### +-+ | B * * # ### | | B ###D# | 20 +-+ D ##D## +-+ | D# | | + + + + + + | 10 +-+--+-----+-----+------------+------------+------------+--+-+ 1 8 16 Guest CPUs 48 64 png: https://imgur.com/iGpGFtv The gains are high for 4-8 CPUs. Beyond that point, however, unrelated lock contention significantly hurts scalability. Reviewed-by: Richard Henderson <richard.henderson@linaro.org> Reviewed-by: Alex Bennée <alex.bennee@linaro.org> Signed-off-by: Emilio G. Cota <cota@braap.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2017-08-05 06:46:31 +03:00
* Either way we need return early as we can't resolve it here.
*/
if (in_code_gen_buffer((const void *)(host_pc - tcg_splitwx_diff))) {
TranslationBlock *tb = tcg_tb_lookup(host_pc);
if (tb) {
cpu_restore_state_from_tb(cpu, tb, host_pc);
return true;
}
}
return false;
}
bool cpu_unwind_state_data(CPUState *cpu, uintptr_t host_pc, uint64_t *data)
{
if (in_code_gen_buffer((const void *)(host_pc - tcg_splitwx_diff))) {
TranslationBlock *tb = tcg_tb_lookup(host_pc);
if (tb) {
return cpu_unwind_data_from_tb(tb, host_pc, data) >= 0;
}
}
return false;
}
void page_init(void)
{
page_size_init();
page_table_config_init();
}
/*
* Isolate the portion of code gen which can setjmp/longjmp.
* Return the size of the generated code, or negative on error.
*/
static int setjmp_gen_code(CPUArchState *env, TranslationBlock *tb,
vaddr pc, void *host_pc,
int *max_insns, int64_t *ti)
{
int ret = sigsetjmp(tcg_ctx->jmp_trans, 0);
if (unlikely(ret != 0)) {
return ret;
}
tcg_func_start(tcg_ctx);
tcg_ctx->cpu = env_cpu(env);
gen_intermediate_code(env_cpu(env), tb, max_insns, pc, host_pc);
assert(tb->size != 0);
tcg_ctx->cpu = NULL;
*max_insns = tb->icount;
return tcg_gen_code(tcg_ctx, tb, pc);
}
/* Called with mmap_lock held for user mode emulation. */
TranslationBlock *tb_gen_code(CPUState *cpu,
vaddr pc, uint64_t cs_base,
uint32_t flags, int cflags)
{
CPUArchState *env = cpu->env_ptr;
TranslationBlock *tb, *existing_tb;
tb_page_addr_t phys_pc;
tcg_insn_unit *gen_code_buf;
int gen_code_size, search_size, max_insns;
int64_t ti;
void *host_pc;
assert_memory_lock();
qemu_thread_jit_write();
phys_pc = get_page_addr_code_hostp(env, pc, &host_pc);
if (phys_pc == -1) {
/* Generate a one-shot TB with 1 insn in it */
cflags = (cflags & ~CF_COUNT_MASK) | CF_LAST_IO | 1;
}
max_insns = cflags & CF_COUNT_MASK;
if (max_insns == 0) {
max_insns = TCG_MAX_INSNS;
}
QEMU_BUILD_BUG_ON(CF_COUNT_MASK + 1 != TCG_MAX_INSNS);
tcg: introduce regions to split code_gen_buffer This is groundwork for supporting multiple TCG contexts. The naive solution here is to split code_gen_buffer statically among the TCG threads; this however results in poor utilization if translation needs are different across TCG threads. What we do here is to add an extra layer of indirection, assigning regions that act just like pages do in virtual memory allocation. (BTW if you are wondering about the chosen naming, I did not want to use blocks or pages because those are already heavily used in QEMU). We use a global lock to serialize allocations as well as statistics reporting (we now export the size of the used code_gen_buffer with tcg_code_size()). Note that for the allocator we could just use a counter and atomic_inc; however, that would complicate the gathering of tcg_code_size()-like stats. So given that the region operations are not a fast path, a lock seems the most reasonable choice. The effectiveness of this approach is clear after seeing some numbers. I used the bootup+shutdown of debian-arm with '-tb-size 80' as a benchmark. Note that I'm evaluating this after enabling per-thread TCG (which is done by a subsequent commit). * -smp 1, 1 region (entire buffer): qemu: flush code_size=83885014 nb_tbs=154739 avg_tb_size=357 qemu: flush code_size=83884902 nb_tbs=153136 avg_tb_size=363 qemu: flush code_size=83885014 nb_tbs=152777 avg_tb_size=364 qemu: flush code_size=83884950 nb_tbs=150057 avg_tb_size=373 qemu: flush code_size=83884998 nb_tbs=150234 avg_tb_size=373 qemu: flush code_size=83885014 nb_tbs=154009 avg_tb_size=360 qemu: flush code_size=83885014 nb_tbs=151007 avg_tb_size=370 qemu: flush code_size=83885014 nb_tbs=151816 avg_tb_size=367 That is, 8 flushes. * -smp 8, 32 regions (80/32 MB per region) [i.e. this patch]: qemu: flush code_size=76328008 nb_tbs=141040 avg_tb_size=356 qemu: flush code_size=75366534 nb_tbs=138000 avg_tb_size=361 qemu: flush code_size=76864546 nb_tbs=140653 avg_tb_size=361 qemu: flush code_size=76309084 nb_tbs=135945 avg_tb_size=375 qemu: flush code_size=74581856 nb_tbs=132909 avg_tb_size=375 qemu: flush code_size=73927256 nb_tbs=135616 avg_tb_size=360 qemu: flush code_size=78629426 nb_tbs=142896 avg_tb_size=365 qemu: flush code_size=76667052 nb_tbs=138508 avg_tb_size=368 Again, 8 flushes. Note how buffer utilization is not 100%, but it is close. Smaller region sizes would yield higher utilization, but we want region allocation to be rare (it acquires a lock), so we do not want to go too small. * -smp 8, static partitioning of 8 regions (10 MB per region): qemu: flush code_size=21936504 nb_tbs=40570 avg_tb_size=354 qemu: flush code_size=11472174 nb_tbs=20633 avg_tb_size=370 qemu: flush code_size=11603976 nb_tbs=21059 avg_tb_size=365 qemu: flush code_size=23254872 nb_tbs=41243 avg_tb_size=377 qemu: flush code_size=28289496 nb_tbs=52057 avg_tb_size=358 qemu: flush code_size=43605160 nb_tbs=78896 avg_tb_size=367 qemu: flush code_size=45166552 nb_tbs=82158 avg_tb_size=364 qemu: flush code_size=63289640 nb_tbs=116494 avg_tb_size=358 qemu: flush code_size=51389960 nb_tbs=93937 avg_tb_size=362 qemu: flush code_size=59665928 nb_tbs=107063 avg_tb_size=372 qemu: flush code_size=38380824 nb_tbs=68597 avg_tb_size=374 qemu: flush code_size=44884568 nb_tbs=79901 avg_tb_size=376 qemu: flush code_size=50782632 nb_tbs=90681 avg_tb_size=374 qemu: flush code_size=39848888 nb_tbs=71433 avg_tb_size=372 qemu: flush code_size=64708840 nb_tbs=119052 avg_tb_size=359 qemu: flush code_size=49830008 nb_tbs=90992 avg_tb_size=362 qemu: flush code_size=68372408 nb_tbs=123442 avg_tb_size=368 qemu: flush code_size=33555560 nb_tbs=59514 avg_tb_size=378 qemu: flush code_size=44748344 nb_tbs=80974 avg_tb_size=367 qemu: flush code_size=37104248 nb_tbs=67609 avg_tb_size=364 That is, 20 flushes. Note how a static partitioning approach uses the code buffer poorly, leading to many unnecessary flushes. Reviewed-by: Richard Henderson <richard.henderson@linaro.org> Signed-off-by: Emilio G. Cota <cota@braap.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2017-07-08 02:24:20 +03:00
buffer_overflow:
tb = tcg_tb_alloc(tcg_ctx);
if (unlikely(!tb)) {
/* flush must be done */
tb_flush(cpu);
mmap_unlock();
/* Make the execution loop process the flush as soon as possible. */
cpu->exception_index = EXCP_INTERRUPT;
cpu_loop_exit(cpu);
}
gen_code_buf = tcg_ctx->code_gen_ptr;
tb->tc.ptr = tcg_splitwx_to_rx(gen_code_buf);
if (!(cflags & CF_PCREL)) {
tb->pc = pc;
}
tb->cs_base = cs_base;
tb->flags = flags;
tb->cflags = cflags;
tb_set_page_addr0(tb, phys_pc);
tb_set_page_addr1(tb, -1);
tcg_ctx->gen_tb = tb;
tcg_ctx->addr_type = TARGET_LONG_BITS == 32 ? TCG_TYPE_I32 : TCG_TYPE_I64;
#ifdef CONFIG_SOFTMMU
tcg_ctx->page_bits = TARGET_PAGE_BITS;
tcg_ctx->page_mask = TARGET_PAGE_MASK;
tcg_ctx->tlb_dyn_max_bits = CPU_TLB_DYN_MAX_BITS;
tcg_ctx->tlb_fast_offset =
(int)offsetof(ArchCPU, neg.tlb.f) - (int)offsetof(ArchCPU, env);
#endif
tcg_ctx->insn_start_words = TARGET_INSN_START_WORDS;
#ifdef TCG_GUEST_DEFAULT_MO
tcg_ctx->guest_mo = TCG_GUEST_DEFAULT_MO;
#else
tcg_ctx->guest_mo = TCG_MO_ALL;
#endif
tb_overflow:
trace_translate_block(tb, pc, tb->tc.ptr);
gen_code_size = setjmp_gen_code(env, tb, pc, host_pc, &max_insns, &ti);
if (unlikely(gen_code_size < 0)) {
switch (gen_code_size) {
case -1:
/*
* Overflow of code_gen_buffer, or the current slice of it.
*
* TODO: We don't need to re-do gen_intermediate_code, nor
* should we re-do the tcg optimization currently hidden
* inside tcg_gen_code. All that should be required is to
* flush the TBs, allocate a new TB, re-initialize it per
* above, and re-do the actual code generation.
*/
qemu_log_mask(CPU_LOG_TB_OP | CPU_LOG_TB_OP_OPT,
"Restarting code generation for "
"code_gen_buffer overflow\n");
goto buffer_overflow;
case -2:
/*
* The code generated for the TranslationBlock is too large.
* The maximum size allowed by the unwind info is 64k.
* There may be stricter constraints from relocations
* in the tcg backend.
*
* Try again with half as many insns as we attempted this time.
* If a single insn overflows, there's a bug somewhere...
*/
assert(max_insns > 1);
max_insns /= 2;
qemu_log_mask(CPU_LOG_TB_OP | CPU_LOG_TB_OP_OPT,
"Restarting code generation with "
"smaller translation block (max %d insns)\n",
max_insns);
goto tb_overflow;
default:
g_assert_not_reached();
}
}
search_size = encode_search(tb, (void *)gen_code_buf + gen_code_size);
if (unlikely(search_size < 0)) {
goto buffer_overflow;
}
translate-all: use a binary search tree to track TBs in TBContext This is a prerequisite for supporting multiple TCG contexts, since we will have threads generating code in separate regions of code_gen_buffer. For this we need a new field (.size) in struct tb_tc to keep track of the size of the translated code. This field uses a size_t to avoid adding a hole to the struct, although really an unsigned int would have been enough. The comparison function we use is optimized for the common case: insertions. Profiling shows that upon booting debian-arm, 98% of comparisons are between existing tb's (i.e. a->size and b->size are both !0), which happens during insertions (and removals, but those are rare). The remaining cases are lookups. From reading the glib sources we see that the first key is always the lookup key. However, the code does not assume this to always be the case because this behaviour is not guaranteed in the glib docs. However, we embed this knowledge in the code as a branch hint for the compiler. Note that tb_free does not free space in the code_gen_buffer anymore, since we cannot easily know whether the tb is the last one inserted in code_gen_buffer. The next patch in this series renames tb_free to tb_remove to reflect this. Performance-wise, lookups in tb_find_pc are the same as before: O(log n). However, insertions are O(log n) instead of O(1), which results in a small slowdown when booting debian-arm: Performance counter stats for 'build/arm-softmmu/qemu-system-arm \ -machine type=virt -nographic -smp 1 -m 4096 \ -netdev user,id=unet,hostfwd=tcp::2222-:22 \ -device virtio-net-device,netdev=unet \ -drive file=img/arm/jessie-arm32.qcow2,id=myblock,index=0,if=none \ -device virtio-blk-device,drive=myblock \ -kernel img/arm/aarch32-current-linux-kernel-only.img \ -append console=ttyAMA0 root=/dev/vda1 \ -name arm,debug-threads=on -smp 1' (10 runs): - Before: 8048.598422 task-clock (msec) # 0.931 CPUs utilized ( +- 0.28% ) 16,974 context-switches # 0.002 M/sec ( +- 0.12% ) 0 cpu-migrations # 0.000 K/sec 10,125 page-faults # 0.001 M/sec ( +- 1.23% ) 35,144,901,879 cycles # 4.367 GHz ( +- 0.14% ) <not supported> stalled-cycles-frontend <not supported> stalled-cycles-backend 65,758,252,643 instructions # 1.87 insns per cycle ( +- 0.33% ) 10,871,298,668 branches # 1350.707 M/sec ( +- 0.41% ) 192,322,212 branch-misses # 1.77% of all branches ( +- 0.32% ) 8.640869419 seconds time elapsed ( +- 0.57% ) - After: 8146.242027 task-clock (msec) # 0.923 CPUs utilized ( +- 1.23% ) 17,016 context-switches # 0.002 M/sec ( +- 0.40% ) 0 cpu-migrations # 0.000 K/sec 18,769 page-faults # 0.002 M/sec ( +- 0.45% ) 35,660,956,120 cycles # 4.378 GHz ( +- 1.22% ) <not supported> stalled-cycles-frontend <not supported> stalled-cycles-backend 65,095,366,607 instructions # 1.83 insns per cycle ( +- 1.73% ) 10,803,480,261 branches # 1326.192 M/sec ( +- 1.95% ) 195,601,289 branch-misses # 1.81% of all branches ( +- 0.39% ) 8.828660235 seconds time elapsed ( +- 0.38% ) Reviewed-by: Richard Henderson <rth@twiddle.net> Signed-off-by: Emilio G. Cota <cota@braap.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2017-06-24 02:00:11 +03:00
tb->tc.size = gen_code_size;
/*
* For CF_PCREL, attribute all executions of the generated code
* to its first mapping.
*/
perf_report_code(pc, tb, tcg_splitwx_to_rx(gen_code_buf));
if (qemu_loglevel_mask(CPU_LOG_TB_OUT_ASM) &&
qemu_log_in_addr_range(pc)) {
FILE *logfile = qemu_log_trylock();
if (logfile) {
int code_size, data_size;
const tcg_target_ulong *rx_data_gen_ptr;
size_t chunk_start;
int insn = 0;
if (tcg_ctx->data_gen_ptr) {
rx_data_gen_ptr = tcg_splitwx_to_rx(tcg_ctx->data_gen_ptr);
code_size = (const void *)rx_data_gen_ptr - tb->tc.ptr;
data_size = gen_code_size - code_size;
} else {
rx_data_gen_ptr = 0;
code_size = gen_code_size;
data_size = 0;
}
/* Dump header and the first instruction */
fprintf(logfile, "OUT: [size=%d]\n", gen_code_size);
fprintf(logfile,
" -- guest addr 0x%016" PRIx64 " + tb prologue\n",
tcg_ctx->gen_insn_data[insn * TARGET_INSN_START_WORDS]);
chunk_start = tcg_ctx->gen_insn_end_off[insn];
disas(logfile, tb->tc.ptr, chunk_start);
/*
* Dump each instruction chunk, wrapping up empty chunks into
* the next instruction. The whole array is offset so the
* first entry is the beginning of the 2nd instruction.
*/
while (insn < tb->icount) {
size_t chunk_end = tcg_ctx->gen_insn_end_off[insn];
if (chunk_end > chunk_start) {
fprintf(logfile, " -- guest addr 0x%016" PRIx64 "\n",
tcg_ctx->gen_insn_data[insn * TARGET_INSN_START_WORDS]);
disas(logfile, tb->tc.ptr + chunk_start,
chunk_end - chunk_start);
chunk_start = chunk_end;
}
insn++;
}
if (chunk_start < code_size) {
fprintf(logfile, " -- tb slow paths + alignment\n");
disas(logfile, tb->tc.ptr + chunk_start,
code_size - chunk_start);
}
/* Finally dump any data we may have after the block */
if (data_size) {
int i;
fprintf(logfile, " data: [size=%d]\n", data_size);
for (i = 0; i < data_size / sizeof(tcg_target_ulong); i++) {
if (sizeof(tcg_target_ulong) == 8) {
fprintf(logfile,
"0x%08" PRIxPTR ": .quad 0x%016" TCG_PRIlx "\n",
(uintptr_t)&rx_data_gen_ptr[i], rx_data_gen_ptr[i]);
} else if (sizeof(tcg_target_ulong) == 4) {
fprintf(logfile,
"0x%08" PRIxPTR ": .long 0x%08" TCG_PRIlx "\n",
(uintptr_t)&rx_data_gen_ptr[i], rx_data_gen_ptr[i]);
} else {
qemu_build_not_reached();
}
}
}
fprintf(logfile, "\n");
qemu_log_unlock(logfile);
}
}
qatomic_set(&tcg_ctx->code_gen_ptr, (void *)
ROUND_UP((uintptr_t)gen_code_buf + gen_code_size + search_size,
tcg: introduce regions to split code_gen_buffer This is groundwork for supporting multiple TCG contexts. The naive solution here is to split code_gen_buffer statically among the TCG threads; this however results in poor utilization if translation needs are different across TCG threads. What we do here is to add an extra layer of indirection, assigning regions that act just like pages do in virtual memory allocation. (BTW if you are wondering about the chosen naming, I did not want to use blocks or pages because those are already heavily used in QEMU). We use a global lock to serialize allocations as well as statistics reporting (we now export the size of the used code_gen_buffer with tcg_code_size()). Note that for the allocator we could just use a counter and atomic_inc; however, that would complicate the gathering of tcg_code_size()-like stats. So given that the region operations are not a fast path, a lock seems the most reasonable choice. The effectiveness of this approach is clear after seeing some numbers. I used the bootup+shutdown of debian-arm with '-tb-size 80' as a benchmark. Note that I'm evaluating this after enabling per-thread TCG (which is done by a subsequent commit). * -smp 1, 1 region (entire buffer): qemu: flush code_size=83885014 nb_tbs=154739 avg_tb_size=357 qemu: flush code_size=83884902 nb_tbs=153136 avg_tb_size=363 qemu: flush code_size=83885014 nb_tbs=152777 avg_tb_size=364 qemu: flush code_size=83884950 nb_tbs=150057 avg_tb_size=373 qemu: flush code_size=83884998 nb_tbs=150234 avg_tb_size=373 qemu: flush code_size=83885014 nb_tbs=154009 avg_tb_size=360 qemu: flush code_size=83885014 nb_tbs=151007 avg_tb_size=370 qemu: flush code_size=83885014 nb_tbs=151816 avg_tb_size=367 That is, 8 flushes. * -smp 8, 32 regions (80/32 MB per region) [i.e. this patch]: qemu: flush code_size=76328008 nb_tbs=141040 avg_tb_size=356 qemu: flush code_size=75366534 nb_tbs=138000 avg_tb_size=361 qemu: flush code_size=76864546 nb_tbs=140653 avg_tb_size=361 qemu: flush code_size=76309084 nb_tbs=135945 avg_tb_size=375 qemu: flush code_size=74581856 nb_tbs=132909 avg_tb_size=375 qemu: flush code_size=73927256 nb_tbs=135616 avg_tb_size=360 qemu: flush code_size=78629426 nb_tbs=142896 avg_tb_size=365 qemu: flush code_size=76667052 nb_tbs=138508 avg_tb_size=368 Again, 8 flushes. Note how buffer utilization is not 100%, but it is close. Smaller region sizes would yield higher utilization, but we want region allocation to be rare (it acquires a lock), so we do not want to go too small. * -smp 8, static partitioning of 8 regions (10 MB per region): qemu: flush code_size=21936504 nb_tbs=40570 avg_tb_size=354 qemu: flush code_size=11472174 nb_tbs=20633 avg_tb_size=370 qemu: flush code_size=11603976 nb_tbs=21059 avg_tb_size=365 qemu: flush code_size=23254872 nb_tbs=41243 avg_tb_size=377 qemu: flush code_size=28289496 nb_tbs=52057 avg_tb_size=358 qemu: flush code_size=43605160 nb_tbs=78896 avg_tb_size=367 qemu: flush code_size=45166552 nb_tbs=82158 avg_tb_size=364 qemu: flush code_size=63289640 nb_tbs=116494 avg_tb_size=358 qemu: flush code_size=51389960 nb_tbs=93937 avg_tb_size=362 qemu: flush code_size=59665928 nb_tbs=107063 avg_tb_size=372 qemu: flush code_size=38380824 nb_tbs=68597 avg_tb_size=374 qemu: flush code_size=44884568 nb_tbs=79901 avg_tb_size=376 qemu: flush code_size=50782632 nb_tbs=90681 avg_tb_size=374 qemu: flush code_size=39848888 nb_tbs=71433 avg_tb_size=372 qemu: flush code_size=64708840 nb_tbs=119052 avg_tb_size=359 qemu: flush code_size=49830008 nb_tbs=90992 avg_tb_size=362 qemu: flush code_size=68372408 nb_tbs=123442 avg_tb_size=368 qemu: flush code_size=33555560 nb_tbs=59514 avg_tb_size=378 qemu: flush code_size=44748344 nb_tbs=80974 avg_tb_size=367 qemu: flush code_size=37104248 nb_tbs=67609 avg_tb_size=364 That is, 20 flushes. Note how a static partitioning approach uses the code buffer poorly, leading to many unnecessary flushes. Reviewed-by: Richard Henderson <richard.henderson@linaro.org> Signed-off-by: Emilio G. Cota <cota@braap.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2017-07-08 02:24:20 +03:00
CODE_GEN_ALIGN));
/* init jump list */
translate-all: protect TB jumps with a per-destination-TB lock This applies to both user-mode and !user-mode emulation. Instead of relying on a global lock, protect the list of incoming jumps with tb->jmp_lock. This lock also protects tb->cflags, so update all tb->cflags readers outside tb->jmp_lock to use atomic reads via tb_cflags(). In order to find the destination TB (and therefore its jmp_lock) from the origin TB, we introduce tb->jmp_dest[]. I considered not using a linked list of jumps, which simplifies code and makes the struct smaller. However, it unnecessarily increases memory usage, which results in a performance decrease. See for instance these numbers booting+shutting down debian-arm: Time (s) Rel. err (%) Abs. err (s) Rel. slowdown (%) ------------------------------------------------------------------------------ before 20.88 0.74 0.154512 0. after 20.81 0.38 0.079078 -0.33524904 GTree 21.02 0.28 0.058856 0.67049808 GHashTable + xxhash 21.63 1.08 0.233604 3.5919540 Using a hash table or a binary tree to keep track of the jumps doesn't really pay off, not only due to the increased memory usage, but also because most TBs have only 0 or 1 jumps to them. The maximum number of jumps when booting debian-arm that I measured is 35, but as we can see in the histogram below a TB with that many incoming jumps is extremely rare; the average TB has 0.80 incoming jumps. n_jumps: 379208; avg jumps/tb: 0.801099 dist: [0.0,1.0)|▄█▁▁▁▁▁▁▁▁▁▁▁ ▁▁▁▁▁▁ ▁▁▁ ▁▁▁ ▁|[34.0,35.0] Reviewed-by: Richard Henderson <richard.henderson@linaro.org> Signed-off-by: Emilio G. Cota <cota@braap.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2017-08-03 03:34:06 +03:00
qemu_spin_init(&tb->jmp_lock);
tb->jmp_list_head = (uintptr_t)NULL;
tb->jmp_list_next[0] = (uintptr_t)NULL;
tb->jmp_list_next[1] = (uintptr_t)NULL;
translate-all: protect TB jumps with a per-destination-TB lock This applies to both user-mode and !user-mode emulation. Instead of relying on a global lock, protect the list of incoming jumps with tb->jmp_lock. This lock also protects tb->cflags, so update all tb->cflags readers outside tb->jmp_lock to use atomic reads via tb_cflags(). In order to find the destination TB (and therefore its jmp_lock) from the origin TB, we introduce tb->jmp_dest[]. I considered not using a linked list of jumps, which simplifies code and makes the struct smaller. However, it unnecessarily increases memory usage, which results in a performance decrease. See for instance these numbers booting+shutting down debian-arm: Time (s) Rel. err (%) Abs. err (s) Rel. slowdown (%) ------------------------------------------------------------------------------ before 20.88 0.74 0.154512 0. after 20.81 0.38 0.079078 -0.33524904 GTree 21.02 0.28 0.058856 0.67049808 GHashTable + xxhash 21.63 1.08 0.233604 3.5919540 Using a hash table or a binary tree to keep track of the jumps doesn't really pay off, not only due to the increased memory usage, but also because most TBs have only 0 or 1 jumps to them. The maximum number of jumps when booting debian-arm that I measured is 35, but as we can see in the histogram below a TB with that many incoming jumps is extremely rare; the average TB has 0.80 incoming jumps. n_jumps: 379208; avg jumps/tb: 0.801099 dist: [0.0,1.0)|▄█▁▁▁▁▁▁▁▁▁▁▁ ▁▁▁▁▁▁ ▁▁▁ ▁▁▁ ▁|[34.0,35.0] Reviewed-by: Richard Henderson <richard.henderson@linaro.org> Signed-off-by: Emilio G. Cota <cota@braap.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2017-08-03 03:34:06 +03:00
tb->jmp_dest[0] = (uintptr_t)NULL;
tb->jmp_dest[1] = (uintptr_t)NULL;
/* init original jump addresses which have been set during tcg_gen_code() */
if (tb->jmp_reset_offset[0] != TB_JMP_OFFSET_INVALID) {
tb_reset_jump(tb, 0);
}
if (tb->jmp_reset_offset[1] != TB_JMP_OFFSET_INVALID) {
tb_reset_jump(tb, 1);
}
/*
* If the TB is not associated with a physical RAM page then it must be
* a temporary one-insn TB, and we have nothing left to do. Return early
* before attempting to link to other TBs or add to the lookup table.
*/
if (tb_page_addr0(tb) == -1) {
return tb;
}
/*
* Insert TB into the corresponding region tree before publishing it
* through QHT. Otherwise rewinding happened in the TB might fail to
* lookup itself using host PC.
*/
tcg_tb_insert(tb);
tcg: remove tb_lock Use mmap_lock in user-mode to protect TCG state and the page descriptors. In !user-mode, each vCPU has its own TCG state, so no locks needed. Per-page locks are used to protect the page descriptors. Per-TB locks are used in both modes to protect TB jumps. Some notes: - tb_lock is removed from notdirty_mem_write by passing a locked page_collection to tb_invalidate_phys_page_fast. - tcg_tb_lookup/remove/insert/etc have their own internal lock(s), so there is no need to further serialize access to them. - do_tb_flush is run in a safe async context, meaning no other vCPU threads are running. Therefore acquiring mmap_lock there is just to please tools such as thread sanitizer. - Not visible in the diff, but tb_invalidate_phys_page already has an assert_memory_lock. - cpu_io_recompile is !user-only, so no mmap_lock there. - Added mmap_unlock()'s before all siglongjmp's that could be called in user-mode while mmap_lock is held. + Added an assert for !have_mmap_lock() after returning from the longjmp in cpu_exec, just like we do in cpu_exec_step_atomic. Performance numbers before/after: Host: AMD Opteron(tm) Processor 6376 ubuntu 17.04 ppc64 bootup+shutdown time 700 +-+--+----+------+------------+-----------+------------*--+-+ | + + + + + *B | | before ***B*** ** * | |tb lock removal ###D### *** | 600 +-+ *** +-+ | ** # | | *B* #D | | *** * ## | 500 +-+ *** ### +-+ | * *** ### | | *B* # ## | | ** * #D# | 400 +-+ ** ## +-+ | ** ### | | ** ## | | ** # ## | 300 +-+ * B* #D# +-+ | B *** ### | | * ** #### | | * *** ### | 200 +-+ B *B #D# +-+ | #B* * ## # | | #* ## | | + D##D# + + + + | 100 +-+--+----+------+------------+-----------+------------+--+-+ 1 8 16 Guest CPUs 48 64 png: https://imgur.com/HwmBHXe debian jessie aarch64 bootup+shutdown time 90 +-+--+-----+-----+------------+------------+------------+--+-+ | + + + + + + | | before ***B*** B | 80 +tb lock removal ###D### **D +-+ | **### | | **## | 70 +-+ ** # +-+ | ** ## | | ** # | 60 +-+ *B ## +-+ | ** ## | | *** #D | 50 +-+ *** ## +-+ | * ** ### | | **B* ### | 40 +-+ **** # ## +-+ | **** #D# | | ***B** ### | 30 +-+ B***B** #### +-+ | B * * # ### | | B ###D# | 20 +-+ D ##D## +-+ | D# | | + + + + + + | 10 +-+--+-----+-----+------------+------------+------------+--+-+ 1 8 16 Guest CPUs 48 64 png: https://imgur.com/iGpGFtv The gains are high for 4-8 CPUs. Beyond that point, however, unrelated lock contention significantly hurts scalability. Reviewed-by: Richard Henderson <richard.henderson@linaro.org> Reviewed-by: Alex Bennée <alex.bennee@linaro.org> Signed-off-by: Emilio G. Cota <cota@braap.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2017-08-05 06:46:31 +03:00
/*
* No explicit memory barrier is required -- tb_link_page() makes the
* TB visible in a consistent state.
*/
existing_tb = tb_link_page(tb, tb_page_addr0(tb), tb_page_addr1(tb));
/* if the TB already exists, discard what we just translated */
if (unlikely(existing_tb != tb)) {
uintptr_t orig_aligned = (uintptr_t)gen_code_buf;
orig_aligned -= ROUND_UP(sizeof(*tb), qemu_icache_linesize);
qatomic_set(&tcg_ctx->code_gen_ptr, (void *)orig_aligned);
tcg_tb_remove(tb);
return existing_tb;
}
return tb;
}
tcg: remove tb_lock Use mmap_lock in user-mode to protect TCG state and the page descriptors. In !user-mode, each vCPU has its own TCG state, so no locks needed. Per-page locks are used to protect the page descriptors. Per-TB locks are used in both modes to protect TB jumps. Some notes: - tb_lock is removed from notdirty_mem_write by passing a locked page_collection to tb_invalidate_phys_page_fast. - tcg_tb_lookup/remove/insert/etc have their own internal lock(s), so there is no need to further serialize access to them. - do_tb_flush is run in a safe async context, meaning no other vCPU threads are running. Therefore acquiring mmap_lock there is just to please tools such as thread sanitizer. - Not visible in the diff, but tb_invalidate_phys_page already has an assert_memory_lock. - cpu_io_recompile is !user-only, so no mmap_lock there. - Added mmap_unlock()'s before all siglongjmp's that could be called in user-mode while mmap_lock is held. + Added an assert for !have_mmap_lock() after returning from the longjmp in cpu_exec, just like we do in cpu_exec_step_atomic. Performance numbers before/after: Host: AMD Opteron(tm) Processor 6376 ubuntu 17.04 ppc64 bootup+shutdown time 700 +-+--+----+------+------------+-----------+------------*--+-+ | + + + + + *B | | before ***B*** ** * | |tb lock removal ###D### *** | 600 +-+ *** +-+ | ** # | | *B* #D | | *** * ## | 500 +-+ *** ### +-+ | * *** ### | | *B* # ## | | ** * #D# | 400 +-+ ** ## +-+ | ** ### | | ** ## | | ** # ## | 300 +-+ * B* #D# +-+ | B *** ### | | * ** #### | | * *** ### | 200 +-+ B *B #D# +-+ | #B* * ## # | | #* ## | | + D##D# + + + + | 100 +-+--+----+------+------------+-----------+------------+--+-+ 1 8 16 Guest CPUs 48 64 png: https://imgur.com/HwmBHXe debian jessie aarch64 bootup+shutdown time 90 +-+--+-----+-----+------------+------------+------------+--+-+ | + + + + + + | | before ***B*** B | 80 +tb lock removal ###D### **D +-+ | **### | | **## | 70 +-+ ** # +-+ | ** ## | | ** # | 60 +-+ *B ## +-+ | ** ## | | *** #D | 50 +-+ *** ## +-+ | * ** ### | | **B* ### | 40 +-+ **** # ## +-+ | **** #D# | | ***B** ### | 30 +-+ B***B** #### +-+ | B * * # ### | | B ###D# | 20 +-+ D ##D## +-+ | D# | | + + + + + + | 10 +-+--+-----+-----+------------+------------+------------+--+-+ 1 8 16 Guest CPUs 48 64 png: https://imgur.com/iGpGFtv The gains are high for 4-8 CPUs. Beyond that point, however, unrelated lock contention significantly hurts scalability. Reviewed-by: Richard Henderson <richard.henderson@linaro.org> Reviewed-by: Alex Bennée <alex.bennee@linaro.org> Signed-off-by: Emilio G. Cota <cota@braap.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2017-08-05 06:46:31 +03:00
/* user-mode: call with mmap_lock held */
void tb_check_watchpoint(CPUState *cpu, uintptr_t retaddr)
{
TranslationBlock *tb;
tcg: remove tb_lock Use mmap_lock in user-mode to protect TCG state and the page descriptors. In !user-mode, each vCPU has its own TCG state, so no locks needed. Per-page locks are used to protect the page descriptors. Per-TB locks are used in both modes to protect TB jumps. Some notes: - tb_lock is removed from notdirty_mem_write by passing a locked page_collection to tb_invalidate_phys_page_fast. - tcg_tb_lookup/remove/insert/etc have their own internal lock(s), so there is no need to further serialize access to them. - do_tb_flush is run in a safe async context, meaning no other vCPU threads are running. Therefore acquiring mmap_lock there is just to please tools such as thread sanitizer. - Not visible in the diff, but tb_invalidate_phys_page already has an assert_memory_lock. - cpu_io_recompile is !user-only, so no mmap_lock there. - Added mmap_unlock()'s before all siglongjmp's that could be called in user-mode while mmap_lock is held. + Added an assert for !have_mmap_lock() after returning from the longjmp in cpu_exec, just like we do in cpu_exec_step_atomic. Performance numbers before/after: Host: AMD Opteron(tm) Processor 6376 ubuntu 17.04 ppc64 bootup+shutdown time 700 +-+--+----+------+------------+-----------+------------*--+-+ | + + + + + *B | | before ***B*** ** * | |tb lock removal ###D### *** | 600 +-+ *** +-+ | ** # | | *B* #D | | *** * ## | 500 +-+ *** ### +-+ | * *** ### | | *B* # ## | | ** * #D# | 400 +-+ ** ## +-+ | ** ### | | ** ## | | ** # ## | 300 +-+ * B* #D# +-+ | B *** ### | | * ** #### | | * *** ### | 200 +-+ B *B #D# +-+ | #B* * ## # | | #* ## | | + D##D# + + + + | 100 +-+--+----+------+------------+-----------+------------+--+-+ 1 8 16 Guest CPUs 48 64 png: https://imgur.com/HwmBHXe debian jessie aarch64 bootup+shutdown time 90 +-+--+-----+-----+------------+------------+------------+--+-+ | + + + + + + | | before ***B*** B | 80 +tb lock removal ###D### **D +-+ | **### | | **## | 70 +-+ ** # +-+ | ** ## | | ** # | 60 +-+ *B ## +-+ | ** ## | | *** #D | 50 +-+ *** ## +-+ | * ** ### | | **B* ### | 40 +-+ **** # ## +-+ | **** #D# | | ***B** ### | 30 +-+ B***B** #### +-+ | B * * # ### | | B ###D# | 20 +-+ D ##D## +-+ | D# | | + + + + + + | 10 +-+--+-----+-----+------------+------------+------------+--+-+ 1 8 16 Guest CPUs 48 64 png: https://imgur.com/iGpGFtv The gains are high for 4-8 CPUs. Beyond that point, however, unrelated lock contention significantly hurts scalability. Reviewed-by: Richard Henderson <richard.henderson@linaro.org> Reviewed-by: Alex Bennée <alex.bennee@linaro.org> Signed-off-by: Emilio G. Cota <cota@braap.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2017-08-05 06:46:31 +03:00
assert_memory_lock();
tb = tcg_tb_lookup(retaddr);
if (tb) {
/* We can use retranslation to find the PC. */
cpu_restore_state_from_tb(cpu, tb, retaddr);
tb_phys_invalidate(tb, -1);
} else {
/* The exception probably happened in a helper. The CPU state should
have been saved before calling it. Fetch the PC from there. */
CPUArchState *env = cpu->env_ptr;
vaddr pc;
uint64_t cs_base;
tb_page_addr_t addr;
uint32_t flags;
cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
addr = get_page_addr_code(env, pc);
if (addr != -1) {
tb_invalidate_phys_range(addr, addr);
}
}
}
#ifndef CONFIG_USER_ONLY
/*
* In deterministic execution mode, instructions doing device I/Os
tcg: drop global lock during TCG code execution This finally allows TCG to benefit from the iothread introduction: Drop the global mutex while running pure TCG CPU code. Reacquire the lock when entering MMIO or PIO emulation, or when leaving the TCG loop. We have to revert a few optimization for the current TCG threading model, namely kicking the TCG thread in qemu_mutex_lock_iothread and not kicking it in qemu_cpu_kick. We also need to disable RAM block reordering until we have a more efficient locking mechanism at hand. Still, a Linux x86 UP guest and my Musicpal ARM model boot fine here. These numbers demonstrate where we gain something: 20338 jan 20 0 331m 75m 6904 R 99 0.9 0:50.95 qemu-system-arm 20337 jan 20 0 331m 75m 6904 S 20 0.9 0:26.50 qemu-system-arm The guest CPU was fully loaded, but the iothread could still run mostly independent on a second core. Without the patch we don't get beyond 32206 jan 20 0 330m 73m 7036 R 82 0.9 1:06.00 qemu-system-arm 32204 jan 20 0 330m 73m 7036 S 21 0.9 0:17.03 qemu-system-arm We don't benefit significantly, though, when the guest is not fully loading a host CPU. Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com> Message-Id: <1439220437-23957-10-git-send-email-fred.konrad@greensocs.com> [FK: Rebase, fix qemu_devices_reset deadlock, rm address_space_* mutex] Signed-off-by: KONRAD Frederic <fred.konrad@greensocs.com> [EGC: fixed iothread lock for cpu-exec IRQ handling] Signed-off-by: Emilio G. Cota <cota@braap.org> [AJB: -smp single-threaded fix, clean commit msg, BQL fixes] Signed-off-by: Alex Bennée <alex.bennee@linaro.org> Reviewed-by: Richard Henderson <rth@twiddle.net> Reviewed-by: Pranith Kumar <bobby.prani@gmail.com> [PM: target-arm changes] Acked-by: Peter Maydell <peter.maydell@linaro.org>
2017-02-23 21:29:11 +03:00
* must be at the end of the TB.
*
* Called by softmmu_template.h, with iothread mutex not held.
*/
void cpu_io_recompile(CPUState *cpu, uintptr_t retaddr)
{
TranslationBlock *tb;
CPUClass *cc;
uint32_t n;
tcg: track TBs with per-region BST's This paves the way for enabling scalable parallel generation of TCG code. Instead of tracking TBs with a single binary search tree (BST), use a BST for each TCG region, protecting it with a lock. This is as scalable as it gets, since each TCG thread operates on a separate region. The core of this change is the introduction of struct tcg_region_tree, which contains a pointer to a GTree and an associated lock to serialize accesses to it. We then allocate an array of tcg_region_tree's, adding the appropriate padding to avoid false sharing based on qemu_dcache_linesize. Given a tc_ptr, we first find the corresponding region_tree. This is done by special-casing the first and last regions first, since they might be of size != region.size; otherwise we just divide the offset by region.stride. I was worried about this division (several dozen cycles of latency), but profiling shows that this is not a fast path. Note that region.stride is not required to be a power of two; it is only required to be a multiple of the host's page size. Note that with this design we can also provide consistent snapshots about all region trees at once; for instance, tcg_tb_foreach acquires/releases all region_tree locks before/after iterating over them. For this reason we now drop tb_lock in dump_exec_info(). As an alternative I considered implementing a concurrent BST, but this can be tricky to get right, offers no consistent snapshots of the BST, and performance and scalability-wise I don't think it could ever beat having separate GTrees, given that our workload is insert-mostly (all concurrent BST designs I've seen focus, understandably, on making lookups fast, which comes at the expense of convoluted, non-wait-free insertions/removals). Reviewed-by: Richard Henderson <richard.henderson@linaro.org> Reviewed-by: Alex Bennée <alex.bennee@linaro.org> Signed-off-by: Emilio G. Cota <cota@braap.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2017-07-26 23:58:05 +03:00
tb = tcg_tb_lookup(retaddr);
if (!tb) {
cpu_abort(cpu, "cpu_io_recompile: could not find TB for pc=%p",
(void *)retaddr);
}
cpu_restore_state_from_tb(cpu, tb, retaddr);
/*
* Some guests must re-execute the branch when re-executing a delay
* slot instruction. When this is the case, adjust icount and N
* to account for the re-execution of the branch.
*/
n = 1;
cc = CPU_GET_CLASS(cpu);
if (cc->tcg_ops->io_recompile_replay_branch &&
cc->tcg_ops->io_recompile_replay_branch(cpu, tb)) {
cpu_neg(cpu)->icount_decr.u16.low++;
n = 2;
}
/*
* Exit the loop and potentially generate a new TB executing the
* just the I/O insns. We also limit instrumentation to memory
* operations only (which execute after completion) so we don't
* double instrument the instruction.
*/
cpu->cflags_next_tb = curr_cflags(cpu) | CF_MEMI_ONLY | CF_LAST_IO | n;
if (qemu_loglevel_mask(CPU_LOG_EXEC)) {
vaddr pc = log_pc(cpu, tb);
if (qemu_log_in_addr_range(pc)) {
qemu_log("cpu_io_recompile: rewound execution of TB to %"
VADDR_PRIx "\n", pc);
}
}
cpu_loop_exit_noexc(cpu);
}
static void print_qht_statistics(struct qht_stats hst, GString *buf)
{
uint32_t hgram_opts;
size_t hgram_bins;
char *hgram;
if (!hst.head_buckets) {
return;
}
g_string_append_printf(buf, "TB hash buckets %zu/%zu "
"(%0.2f%% head buckets used)\n",
hst.used_head_buckets, hst.head_buckets,
(double)hst.used_head_buckets /
hst.head_buckets * 100);
hgram_opts = QDIST_PR_BORDER | QDIST_PR_LABELS;
hgram_opts |= QDIST_PR_100X | QDIST_PR_PERCENT;
if (qdist_xmax(&hst.occupancy) - qdist_xmin(&hst.occupancy) == 1) {
hgram_opts |= QDIST_PR_NODECIMAL;
}
hgram = qdist_pr(&hst.occupancy, 10, hgram_opts);
g_string_append_printf(buf, "TB hash occupancy %0.2f%% avg chain occ. "
"Histogram: %s\n",
qdist_avg(&hst.occupancy) * 100, hgram);
g_free(hgram);
hgram_opts = QDIST_PR_BORDER | QDIST_PR_LABELS;
hgram_bins = qdist_xmax(&hst.chain) - qdist_xmin(&hst.chain);
if (hgram_bins > 10) {
hgram_bins = 10;
} else {
hgram_bins = 0;
hgram_opts |= QDIST_PR_NODECIMAL | QDIST_PR_NOBINRANGE;
}
hgram = qdist_pr(&hst.chain, hgram_bins, hgram_opts);
g_string_append_printf(buf, "TB hash avg chain %0.3f buckets. "
"Histogram: %s\n",
qdist_avg(&hst.chain), hgram);
g_free(hgram);
}
translate-all: use a binary search tree to track TBs in TBContext This is a prerequisite for supporting multiple TCG contexts, since we will have threads generating code in separate regions of code_gen_buffer. For this we need a new field (.size) in struct tb_tc to keep track of the size of the translated code. This field uses a size_t to avoid adding a hole to the struct, although really an unsigned int would have been enough. The comparison function we use is optimized for the common case: insertions. Profiling shows that upon booting debian-arm, 98% of comparisons are between existing tb's (i.e. a->size and b->size are both !0), which happens during insertions (and removals, but those are rare). The remaining cases are lookups. From reading the glib sources we see that the first key is always the lookup key. However, the code does not assume this to always be the case because this behaviour is not guaranteed in the glib docs. However, we embed this knowledge in the code as a branch hint for the compiler. Note that tb_free does not free space in the code_gen_buffer anymore, since we cannot easily know whether the tb is the last one inserted in code_gen_buffer. The next patch in this series renames tb_free to tb_remove to reflect this. Performance-wise, lookups in tb_find_pc are the same as before: O(log n). However, insertions are O(log n) instead of O(1), which results in a small slowdown when booting debian-arm: Performance counter stats for 'build/arm-softmmu/qemu-system-arm \ -machine type=virt -nographic -smp 1 -m 4096 \ -netdev user,id=unet,hostfwd=tcp::2222-:22 \ -device virtio-net-device,netdev=unet \ -drive file=img/arm/jessie-arm32.qcow2,id=myblock,index=0,if=none \ -device virtio-blk-device,drive=myblock \ -kernel img/arm/aarch32-current-linux-kernel-only.img \ -append console=ttyAMA0 root=/dev/vda1 \ -name arm,debug-threads=on -smp 1' (10 runs): - Before: 8048.598422 task-clock (msec) # 0.931 CPUs utilized ( +- 0.28% ) 16,974 context-switches # 0.002 M/sec ( +- 0.12% ) 0 cpu-migrations # 0.000 K/sec 10,125 page-faults # 0.001 M/sec ( +- 1.23% ) 35,144,901,879 cycles # 4.367 GHz ( +- 0.14% ) <not supported> stalled-cycles-frontend <not supported> stalled-cycles-backend 65,758,252,643 instructions # 1.87 insns per cycle ( +- 0.33% ) 10,871,298,668 branches # 1350.707 M/sec ( +- 0.41% ) 192,322,212 branch-misses # 1.77% of all branches ( +- 0.32% ) 8.640869419 seconds time elapsed ( +- 0.57% ) - After: 8146.242027 task-clock (msec) # 0.923 CPUs utilized ( +- 1.23% ) 17,016 context-switches # 0.002 M/sec ( +- 0.40% ) 0 cpu-migrations # 0.000 K/sec 18,769 page-faults # 0.002 M/sec ( +- 0.45% ) 35,660,956,120 cycles # 4.378 GHz ( +- 1.22% ) <not supported> stalled-cycles-frontend <not supported> stalled-cycles-backend 65,095,366,607 instructions # 1.83 insns per cycle ( +- 1.73% ) 10,803,480,261 branches # 1326.192 M/sec ( +- 1.95% ) 195,601,289 branch-misses # 1.81% of all branches ( +- 0.39% ) 8.828660235 seconds time elapsed ( +- 0.38% ) Reviewed-by: Richard Henderson <rth@twiddle.net> Signed-off-by: Emilio G. Cota <cota@braap.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2017-06-24 02:00:11 +03:00
struct tb_tree_stats {
tcg: track TBs with per-region BST's This paves the way for enabling scalable parallel generation of TCG code. Instead of tracking TBs with a single binary search tree (BST), use a BST for each TCG region, protecting it with a lock. This is as scalable as it gets, since each TCG thread operates on a separate region. The core of this change is the introduction of struct tcg_region_tree, which contains a pointer to a GTree and an associated lock to serialize accesses to it. We then allocate an array of tcg_region_tree's, adding the appropriate padding to avoid false sharing based on qemu_dcache_linesize. Given a tc_ptr, we first find the corresponding region_tree. This is done by special-casing the first and last regions first, since they might be of size != region.size; otherwise we just divide the offset by region.stride. I was worried about this division (several dozen cycles of latency), but profiling shows that this is not a fast path. Note that region.stride is not required to be a power of two; it is only required to be a multiple of the host's page size. Note that with this design we can also provide consistent snapshots about all region trees at once; for instance, tcg_tb_foreach acquires/releases all region_tree locks before/after iterating over them. For this reason we now drop tb_lock in dump_exec_info(). As an alternative I considered implementing a concurrent BST, but this can be tricky to get right, offers no consistent snapshots of the BST, and performance and scalability-wise I don't think it could ever beat having separate GTrees, given that our workload is insert-mostly (all concurrent BST designs I've seen focus, understandably, on making lookups fast, which comes at the expense of convoluted, non-wait-free insertions/removals). Reviewed-by: Richard Henderson <richard.henderson@linaro.org> Reviewed-by: Alex Bennée <alex.bennee@linaro.org> Signed-off-by: Emilio G. Cota <cota@braap.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2017-07-26 23:58:05 +03:00
size_t nb_tbs;
size_t host_size;
translate-all: use a binary search tree to track TBs in TBContext This is a prerequisite for supporting multiple TCG contexts, since we will have threads generating code in separate regions of code_gen_buffer. For this we need a new field (.size) in struct tb_tc to keep track of the size of the translated code. This field uses a size_t to avoid adding a hole to the struct, although really an unsigned int would have been enough. The comparison function we use is optimized for the common case: insertions. Profiling shows that upon booting debian-arm, 98% of comparisons are between existing tb's (i.e. a->size and b->size are both !0), which happens during insertions (and removals, but those are rare). The remaining cases are lookups. From reading the glib sources we see that the first key is always the lookup key. However, the code does not assume this to always be the case because this behaviour is not guaranteed in the glib docs. However, we embed this knowledge in the code as a branch hint for the compiler. Note that tb_free does not free space in the code_gen_buffer anymore, since we cannot easily know whether the tb is the last one inserted in code_gen_buffer. The next patch in this series renames tb_free to tb_remove to reflect this. Performance-wise, lookups in tb_find_pc are the same as before: O(log n). However, insertions are O(log n) instead of O(1), which results in a small slowdown when booting debian-arm: Performance counter stats for 'build/arm-softmmu/qemu-system-arm \ -machine type=virt -nographic -smp 1 -m 4096 \ -netdev user,id=unet,hostfwd=tcp::2222-:22 \ -device virtio-net-device,netdev=unet \ -drive file=img/arm/jessie-arm32.qcow2,id=myblock,index=0,if=none \ -device virtio-blk-device,drive=myblock \ -kernel img/arm/aarch32-current-linux-kernel-only.img \ -append console=ttyAMA0 root=/dev/vda1 \ -name arm,debug-threads=on -smp 1' (10 runs): - Before: 8048.598422 task-clock (msec) # 0.931 CPUs utilized ( +- 0.28% ) 16,974 context-switches # 0.002 M/sec ( +- 0.12% ) 0 cpu-migrations # 0.000 K/sec 10,125 page-faults # 0.001 M/sec ( +- 1.23% ) 35,144,901,879 cycles # 4.367 GHz ( +- 0.14% ) <not supported> stalled-cycles-frontend <not supported> stalled-cycles-backend 65,758,252,643 instructions # 1.87 insns per cycle ( +- 0.33% ) 10,871,298,668 branches # 1350.707 M/sec ( +- 0.41% ) 192,322,212 branch-misses # 1.77% of all branches ( +- 0.32% ) 8.640869419 seconds time elapsed ( +- 0.57% ) - After: 8146.242027 task-clock (msec) # 0.923 CPUs utilized ( +- 1.23% ) 17,016 context-switches # 0.002 M/sec ( +- 0.40% ) 0 cpu-migrations # 0.000 K/sec 18,769 page-faults # 0.002 M/sec ( +- 0.45% ) 35,660,956,120 cycles # 4.378 GHz ( +- 1.22% ) <not supported> stalled-cycles-frontend <not supported> stalled-cycles-backend 65,095,366,607 instructions # 1.83 insns per cycle ( +- 1.73% ) 10,803,480,261 branches # 1326.192 M/sec ( +- 1.95% ) 195,601,289 branch-misses # 1.81% of all branches ( +- 0.39% ) 8.828660235 seconds time elapsed ( +- 0.38% ) Reviewed-by: Richard Henderson <rth@twiddle.net> Signed-off-by: Emilio G. Cota <cota@braap.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2017-06-24 02:00:11 +03:00
size_t target_size;
size_t max_target_size;
size_t direct_jmp_count;
size_t direct_jmp2_count;
size_t cross_page;
};
static gboolean tb_tree_stats_iter(gpointer key, gpointer value, gpointer data)
{
const TranslationBlock *tb = value;
struct tb_tree_stats *tst = data;
tcg: track TBs with per-region BST's This paves the way for enabling scalable parallel generation of TCG code. Instead of tracking TBs with a single binary search tree (BST), use a BST for each TCG region, protecting it with a lock. This is as scalable as it gets, since each TCG thread operates on a separate region. The core of this change is the introduction of struct tcg_region_tree, which contains a pointer to a GTree and an associated lock to serialize accesses to it. We then allocate an array of tcg_region_tree's, adding the appropriate padding to avoid false sharing based on qemu_dcache_linesize. Given a tc_ptr, we first find the corresponding region_tree. This is done by special-casing the first and last regions first, since they might be of size != region.size; otherwise we just divide the offset by region.stride. I was worried about this division (several dozen cycles of latency), but profiling shows that this is not a fast path. Note that region.stride is not required to be a power of two; it is only required to be a multiple of the host's page size. Note that with this design we can also provide consistent snapshots about all region trees at once; for instance, tcg_tb_foreach acquires/releases all region_tree locks before/after iterating over them. For this reason we now drop tb_lock in dump_exec_info(). As an alternative I considered implementing a concurrent BST, but this can be tricky to get right, offers no consistent snapshots of the BST, and performance and scalability-wise I don't think it could ever beat having separate GTrees, given that our workload is insert-mostly (all concurrent BST designs I've seen focus, understandably, on making lookups fast, which comes at the expense of convoluted, non-wait-free insertions/removals). Reviewed-by: Richard Henderson <richard.henderson@linaro.org> Reviewed-by: Alex Bennée <alex.bennee@linaro.org> Signed-off-by: Emilio G. Cota <cota@braap.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2017-07-26 23:58:05 +03:00
tst->nb_tbs++;
tst->host_size += tb->tc.size;
translate-all: use a binary search tree to track TBs in TBContext This is a prerequisite for supporting multiple TCG contexts, since we will have threads generating code in separate regions of code_gen_buffer. For this we need a new field (.size) in struct tb_tc to keep track of the size of the translated code. This field uses a size_t to avoid adding a hole to the struct, although really an unsigned int would have been enough. The comparison function we use is optimized for the common case: insertions. Profiling shows that upon booting debian-arm, 98% of comparisons are between existing tb's (i.e. a->size and b->size are both !0), which happens during insertions (and removals, but those are rare). The remaining cases are lookups. From reading the glib sources we see that the first key is always the lookup key. However, the code does not assume this to always be the case because this behaviour is not guaranteed in the glib docs. However, we embed this knowledge in the code as a branch hint for the compiler. Note that tb_free does not free space in the code_gen_buffer anymore, since we cannot easily know whether the tb is the last one inserted in code_gen_buffer. The next patch in this series renames tb_free to tb_remove to reflect this. Performance-wise, lookups in tb_find_pc are the same as before: O(log n). However, insertions are O(log n) instead of O(1), which results in a small slowdown when booting debian-arm: Performance counter stats for 'build/arm-softmmu/qemu-system-arm \ -machine type=virt -nographic -smp 1 -m 4096 \ -netdev user,id=unet,hostfwd=tcp::2222-:22 \ -device virtio-net-device,netdev=unet \ -drive file=img/arm/jessie-arm32.qcow2,id=myblock,index=0,if=none \ -device virtio-blk-device,drive=myblock \ -kernel img/arm/aarch32-current-linux-kernel-only.img \ -append console=ttyAMA0 root=/dev/vda1 \ -name arm,debug-threads=on -smp 1' (10 runs): - Before: 8048.598422 task-clock (msec) # 0.931 CPUs utilized ( +- 0.28% ) 16,974 context-switches # 0.002 M/sec ( +- 0.12% ) 0 cpu-migrations # 0.000 K/sec 10,125 page-faults # 0.001 M/sec ( +- 1.23% ) 35,144,901,879 cycles # 4.367 GHz ( +- 0.14% ) <not supported> stalled-cycles-frontend <not supported> stalled-cycles-backend 65,758,252,643 instructions # 1.87 insns per cycle ( +- 0.33% ) 10,871,298,668 branches # 1350.707 M/sec ( +- 0.41% ) 192,322,212 branch-misses # 1.77% of all branches ( +- 0.32% ) 8.640869419 seconds time elapsed ( +- 0.57% ) - After: 8146.242027 task-clock (msec) # 0.923 CPUs utilized ( +- 1.23% ) 17,016 context-switches # 0.002 M/sec ( +- 0.40% ) 0 cpu-migrations # 0.000 K/sec 18,769 page-faults # 0.002 M/sec ( +- 0.45% ) 35,660,956,120 cycles # 4.378 GHz ( +- 1.22% ) <not supported> stalled-cycles-frontend <not supported> stalled-cycles-backend 65,095,366,607 instructions # 1.83 insns per cycle ( +- 1.73% ) 10,803,480,261 branches # 1326.192 M/sec ( +- 1.95% ) 195,601,289 branch-misses # 1.81% of all branches ( +- 0.39% ) 8.828660235 seconds time elapsed ( +- 0.38% ) Reviewed-by: Richard Henderson <rth@twiddle.net> Signed-off-by: Emilio G. Cota <cota@braap.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2017-06-24 02:00:11 +03:00
tst->target_size += tb->size;
if (tb->size > tst->max_target_size) {
tst->max_target_size = tb->size;
}
if (tb_page_addr1(tb) != -1) {
translate-all: use a binary search tree to track TBs in TBContext This is a prerequisite for supporting multiple TCG contexts, since we will have threads generating code in separate regions of code_gen_buffer. For this we need a new field (.size) in struct tb_tc to keep track of the size of the translated code. This field uses a size_t to avoid adding a hole to the struct, although really an unsigned int would have been enough. The comparison function we use is optimized for the common case: insertions. Profiling shows that upon booting debian-arm, 98% of comparisons are between existing tb's (i.e. a->size and b->size are both !0), which happens during insertions (and removals, but those are rare). The remaining cases are lookups. From reading the glib sources we see that the first key is always the lookup key. However, the code does not assume this to always be the case because this behaviour is not guaranteed in the glib docs. However, we embed this knowledge in the code as a branch hint for the compiler. Note that tb_free does not free space in the code_gen_buffer anymore, since we cannot easily know whether the tb is the last one inserted in code_gen_buffer. The next patch in this series renames tb_free to tb_remove to reflect this. Performance-wise, lookups in tb_find_pc are the same as before: O(log n). However, insertions are O(log n) instead of O(1), which results in a small slowdown when booting debian-arm: Performance counter stats for 'build/arm-softmmu/qemu-system-arm \ -machine type=virt -nographic -smp 1 -m 4096 \ -netdev user,id=unet,hostfwd=tcp::2222-:22 \ -device virtio-net-device,netdev=unet \ -drive file=img/arm/jessie-arm32.qcow2,id=myblock,index=0,if=none \ -device virtio-blk-device,drive=myblock \ -kernel img/arm/aarch32-current-linux-kernel-only.img \ -append console=ttyAMA0 root=/dev/vda1 \ -name arm,debug-threads=on -smp 1' (10 runs): - Before: 8048.598422 task-clock (msec) # 0.931 CPUs utilized ( +- 0.28% ) 16,974 context-switches # 0.002 M/sec ( +- 0.12% ) 0 cpu-migrations # 0.000 K/sec 10,125 page-faults # 0.001 M/sec ( +- 1.23% ) 35,144,901,879 cycles # 4.367 GHz ( +- 0.14% ) <not supported> stalled-cycles-frontend <not supported> stalled-cycles-backend 65,758,252,643 instructions # 1.87 insns per cycle ( +- 0.33% ) 10,871,298,668 branches # 1350.707 M/sec ( +- 0.41% ) 192,322,212 branch-misses # 1.77% of all branches ( +- 0.32% ) 8.640869419 seconds time elapsed ( +- 0.57% ) - After: 8146.242027 task-clock (msec) # 0.923 CPUs utilized ( +- 1.23% ) 17,016 context-switches # 0.002 M/sec ( +- 0.40% ) 0 cpu-migrations # 0.000 K/sec 18,769 page-faults # 0.002 M/sec ( +- 0.45% ) 35,660,956,120 cycles # 4.378 GHz ( +- 1.22% ) <not supported> stalled-cycles-frontend <not supported> stalled-cycles-backend 65,095,366,607 instructions # 1.83 insns per cycle ( +- 1.73% ) 10,803,480,261 branches # 1326.192 M/sec ( +- 1.95% ) 195,601,289 branch-misses # 1.81% of all branches ( +- 0.39% ) 8.828660235 seconds time elapsed ( +- 0.38% ) Reviewed-by: Richard Henderson <rth@twiddle.net> Signed-off-by: Emilio G. Cota <cota@braap.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2017-06-24 02:00:11 +03:00
tst->cross_page++;
}
if (tb->jmp_reset_offset[0] != TB_JMP_OFFSET_INVALID) {
translate-all: use a binary search tree to track TBs in TBContext This is a prerequisite for supporting multiple TCG contexts, since we will have threads generating code in separate regions of code_gen_buffer. For this we need a new field (.size) in struct tb_tc to keep track of the size of the translated code. This field uses a size_t to avoid adding a hole to the struct, although really an unsigned int would have been enough. The comparison function we use is optimized for the common case: insertions. Profiling shows that upon booting debian-arm, 98% of comparisons are between existing tb's (i.e. a->size and b->size are both !0), which happens during insertions (and removals, but those are rare). The remaining cases are lookups. From reading the glib sources we see that the first key is always the lookup key. However, the code does not assume this to always be the case because this behaviour is not guaranteed in the glib docs. However, we embed this knowledge in the code as a branch hint for the compiler. Note that tb_free does not free space in the code_gen_buffer anymore, since we cannot easily know whether the tb is the last one inserted in code_gen_buffer. The next patch in this series renames tb_free to tb_remove to reflect this. Performance-wise, lookups in tb_find_pc are the same as before: O(log n). However, insertions are O(log n) instead of O(1), which results in a small slowdown when booting debian-arm: Performance counter stats for 'build/arm-softmmu/qemu-system-arm \ -machine type=virt -nographic -smp 1 -m 4096 \ -netdev user,id=unet,hostfwd=tcp::2222-:22 \ -device virtio-net-device,netdev=unet \ -drive file=img/arm/jessie-arm32.qcow2,id=myblock,index=0,if=none \ -device virtio-blk-device,drive=myblock \ -kernel img/arm/aarch32-current-linux-kernel-only.img \ -append console=ttyAMA0 root=/dev/vda1 \ -name arm,debug-threads=on -smp 1' (10 runs): - Before: 8048.598422 task-clock (msec) # 0.931 CPUs utilized ( +- 0.28% ) 16,974 context-switches # 0.002 M/sec ( +- 0.12% ) 0 cpu-migrations # 0.000 K/sec 10,125 page-faults # 0.001 M/sec ( +- 1.23% ) 35,144,901,879 cycles # 4.367 GHz ( +- 0.14% ) <not supported> stalled-cycles-frontend <not supported> stalled-cycles-backend 65,758,252,643 instructions # 1.87 insns per cycle ( +- 0.33% ) 10,871,298,668 branches # 1350.707 M/sec ( +- 0.41% ) 192,322,212 branch-misses # 1.77% of all branches ( +- 0.32% ) 8.640869419 seconds time elapsed ( +- 0.57% ) - After: 8146.242027 task-clock (msec) # 0.923 CPUs utilized ( +- 1.23% ) 17,016 context-switches # 0.002 M/sec ( +- 0.40% ) 0 cpu-migrations # 0.000 K/sec 18,769 page-faults # 0.002 M/sec ( +- 0.45% ) 35,660,956,120 cycles # 4.378 GHz ( +- 1.22% ) <not supported> stalled-cycles-frontend <not supported> stalled-cycles-backend 65,095,366,607 instructions # 1.83 insns per cycle ( +- 1.73% ) 10,803,480,261 branches # 1326.192 M/sec ( +- 1.95% ) 195,601,289 branch-misses # 1.81% of all branches ( +- 0.39% ) 8.828660235 seconds time elapsed ( +- 0.38% ) Reviewed-by: Richard Henderson <rth@twiddle.net> Signed-off-by: Emilio G. Cota <cota@braap.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2017-06-24 02:00:11 +03:00
tst->direct_jmp_count++;
if (tb->jmp_reset_offset[1] != TB_JMP_OFFSET_INVALID) {
translate-all: use a binary search tree to track TBs in TBContext This is a prerequisite for supporting multiple TCG contexts, since we will have threads generating code in separate regions of code_gen_buffer. For this we need a new field (.size) in struct tb_tc to keep track of the size of the translated code. This field uses a size_t to avoid adding a hole to the struct, although really an unsigned int would have been enough. The comparison function we use is optimized for the common case: insertions. Profiling shows that upon booting debian-arm, 98% of comparisons are between existing tb's (i.e. a->size and b->size are both !0), which happens during insertions (and removals, but those are rare). The remaining cases are lookups. From reading the glib sources we see that the first key is always the lookup key. However, the code does not assume this to always be the case because this behaviour is not guaranteed in the glib docs. However, we embed this knowledge in the code as a branch hint for the compiler. Note that tb_free does not free space in the code_gen_buffer anymore, since we cannot easily know whether the tb is the last one inserted in code_gen_buffer. The next patch in this series renames tb_free to tb_remove to reflect this. Performance-wise, lookups in tb_find_pc are the same as before: O(log n). However, insertions are O(log n) instead of O(1), which results in a small slowdown when booting debian-arm: Performance counter stats for 'build/arm-softmmu/qemu-system-arm \ -machine type=virt -nographic -smp 1 -m 4096 \ -netdev user,id=unet,hostfwd=tcp::2222-:22 \ -device virtio-net-device,netdev=unet \ -drive file=img/arm/jessie-arm32.qcow2,id=myblock,index=0,if=none \ -device virtio-blk-device,drive=myblock \ -kernel img/arm/aarch32-current-linux-kernel-only.img \ -append console=ttyAMA0 root=/dev/vda1 \ -name arm,debug-threads=on -smp 1' (10 runs): - Before: 8048.598422 task-clock (msec) # 0.931 CPUs utilized ( +- 0.28% ) 16,974 context-switches # 0.002 M/sec ( +- 0.12% ) 0 cpu-migrations # 0.000 K/sec 10,125 page-faults # 0.001 M/sec ( +- 1.23% ) 35,144,901,879 cycles # 4.367 GHz ( +- 0.14% ) <not supported> stalled-cycles-frontend <not supported> stalled-cycles-backend 65,758,252,643 instructions # 1.87 insns per cycle ( +- 0.33% ) 10,871,298,668 branches # 1350.707 M/sec ( +- 0.41% ) 192,322,212 branch-misses # 1.77% of all branches ( +- 0.32% ) 8.640869419 seconds time elapsed ( +- 0.57% ) - After: 8146.242027 task-clock (msec) # 0.923 CPUs utilized ( +- 1.23% ) 17,016 context-switches # 0.002 M/sec ( +- 0.40% ) 0 cpu-migrations # 0.000 K/sec 18,769 page-faults # 0.002 M/sec ( +- 0.45% ) 35,660,956,120 cycles # 4.378 GHz ( +- 1.22% ) <not supported> stalled-cycles-frontend <not supported> stalled-cycles-backend 65,095,366,607 instructions # 1.83 insns per cycle ( +- 1.73% ) 10,803,480,261 branches # 1326.192 M/sec ( +- 1.95% ) 195,601,289 branch-misses # 1.81% of all branches ( +- 0.39% ) 8.828660235 seconds time elapsed ( +- 0.38% ) Reviewed-by: Richard Henderson <rth@twiddle.net> Signed-off-by: Emilio G. Cota <cota@braap.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2017-06-24 02:00:11 +03:00
tst->direct_jmp2_count++;
}
}
return false;
}
void dump_exec_info(GString *buf)
{
translate-all: use a binary search tree to track TBs in TBContext This is a prerequisite for supporting multiple TCG contexts, since we will have threads generating code in separate regions of code_gen_buffer. For this we need a new field (.size) in struct tb_tc to keep track of the size of the translated code. This field uses a size_t to avoid adding a hole to the struct, although really an unsigned int would have been enough. The comparison function we use is optimized for the common case: insertions. Profiling shows that upon booting debian-arm, 98% of comparisons are between existing tb's (i.e. a->size and b->size are both !0), which happens during insertions (and removals, but those are rare). The remaining cases are lookups. From reading the glib sources we see that the first key is always the lookup key. However, the code does not assume this to always be the case because this behaviour is not guaranteed in the glib docs. However, we embed this knowledge in the code as a branch hint for the compiler. Note that tb_free does not free space in the code_gen_buffer anymore, since we cannot easily know whether the tb is the last one inserted in code_gen_buffer. The next patch in this series renames tb_free to tb_remove to reflect this. Performance-wise, lookups in tb_find_pc are the same as before: O(log n). However, insertions are O(log n) instead of O(1), which results in a small slowdown when booting debian-arm: Performance counter stats for 'build/arm-softmmu/qemu-system-arm \ -machine type=virt -nographic -smp 1 -m 4096 \ -netdev user,id=unet,hostfwd=tcp::2222-:22 \ -device virtio-net-device,netdev=unet \ -drive file=img/arm/jessie-arm32.qcow2,id=myblock,index=0,if=none \ -device virtio-blk-device,drive=myblock \ -kernel img/arm/aarch32-current-linux-kernel-only.img \ -append console=ttyAMA0 root=/dev/vda1 \ -name arm,debug-threads=on -smp 1' (10 runs): - Before: 8048.598422 task-clock (msec) # 0.931 CPUs utilized ( +- 0.28% ) 16,974 context-switches # 0.002 M/sec ( +- 0.12% ) 0 cpu-migrations # 0.000 K/sec 10,125 page-faults # 0.001 M/sec ( +- 1.23% ) 35,144,901,879 cycles # 4.367 GHz ( +- 0.14% ) <not supported> stalled-cycles-frontend <not supported> stalled-cycles-backend 65,758,252,643 instructions # 1.87 insns per cycle ( +- 0.33% ) 10,871,298,668 branches # 1350.707 M/sec ( +- 0.41% ) 192,322,212 branch-misses # 1.77% of all branches ( +- 0.32% ) 8.640869419 seconds time elapsed ( +- 0.57% ) - After: 8146.242027 task-clock (msec) # 0.923 CPUs utilized ( +- 1.23% ) 17,016 context-switches # 0.002 M/sec ( +- 0.40% ) 0 cpu-migrations # 0.000 K/sec 18,769 page-faults # 0.002 M/sec ( +- 0.45% ) 35,660,956,120 cycles # 4.378 GHz ( +- 1.22% ) <not supported> stalled-cycles-frontend <not supported> stalled-cycles-backend 65,095,366,607 instructions # 1.83 insns per cycle ( +- 1.73% ) 10,803,480,261 branches # 1326.192 M/sec ( +- 1.95% ) 195,601,289 branch-misses # 1.81% of all branches ( +- 0.39% ) 8.828660235 seconds time elapsed ( +- 0.38% ) Reviewed-by: Richard Henderson <rth@twiddle.net> Signed-off-by: Emilio G. Cota <cota@braap.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2017-06-24 02:00:11 +03:00
struct tb_tree_stats tst = {};
translate-all: add tb hash bucket info to 'info jit' dump Examples: - Good hashing, i.e. tb_hash_func5(phys_pc, pc, flags): TB count 715135/2684354 [...] TB hash buckets 388775/524288 (74.15% head buckets used) TB hash occupancy 33.04% avg chain occ. Histogram: [0,10)%|▆ █ ▅▁▃▁▁|[90,100]% TB hash avg chain 1.017 buckets. Histogram: 1|█▁▁|3 - Not-so-good hashing, i.e. tb_hash_func5(phys_pc, pc, 0): TB count 712636/2684354 [...] TB hash buckets 344924/524288 (65.79% head buckets used) TB hash occupancy 31.64% avg chain occ. Histogram: [0,10)%|█ ▆ ▅▁▃▁▂|[90,100]% TB hash avg chain 1.047 buckets. Histogram: 1|█▁▁▁|4 - Bad hashing, i.e. tb_hash_func5(phys_pc, 0, 0): TB count 702818/2684354 [...] TB hash buckets 112741/524288 (21.50% head buckets used) TB hash occupancy 10.15% avg chain occ. Histogram: [0,10)%|█ ▁ ▁▁▁▁▁|[90,100]% TB hash avg chain 2.107 buckets. Histogram: [1.0,10.2)|█▁▁▁▁▁▁▁▁▁|[83.8,93.0] - Good hashing, but no auto-resize: TB count 715634/2684354 TB hash buckets 8192/8192 (100.00% head buckets used) TB hash occupancy 98.30% avg chain occ. Histogram: [95.3,95.8)%|▁▁▃▄▃▄▁▇▁█|[99.5,100.0]% TB hash avg chain 22.070 buckets. Histogram: [15.0,16.7)|▁▂▅▄█▅▁▁▁▁|[30.3,32.0] Acked-by: Sergey Fedorov <sergey.fedorov@linaro.org> Suggested-by: Richard Henderson <rth@twiddle.net> Reviewed-by: Richard Henderson <rth@twiddle.net> Signed-off-by: Emilio G. Cota <cota@braap.org> Message-Id: <1465412133-3029-16-git-send-email-cota@braap.org> Signed-off-by: Richard Henderson <rth@twiddle.net>
2016-06-08 21:55:33 +03:00
struct qht_stats hst;
size_t nb_tbs, flush_full, flush_part, flush_elide;
tcg: track TBs with per-region BST's This paves the way for enabling scalable parallel generation of TCG code. Instead of tracking TBs with a single binary search tree (BST), use a BST for each TCG region, protecting it with a lock. This is as scalable as it gets, since each TCG thread operates on a separate region. The core of this change is the introduction of struct tcg_region_tree, which contains a pointer to a GTree and an associated lock to serialize accesses to it. We then allocate an array of tcg_region_tree's, adding the appropriate padding to avoid false sharing based on qemu_dcache_linesize. Given a tc_ptr, we first find the corresponding region_tree. This is done by special-casing the first and last regions first, since they might be of size != region.size; otherwise we just divide the offset by region.stride. I was worried about this division (several dozen cycles of latency), but profiling shows that this is not a fast path. Note that region.stride is not required to be a power of two; it is only required to be a multiple of the host's page size. Note that with this design we can also provide consistent snapshots about all region trees at once; for instance, tcg_tb_foreach acquires/releases all region_tree locks before/after iterating over them. For this reason we now drop tb_lock in dump_exec_info(). As an alternative I considered implementing a concurrent BST, but this can be tricky to get right, offers no consistent snapshots of the BST, and performance and scalability-wise I don't think it could ever beat having separate GTrees, given that our workload is insert-mostly (all concurrent BST designs I've seen focus, understandably, on making lookups fast, which comes at the expense of convoluted, non-wait-free insertions/removals). Reviewed-by: Richard Henderson <richard.henderson@linaro.org> Reviewed-by: Alex Bennée <alex.bennee@linaro.org> Signed-off-by: Emilio G. Cota <cota@braap.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2017-07-26 23:58:05 +03:00
tcg_tb_foreach(tb_tree_stats_iter, &tst);
nb_tbs = tst.nb_tbs;
/* XXX: avoid using doubles ? */
g_string_append_printf(buf, "Translation buffer state:\n");
/*
* Report total code size including the padding and TB structs;
* otherwise users might think "-accel tcg,tb-size" is not honoured.
* For avg host size we use the precise numbers from tb_tree_stats though.
*/
g_string_append_printf(buf, "gen code size %zu/%zu\n",
tcg_code_size(), tcg_code_capacity());
g_string_append_printf(buf, "TB count %zu\n", nb_tbs);
g_string_append_printf(buf, "TB avg target size %zu max=%zu bytes\n",
nb_tbs ? tst.target_size / nb_tbs : 0,
tst.max_target_size);
g_string_append_printf(buf, "TB avg host size %zu bytes "
"(expansion ratio: %0.1f)\n",
nb_tbs ? tst.host_size / nb_tbs : 0,
tst.target_size ?
(double)tst.host_size / tst.target_size : 0);
g_string_append_printf(buf, "cross page TB count %zu (%zu%%)\n",
tst.cross_page,
nb_tbs ? (tst.cross_page * 100) / nb_tbs : 0);
g_string_append_printf(buf, "direct jump count %zu (%zu%%) "
"(2 jumps=%zu %zu%%)\n",
tst.direct_jmp_count,
nb_tbs ? (tst.direct_jmp_count * 100) / nb_tbs : 0,
tst.direct_jmp2_count,
nb_tbs ? (tst.direct_jmp2_count * 100) / nb_tbs : 0);
translate-all: add tb hash bucket info to 'info jit' dump Examples: - Good hashing, i.e. tb_hash_func5(phys_pc, pc, flags): TB count 715135/2684354 [...] TB hash buckets 388775/524288 (74.15% head buckets used) TB hash occupancy 33.04% avg chain occ. Histogram: [0,10)%|▆ █ ▅▁▃▁▁|[90,100]% TB hash avg chain 1.017 buckets. Histogram: 1|█▁▁|3 - Not-so-good hashing, i.e. tb_hash_func5(phys_pc, pc, 0): TB count 712636/2684354 [...] TB hash buckets 344924/524288 (65.79% head buckets used) TB hash occupancy 31.64% avg chain occ. Histogram: [0,10)%|█ ▆ ▅▁▃▁▂|[90,100]% TB hash avg chain 1.047 buckets. Histogram: 1|█▁▁▁|4 - Bad hashing, i.e. tb_hash_func5(phys_pc, 0, 0): TB count 702818/2684354 [...] TB hash buckets 112741/524288 (21.50% head buckets used) TB hash occupancy 10.15% avg chain occ. Histogram: [0,10)%|█ ▁ ▁▁▁▁▁|[90,100]% TB hash avg chain 2.107 buckets. Histogram: [1.0,10.2)|█▁▁▁▁▁▁▁▁▁|[83.8,93.0] - Good hashing, but no auto-resize: TB count 715634/2684354 TB hash buckets 8192/8192 (100.00% head buckets used) TB hash occupancy 98.30% avg chain occ. Histogram: [95.3,95.8)%|▁▁▃▄▃▄▁▇▁█|[99.5,100.0]% TB hash avg chain 22.070 buckets. Histogram: [15.0,16.7)|▁▂▅▄█▅▁▁▁▁|[30.3,32.0] Acked-by: Sergey Fedorov <sergey.fedorov@linaro.org> Suggested-by: Richard Henderson <rth@twiddle.net> Reviewed-by: Richard Henderson <rth@twiddle.net> Signed-off-by: Emilio G. Cota <cota@braap.org> Message-Id: <1465412133-3029-16-git-send-email-cota@braap.org> Signed-off-by: Richard Henderson <rth@twiddle.net>
2016-06-08 21:55:33 +03:00
qht_statistics_init(&tb_ctx.htable, &hst);
print_qht_statistics(hst, buf);
translate-all: add tb hash bucket info to 'info jit' dump Examples: - Good hashing, i.e. tb_hash_func5(phys_pc, pc, flags): TB count 715135/2684354 [...] TB hash buckets 388775/524288 (74.15% head buckets used) TB hash occupancy 33.04% avg chain occ. Histogram: [0,10)%|▆ █ ▅▁▃▁▁|[90,100]% TB hash avg chain 1.017 buckets. Histogram: 1|█▁▁|3 - Not-so-good hashing, i.e. tb_hash_func5(phys_pc, pc, 0): TB count 712636/2684354 [...] TB hash buckets 344924/524288 (65.79% head buckets used) TB hash occupancy 31.64% avg chain occ. Histogram: [0,10)%|█ ▆ ▅▁▃▁▂|[90,100]% TB hash avg chain 1.047 buckets. Histogram: 1|█▁▁▁|4 - Bad hashing, i.e. tb_hash_func5(phys_pc, 0, 0): TB count 702818/2684354 [...] TB hash buckets 112741/524288 (21.50% head buckets used) TB hash occupancy 10.15% avg chain occ. Histogram: [0,10)%|█ ▁ ▁▁▁▁▁|[90,100]% TB hash avg chain 2.107 buckets. Histogram: [1.0,10.2)|█▁▁▁▁▁▁▁▁▁|[83.8,93.0] - Good hashing, but no auto-resize: TB count 715634/2684354 TB hash buckets 8192/8192 (100.00% head buckets used) TB hash occupancy 98.30% avg chain occ. Histogram: [95.3,95.8)%|▁▁▃▄▃▄▁▇▁█|[99.5,100.0]% TB hash avg chain 22.070 buckets. Histogram: [15.0,16.7)|▁▂▅▄█▅▁▁▁▁|[30.3,32.0] Acked-by: Sergey Fedorov <sergey.fedorov@linaro.org> Suggested-by: Richard Henderson <rth@twiddle.net> Reviewed-by: Richard Henderson <rth@twiddle.net> Signed-off-by: Emilio G. Cota <cota@braap.org> Message-Id: <1465412133-3029-16-git-send-email-cota@braap.org> Signed-off-by: Richard Henderson <rth@twiddle.net>
2016-06-08 21:55:33 +03:00
qht_statistics_destroy(&hst);
g_string_append_printf(buf, "\nStatistics:\n");
g_string_append_printf(buf, "TB flush count %u\n",
qatomic_read(&tb_ctx.tb_flush_count));
g_string_append_printf(buf, "TB invalidate count %u\n",
qatomic_read(&tb_ctx.tb_phys_invalidate_count));
tlb_flush_counts(&flush_full, &flush_part, &flush_elide);
g_string_append_printf(buf, "TLB full flushes %zu\n", flush_full);
g_string_append_printf(buf, "TLB partial flushes %zu\n", flush_part);
g_string_append_printf(buf, "TLB elided flushes %zu\n", flush_elide);
tcg_dump_info(buf);
}
#else /* CONFIG_USER_ONLY */
void cpu_interrupt(CPUState *cpu, int mask)
{
tcg: drop global lock during TCG code execution This finally allows TCG to benefit from the iothread introduction: Drop the global mutex while running pure TCG CPU code. Reacquire the lock when entering MMIO or PIO emulation, or when leaving the TCG loop. We have to revert a few optimization for the current TCG threading model, namely kicking the TCG thread in qemu_mutex_lock_iothread and not kicking it in qemu_cpu_kick. We also need to disable RAM block reordering until we have a more efficient locking mechanism at hand. Still, a Linux x86 UP guest and my Musicpal ARM model boot fine here. These numbers demonstrate where we gain something: 20338 jan 20 0 331m 75m 6904 R 99 0.9 0:50.95 qemu-system-arm 20337 jan 20 0 331m 75m 6904 S 20 0.9 0:26.50 qemu-system-arm The guest CPU was fully loaded, but the iothread could still run mostly independent on a second core. Without the patch we don't get beyond 32206 jan 20 0 330m 73m 7036 R 82 0.9 1:06.00 qemu-system-arm 32204 jan 20 0 330m 73m 7036 S 21 0.9 0:17.03 qemu-system-arm We don't benefit significantly, though, when the guest is not fully loading a host CPU. Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com> Message-Id: <1439220437-23957-10-git-send-email-fred.konrad@greensocs.com> [FK: Rebase, fix qemu_devices_reset deadlock, rm address_space_* mutex] Signed-off-by: KONRAD Frederic <fred.konrad@greensocs.com> [EGC: fixed iothread lock for cpu-exec IRQ handling] Signed-off-by: Emilio G. Cota <cota@braap.org> [AJB: -smp single-threaded fix, clean commit msg, BQL fixes] Signed-off-by: Alex Bennée <alex.bennee@linaro.org> Reviewed-by: Richard Henderson <rth@twiddle.net> Reviewed-by: Pranith Kumar <bobby.prani@gmail.com> [PM: target-arm changes] Acked-by: Peter Maydell <peter.maydell@linaro.org>
2017-02-23 21:29:11 +03:00
g_assert(qemu_mutex_iothread_locked());
cpu->interrupt_request |= mask;
qatomic_set(&cpu_neg(cpu)->icount_decr.u16.high, -1);
}
#endif /* CONFIG_USER_ONLY */
/*
* Called by generic code at e.g. cpu reset after cpu creation,
* therefore we must be prepared to allocate the jump cache.
*/
void tcg_flush_jmp_cache(CPUState *cpu)
{
CPUJumpCache *jc = cpu->tb_jmp_cache;
/* During early initialization, the cache may not yet be allocated. */
if (unlikely(jc == NULL)) {
return;
}
for (int i = 0; i < TB_JMP_CACHE_SIZE; i++) {
qatomic_set(&jc->array[i].tb, NULL);
}
}
/* This is a wrapper for common code that can not use CONFIG_SOFTMMU */
void tcg_flush_softmmu_tlb(CPUState *cs)
{
#ifdef CONFIG_SOFTMMU
tlb_flush(cs);
#endif
}