2003-06-15 23:58:51 +04:00
|
|
|
/*
|
|
|
|
* Host code generation
|
2007-09-17 01:08:06 +04:00
|
|
|
*
|
2003-06-15 23:58:51 +04:00
|
|
|
* Copyright (c) 2003 Fabrice Bellard
|
|
|
|
*
|
|
|
|
* This library is free software; you can redistribute it and/or
|
|
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
|
|
* License as published by the Free Software Foundation; either
|
2019-01-23 17:08:56 +03:00
|
|
|
* version 2.1 of the License, or (at your option) any later version.
|
2003-06-15 23:58:51 +04:00
|
|
|
*
|
|
|
|
* This library is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
* Lesser General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU Lesser General Public
|
2009-07-17 00:47:01 +04:00
|
|
|
* License along with this library; if not, see <http://www.gnu.org/licenses/>.
|
2003-06-15 23:58:51 +04:00
|
|
|
*/
|
2019-05-23 17:35:05 +03:00
|
|
|
|
2016-01-26 21:16:56 +03:00
|
|
|
#include "qemu/osdep.h"
|
2003-06-15 23:58:51 +04:00
|
|
|
|
2004-01-05 02:28:12 +03:00
|
|
|
#define NO_CPU_IO_DEFS
|
2017-06-02 09:06:45 +03:00
|
|
|
#include "trace.h"
|
2012-10-24 13:12:21 +04:00
|
|
|
#include "disas/disas.h"
|
2016-03-15 15:18:37 +03:00
|
|
|
#include "exec/exec-all.h"
|
2020-01-01 14:23:00 +03:00
|
|
|
#include "tcg/tcg.h"
|
2012-12-02 20:04:43 +04:00
|
|
|
#if defined(CONFIG_USER_ONLY)
|
|
|
|
#include "qemu.h"
|
|
|
|
#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
|
|
|
|
#include <sys/param.h>
|
|
|
|
#if __FreeBSD_version >= 700104
|
|
|
|
#define HAVE_KINFO_GETVMMAP
|
|
|
|
#define sigqueue sigqueue_freebsd /* avoid redefinition */
|
|
|
|
#include <sys/proc.h>
|
|
|
|
#include <machine/profile.h>
|
|
|
|
#define _KERNEL
|
|
|
|
#include <sys/user.h>
|
|
|
|
#undef _KERNEL
|
|
|
|
#undef sigqueue
|
|
|
|
#include <libutil.h>
|
|
|
|
#endif
|
|
|
|
#endif
|
2013-04-08 19:29:59 +04:00
|
|
|
#else
|
2018-05-30 12:58:36 +03:00
|
|
|
#include "exec/ram_addr.h"
|
2012-12-02 20:04:43 +04:00
|
|
|
#endif
|
|
|
|
|
2012-12-17 21:19:49 +04:00
|
|
|
#include "exec/cputlb.h"
|
2020-12-16 15:27:58 +03:00
|
|
|
#include "exec/translate-all.h"
|
2022-08-11 23:48:03 +03:00
|
|
|
#include "exec/translator.h"
|
2023-03-03 05:57:43 +03:00
|
|
|
#include "exec/tb-flush.h"
|
2015-04-23 00:50:52 +03:00
|
|
|
#include "qemu/bitmap.h"
|
2019-04-17 22:17:52 +03:00
|
|
|
#include "qemu/qemu-print.h"
|
tcg: drop global lock during TCG code execution
This finally allows TCG to benefit from the iothread introduction: Drop
the global mutex while running pure TCG CPU code. Reacquire the lock
when entering MMIO or PIO emulation, or when leaving the TCG loop.
We have to revert a few optimization for the current TCG threading
model, namely kicking the TCG thread in qemu_mutex_lock_iothread and not
kicking it in qemu_cpu_kick. We also need to disable RAM block
reordering until we have a more efficient locking mechanism at hand.
Still, a Linux x86 UP guest and my Musicpal ARM model boot fine here.
These numbers demonstrate where we gain something:
20338 jan 20 0 331m 75m 6904 R 99 0.9 0:50.95 qemu-system-arm
20337 jan 20 0 331m 75m 6904 S 20 0.9 0:26.50 qemu-system-arm
The guest CPU was fully loaded, but the iothread could still run mostly
independent on a second core. Without the patch we don't get beyond
32206 jan 20 0 330m 73m 7036 R 82 0.9 1:06.00 qemu-system-arm
32204 jan 20 0 330m 73m 7036 S 21 0.9 0:17.03 qemu-system-arm
We don't benefit significantly, though, when the guest is not fully
loading a host CPU.
Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Message-Id: <1439220437-23957-10-git-send-email-fred.konrad@greensocs.com>
[FK: Rebase, fix qemu_devices_reset deadlock, rm address_space_* mutex]
Signed-off-by: KONRAD Frederic <fred.konrad@greensocs.com>
[EGC: fixed iothread lock for cpu-exec IRQ handling]
Signed-off-by: Emilio G. Cota <cota@braap.org>
[AJB: -smp single-threaded fix, clean commit msg, BQL fixes]
Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Richard Henderson <rth@twiddle.net>
Reviewed-by: Pranith Kumar <bobby.prani@gmail.com>
[PM: target-arm changes]
Acked-by: Peter Maydell <peter.maydell@linaro.org>
2017-02-23 21:29:11 +03:00
|
|
|
#include "qemu/main-loop.h"
|
2022-02-08 23:08:55 +03:00
|
|
|
#include "qemu/cacheinfo.h"
|
2023-03-03 11:49:48 +03:00
|
|
|
#include "qemu/timer.h"
|
2016-01-07 16:55:28 +03:00
|
|
|
#include "exec/log.h"
|
2017-03-03 14:01:16 +03:00
|
|
|
#include "sysemu/cpus.h"
|
2020-08-19 14:17:19 +03:00
|
|
|
#include "sysemu/cpu-timers.h"
|
2019-05-23 17:35:05 +03:00
|
|
|
#include "sysemu/tcg.h"
|
2020-10-29 06:14:54 +03:00
|
|
|
#include "qapi/error.h"
|
2021-02-13 16:03:13 +03:00
|
|
|
#include "hw/core/tcg-cpu-ops.h"
|
2022-08-15 23:13:05 +03:00
|
|
|
#include "tb-jmp-cache.h"
|
2021-05-24 20:04:53 +03:00
|
|
|
#include "tb-hash.h"
|
|
|
|
#include "tb-context.h"
|
2021-01-21 09:15:06 +03:00
|
|
|
#include "internal.h"
|
2023-01-12 18:20:13 +03:00
|
|
|
#include "perf.h"
|
2012-12-02 20:04:43 +04:00
|
|
|
|
2017-07-04 11:42:32 +03:00
|
|
|
/* Make sure all possible CPU event bits fit in tb->trace_vcpu_dstate */
|
|
|
|
QEMU_BUILD_BUG_ON(CPU_TRACE_DSTATE_MAX_EVENTS >
|
2018-06-14 19:44:31 +03:00
|
|
|
sizeof_field(TranslationBlock, trace_vcpu_dstate)
|
2017-07-04 11:42:32 +03:00
|
|
|
* BITS_PER_BYTE);
|
|
|
|
|
2017-06-24 03:04:43 +03:00
|
|
|
TBContext tb_ctx;
|
2003-06-15 23:58:51 +04:00
|
|
|
|
2015-09-02 05:11:45 +03:00
|
|
|
/* Encode VAL as a signed leb128 sequence at P.
|
|
|
|
Return P incremented past the encoded value. */
|
|
|
|
static uint8_t *encode_sleb128(uint8_t *p, target_long val)
|
|
|
|
{
|
|
|
|
int more, byte;
|
|
|
|
|
|
|
|
do {
|
|
|
|
byte = val & 0x7f;
|
|
|
|
val >>= 7;
|
|
|
|
more = !((val == 0 && (byte & 0x40) == 0)
|
|
|
|
|| (val == -1 && (byte & 0x40) != 0));
|
|
|
|
if (more) {
|
|
|
|
byte |= 0x80;
|
|
|
|
}
|
|
|
|
*p++ = byte;
|
|
|
|
} while (more);
|
|
|
|
|
|
|
|
return p;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Decode a signed leb128 sequence at *PP; increment *PP past the
|
|
|
|
decoded value. Return the decoded value. */
|
2020-10-28 22:05:44 +03:00
|
|
|
static target_long decode_sleb128(const uint8_t **pp)
|
2015-09-02 05:11:45 +03:00
|
|
|
{
|
2020-10-28 22:05:44 +03:00
|
|
|
const uint8_t *p = *pp;
|
2015-09-02 05:11:45 +03:00
|
|
|
target_long val = 0;
|
|
|
|
int byte, shift = 0;
|
|
|
|
|
|
|
|
do {
|
|
|
|
byte = *p++;
|
|
|
|
val |= (target_ulong)(byte & 0x7f) << shift;
|
|
|
|
shift += 7;
|
|
|
|
} while (byte & 0x80);
|
|
|
|
if (shift < TARGET_LONG_BITS && (byte & 0x40)) {
|
|
|
|
val |= -(target_ulong)1 << shift;
|
|
|
|
}
|
|
|
|
|
|
|
|
*pp = p;
|
|
|
|
return val;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Encode the data collected about the instructions while compiling TB.
|
|
|
|
Place the data at BLOCK, and return the number of bytes consumed.
|
|
|
|
|
2017-10-19 01:01:42 +03:00
|
|
|
The logical table consists of TARGET_INSN_START_WORDS target_ulong's,
|
2015-09-02 05:11:45 +03:00
|
|
|
which come from the target's insn_start data, followed by a uintptr_t
|
|
|
|
which comes from the host pc of the end of the code implementing the insn.
|
|
|
|
|
|
|
|
Each line of the table is encoded as sleb128 deltas from the previous
|
2017-07-12 07:08:21 +03:00
|
|
|
line. The seed for the first line is { tb->pc, 0..., tb->tc.ptr }.
|
2015-09-02 05:11:45 +03:00
|
|
|
That is, the first column is seeded with the guest pc, the last column
|
|
|
|
with the host pc, and the middle columns with zeros. */
|
|
|
|
|
|
|
|
static int encode_search(TranslationBlock *tb, uint8_t *block)
|
|
|
|
{
|
2017-07-13 00:15:52 +03:00
|
|
|
uint8_t *highwater = tcg_ctx->code_gen_highwater;
|
2015-09-02 05:11:45 +03:00
|
|
|
uint8_t *p = block;
|
|
|
|
int i, j, n;
|
|
|
|
|
|
|
|
for (i = 0, n = tb->icount; i < n; ++i) {
|
|
|
|
target_ulong prev;
|
|
|
|
|
|
|
|
for (j = 0; j < TARGET_INSN_START_WORDS; ++j) {
|
|
|
|
if (i == 0) {
|
2023-02-27 16:51:47 +03:00
|
|
|
prev = (!(tb_cflags(tb) & CF_PCREL) && j == 0 ? tb->pc : 0);
|
2015-09-02 05:11:45 +03:00
|
|
|
} else {
|
2017-07-13 00:15:52 +03:00
|
|
|
prev = tcg_ctx->gen_insn_data[i - 1][j];
|
2015-09-02 05:11:45 +03:00
|
|
|
}
|
2017-07-13 00:15:52 +03:00
|
|
|
p = encode_sleb128(p, tcg_ctx->gen_insn_data[i][j] - prev);
|
2015-09-02 05:11:45 +03:00
|
|
|
}
|
2017-07-13 00:15:52 +03:00
|
|
|
prev = (i == 0 ? 0 : tcg_ctx->gen_insn_end_off[i - 1]);
|
|
|
|
p = encode_sleb128(p, tcg_ctx->gen_insn_end_off[i] - prev);
|
2015-09-22 23:01:15 +03:00
|
|
|
|
|
|
|
/* Test for (pending) buffer overflow. The assumption is that any
|
|
|
|
one row beginning below the high water mark cannot overrun
|
|
|
|
the buffer completely. Thus we can test for overflow after
|
|
|
|
encoding a row without having to check during encoding. */
|
|
|
|
if (unlikely(p > highwater)) {
|
|
|
|
return -1;
|
|
|
|
}
|
2015-09-02 05:11:45 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
return p - block;
|
|
|
|
}
|
|
|
|
|
2022-10-24 15:15:04 +03:00
|
|
|
static int cpu_unwind_data_from_tb(TranslationBlock *tb, uintptr_t host_pc,
|
|
|
|
uint64_t *data)
|
2003-06-15 23:58:51 +04:00
|
|
|
{
|
2022-10-24 15:15:04 +03:00
|
|
|
uintptr_t iter_pc = (uintptr_t)tb->tc.ptr;
|
2020-10-28 22:05:44 +03:00
|
|
|
const uint8_t *p = tb->tc.ptr + tb->tc.size;
|
2015-09-02 05:11:45 +03:00
|
|
|
int i, j, num_insns = tb->icount;
|
2008-02-01 13:50:11 +03:00
|
|
|
|
2022-10-24 15:15:04 +03:00
|
|
|
host_pc -= GETPC_ADJ;
|
2016-07-26 03:39:16 +03:00
|
|
|
|
2022-10-24 15:15:04 +03:00
|
|
|
if (host_pc < iter_pc) {
|
2015-09-02 05:11:45 +03:00
|
|
|
return -1;
|
|
|
|
}
|
2003-06-15 23:58:51 +04:00
|
|
|
|
2022-10-24 15:15:04 +03:00
|
|
|
memset(data, 0, sizeof(uint64_t) * TARGET_INSN_START_WORDS);
|
2023-02-27 16:51:39 +03:00
|
|
|
if (!(tb_cflags(tb) & CF_PCREL)) {
|
2023-02-27 16:51:47 +03:00
|
|
|
data[0] = tb->pc;
|
2022-08-12 19:53:53 +03:00
|
|
|
}
|
|
|
|
|
2022-10-24 15:15:04 +03:00
|
|
|
/*
|
|
|
|
* Reconstruct the stored insn data while looking for the point
|
|
|
|
* at which the end of the insn exceeds host_pc.
|
|
|
|
*/
|
2015-09-02 05:11:45 +03:00
|
|
|
for (i = 0; i < num_insns; ++i) {
|
|
|
|
for (j = 0; j < TARGET_INSN_START_WORDS; ++j) {
|
|
|
|
data[j] += decode_sleb128(&p);
|
|
|
|
}
|
2022-10-24 15:15:04 +03:00
|
|
|
iter_pc += decode_sleb128(&p);
|
|
|
|
if (iter_pc > host_pc) {
|
|
|
|
return num_insns - i;
|
2015-09-02 05:11:45 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return -1;
|
2022-10-24 15:15:04 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2022-10-24 16:12:56 +03:00
|
|
|
* The cpu state corresponding to 'host_pc' is restored in
|
|
|
|
* preparation for exiting the TB.
|
2022-10-24 15:15:04 +03:00
|
|
|
*/
|
|
|
|
void cpu_restore_state_from_tb(CPUState *cpu, TranslationBlock *tb,
|
2022-10-24 16:12:56 +03:00
|
|
|
uintptr_t host_pc)
|
2022-10-24 15:15:04 +03:00
|
|
|
{
|
|
|
|
uint64_t data[TARGET_INSN_START_WORDS];
|
|
|
|
#ifdef CONFIG_PROFILER
|
|
|
|
TCGProfile *prof = &tcg_ctx->prof;
|
|
|
|
int64_t ti = profile_getclock();
|
|
|
|
#endif
|
|
|
|
int insns_left = cpu_unwind_data_from_tb(tb, host_pc, data);
|
|
|
|
|
|
|
|
if (insns_left < 0) {
|
|
|
|
return;
|
|
|
|
}
|
2007-09-17 12:09:54 +04:00
|
|
|
|
2022-10-24 16:12:56 +03:00
|
|
|
if (tb_cflags(tb) & CF_USE_ICOUNT) {
|
2020-08-19 14:17:19 +03:00
|
|
|
assert(icount_enabled());
|
2022-10-24 15:15:04 +03:00
|
|
|
/*
|
|
|
|
* Reset the cycle counter to the start of the block and
|
|
|
|
* shift if to the number of actually executed instructions.
|
|
|
|
*/
|
|
|
|
cpu_neg(cpu)->icount_decr.u16.low += insns_left;
|
2008-06-29 05:03:05 +04:00
|
|
|
}
|
2022-10-24 12:43:40 +03:00
|
|
|
|
2022-10-24 14:17:39 +03:00
|
|
|
cpu->cc->tcg_ops->restore_state_to_opc(cpu, tb, data);
|
2008-02-01 13:50:11 +03:00
|
|
|
|
|
|
|
#ifdef CONFIG_PROFILER
|
2020-09-23 13:56:46 +03:00
|
|
|
qatomic_set(&prof->restore_time,
|
2017-07-06 02:35:06 +03:00
|
|
|
prof->restore_time + profile_getclock() - ti);
|
2020-09-23 13:56:46 +03:00
|
|
|
qatomic_set(&prof->restore_count, prof->restore_count + 1);
|
2008-02-01 13:50:11 +03:00
|
|
|
#endif
|
2003-06-15 23:58:51 +04:00
|
|
|
}
|
2012-12-02 20:04:43 +04:00
|
|
|
|
2022-10-24 16:09:57 +03:00
|
|
|
bool cpu_restore_state(CPUState *cpu, uintptr_t host_pc)
|
2012-12-05 00:16:07 +04:00
|
|
|
{
|
2020-10-31 04:59:09 +03:00
|
|
|
/*
|
2020-10-28 22:05:44 +03:00
|
|
|
* The host_pc has to be in the rx region of the code buffer.
|
2020-10-31 04:59:09 +03:00
|
|
|
* If it is not we will not be able to resolve it here.
|
|
|
|
* The two cases where host_pc will not be correct are:
|
2017-11-13 16:55:27 +03:00
|
|
|
*
|
|
|
|
* - fault during translation (instruction fetch)
|
|
|
|
* - fault from helper (not using GETPC() macro)
|
|
|
|
*
|
2017-08-05 06:46:31 +03:00
|
|
|
* Either way we need return early as we can't resolve it here.
|
2017-03-02 13:31:32 +03:00
|
|
|
*/
|
2020-10-28 22:05:44 +03:00
|
|
|
if (in_code_gen_buffer((const void *)(host_pc - tcg_splitwx_diff))) {
|
2020-10-31 04:59:09 +03:00
|
|
|
TranslationBlock *tb = tcg_tb_lookup(host_pc);
|
2017-11-13 16:55:27 +03:00
|
|
|
if (tb) {
|
2022-10-24 16:12:56 +03:00
|
|
|
cpu_restore_state_from_tb(cpu, tb, host_pc);
|
2020-10-31 04:59:09 +03:00
|
|
|
return true;
|
2014-11-26 13:40:16 +03:00
|
|
|
}
|
2012-12-05 00:16:07 +04:00
|
|
|
}
|
2020-10-31 04:59:09 +03:00
|
|
|
return false;
|
2012-12-05 00:16:07 +04:00
|
|
|
}
|
|
|
|
|
2022-10-24 15:15:04 +03:00
|
|
|
bool cpu_unwind_state_data(CPUState *cpu, uintptr_t host_pc, uint64_t *data)
|
|
|
|
{
|
|
|
|
if (in_code_gen_buffer((const void *)(host_pc - tcg_splitwx_diff))) {
|
|
|
|
TranslationBlock *tb = tcg_tb_lookup(host_pc);
|
|
|
|
if (tb) {
|
|
|
|
return cpu_unwind_data_from_tb(tb, host_pc, data) >= 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2021-03-10 02:42:16 +03:00
|
|
|
void page_init(void)
|
2014-01-17 22:12:07 +04:00
|
|
|
{
|
|
|
|
page_size_init();
|
2016-10-24 18:26:49 +03:00
|
|
|
page_table_config_init();
|
2012-12-02 20:04:43 +04:00
|
|
|
}
|
|
|
|
|
2022-11-06 03:12:33 +03:00
|
|
|
/*
|
|
|
|
* Isolate the portion of code gen which can setjmp/longjmp.
|
|
|
|
* Return the size of the generated code, or negative on error.
|
|
|
|
*/
|
|
|
|
static int setjmp_gen_code(CPUArchState *env, TranslationBlock *tb,
|
|
|
|
target_ulong pc, void *host_pc,
|
|
|
|
int *max_insns, int64_t *ti)
|
|
|
|
{
|
|
|
|
int ret = sigsetjmp(tcg_ctx->jmp_trans, 0);
|
|
|
|
if (unlikely(ret != 0)) {
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
tcg_func_start(tcg_ctx);
|
|
|
|
|
|
|
|
tcg_ctx->cpu = env_cpu(env);
|
2023-01-29 04:19:22 +03:00
|
|
|
gen_intermediate_code(env_cpu(env), tb, max_insns, pc, host_pc);
|
2022-11-06 03:12:33 +03:00
|
|
|
assert(tb->size != 0);
|
|
|
|
tcg_ctx->cpu = NULL;
|
|
|
|
*max_insns = tb->icount;
|
|
|
|
|
|
|
|
#ifdef CONFIG_PROFILER
|
|
|
|
qatomic_set(&tcg_ctx->prof.tb_count, tcg_ctx->prof.tb_count + 1);
|
|
|
|
qatomic_set(&tcg_ctx->prof.interm_time,
|
|
|
|
tcg_ctx->prof.interm_time + profile_getclock() - *ti);
|
|
|
|
*ti = profile_getclock();
|
|
|
|
#endif
|
|
|
|
|
|
|
|
return tcg_gen_code(tcg_ctx, tb, pc);
|
|
|
|
}
|
|
|
|
|
2015-08-11 11:59:50 +03:00
|
|
|
/* Called with mmap_lock held for user mode emulation. */
|
2013-09-01 19:43:17 +04:00
|
|
|
TranslationBlock *tb_gen_code(CPUState *cpu,
|
2012-12-02 20:04:43 +04:00
|
|
|
target_ulong pc, target_ulong cs_base,
|
2016-04-07 20:19:22 +03:00
|
|
|
uint32_t flags, int cflags)
|
2012-12-02 20:04:43 +04:00
|
|
|
{
|
2013-09-01 19:43:17 +04:00
|
|
|
CPUArchState *env = cpu->env_ptr;
|
2017-08-01 22:40:16 +03:00
|
|
|
TranslationBlock *tb, *existing_tb;
|
2022-08-11 07:39:29 +03:00
|
|
|
tb_page_addr_t phys_pc;
|
2015-08-28 04:17:40 +03:00
|
|
|
tcg_insn_unit *gen_code_buf;
|
2019-04-16 09:54:54 +03:00
|
|
|
int gen_code_size, search_size, max_insns;
|
2015-08-28 04:17:40 +03:00
|
|
|
#ifdef CONFIG_PROFILER
|
2017-07-06 02:35:06 +03:00
|
|
|
TCGProfile *prof = &tcg_ctx->prof;
|
2015-08-28 04:17:40 +03:00
|
|
|
#endif
|
2022-11-06 03:12:33 +03:00
|
|
|
int64_t ti;
|
2022-08-11 23:48:03 +03:00
|
|
|
void *host_pc;
|
2019-10-23 19:20:47 +03:00
|
|
|
|
2016-10-27 18:10:05 +03:00
|
|
|
assert_memory_lock();
|
2021-01-13 06:28:07 +03:00
|
|
|
qemu_thread_jit_write();
|
2012-12-02 20:04:43 +04:00
|
|
|
|
2022-08-11 23:48:03 +03:00
|
|
|
phys_pc = get_page_addr_code_hostp(env, pc, &host_pc);
|
2015-09-22 23:01:15 +03:00
|
|
|
|
2018-08-14 19:17:19 +03:00
|
|
|
if (phys_pc == -1) {
|
2021-02-13 16:03:20 +03:00
|
|
|
/* Generate a one-shot TB with 1 insn in it */
|
2021-04-15 19:24:53 +03:00
|
|
|
cflags = (cflags & ~CF_COUNT_MASK) | CF_LAST_IO | 1;
|
2018-08-14 19:17:19 +03:00
|
|
|
}
|
|
|
|
|
2019-04-16 09:54:54 +03:00
|
|
|
max_insns = cflags & CF_COUNT_MASK;
|
|
|
|
if (max_insns == 0) {
|
|
|
|
max_insns = TCG_MAX_INSNS;
|
|
|
|
}
|
2021-07-18 01:18:39 +03:00
|
|
|
QEMU_BUILD_BUG_ON(CF_COUNT_MASK + 1 != TCG_MAX_INSNS);
|
|
|
|
|
tcg: introduce regions to split code_gen_buffer
This is groundwork for supporting multiple TCG contexts.
The naive solution here is to split code_gen_buffer statically
among the TCG threads; this however results in poor utilization
if translation needs are different across TCG threads.
What we do here is to add an extra layer of indirection, assigning
regions that act just like pages do in virtual memory allocation.
(BTW if you are wondering about the chosen naming, I did not want
to use blocks or pages because those are already heavily used in QEMU).
We use a global lock to serialize allocations as well as statistics
reporting (we now export the size of the used code_gen_buffer with
tcg_code_size()). Note that for the allocator we could just use
a counter and atomic_inc; however, that would complicate the gathering
of tcg_code_size()-like stats. So given that the region operations are
not a fast path, a lock seems the most reasonable choice.
The effectiveness of this approach is clear after seeing some numbers.
I used the bootup+shutdown of debian-arm with '-tb-size 80' as a benchmark.
Note that I'm evaluating this after enabling per-thread TCG (which
is done by a subsequent commit).
* -smp 1, 1 region (entire buffer):
qemu: flush code_size=83885014 nb_tbs=154739 avg_tb_size=357
qemu: flush code_size=83884902 nb_tbs=153136 avg_tb_size=363
qemu: flush code_size=83885014 nb_tbs=152777 avg_tb_size=364
qemu: flush code_size=83884950 nb_tbs=150057 avg_tb_size=373
qemu: flush code_size=83884998 nb_tbs=150234 avg_tb_size=373
qemu: flush code_size=83885014 nb_tbs=154009 avg_tb_size=360
qemu: flush code_size=83885014 nb_tbs=151007 avg_tb_size=370
qemu: flush code_size=83885014 nb_tbs=151816 avg_tb_size=367
That is, 8 flushes.
* -smp 8, 32 regions (80/32 MB per region) [i.e. this patch]:
qemu: flush code_size=76328008 nb_tbs=141040 avg_tb_size=356
qemu: flush code_size=75366534 nb_tbs=138000 avg_tb_size=361
qemu: flush code_size=76864546 nb_tbs=140653 avg_tb_size=361
qemu: flush code_size=76309084 nb_tbs=135945 avg_tb_size=375
qemu: flush code_size=74581856 nb_tbs=132909 avg_tb_size=375
qemu: flush code_size=73927256 nb_tbs=135616 avg_tb_size=360
qemu: flush code_size=78629426 nb_tbs=142896 avg_tb_size=365
qemu: flush code_size=76667052 nb_tbs=138508 avg_tb_size=368
Again, 8 flushes. Note how buffer utilization is not 100%, but it
is close. Smaller region sizes would yield higher utilization,
but we want region allocation to be rare (it acquires a lock), so
we do not want to go too small.
* -smp 8, static partitioning of 8 regions (10 MB per region):
qemu: flush code_size=21936504 nb_tbs=40570 avg_tb_size=354
qemu: flush code_size=11472174 nb_tbs=20633 avg_tb_size=370
qemu: flush code_size=11603976 nb_tbs=21059 avg_tb_size=365
qemu: flush code_size=23254872 nb_tbs=41243 avg_tb_size=377
qemu: flush code_size=28289496 nb_tbs=52057 avg_tb_size=358
qemu: flush code_size=43605160 nb_tbs=78896 avg_tb_size=367
qemu: flush code_size=45166552 nb_tbs=82158 avg_tb_size=364
qemu: flush code_size=63289640 nb_tbs=116494 avg_tb_size=358
qemu: flush code_size=51389960 nb_tbs=93937 avg_tb_size=362
qemu: flush code_size=59665928 nb_tbs=107063 avg_tb_size=372
qemu: flush code_size=38380824 nb_tbs=68597 avg_tb_size=374
qemu: flush code_size=44884568 nb_tbs=79901 avg_tb_size=376
qemu: flush code_size=50782632 nb_tbs=90681 avg_tb_size=374
qemu: flush code_size=39848888 nb_tbs=71433 avg_tb_size=372
qemu: flush code_size=64708840 nb_tbs=119052 avg_tb_size=359
qemu: flush code_size=49830008 nb_tbs=90992 avg_tb_size=362
qemu: flush code_size=68372408 nb_tbs=123442 avg_tb_size=368
qemu: flush code_size=33555560 nb_tbs=59514 avg_tb_size=378
qemu: flush code_size=44748344 nb_tbs=80974 avg_tb_size=367
qemu: flush code_size=37104248 nb_tbs=67609 avg_tb_size=364
That is, 20 flushes. Note how a static partitioning approach uses
the code buffer poorly, leading to many unnecessary flushes.
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2017-07-08 02:24:20 +03:00
|
|
|
buffer_overflow:
|
2019-10-23 19:20:47 +03:00
|
|
|
tb = tcg_tb_alloc(tcg_ctx);
|
2015-09-22 23:01:15 +03:00
|
|
|
if (unlikely(!tb)) {
|
2012-12-02 20:04:43 +04:00
|
|
|
/* flush must be done */
|
2015-06-24 05:31:15 +03:00
|
|
|
tb_flush(cpu);
|
2016-08-02 20:27:43 +03:00
|
|
|
mmap_unlock();
|
2017-01-26 15:34:18 +03:00
|
|
|
/* Make the execution loop process the flush as soon as possible. */
|
|
|
|
cpu->exception_index = EXCP_INTERRUPT;
|
2016-08-02 20:27:43 +03:00
|
|
|
cpu_loop_exit(cpu);
|
2012-12-02 20:04:43 +04:00
|
|
|
}
|
2015-08-28 04:17:40 +03:00
|
|
|
|
2017-07-13 00:15:52 +03:00
|
|
|
gen_code_buf = tcg_ctx->code_gen_ptr;
|
2020-10-28 22:05:44 +03:00
|
|
|
tb->tc.ptr = tcg_splitwx_to_rx(gen_code_buf);
|
2023-02-27 16:51:39 +03:00
|
|
|
if (!(cflags & CF_PCREL)) {
|
|
|
|
tb->pc = pc;
|
|
|
|
}
|
2012-12-02 20:04:43 +04:00
|
|
|
tb->cs_base = cs_base;
|
|
|
|
tb->flags = flags;
|
|
|
|
tb->cflags = cflags;
|
2017-07-04 11:42:32 +03:00
|
|
|
tb->trace_vcpu_dstate = *cpu->trace_dstate;
|
2022-09-20 14:21:40 +03:00
|
|
|
tb_set_page_addr0(tb, phys_pc);
|
|
|
|
tb_set_page_addr1(tb, -1);
|
2022-11-27 05:39:55 +03:00
|
|
|
tcg_ctx->gen_tb = tb;
|
2019-04-16 11:06:39 +03:00
|
|
|
tb_overflow:
|
2015-08-28 04:17:40 +03:00
|
|
|
|
|
|
|
#ifdef CONFIG_PROFILER
|
2017-07-06 02:35:06 +03:00
|
|
|
/* includes aborted translations because of exceptions */
|
2020-09-23 13:56:46 +03:00
|
|
|
qatomic_set(&prof->tb_count1, prof->tb_count1 + 1);
|
2015-08-28 04:17:40 +03:00
|
|
|
ti = profile_getclock();
|
|
|
|
#endif
|
|
|
|
|
2022-08-15 23:16:06 +03:00
|
|
|
trace_translate_block(tb, pc, tb->tc.ptr);
|
2015-08-28 04:17:40 +03:00
|
|
|
|
2022-11-06 03:12:33 +03:00
|
|
|
gen_code_size = setjmp_gen_code(env, tb, pc, host_pc, &max_insns, &ti);
|
2015-09-22 23:01:15 +03:00
|
|
|
if (unlikely(gen_code_size < 0)) {
|
2019-04-16 11:06:39 +03:00
|
|
|
switch (gen_code_size) {
|
|
|
|
case -1:
|
|
|
|
/*
|
|
|
|
* Overflow of code_gen_buffer, or the current slice of it.
|
|
|
|
*
|
|
|
|
* TODO: We don't need to re-do gen_intermediate_code, nor
|
|
|
|
* should we re-do the tcg optimization currently hidden
|
|
|
|
* inside tcg_gen_code. All that should be required is to
|
|
|
|
* flush the TBs, allocate a new TB, re-initialize it per
|
|
|
|
* above, and re-do the actual code generation.
|
|
|
|
*/
|
2021-01-24 01:11:17 +03:00
|
|
|
qemu_log_mask(CPU_LOG_TB_OP | CPU_LOG_TB_OP_OPT,
|
|
|
|
"Restarting code generation for "
|
|
|
|
"code_gen_buffer overflow\n");
|
2019-04-16 11:06:39 +03:00
|
|
|
goto buffer_overflow;
|
|
|
|
|
|
|
|
case -2:
|
|
|
|
/*
|
|
|
|
* The code generated for the TranslationBlock is too large.
|
|
|
|
* The maximum size allowed by the unwind info is 64k.
|
|
|
|
* There may be stricter constraints from relocations
|
|
|
|
* in the tcg backend.
|
|
|
|
*
|
|
|
|
* Try again with half as many insns as we attempted this time.
|
|
|
|
* If a single insn overflows, there's a bug somewhere...
|
|
|
|
*/
|
|
|
|
assert(max_insns > 1);
|
|
|
|
max_insns /= 2;
|
2021-01-24 01:11:17 +03:00
|
|
|
qemu_log_mask(CPU_LOG_TB_OP | CPU_LOG_TB_OP_OPT,
|
|
|
|
"Restarting code generation with "
|
|
|
|
"smaller translation block (max %d insns)\n",
|
|
|
|
max_insns);
|
2019-04-16 11:06:39 +03:00
|
|
|
goto tb_overflow;
|
|
|
|
|
|
|
|
default:
|
|
|
|
g_assert_not_reached();
|
|
|
|
}
|
2015-09-22 23:01:15 +03:00
|
|
|
}
|
2015-09-02 05:11:45 +03:00
|
|
|
search_size = encode_search(tb, (void *)gen_code_buf + gen_code_size);
|
2015-09-22 23:01:15 +03:00
|
|
|
if (unlikely(search_size < 0)) {
|
|
|
|
goto buffer_overflow;
|
|
|
|
}
|
translate-all: use a binary search tree to track TBs in TBContext
This is a prerequisite for supporting multiple TCG contexts, since
we will have threads generating code in separate regions of
code_gen_buffer.
For this we need a new field (.size) in struct tb_tc to keep
track of the size of the translated code. This field uses a size_t
to avoid adding a hole to the struct, although really an unsigned
int would have been enough.
The comparison function we use is optimized for the common case:
insertions. Profiling shows that upon booting debian-arm, 98%
of comparisons are between existing tb's (i.e. a->size and b->size
are both !0), which happens during insertions (and removals, but
those are rare). The remaining cases are lookups. From reading the glib
sources we see that the first key is always the lookup key. However,
the code does not assume this to always be the case because this
behaviour is not guaranteed in the glib docs. However, we embed
this knowledge in the code as a branch hint for the compiler.
Note that tb_free does not free space in the code_gen_buffer anymore,
since we cannot easily know whether the tb is the last one inserted
in code_gen_buffer. The next patch in this series renames tb_free
to tb_remove to reflect this.
Performance-wise, lookups in tb_find_pc are the same as before:
O(log n). However, insertions are O(log n) instead of O(1), which
results in a small slowdown when booting debian-arm:
Performance counter stats for 'build/arm-softmmu/qemu-system-arm \
-machine type=virt -nographic -smp 1 -m 4096 \
-netdev user,id=unet,hostfwd=tcp::2222-:22 \
-device virtio-net-device,netdev=unet \
-drive file=img/arm/jessie-arm32.qcow2,id=myblock,index=0,if=none \
-device virtio-blk-device,drive=myblock \
-kernel img/arm/aarch32-current-linux-kernel-only.img \
-append console=ttyAMA0 root=/dev/vda1 \
-name arm,debug-threads=on -smp 1' (10 runs):
- Before:
8048.598422 task-clock (msec) # 0.931 CPUs utilized ( +- 0.28% )
16,974 context-switches # 0.002 M/sec ( +- 0.12% )
0 cpu-migrations # 0.000 K/sec
10,125 page-faults # 0.001 M/sec ( +- 1.23% )
35,144,901,879 cycles # 4.367 GHz ( +- 0.14% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
65,758,252,643 instructions # 1.87 insns per cycle ( +- 0.33% )
10,871,298,668 branches # 1350.707 M/sec ( +- 0.41% )
192,322,212 branch-misses # 1.77% of all branches ( +- 0.32% )
8.640869419 seconds time elapsed ( +- 0.57% )
- After:
8146.242027 task-clock (msec) # 0.923 CPUs utilized ( +- 1.23% )
17,016 context-switches # 0.002 M/sec ( +- 0.40% )
0 cpu-migrations # 0.000 K/sec
18,769 page-faults # 0.002 M/sec ( +- 0.45% )
35,660,956,120 cycles # 4.378 GHz ( +- 1.22% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
65,095,366,607 instructions # 1.83 insns per cycle ( +- 1.73% )
10,803,480,261 branches # 1326.192 M/sec ( +- 1.95% )
195,601,289 branch-misses # 1.81% of all branches ( +- 0.39% )
8.828660235 seconds time elapsed ( +- 0.38% )
Reviewed-by: Richard Henderson <rth@twiddle.net>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2017-06-24 02:00:11 +03:00
|
|
|
tb->tc.size = gen_code_size;
|
2015-08-28 04:17:40 +03:00
|
|
|
|
2023-01-12 18:20:13 +03:00
|
|
|
/*
|
2023-02-27 16:51:39 +03:00
|
|
|
* For CF_PCREL, attribute all executions of the generated code
|
|
|
|
* to its first mapping.
|
2023-01-12 18:20:13 +03:00
|
|
|
*/
|
|
|
|
perf_report_code(pc, tb, tcg_splitwx_to_rx(gen_code_buf));
|
|
|
|
|
2015-08-28 04:17:40 +03:00
|
|
|
#ifdef CONFIG_PROFILER
|
2020-09-23 13:56:46 +03:00
|
|
|
qatomic_set(&prof->code_time, prof->code_time + profile_getclock() - ti);
|
|
|
|
qatomic_set(&prof->code_in_len, prof->code_in_len + tb->size);
|
|
|
|
qatomic_set(&prof->code_out_len, prof->code_out_len + gen_code_size);
|
|
|
|
qatomic_set(&prof->search_out_len, prof->search_out_len + search_size);
|
2015-08-28 04:17:40 +03:00
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifdef DEBUG_DISAS
|
2016-03-15 17:30:21 +03:00
|
|
|
if (qemu_loglevel_mask(CPU_LOG_TB_OUT_ASM) &&
|
2022-08-15 23:16:06 +03:00
|
|
|
qemu_log_in_addr_range(pc)) {
|
2022-04-17 21:29:47 +03:00
|
|
|
FILE *logfile = qemu_log_trylock();
|
2022-04-17 21:29:49 +03:00
|
|
|
if (logfile) {
|
|
|
|
int code_size, data_size;
|
|
|
|
const tcg_target_ulong *rx_data_gen_ptr;
|
|
|
|
size_t chunk_start;
|
|
|
|
int insn = 0;
|
|
|
|
|
|
|
|
if (tcg_ctx->data_gen_ptr) {
|
|
|
|
rx_data_gen_ptr = tcg_splitwx_to_rx(tcg_ctx->data_gen_ptr);
|
|
|
|
code_size = (const void *)rx_data_gen_ptr - tb->tc.ptr;
|
|
|
|
data_size = gen_code_size - code_size;
|
|
|
|
} else {
|
|
|
|
rx_data_gen_ptr = 0;
|
|
|
|
code_size = gen_code_size;
|
|
|
|
data_size = 0;
|
|
|
|
}
|
2017-07-30 23:13:21 +03:00
|
|
|
|
2022-04-17 21:29:49 +03:00
|
|
|
/* Dump header and the first instruction */
|
|
|
|
fprintf(logfile, "OUT: [size=%d]\n", gen_code_size);
|
|
|
|
fprintf(logfile,
|
|
|
|
" -- guest addr 0x" TARGET_FMT_lx " + tb prologue\n",
|
|
|
|
tcg_ctx->gen_insn_data[insn][0]);
|
|
|
|
chunk_start = tcg_ctx->gen_insn_end_off[insn];
|
|
|
|
disas(logfile, tb->tc.ptr, chunk_start);
|
2017-07-30 23:13:21 +03:00
|
|
|
|
2022-04-17 21:29:49 +03:00
|
|
|
/*
|
|
|
|
* Dump each instruction chunk, wrapping up empty chunks into
|
|
|
|
* the next instruction. The whole array is offset so the
|
|
|
|
* first entry is the beginning of the 2nd instruction.
|
|
|
|
*/
|
|
|
|
while (insn < tb->icount) {
|
|
|
|
size_t chunk_end = tcg_ctx->gen_insn_end_off[insn];
|
|
|
|
if (chunk_end > chunk_start) {
|
|
|
|
fprintf(logfile, " -- guest addr 0x" TARGET_FMT_lx "\n",
|
|
|
|
tcg_ctx->gen_insn_data[insn][0]);
|
|
|
|
disas(logfile, tb->tc.ptr + chunk_start,
|
|
|
|
chunk_end - chunk_start);
|
|
|
|
chunk_start = chunk_end;
|
|
|
|
}
|
|
|
|
insn++;
|
2020-05-13 20:51:34 +03:00
|
|
|
}
|
|
|
|
|
2022-04-17 21:29:49 +03:00
|
|
|
if (chunk_start < code_size) {
|
|
|
|
fprintf(logfile, " -- tb slow paths + alignment\n");
|
|
|
|
disas(logfile, tb->tc.ptr + chunk_start,
|
|
|
|
code_size - chunk_start);
|
|
|
|
}
|
2020-09-10 22:15:04 +03:00
|
|
|
|
2022-04-17 21:29:49 +03:00
|
|
|
/* Finally dump any data we may have after the block */
|
|
|
|
if (data_size) {
|
|
|
|
int i;
|
|
|
|
fprintf(logfile, " data: [size=%d]\n", data_size);
|
|
|
|
for (i = 0; i < data_size / sizeof(tcg_target_ulong); i++) {
|
|
|
|
if (sizeof(tcg_target_ulong) == 8) {
|
|
|
|
fprintf(logfile,
|
|
|
|
"0x%08" PRIxPTR ": .quad 0x%016" TCG_PRIlx "\n",
|
|
|
|
(uintptr_t)&rx_data_gen_ptr[i], rx_data_gen_ptr[i]);
|
|
|
|
} else if (sizeof(tcg_target_ulong) == 4) {
|
|
|
|
fprintf(logfile,
|
|
|
|
"0x%08" PRIxPTR ": .long 0x%08" TCG_PRIlx "\n",
|
|
|
|
(uintptr_t)&rx_data_gen_ptr[i], rx_data_gen_ptr[i]);
|
|
|
|
} else {
|
|
|
|
qemu_build_not_reached();
|
|
|
|
}
|
2021-05-15 13:42:02 +03:00
|
|
|
}
|
2017-07-30 23:13:21 +03:00
|
|
|
}
|
2022-04-17 21:29:49 +03:00
|
|
|
fprintf(logfile, "\n");
|
|
|
|
qemu_log_unlock(logfile);
|
2017-07-30 23:13:21 +03:00
|
|
|
}
|
2015-08-28 04:17:40 +03:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2020-09-23 13:56:46 +03:00
|
|
|
qatomic_set(&tcg_ctx->code_gen_ptr, (void *)
|
2015-09-02 05:11:45 +03:00
|
|
|
ROUND_UP((uintptr_t)gen_code_buf + gen_code_size + search_size,
|
tcg: introduce regions to split code_gen_buffer
This is groundwork for supporting multiple TCG contexts.
The naive solution here is to split code_gen_buffer statically
among the TCG threads; this however results in poor utilization
if translation needs are different across TCG threads.
What we do here is to add an extra layer of indirection, assigning
regions that act just like pages do in virtual memory allocation.
(BTW if you are wondering about the chosen naming, I did not want
to use blocks or pages because those are already heavily used in QEMU).
We use a global lock to serialize allocations as well as statistics
reporting (we now export the size of the used code_gen_buffer with
tcg_code_size()). Note that for the allocator we could just use
a counter and atomic_inc; however, that would complicate the gathering
of tcg_code_size()-like stats. So given that the region operations are
not a fast path, a lock seems the most reasonable choice.
The effectiveness of this approach is clear after seeing some numbers.
I used the bootup+shutdown of debian-arm with '-tb-size 80' as a benchmark.
Note that I'm evaluating this after enabling per-thread TCG (which
is done by a subsequent commit).
* -smp 1, 1 region (entire buffer):
qemu: flush code_size=83885014 nb_tbs=154739 avg_tb_size=357
qemu: flush code_size=83884902 nb_tbs=153136 avg_tb_size=363
qemu: flush code_size=83885014 nb_tbs=152777 avg_tb_size=364
qemu: flush code_size=83884950 nb_tbs=150057 avg_tb_size=373
qemu: flush code_size=83884998 nb_tbs=150234 avg_tb_size=373
qemu: flush code_size=83885014 nb_tbs=154009 avg_tb_size=360
qemu: flush code_size=83885014 nb_tbs=151007 avg_tb_size=370
qemu: flush code_size=83885014 nb_tbs=151816 avg_tb_size=367
That is, 8 flushes.
* -smp 8, 32 regions (80/32 MB per region) [i.e. this patch]:
qemu: flush code_size=76328008 nb_tbs=141040 avg_tb_size=356
qemu: flush code_size=75366534 nb_tbs=138000 avg_tb_size=361
qemu: flush code_size=76864546 nb_tbs=140653 avg_tb_size=361
qemu: flush code_size=76309084 nb_tbs=135945 avg_tb_size=375
qemu: flush code_size=74581856 nb_tbs=132909 avg_tb_size=375
qemu: flush code_size=73927256 nb_tbs=135616 avg_tb_size=360
qemu: flush code_size=78629426 nb_tbs=142896 avg_tb_size=365
qemu: flush code_size=76667052 nb_tbs=138508 avg_tb_size=368
Again, 8 flushes. Note how buffer utilization is not 100%, but it
is close. Smaller region sizes would yield higher utilization,
but we want region allocation to be rare (it acquires a lock), so
we do not want to go too small.
* -smp 8, static partitioning of 8 regions (10 MB per region):
qemu: flush code_size=21936504 nb_tbs=40570 avg_tb_size=354
qemu: flush code_size=11472174 nb_tbs=20633 avg_tb_size=370
qemu: flush code_size=11603976 nb_tbs=21059 avg_tb_size=365
qemu: flush code_size=23254872 nb_tbs=41243 avg_tb_size=377
qemu: flush code_size=28289496 nb_tbs=52057 avg_tb_size=358
qemu: flush code_size=43605160 nb_tbs=78896 avg_tb_size=367
qemu: flush code_size=45166552 nb_tbs=82158 avg_tb_size=364
qemu: flush code_size=63289640 nb_tbs=116494 avg_tb_size=358
qemu: flush code_size=51389960 nb_tbs=93937 avg_tb_size=362
qemu: flush code_size=59665928 nb_tbs=107063 avg_tb_size=372
qemu: flush code_size=38380824 nb_tbs=68597 avg_tb_size=374
qemu: flush code_size=44884568 nb_tbs=79901 avg_tb_size=376
qemu: flush code_size=50782632 nb_tbs=90681 avg_tb_size=374
qemu: flush code_size=39848888 nb_tbs=71433 avg_tb_size=372
qemu: flush code_size=64708840 nb_tbs=119052 avg_tb_size=359
qemu: flush code_size=49830008 nb_tbs=90992 avg_tb_size=362
qemu: flush code_size=68372408 nb_tbs=123442 avg_tb_size=368
qemu: flush code_size=33555560 nb_tbs=59514 avg_tb_size=378
qemu: flush code_size=44748344 nb_tbs=80974 avg_tb_size=367
qemu: flush code_size=37104248 nb_tbs=67609 avg_tb_size=364
That is, 20 flushes. Note how a static partitioning approach uses
the code buffer poorly, leading to many unnecessary flushes.
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2017-07-08 02:24:20 +03:00
|
|
|
CODE_GEN_ALIGN));
|
2012-12-02 20:04:43 +04:00
|
|
|
|
2016-03-22 19:00:12 +03:00
|
|
|
/* init jump list */
|
translate-all: protect TB jumps with a per-destination-TB lock
This applies to both user-mode and !user-mode emulation.
Instead of relying on a global lock, protect the list of incoming
jumps with tb->jmp_lock. This lock also protects tb->cflags,
so update all tb->cflags readers outside tb->jmp_lock to use
atomic reads via tb_cflags().
In order to find the destination TB (and therefore its jmp_lock)
from the origin TB, we introduce tb->jmp_dest[].
I considered not using a linked list of jumps, which simplifies
code and makes the struct smaller. However, it unnecessarily increases
memory usage, which results in a performance decrease. See for
instance these numbers booting+shutting down debian-arm:
Time (s) Rel. err (%) Abs. err (s) Rel. slowdown (%)
------------------------------------------------------------------------------
before 20.88 0.74 0.154512 0.
after 20.81 0.38 0.079078 -0.33524904
GTree 21.02 0.28 0.058856 0.67049808
GHashTable + xxhash 21.63 1.08 0.233604 3.5919540
Using a hash table or a binary tree to keep track of the jumps
doesn't really pay off, not only due to the increased memory usage,
but also because most TBs have only 0 or 1 jumps to them. The maximum
number of jumps when booting debian-arm that I measured is 35, but
as we can see in the histogram below a TB with that many incoming jumps
is extremely rare; the average TB has 0.80 incoming jumps.
n_jumps: 379208; avg jumps/tb: 0.801099
dist: [0.0,1.0)|▄█▁▁▁▁▁▁▁▁▁▁▁ ▁▁▁▁▁▁ ▁▁▁ ▁▁▁ ▁|[34.0,35.0]
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2017-08-03 03:34:06 +03:00
|
|
|
qemu_spin_init(&tb->jmp_lock);
|
|
|
|
tb->jmp_list_head = (uintptr_t)NULL;
|
2016-03-22 19:00:12 +03:00
|
|
|
tb->jmp_list_next[0] = (uintptr_t)NULL;
|
|
|
|
tb->jmp_list_next[1] = (uintptr_t)NULL;
|
translate-all: protect TB jumps with a per-destination-TB lock
This applies to both user-mode and !user-mode emulation.
Instead of relying on a global lock, protect the list of incoming
jumps with tb->jmp_lock. This lock also protects tb->cflags,
so update all tb->cflags readers outside tb->jmp_lock to use
atomic reads via tb_cflags().
In order to find the destination TB (and therefore its jmp_lock)
from the origin TB, we introduce tb->jmp_dest[].
I considered not using a linked list of jumps, which simplifies
code and makes the struct smaller. However, it unnecessarily increases
memory usage, which results in a performance decrease. See for
instance these numbers booting+shutting down debian-arm:
Time (s) Rel. err (%) Abs. err (s) Rel. slowdown (%)
------------------------------------------------------------------------------
before 20.88 0.74 0.154512 0.
after 20.81 0.38 0.079078 -0.33524904
GTree 21.02 0.28 0.058856 0.67049808
GHashTable + xxhash 21.63 1.08 0.233604 3.5919540
Using a hash table or a binary tree to keep track of the jumps
doesn't really pay off, not only due to the increased memory usage,
but also because most TBs have only 0 or 1 jumps to them. The maximum
number of jumps when booting debian-arm that I measured is 35, but
as we can see in the histogram below a TB with that many incoming jumps
is extremely rare; the average TB has 0.80 incoming jumps.
n_jumps: 379208; avg jumps/tb: 0.801099
dist: [0.0,1.0)|▄█▁▁▁▁▁▁▁▁▁▁▁ ▁▁▁▁▁▁ ▁▁▁ ▁▁▁ ▁|[34.0,35.0]
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2017-08-03 03:34:06 +03:00
|
|
|
tb->jmp_dest[0] = (uintptr_t)NULL;
|
|
|
|
tb->jmp_dest[1] = (uintptr_t)NULL;
|
2016-03-22 19:00:12 +03:00
|
|
|
|
2018-07-12 22:44:54 +03:00
|
|
|
/* init original jump addresses which have been set during tcg_gen_code() */
|
2022-11-27 05:20:57 +03:00
|
|
|
if (tb->jmp_reset_offset[0] != TB_JMP_OFFSET_INVALID) {
|
2016-03-22 19:00:12 +03:00
|
|
|
tb_reset_jump(tb, 0);
|
|
|
|
}
|
2022-11-27 05:20:57 +03:00
|
|
|
if (tb->jmp_reset_offset[1] != TB_JMP_OFFSET_INVALID) {
|
2016-03-22 19:00:12 +03:00
|
|
|
tb_reset_jump(tb, 1);
|
|
|
|
}
|
|
|
|
|
2021-02-13 16:03:20 +03:00
|
|
|
/*
|
2022-08-11 07:39:29 +03:00
|
|
|
* If the TB is not associated with a physical RAM page then it must be
|
|
|
|
* a temporary one-insn TB, and we have nothing left to do. Return early
|
|
|
|
* before attempting to link to other TBs or add to the lookup table.
|
2021-02-13 16:03:20 +03:00
|
|
|
*/
|
2022-09-20 14:21:40 +03:00
|
|
|
if (tb_page_addr0(tb) == -1) {
|
2021-02-13 16:03:20 +03:00
|
|
|
return tb;
|
|
|
|
}
|
|
|
|
|
2021-07-04 17:31:26 +03:00
|
|
|
/*
|
|
|
|
* Insert TB into the corresponding region tree before publishing it
|
|
|
|
* through QHT. Otherwise rewinding happened in the TB might fail to
|
|
|
|
* lookup itself using host PC.
|
|
|
|
*/
|
|
|
|
tcg_tb_insert(tb);
|
|
|
|
|
2017-08-05 06:46:31 +03:00
|
|
|
/*
|
|
|
|
* No explicit memory barrier is required -- tb_link_page() makes the
|
|
|
|
* TB visible in a consistent state.
|
2016-03-22 19:00:12 +03:00
|
|
|
*/
|
2022-09-20 14:21:40 +03:00
|
|
|
existing_tb = tb_link_page(tb, tb_page_addr0(tb), tb_page_addr1(tb));
|
2017-08-01 22:40:16 +03:00
|
|
|
/* if the TB already exists, discard what we just translated */
|
|
|
|
if (unlikely(existing_tb != tb)) {
|
|
|
|
uintptr_t orig_aligned = (uintptr_t)gen_code_buf;
|
|
|
|
|
|
|
|
orig_aligned -= ROUND_UP(sizeof(*tb), qemu_icache_linesize);
|
2020-09-23 13:56:46 +03:00
|
|
|
qatomic_set(&tcg_ctx->code_gen_ptr, (void *)orig_aligned);
|
2021-07-04 17:31:26 +03:00
|
|
|
tcg_tb_remove(tb);
|
2017-08-01 22:40:16 +03:00
|
|
|
return existing_tb;
|
|
|
|
}
|
2012-12-02 20:04:43 +04:00
|
|
|
return tb;
|
|
|
|
}
|
|
|
|
|
2017-08-05 06:46:31 +03:00
|
|
|
/* user-mode: call with mmap_lock held */
|
2019-09-22 06:24:12 +03:00
|
|
|
void tb_check_watchpoint(CPUState *cpu, uintptr_t retaddr)
|
2012-12-02 20:04:43 +04:00
|
|
|
{
|
|
|
|
TranslationBlock *tb;
|
|
|
|
|
2017-08-05 06:46:31 +03:00
|
|
|
assert_memory_lock();
|
|
|
|
|
2019-09-22 06:24:12 +03:00
|
|
|
tb = tcg_tb_lookup(retaddr);
|
2015-06-13 01:45:59 +03:00
|
|
|
if (tb) {
|
|
|
|
/* We can use retranslation to find the PC. */
|
2022-10-24 16:12:56 +03:00
|
|
|
cpu_restore_state_from_tb(cpu, tb, retaddr);
|
2015-06-13 01:45:59 +03:00
|
|
|
tb_phys_invalidate(tb, -1);
|
|
|
|
} else {
|
|
|
|
/* The exception probably happened in a helper. The CPU state should
|
|
|
|
have been saved before calling it. Fetch the PC from there. */
|
|
|
|
CPUArchState *env = cpu->env_ptr;
|
|
|
|
target_ulong pc, cs_base;
|
|
|
|
tb_page_addr_t addr;
|
2016-04-07 20:19:22 +03:00
|
|
|
uint32_t flags;
|
2015-06-13 01:45:59 +03:00
|
|
|
|
|
|
|
cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
|
|
|
|
addr = get_page_addr_code(env, pc);
|
2018-08-14 19:17:19 +03:00
|
|
|
if (addr != -1) {
|
|
|
|
tb_invalidate_phys_range(addr, addr + 1);
|
|
|
|
}
|
2012-12-02 20:04:43 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifndef CONFIG_USER_ONLY
|
2021-02-13 16:03:22 +03:00
|
|
|
/*
|
|
|
|
* In deterministic execution mode, instructions doing device I/Os
|
tcg: drop global lock during TCG code execution
This finally allows TCG to benefit from the iothread introduction: Drop
the global mutex while running pure TCG CPU code. Reacquire the lock
when entering MMIO or PIO emulation, or when leaving the TCG loop.
We have to revert a few optimization for the current TCG threading
model, namely kicking the TCG thread in qemu_mutex_lock_iothread and not
kicking it in qemu_cpu_kick. We also need to disable RAM block
reordering until we have a more efficient locking mechanism at hand.
Still, a Linux x86 UP guest and my Musicpal ARM model boot fine here.
These numbers demonstrate where we gain something:
20338 jan 20 0 331m 75m 6904 R 99 0.9 0:50.95 qemu-system-arm
20337 jan 20 0 331m 75m 6904 S 20 0.9 0:26.50 qemu-system-arm
The guest CPU was fully loaded, but the iothread could still run mostly
independent on a second core. Without the patch we don't get beyond
32206 jan 20 0 330m 73m 7036 R 82 0.9 1:06.00 qemu-system-arm
32204 jan 20 0 330m 73m 7036 S 21 0.9 0:17.03 qemu-system-arm
We don't benefit significantly, though, when the guest is not fully
loading a host CPU.
Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Message-Id: <1439220437-23957-10-git-send-email-fred.konrad@greensocs.com>
[FK: Rebase, fix qemu_devices_reset deadlock, rm address_space_* mutex]
Signed-off-by: KONRAD Frederic <fred.konrad@greensocs.com>
[EGC: fixed iothread lock for cpu-exec IRQ handling]
Signed-off-by: Emilio G. Cota <cota@braap.org>
[AJB: -smp single-threaded fix, clean commit msg, BQL fixes]
Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Richard Henderson <rth@twiddle.net>
Reviewed-by: Pranith Kumar <bobby.prani@gmail.com>
[PM: target-arm changes]
Acked-by: Peter Maydell <peter.maydell@linaro.org>
2017-02-23 21:29:11 +03:00
|
|
|
* must be at the end of the TB.
|
|
|
|
*
|
|
|
|
* Called by softmmu_template.h, with iothread mutex not held.
|
|
|
|
*/
|
2013-09-01 19:21:47 +04:00
|
|
|
void cpu_io_recompile(CPUState *cpu, uintptr_t retaddr)
|
2012-12-02 20:04:43 +04:00
|
|
|
{
|
|
|
|
TranslationBlock *tb;
|
2021-02-13 16:03:13 +03:00
|
|
|
CPUClass *cc;
|
2018-03-19 06:15:45 +03:00
|
|
|
uint32_t n;
|
2012-12-02 20:04:43 +04:00
|
|
|
|
tcg: track TBs with per-region BST's
This paves the way for enabling scalable parallel generation of TCG code.
Instead of tracking TBs with a single binary search tree (BST), use a
BST for each TCG region, protecting it with a lock. This is as scalable
as it gets, since each TCG thread operates on a separate region.
The core of this change is the introduction of struct tcg_region_tree,
which contains a pointer to a GTree and an associated lock to serialize
accesses to it. We then allocate an array of tcg_region_tree's, adding
the appropriate padding to avoid false sharing based on
qemu_dcache_linesize.
Given a tc_ptr, we first find the corresponding region_tree. This
is done by special-casing the first and last regions first, since they
might be of size != region.size; otherwise we just divide the offset
by region.stride. I was worried about this division (several dozen
cycles of latency), but profiling shows that this is not a fast path.
Note that region.stride is not required to be a power of two; it
is only required to be a multiple of the host's page size.
Note that with this design we can also provide consistent snapshots
about all region trees at once; for instance, tcg_tb_foreach
acquires/releases all region_tree locks before/after iterating over them.
For this reason we now drop tb_lock in dump_exec_info().
As an alternative I considered implementing a concurrent BST, but this
can be tricky to get right, offers no consistent snapshots of the BST,
and performance and scalability-wise I don't think it could ever beat
having separate GTrees, given that our workload is insert-mostly (all
concurrent BST designs I've seen focus, understandably, on making
lookups fast, which comes at the expense of convoluted, non-wait-free
insertions/removals).
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2017-07-26 23:58:05 +03:00
|
|
|
tb = tcg_tb_lookup(retaddr);
|
2012-12-02 20:04:43 +04:00
|
|
|
if (!tb) {
|
2013-09-03 19:38:47 +04:00
|
|
|
cpu_abort(cpu, "cpu_io_recompile: could not find TB for pc=%p",
|
2012-12-02 20:04:43 +04:00
|
|
|
(void *)retaddr);
|
|
|
|
}
|
2022-10-24 16:12:56 +03:00
|
|
|
cpu_restore_state_from_tb(cpu, tb, retaddr);
|
2018-03-19 06:15:45 +03:00
|
|
|
|
2021-02-13 16:03:13 +03:00
|
|
|
/*
|
|
|
|
* Some guests must re-execute the branch when re-executing a delay
|
|
|
|
* slot instruction. When this is the case, adjust icount and N
|
|
|
|
* to account for the re-execution of the branch.
|
|
|
|
*/
|
2018-03-19 06:15:45 +03:00
|
|
|
n = 1;
|
2021-02-13 16:03:13 +03:00
|
|
|
cc = CPU_GET_CLASS(cpu);
|
|
|
|
if (cc->tcg_ops->io_recompile_replay_branch &&
|
|
|
|
cc->tcg_ops->io_recompile_replay_branch(cpu, tb)) {
|
|
|
|
cpu_neg(cpu)->icount_decr.u16.low++;
|
|
|
|
n = 2;
|
|
|
|
}
|
2012-12-02 20:04:43 +04:00
|
|
|
|
2021-02-13 16:03:22 +03:00
|
|
|
/*
|
|
|
|
* Exit the loop and potentially generate a new TB executing the
|
|
|
|
* just the I/O insns. We also limit instrumentation to memory
|
|
|
|
* operations only (which execute after completion) so we don't
|
|
|
|
* double instrument the instruction.
|
|
|
|
*/
|
2021-02-24 19:58:08 +03:00
|
|
|
cpu->cflags_next_tb = curr_cflags(cpu) | CF_MEMI_ONLY | CF_LAST_IO | n;
|
2017-10-13 20:50:02 +03:00
|
|
|
|
2022-08-15 23:16:06 +03:00
|
|
|
if (qemu_loglevel_mask(CPU_LOG_EXEC)) {
|
|
|
|
target_ulong pc = log_pc(cpu, tb);
|
|
|
|
if (qemu_log_in_addr_range(pc)) {
|
|
|
|
qemu_log("cpu_io_recompile: rewound execution of TB to "
|
|
|
|
TARGET_FMT_lx "\n", pc);
|
|
|
|
}
|
|
|
|
}
|
2020-10-13 15:26:58 +03:00
|
|
|
|
2016-05-17 17:18:04 +03:00
|
|
|
cpu_loop_exit_noexc(cpu);
|
2012-12-02 20:04:43 +04:00
|
|
|
}
|
|
|
|
|
2021-09-08 12:35:43 +03:00
|
|
|
static void print_qht_statistics(struct qht_stats hst, GString *buf)
|
qht: do not segfault when gathering stats from an uninitialized qht
So far, QHT functions assume that the passed qht has previously been
initialized--otherwise they segfault.
This patch makes an exception for qht_statistics_init, with the goal
of simplifying calling code. For instance, qht_statistics_init is
called from the 'info jit' dump, and given that under KVM the TB qht
is never initialized, we get a segfault. Thus, instead of complicating
the 'info jit' code with additional checks, let's allow passing an
uninitialized qht to qht_statistics_init.
While at it, add a test for this to test-qht.
Before the patch (for $ qemu -enable-kvm [...]):
(qemu) info jit
[...]
direct jump count 0 (0%) (2 jumps=0 0%)
Program received signal SIGSEGV, Segmentation fault.
After the patch the "TB hash buckets", "TB hash occupancy"
and "TB hash avg chain" lines are omitted.
(qemu) info jit
[...]
direct jump count 0 (0%) (2 jumps=0 0%)
TB hash buckets 0/0 (-nan% head buckets used)
TB hash occupancy nan% avg chain occ. Histogram: (null)
TB hash avg chain nan buckets. Histogram: (null)
[...]
Reported by: Changlong Xie <xiecl.fnst@cn.fujitsu.com>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Message-Id: <1469205390-14369-1-git-send-email-cota@braap.org>
[Extract printing statistics to an entirely separate function. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2016-07-22 19:36:30 +03:00
|
|
|
{
|
|
|
|
uint32_t hgram_opts;
|
|
|
|
size_t hgram_bins;
|
|
|
|
char *hgram;
|
|
|
|
|
|
|
|
if (!hst.head_buckets) {
|
|
|
|
return;
|
|
|
|
}
|
2021-09-08 12:35:43 +03:00
|
|
|
g_string_append_printf(buf, "TB hash buckets %zu/%zu "
|
|
|
|
"(%0.2f%% head buckets used)\n",
|
|
|
|
hst.used_head_buckets, hst.head_buckets,
|
|
|
|
(double)hst.used_head_buckets /
|
|
|
|
hst.head_buckets * 100);
|
qht: do not segfault when gathering stats from an uninitialized qht
So far, QHT functions assume that the passed qht has previously been
initialized--otherwise they segfault.
This patch makes an exception for qht_statistics_init, with the goal
of simplifying calling code. For instance, qht_statistics_init is
called from the 'info jit' dump, and given that under KVM the TB qht
is never initialized, we get a segfault. Thus, instead of complicating
the 'info jit' code with additional checks, let's allow passing an
uninitialized qht to qht_statistics_init.
While at it, add a test for this to test-qht.
Before the patch (for $ qemu -enable-kvm [...]):
(qemu) info jit
[...]
direct jump count 0 (0%) (2 jumps=0 0%)
Program received signal SIGSEGV, Segmentation fault.
After the patch the "TB hash buckets", "TB hash occupancy"
and "TB hash avg chain" lines are omitted.
(qemu) info jit
[...]
direct jump count 0 (0%) (2 jumps=0 0%)
TB hash buckets 0/0 (-nan% head buckets used)
TB hash occupancy nan% avg chain occ. Histogram: (null)
TB hash avg chain nan buckets. Histogram: (null)
[...]
Reported by: Changlong Xie <xiecl.fnst@cn.fujitsu.com>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Message-Id: <1469205390-14369-1-git-send-email-cota@braap.org>
[Extract printing statistics to an entirely separate function. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2016-07-22 19:36:30 +03:00
|
|
|
|
|
|
|
hgram_opts = QDIST_PR_BORDER | QDIST_PR_LABELS;
|
|
|
|
hgram_opts |= QDIST_PR_100X | QDIST_PR_PERCENT;
|
|
|
|
if (qdist_xmax(&hst.occupancy) - qdist_xmin(&hst.occupancy) == 1) {
|
|
|
|
hgram_opts |= QDIST_PR_NODECIMAL;
|
|
|
|
}
|
|
|
|
hgram = qdist_pr(&hst.occupancy, 10, hgram_opts);
|
2021-09-08 12:35:43 +03:00
|
|
|
g_string_append_printf(buf, "TB hash occupancy %0.2f%% avg chain occ. "
|
|
|
|
"Histogram: %s\n",
|
|
|
|
qdist_avg(&hst.occupancy) * 100, hgram);
|
qht: do not segfault when gathering stats from an uninitialized qht
So far, QHT functions assume that the passed qht has previously been
initialized--otherwise they segfault.
This patch makes an exception for qht_statistics_init, with the goal
of simplifying calling code. For instance, qht_statistics_init is
called from the 'info jit' dump, and given that under KVM the TB qht
is never initialized, we get a segfault. Thus, instead of complicating
the 'info jit' code with additional checks, let's allow passing an
uninitialized qht to qht_statistics_init.
While at it, add a test for this to test-qht.
Before the patch (for $ qemu -enable-kvm [...]):
(qemu) info jit
[...]
direct jump count 0 (0%) (2 jumps=0 0%)
Program received signal SIGSEGV, Segmentation fault.
After the patch the "TB hash buckets", "TB hash occupancy"
and "TB hash avg chain" lines are omitted.
(qemu) info jit
[...]
direct jump count 0 (0%) (2 jumps=0 0%)
TB hash buckets 0/0 (-nan% head buckets used)
TB hash occupancy nan% avg chain occ. Histogram: (null)
TB hash avg chain nan buckets. Histogram: (null)
[...]
Reported by: Changlong Xie <xiecl.fnst@cn.fujitsu.com>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Message-Id: <1469205390-14369-1-git-send-email-cota@braap.org>
[Extract printing statistics to an entirely separate function. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2016-07-22 19:36:30 +03:00
|
|
|
g_free(hgram);
|
|
|
|
|
|
|
|
hgram_opts = QDIST_PR_BORDER | QDIST_PR_LABELS;
|
|
|
|
hgram_bins = qdist_xmax(&hst.chain) - qdist_xmin(&hst.chain);
|
|
|
|
if (hgram_bins > 10) {
|
|
|
|
hgram_bins = 10;
|
|
|
|
} else {
|
|
|
|
hgram_bins = 0;
|
|
|
|
hgram_opts |= QDIST_PR_NODECIMAL | QDIST_PR_NOBINRANGE;
|
|
|
|
}
|
|
|
|
hgram = qdist_pr(&hst.chain, hgram_bins, hgram_opts);
|
2021-09-08 12:35:43 +03:00
|
|
|
g_string_append_printf(buf, "TB hash avg chain %0.3f buckets. "
|
|
|
|
"Histogram: %s\n",
|
|
|
|
qdist_avg(&hst.chain), hgram);
|
qht: do not segfault when gathering stats from an uninitialized qht
So far, QHT functions assume that the passed qht has previously been
initialized--otherwise they segfault.
This patch makes an exception for qht_statistics_init, with the goal
of simplifying calling code. For instance, qht_statistics_init is
called from the 'info jit' dump, and given that under KVM the TB qht
is never initialized, we get a segfault. Thus, instead of complicating
the 'info jit' code with additional checks, let's allow passing an
uninitialized qht to qht_statistics_init.
While at it, add a test for this to test-qht.
Before the patch (for $ qemu -enable-kvm [...]):
(qemu) info jit
[...]
direct jump count 0 (0%) (2 jumps=0 0%)
Program received signal SIGSEGV, Segmentation fault.
After the patch the "TB hash buckets", "TB hash occupancy"
and "TB hash avg chain" lines are omitted.
(qemu) info jit
[...]
direct jump count 0 (0%) (2 jumps=0 0%)
TB hash buckets 0/0 (-nan% head buckets used)
TB hash occupancy nan% avg chain occ. Histogram: (null)
TB hash avg chain nan buckets. Histogram: (null)
[...]
Reported by: Changlong Xie <xiecl.fnst@cn.fujitsu.com>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Message-Id: <1469205390-14369-1-git-send-email-cota@braap.org>
[Extract printing statistics to an entirely separate function. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2016-07-22 19:36:30 +03:00
|
|
|
g_free(hgram);
|
|
|
|
}
|
|
|
|
|
translate-all: use a binary search tree to track TBs in TBContext
This is a prerequisite for supporting multiple TCG contexts, since
we will have threads generating code in separate regions of
code_gen_buffer.
For this we need a new field (.size) in struct tb_tc to keep
track of the size of the translated code. This field uses a size_t
to avoid adding a hole to the struct, although really an unsigned
int would have been enough.
The comparison function we use is optimized for the common case:
insertions. Profiling shows that upon booting debian-arm, 98%
of comparisons are between existing tb's (i.e. a->size and b->size
are both !0), which happens during insertions (and removals, but
those are rare). The remaining cases are lookups. From reading the glib
sources we see that the first key is always the lookup key. However,
the code does not assume this to always be the case because this
behaviour is not guaranteed in the glib docs. However, we embed
this knowledge in the code as a branch hint for the compiler.
Note that tb_free does not free space in the code_gen_buffer anymore,
since we cannot easily know whether the tb is the last one inserted
in code_gen_buffer. The next patch in this series renames tb_free
to tb_remove to reflect this.
Performance-wise, lookups in tb_find_pc are the same as before:
O(log n). However, insertions are O(log n) instead of O(1), which
results in a small slowdown when booting debian-arm:
Performance counter stats for 'build/arm-softmmu/qemu-system-arm \
-machine type=virt -nographic -smp 1 -m 4096 \
-netdev user,id=unet,hostfwd=tcp::2222-:22 \
-device virtio-net-device,netdev=unet \
-drive file=img/arm/jessie-arm32.qcow2,id=myblock,index=0,if=none \
-device virtio-blk-device,drive=myblock \
-kernel img/arm/aarch32-current-linux-kernel-only.img \
-append console=ttyAMA0 root=/dev/vda1 \
-name arm,debug-threads=on -smp 1' (10 runs):
- Before:
8048.598422 task-clock (msec) # 0.931 CPUs utilized ( +- 0.28% )
16,974 context-switches # 0.002 M/sec ( +- 0.12% )
0 cpu-migrations # 0.000 K/sec
10,125 page-faults # 0.001 M/sec ( +- 1.23% )
35,144,901,879 cycles # 4.367 GHz ( +- 0.14% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
65,758,252,643 instructions # 1.87 insns per cycle ( +- 0.33% )
10,871,298,668 branches # 1350.707 M/sec ( +- 0.41% )
192,322,212 branch-misses # 1.77% of all branches ( +- 0.32% )
8.640869419 seconds time elapsed ( +- 0.57% )
- After:
8146.242027 task-clock (msec) # 0.923 CPUs utilized ( +- 1.23% )
17,016 context-switches # 0.002 M/sec ( +- 0.40% )
0 cpu-migrations # 0.000 K/sec
18,769 page-faults # 0.002 M/sec ( +- 0.45% )
35,660,956,120 cycles # 4.378 GHz ( +- 1.22% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
65,095,366,607 instructions # 1.83 insns per cycle ( +- 1.73% )
10,803,480,261 branches # 1326.192 M/sec ( +- 1.95% )
195,601,289 branch-misses # 1.81% of all branches ( +- 0.39% )
8.828660235 seconds time elapsed ( +- 0.38% )
Reviewed-by: Richard Henderson <rth@twiddle.net>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2017-06-24 02:00:11 +03:00
|
|
|
struct tb_tree_stats {
|
tcg: track TBs with per-region BST's
This paves the way for enabling scalable parallel generation of TCG code.
Instead of tracking TBs with a single binary search tree (BST), use a
BST for each TCG region, protecting it with a lock. This is as scalable
as it gets, since each TCG thread operates on a separate region.
The core of this change is the introduction of struct tcg_region_tree,
which contains a pointer to a GTree and an associated lock to serialize
accesses to it. We then allocate an array of tcg_region_tree's, adding
the appropriate padding to avoid false sharing based on
qemu_dcache_linesize.
Given a tc_ptr, we first find the corresponding region_tree. This
is done by special-casing the first and last regions first, since they
might be of size != region.size; otherwise we just divide the offset
by region.stride. I was worried about this division (several dozen
cycles of latency), but profiling shows that this is not a fast path.
Note that region.stride is not required to be a power of two; it
is only required to be a multiple of the host's page size.
Note that with this design we can also provide consistent snapshots
about all region trees at once; for instance, tcg_tb_foreach
acquires/releases all region_tree locks before/after iterating over them.
For this reason we now drop tb_lock in dump_exec_info().
As an alternative I considered implementing a concurrent BST, but this
can be tricky to get right, offers no consistent snapshots of the BST,
and performance and scalability-wise I don't think it could ever beat
having separate GTrees, given that our workload is insert-mostly (all
concurrent BST designs I've seen focus, understandably, on making
lookups fast, which comes at the expense of convoluted, non-wait-free
insertions/removals).
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2017-07-26 23:58:05 +03:00
|
|
|
size_t nb_tbs;
|
2017-06-24 03:57:44 +03:00
|
|
|
size_t host_size;
|
translate-all: use a binary search tree to track TBs in TBContext
This is a prerequisite for supporting multiple TCG contexts, since
we will have threads generating code in separate regions of
code_gen_buffer.
For this we need a new field (.size) in struct tb_tc to keep
track of the size of the translated code. This field uses a size_t
to avoid adding a hole to the struct, although really an unsigned
int would have been enough.
The comparison function we use is optimized for the common case:
insertions. Profiling shows that upon booting debian-arm, 98%
of comparisons are between existing tb's (i.e. a->size and b->size
are both !0), which happens during insertions (and removals, but
those are rare). The remaining cases are lookups. From reading the glib
sources we see that the first key is always the lookup key. However,
the code does not assume this to always be the case because this
behaviour is not guaranteed in the glib docs. However, we embed
this knowledge in the code as a branch hint for the compiler.
Note that tb_free does not free space in the code_gen_buffer anymore,
since we cannot easily know whether the tb is the last one inserted
in code_gen_buffer. The next patch in this series renames tb_free
to tb_remove to reflect this.
Performance-wise, lookups in tb_find_pc are the same as before:
O(log n). However, insertions are O(log n) instead of O(1), which
results in a small slowdown when booting debian-arm:
Performance counter stats for 'build/arm-softmmu/qemu-system-arm \
-machine type=virt -nographic -smp 1 -m 4096 \
-netdev user,id=unet,hostfwd=tcp::2222-:22 \
-device virtio-net-device,netdev=unet \
-drive file=img/arm/jessie-arm32.qcow2,id=myblock,index=0,if=none \
-device virtio-blk-device,drive=myblock \
-kernel img/arm/aarch32-current-linux-kernel-only.img \
-append console=ttyAMA0 root=/dev/vda1 \
-name arm,debug-threads=on -smp 1' (10 runs):
- Before:
8048.598422 task-clock (msec) # 0.931 CPUs utilized ( +- 0.28% )
16,974 context-switches # 0.002 M/sec ( +- 0.12% )
0 cpu-migrations # 0.000 K/sec
10,125 page-faults # 0.001 M/sec ( +- 1.23% )
35,144,901,879 cycles # 4.367 GHz ( +- 0.14% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
65,758,252,643 instructions # 1.87 insns per cycle ( +- 0.33% )
10,871,298,668 branches # 1350.707 M/sec ( +- 0.41% )
192,322,212 branch-misses # 1.77% of all branches ( +- 0.32% )
8.640869419 seconds time elapsed ( +- 0.57% )
- After:
8146.242027 task-clock (msec) # 0.923 CPUs utilized ( +- 1.23% )
17,016 context-switches # 0.002 M/sec ( +- 0.40% )
0 cpu-migrations # 0.000 K/sec
18,769 page-faults # 0.002 M/sec ( +- 0.45% )
35,660,956,120 cycles # 4.378 GHz ( +- 1.22% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
65,095,366,607 instructions # 1.83 insns per cycle ( +- 1.73% )
10,803,480,261 branches # 1326.192 M/sec ( +- 1.95% )
195,601,289 branch-misses # 1.81% of all branches ( +- 0.39% )
8.828660235 seconds time elapsed ( +- 0.38% )
Reviewed-by: Richard Henderson <rth@twiddle.net>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2017-06-24 02:00:11 +03:00
|
|
|
size_t target_size;
|
|
|
|
size_t max_target_size;
|
|
|
|
size_t direct_jmp_count;
|
|
|
|
size_t direct_jmp2_count;
|
|
|
|
size_t cross_page;
|
|
|
|
};
|
|
|
|
|
|
|
|
static gboolean tb_tree_stats_iter(gpointer key, gpointer value, gpointer data)
|
|
|
|
{
|
|
|
|
const TranslationBlock *tb = value;
|
|
|
|
struct tb_tree_stats *tst = data;
|
|
|
|
|
tcg: track TBs with per-region BST's
This paves the way for enabling scalable parallel generation of TCG code.
Instead of tracking TBs with a single binary search tree (BST), use a
BST for each TCG region, protecting it with a lock. This is as scalable
as it gets, since each TCG thread operates on a separate region.
The core of this change is the introduction of struct tcg_region_tree,
which contains a pointer to a GTree and an associated lock to serialize
accesses to it. We then allocate an array of tcg_region_tree's, adding
the appropriate padding to avoid false sharing based on
qemu_dcache_linesize.
Given a tc_ptr, we first find the corresponding region_tree. This
is done by special-casing the first and last regions first, since they
might be of size != region.size; otherwise we just divide the offset
by region.stride. I was worried about this division (several dozen
cycles of latency), but profiling shows that this is not a fast path.
Note that region.stride is not required to be a power of two; it
is only required to be a multiple of the host's page size.
Note that with this design we can also provide consistent snapshots
about all region trees at once; for instance, tcg_tb_foreach
acquires/releases all region_tree locks before/after iterating over them.
For this reason we now drop tb_lock in dump_exec_info().
As an alternative I considered implementing a concurrent BST, but this
can be tricky to get right, offers no consistent snapshots of the BST,
and performance and scalability-wise I don't think it could ever beat
having separate GTrees, given that our workload is insert-mostly (all
concurrent BST designs I've seen focus, understandably, on making
lookups fast, which comes at the expense of convoluted, non-wait-free
insertions/removals).
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2017-07-26 23:58:05 +03:00
|
|
|
tst->nb_tbs++;
|
2017-06-24 03:57:44 +03:00
|
|
|
tst->host_size += tb->tc.size;
|
translate-all: use a binary search tree to track TBs in TBContext
This is a prerequisite for supporting multiple TCG contexts, since
we will have threads generating code in separate regions of
code_gen_buffer.
For this we need a new field (.size) in struct tb_tc to keep
track of the size of the translated code. This field uses a size_t
to avoid adding a hole to the struct, although really an unsigned
int would have been enough.
The comparison function we use is optimized for the common case:
insertions. Profiling shows that upon booting debian-arm, 98%
of comparisons are between existing tb's (i.e. a->size and b->size
are both !0), which happens during insertions (and removals, but
those are rare). The remaining cases are lookups. From reading the glib
sources we see that the first key is always the lookup key. However,
the code does not assume this to always be the case because this
behaviour is not guaranteed in the glib docs. However, we embed
this knowledge in the code as a branch hint for the compiler.
Note that tb_free does not free space in the code_gen_buffer anymore,
since we cannot easily know whether the tb is the last one inserted
in code_gen_buffer. The next patch in this series renames tb_free
to tb_remove to reflect this.
Performance-wise, lookups in tb_find_pc are the same as before:
O(log n). However, insertions are O(log n) instead of O(1), which
results in a small slowdown when booting debian-arm:
Performance counter stats for 'build/arm-softmmu/qemu-system-arm \
-machine type=virt -nographic -smp 1 -m 4096 \
-netdev user,id=unet,hostfwd=tcp::2222-:22 \
-device virtio-net-device,netdev=unet \
-drive file=img/arm/jessie-arm32.qcow2,id=myblock,index=0,if=none \
-device virtio-blk-device,drive=myblock \
-kernel img/arm/aarch32-current-linux-kernel-only.img \
-append console=ttyAMA0 root=/dev/vda1 \
-name arm,debug-threads=on -smp 1' (10 runs):
- Before:
8048.598422 task-clock (msec) # 0.931 CPUs utilized ( +- 0.28% )
16,974 context-switches # 0.002 M/sec ( +- 0.12% )
0 cpu-migrations # 0.000 K/sec
10,125 page-faults # 0.001 M/sec ( +- 1.23% )
35,144,901,879 cycles # 4.367 GHz ( +- 0.14% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
65,758,252,643 instructions # 1.87 insns per cycle ( +- 0.33% )
10,871,298,668 branches # 1350.707 M/sec ( +- 0.41% )
192,322,212 branch-misses # 1.77% of all branches ( +- 0.32% )
8.640869419 seconds time elapsed ( +- 0.57% )
- After:
8146.242027 task-clock (msec) # 0.923 CPUs utilized ( +- 1.23% )
17,016 context-switches # 0.002 M/sec ( +- 0.40% )
0 cpu-migrations # 0.000 K/sec
18,769 page-faults # 0.002 M/sec ( +- 0.45% )
35,660,956,120 cycles # 4.378 GHz ( +- 1.22% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
65,095,366,607 instructions # 1.83 insns per cycle ( +- 1.73% )
10,803,480,261 branches # 1326.192 M/sec ( +- 1.95% )
195,601,289 branch-misses # 1.81% of all branches ( +- 0.39% )
8.828660235 seconds time elapsed ( +- 0.38% )
Reviewed-by: Richard Henderson <rth@twiddle.net>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2017-06-24 02:00:11 +03:00
|
|
|
tst->target_size += tb->size;
|
|
|
|
if (tb->size > tst->max_target_size) {
|
|
|
|
tst->max_target_size = tb->size;
|
|
|
|
}
|
2022-09-20 14:21:40 +03:00
|
|
|
if (tb_page_addr1(tb) != -1) {
|
translate-all: use a binary search tree to track TBs in TBContext
This is a prerequisite for supporting multiple TCG contexts, since
we will have threads generating code in separate regions of
code_gen_buffer.
For this we need a new field (.size) in struct tb_tc to keep
track of the size of the translated code. This field uses a size_t
to avoid adding a hole to the struct, although really an unsigned
int would have been enough.
The comparison function we use is optimized for the common case:
insertions. Profiling shows that upon booting debian-arm, 98%
of comparisons are between existing tb's (i.e. a->size and b->size
are both !0), which happens during insertions (and removals, but
those are rare). The remaining cases are lookups. From reading the glib
sources we see that the first key is always the lookup key. However,
the code does not assume this to always be the case because this
behaviour is not guaranteed in the glib docs. However, we embed
this knowledge in the code as a branch hint for the compiler.
Note that tb_free does not free space in the code_gen_buffer anymore,
since we cannot easily know whether the tb is the last one inserted
in code_gen_buffer. The next patch in this series renames tb_free
to tb_remove to reflect this.
Performance-wise, lookups in tb_find_pc are the same as before:
O(log n). However, insertions are O(log n) instead of O(1), which
results in a small slowdown when booting debian-arm:
Performance counter stats for 'build/arm-softmmu/qemu-system-arm \
-machine type=virt -nographic -smp 1 -m 4096 \
-netdev user,id=unet,hostfwd=tcp::2222-:22 \
-device virtio-net-device,netdev=unet \
-drive file=img/arm/jessie-arm32.qcow2,id=myblock,index=0,if=none \
-device virtio-blk-device,drive=myblock \
-kernel img/arm/aarch32-current-linux-kernel-only.img \
-append console=ttyAMA0 root=/dev/vda1 \
-name arm,debug-threads=on -smp 1' (10 runs):
- Before:
8048.598422 task-clock (msec) # 0.931 CPUs utilized ( +- 0.28% )
16,974 context-switches # 0.002 M/sec ( +- 0.12% )
0 cpu-migrations # 0.000 K/sec
10,125 page-faults # 0.001 M/sec ( +- 1.23% )
35,144,901,879 cycles # 4.367 GHz ( +- 0.14% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
65,758,252,643 instructions # 1.87 insns per cycle ( +- 0.33% )
10,871,298,668 branches # 1350.707 M/sec ( +- 0.41% )
192,322,212 branch-misses # 1.77% of all branches ( +- 0.32% )
8.640869419 seconds time elapsed ( +- 0.57% )
- After:
8146.242027 task-clock (msec) # 0.923 CPUs utilized ( +- 1.23% )
17,016 context-switches # 0.002 M/sec ( +- 0.40% )
0 cpu-migrations # 0.000 K/sec
18,769 page-faults # 0.002 M/sec ( +- 0.45% )
35,660,956,120 cycles # 4.378 GHz ( +- 1.22% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
65,095,366,607 instructions # 1.83 insns per cycle ( +- 1.73% )
10,803,480,261 branches # 1326.192 M/sec ( +- 1.95% )
195,601,289 branch-misses # 1.81% of all branches ( +- 0.39% )
8.828660235 seconds time elapsed ( +- 0.38% )
Reviewed-by: Richard Henderson <rth@twiddle.net>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2017-06-24 02:00:11 +03:00
|
|
|
tst->cross_page++;
|
|
|
|
}
|
2022-11-27 05:20:57 +03:00
|
|
|
if (tb->jmp_reset_offset[0] != TB_JMP_OFFSET_INVALID) {
|
translate-all: use a binary search tree to track TBs in TBContext
This is a prerequisite for supporting multiple TCG contexts, since
we will have threads generating code in separate regions of
code_gen_buffer.
For this we need a new field (.size) in struct tb_tc to keep
track of the size of the translated code. This field uses a size_t
to avoid adding a hole to the struct, although really an unsigned
int would have been enough.
The comparison function we use is optimized for the common case:
insertions. Profiling shows that upon booting debian-arm, 98%
of comparisons are between existing tb's (i.e. a->size and b->size
are both !0), which happens during insertions (and removals, but
those are rare). The remaining cases are lookups. From reading the glib
sources we see that the first key is always the lookup key. However,
the code does not assume this to always be the case because this
behaviour is not guaranteed in the glib docs. However, we embed
this knowledge in the code as a branch hint for the compiler.
Note that tb_free does not free space in the code_gen_buffer anymore,
since we cannot easily know whether the tb is the last one inserted
in code_gen_buffer. The next patch in this series renames tb_free
to tb_remove to reflect this.
Performance-wise, lookups in tb_find_pc are the same as before:
O(log n). However, insertions are O(log n) instead of O(1), which
results in a small slowdown when booting debian-arm:
Performance counter stats for 'build/arm-softmmu/qemu-system-arm \
-machine type=virt -nographic -smp 1 -m 4096 \
-netdev user,id=unet,hostfwd=tcp::2222-:22 \
-device virtio-net-device,netdev=unet \
-drive file=img/arm/jessie-arm32.qcow2,id=myblock,index=0,if=none \
-device virtio-blk-device,drive=myblock \
-kernel img/arm/aarch32-current-linux-kernel-only.img \
-append console=ttyAMA0 root=/dev/vda1 \
-name arm,debug-threads=on -smp 1' (10 runs):
- Before:
8048.598422 task-clock (msec) # 0.931 CPUs utilized ( +- 0.28% )
16,974 context-switches # 0.002 M/sec ( +- 0.12% )
0 cpu-migrations # 0.000 K/sec
10,125 page-faults # 0.001 M/sec ( +- 1.23% )
35,144,901,879 cycles # 4.367 GHz ( +- 0.14% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
65,758,252,643 instructions # 1.87 insns per cycle ( +- 0.33% )
10,871,298,668 branches # 1350.707 M/sec ( +- 0.41% )
192,322,212 branch-misses # 1.77% of all branches ( +- 0.32% )
8.640869419 seconds time elapsed ( +- 0.57% )
- After:
8146.242027 task-clock (msec) # 0.923 CPUs utilized ( +- 1.23% )
17,016 context-switches # 0.002 M/sec ( +- 0.40% )
0 cpu-migrations # 0.000 K/sec
18,769 page-faults # 0.002 M/sec ( +- 0.45% )
35,660,956,120 cycles # 4.378 GHz ( +- 1.22% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
65,095,366,607 instructions # 1.83 insns per cycle ( +- 1.73% )
10,803,480,261 branches # 1326.192 M/sec ( +- 1.95% )
195,601,289 branch-misses # 1.81% of all branches ( +- 0.39% )
8.828660235 seconds time elapsed ( +- 0.38% )
Reviewed-by: Richard Henderson <rth@twiddle.net>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2017-06-24 02:00:11 +03:00
|
|
|
tst->direct_jmp_count++;
|
2022-11-27 05:20:57 +03:00
|
|
|
if (tb->jmp_reset_offset[1] != TB_JMP_OFFSET_INVALID) {
|
translate-all: use a binary search tree to track TBs in TBContext
This is a prerequisite for supporting multiple TCG contexts, since
we will have threads generating code in separate regions of
code_gen_buffer.
For this we need a new field (.size) in struct tb_tc to keep
track of the size of the translated code. This field uses a size_t
to avoid adding a hole to the struct, although really an unsigned
int would have been enough.
The comparison function we use is optimized for the common case:
insertions. Profiling shows that upon booting debian-arm, 98%
of comparisons are between existing tb's (i.e. a->size and b->size
are both !0), which happens during insertions (and removals, but
those are rare). The remaining cases are lookups. From reading the glib
sources we see that the first key is always the lookup key. However,
the code does not assume this to always be the case because this
behaviour is not guaranteed in the glib docs. However, we embed
this knowledge in the code as a branch hint for the compiler.
Note that tb_free does not free space in the code_gen_buffer anymore,
since we cannot easily know whether the tb is the last one inserted
in code_gen_buffer. The next patch in this series renames tb_free
to tb_remove to reflect this.
Performance-wise, lookups in tb_find_pc are the same as before:
O(log n). However, insertions are O(log n) instead of O(1), which
results in a small slowdown when booting debian-arm:
Performance counter stats for 'build/arm-softmmu/qemu-system-arm \
-machine type=virt -nographic -smp 1 -m 4096 \
-netdev user,id=unet,hostfwd=tcp::2222-:22 \
-device virtio-net-device,netdev=unet \
-drive file=img/arm/jessie-arm32.qcow2,id=myblock,index=0,if=none \
-device virtio-blk-device,drive=myblock \
-kernel img/arm/aarch32-current-linux-kernel-only.img \
-append console=ttyAMA0 root=/dev/vda1 \
-name arm,debug-threads=on -smp 1' (10 runs):
- Before:
8048.598422 task-clock (msec) # 0.931 CPUs utilized ( +- 0.28% )
16,974 context-switches # 0.002 M/sec ( +- 0.12% )
0 cpu-migrations # 0.000 K/sec
10,125 page-faults # 0.001 M/sec ( +- 1.23% )
35,144,901,879 cycles # 4.367 GHz ( +- 0.14% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
65,758,252,643 instructions # 1.87 insns per cycle ( +- 0.33% )
10,871,298,668 branches # 1350.707 M/sec ( +- 0.41% )
192,322,212 branch-misses # 1.77% of all branches ( +- 0.32% )
8.640869419 seconds time elapsed ( +- 0.57% )
- After:
8146.242027 task-clock (msec) # 0.923 CPUs utilized ( +- 1.23% )
17,016 context-switches # 0.002 M/sec ( +- 0.40% )
0 cpu-migrations # 0.000 K/sec
18,769 page-faults # 0.002 M/sec ( +- 0.45% )
35,660,956,120 cycles # 4.378 GHz ( +- 1.22% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
65,095,366,607 instructions # 1.83 insns per cycle ( +- 1.73% )
10,803,480,261 branches # 1326.192 M/sec ( +- 1.95% )
195,601,289 branch-misses # 1.81% of all branches ( +- 0.39% )
8.828660235 seconds time elapsed ( +- 0.38% )
Reviewed-by: Richard Henderson <rth@twiddle.net>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2017-06-24 02:00:11 +03:00
|
|
|
tst->direct_jmp2_count++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2021-09-08 12:35:43 +03:00
|
|
|
void dump_exec_info(GString *buf)
|
2012-12-02 20:04:43 +04:00
|
|
|
{
|
translate-all: use a binary search tree to track TBs in TBContext
This is a prerequisite for supporting multiple TCG contexts, since
we will have threads generating code in separate regions of
code_gen_buffer.
For this we need a new field (.size) in struct tb_tc to keep
track of the size of the translated code. This field uses a size_t
to avoid adding a hole to the struct, although really an unsigned
int would have been enough.
The comparison function we use is optimized for the common case:
insertions. Profiling shows that upon booting debian-arm, 98%
of comparisons are between existing tb's (i.e. a->size and b->size
are both !0), which happens during insertions (and removals, but
those are rare). The remaining cases are lookups. From reading the glib
sources we see that the first key is always the lookup key. However,
the code does not assume this to always be the case because this
behaviour is not guaranteed in the glib docs. However, we embed
this knowledge in the code as a branch hint for the compiler.
Note that tb_free does not free space in the code_gen_buffer anymore,
since we cannot easily know whether the tb is the last one inserted
in code_gen_buffer. The next patch in this series renames tb_free
to tb_remove to reflect this.
Performance-wise, lookups in tb_find_pc are the same as before:
O(log n). However, insertions are O(log n) instead of O(1), which
results in a small slowdown when booting debian-arm:
Performance counter stats for 'build/arm-softmmu/qemu-system-arm \
-machine type=virt -nographic -smp 1 -m 4096 \
-netdev user,id=unet,hostfwd=tcp::2222-:22 \
-device virtio-net-device,netdev=unet \
-drive file=img/arm/jessie-arm32.qcow2,id=myblock,index=0,if=none \
-device virtio-blk-device,drive=myblock \
-kernel img/arm/aarch32-current-linux-kernel-only.img \
-append console=ttyAMA0 root=/dev/vda1 \
-name arm,debug-threads=on -smp 1' (10 runs):
- Before:
8048.598422 task-clock (msec) # 0.931 CPUs utilized ( +- 0.28% )
16,974 context-switches # 0.002 M/sec ( +- 0.12% )
0 cpu-migrations # 0.000 K/sec
10,125 page-faults # 0.001 M/sec ( +- 1.23% )
35,144,901,879 cycles # 4.367 GHz ( +- 0.14% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
65,758,252,643 instructions # 1.87 insns per cycle ( +- 0.33% )
10,871,298,668 branches # 1350.707 M/sec ( +- 0.41% )
192,322,212 branch-misses # 1.77% of all branches ( +- 0.32% )
8.640869419 seconds time elapsed ( +- 0.57% )
- After:
8146.242027 task-clock (msec) # 0.923 CPUs utilized ( +- 1.23% )
17,016 context-switches # 0.002 M/sec ( +- 0.40% )
0 cpu-migrations # 0.000 K/sec
18,769 page-faults # 0.002 M/sec ( +- 0.45% )
35,660,956,120 cycles # 4.378 GHz ( +- 1.22% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
65,095,366,607 instructions # 1.83 insns per cycle ( +- 1.73% )
10,803,480,261 branches # 1326.192 M/sec ( +- 1.95% )
195,601,289 branch-misses # 1.81% of all branches ( +- 0.39% )
8.828660235 seconds time elapsed ( +- 0.38% )
Reviewed-by: Richard Henderson <rth@twiddle.net>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2017-06-24 02:00:11 +03:00
|
|
|
struct tb_tree_stats tst = {};
|
translate-all: add tb hash bucket info to 'info jit' dump
Examples:
- Good hashing, i.e. tb_hash_func5(phys_pc, pc, flags):
TB count 715135/2684354
[...]
TB hash buckets 388775/524288 (74.15% head buckets used)
TB hash occupancy 33.04% avg chain occ. Histogram: [0,10)%|▆ █ ▅▁▃▁▁|[90,100]%
TB hash avg chain 1.017 buckets. Histogram: 1|█▁▁|3
- Not-so-good hashing, i.e. tb_hash_func5(phys_pc, pc, 0):
TB count 712636/2684354
[...]
TB hash buckets 344924/524288 (65.79% head buckets used)
TB hash occupancy 31.64% avg chain occ. Histogram: [0,10)%|█ ▆ ▅▁▃▁▂|[90,100]%
TB hash avg chain 1.047 buckets. Histogram: 1|█▁▁▁|4
- Bad hashing, i.e. tb_hash_func5(phys_pc, 0, 0):
TB count 702818/2684354
[...]
TB hash buckets 112741/524288 (21.50% head buckets used)
TB hash occupancy 10.15% avg chain occ. Histogram: [0,10)%|█ ▁ ▁▁▁▁▁|[90,100]%
TB hash avg chain 2.107 buckets. Histogram: [1.0,10.2)|█▁▁▁▁▁▁▁▁▁|[83.8,93.0]
- Good hashing, but no auto-resize:
TB count 715634/2684354
TB hash buckets 8192/8192 (100.00% head buckets used)
TB hash occupancy 98.30% avg chain occ. Histogram: [95.3,95.8)%|▁▁▃▄▃▄▁▇▁█|[99.5,100.0]%
TB hash avg chain 22.070 buckets. Histogram: [15.0,16.7)|▁▂▅▄█▅▁▁▁▁|[30.3,32.0]
Acked-by: Sergey Fedorov <sergey.fedorov@linaro.org>
Suggested-by: Richard Henderson <rth@twiddle.net>
Reviewed-by: Richard Henderson <rth@twiddle.net>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Message-Id: <1465412133-3029-16-git-send-email-cota@braap.org>
Signed-off-by: Richard Henderson <rth@twiddle.net>
2016-06-08 21:55:33 +03:00
|
|
|
struct qht_stats hst;
|
2018-10-20 00:36:43 +03:00
|
|
|
size_t nb_tbs, flush_full, flush_part, flush_elide;
|
2012-12-02 20:04:43 +04:00
|
|
|
|
tcg: track TBs with per-region BST's
This paves the way for enabling scalable parallel generation of TCG code.
Instead of tracking TBs with a single binary search tree (BST), use a
BST for each TCG region, protecting it with a lock. This is as scalable
as it gets, since each TCG thread operates on a separate region.
The core of this change is the introduction of struct tcg_region_tree,
which contains a pointer to a GTree and an associated lock to serialize
accesses to it. We then allocate an array of tcg_region_tree's, adding
the appropriate padding to avoid false sharing based on
qemu_dcache_linesize.
Given a tc_ptr, we first find the corresponding region_tree. This
is done by special-casing the first and last regions first, since they
might be of size != region.size; otherwise we just divide the offset
by region.stride. I was worried about this division (several dozen
cycles of latency), but profiling shows that this is not a fast path.
Note that region.stride is not required to be a power of two; it
is only required to be a multiple of the host's page size.
Note that with this design we can also provide consistent snapshots
about all region trees at once; for instance, tcg_tb_foreach
acquires/releases all region_tree locks before/after iterating over them.
For this reason we now drop tb_lock in dump_exec_info().
As an alternative I considered implementing a concurrent BST, but this
can be tricky to get right, offers no consistent snapshots of the BST,
and performance and scalability-wise I don't think it could ever beat
having separate GTrees, given that our workload is insert-mostly (all
concurrent BST designs I've seen focus, understandably, on making
lookups fast, which comes at the expense of convoluted, non-wait-free
insertions/removals).
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2017-07-26 23:58:05 +03:00
|
|
|
tcg_tb_foreach(tb_tree_stats_iter, &tst);
|
|
|
|
nb_tbs = tst.nb_tbs;
|
2012-12-02 20:04:43 +04:00
|
|
|
/* XXX: avoid using doubles ? */
|
2021-09-08 12:35:43 +03:00
|
|
|
g_string_append_printf(buf, "Translation buffer state:\n");
|
2017-06-24 03:57:44 +03:00
|
|
|
/*
|
|
|
|
* Report total code size including the padding and TB structs;
|
2020-12-10 18:58:05 +03:00
|
|
|
* otherwise users might think "-accel tcg,tb-size" is not honoured.
|
2017-06-24 03:57:44 +03:00
|
|
|
* For avg host size we use the precise numbers from tb_tree_stats though.
|
|
|
|
*/
|
2021-09-08 12:35:43 +03:00
|
|
|
g_string_append_printf(buf, "gen code size %zu/%zu\n",
|
|
|
|
tcg_code_size(), tcg_code_capacity());
|
|
|
|
g_string_append_printf(buf, "TB count %zu\n", nb_tbs);
|
|
|
|
g_string_append_printf(buf, "TB avg target size %zu max=%zu bytes\n",
|
|
|
|
nb_tbs ? tst.target_size / nb_tbs : 0,
|
|
|
|
tst.max_target_size);
|
|
|
|
g_string_append_printf(buf, "TB avg host size %zu bytes "
|
|
|
|
"(expansion ratio: %0.1f)\n",
|
|
|
|
nb_tbs ? tst.host_size / nb_tbs : 0,
|
|
|
|
tst.target_size ?
|
|
|
|
(double)tst.host_size / tst.target_size : 0);
|
|
|
|
g_string_append_printf(buf, "cross page TB count %zu (%zu%%)\n",
|
|
|
|
tst.cross_page,
|
|
|
|
nb_tbs ? (tst.cross_page * 100) / nb_tbs : 0);
|
|
|
|
g_string_append_printf(buf, "direct jump count %zu (%zu%%) "
|
|
|
|
"(2 jumps=%zu %zu%%)\n",
|
|
|
|
tst.direct_jmp_count,
|
|
|
|
nb_tbs ? (tst.direct_jmp_count * 100) / nb_tbs : 0,
|
|
|
|
tst.direct_jmp2_count,
|
|
|
|
nb_tbs ? (tst.direct_jmp2_count * 100) / nb_tbs : 0);
|
translate-all: add tb hash bucket info to 'info jit' dump
Examples:
- Good hashing, i.e. tb_hash_func5(phys_pc, pc, flags):
TB count 715135/2684354
[...]
TB hash buckets 388775/524288 (74.15% head buckets used)
TB hash occupancy 33.04% avg chain occ. Histogram: [0,10)%|▆ █ ▅▁▃▁▁|[90,100]%
TB hash avg chain 1.017 buckets. Histogram: 1|█▁▁|3
- Not-so-good hashing, i.e. tb_hash_func5(phys_pc, pc, 0):
TB count 712636/2684354
[...]
TB hash buckets 344924/524288 (65.79% head buckets used)
TB hash occupancy 31.64% avg chain occ. Histogram: [0,10)%|█ ▆ ▅▁▃▁▂|[90,100]%
TB hash avg chain 1.047 buckets. Histogram: 1|█▁▁▁|4
- Bad hashing, i.e. tb_hash_func5(phys_pc, 0, 0):
TB count 702818/2684354
[...]
TB hash buckets 112741/524288 (21.50% head buckets used)
TB hash occupancy 10.15% avg chain occ. Histogram: [0,10)%|█ ▁ ▁▁▁▁▁|[90,100]%
TB hash avg chain 2.107 buckets. Histogram: [1.0,10.2)|█▁▁▁▁▁▁▁▁▁|[83.8,93.0]
- Good hashing, but no auto-resize:
TB count 715634/2684354
TB hash buckets 8192/8192 (100.00% head buckets used)
TB hash occupancy 98.30% avg chain occ. Histogram: [95.3,95.8)%|▁▁▃▄▃▄▁▇▁█|[99.5,100.0]%
TB hash avg chain 22.070 buckets. Histogram: [15.0,16.7)|▁▂▅▄█▅▁▁▁▁|[30.3,32.0]
Acked-by: Sergey Fedorov <sergey.fedorov@linaro.org>
Suggested-by: Richard Henderson <rth@twiddle.net>
Reviewed-by: Richard Henderson <rth@twiddle.net>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Message-Id: <1465412133-3029-16-git-send-email-cota@braap.org>
Signed-off-by: Richard Henderson <rth@twiddle.net>
2016-06-08 21:55:33 +03:00
|
|
|
|
2017-06-24 03:04:43 +03:00
|
|
|
qht_statistics_init(&tb_ctx.htable, &hst);
|
2021-09-08 12:35:43 +03:00
|
|
|
print_qht_statistics(hst, buf);
|
translate-all: add tb hash bucket info to 'info jit' dump
Examples:
- Good hashing, i.e. tb_hash_func5(phys_pc, pc, flags):
TB count 715135/2684354
[...]
TB hash buckets 388775/524288 (74.15% head buckets used)
TB hash occupancy 33.04% avg chain occ. Histogram: [0,10)%|▆ █ ▅▁▃▁▁|[90,100]%
TB hash avg chain 1.017 buckets. Histogram: 1|█▁▁|3
- Not-so-good hashing, i.e. tb_hash_func5(phys_pc, pc, 0):
TB count 712636/2684354
[...]
TB hash buckets 344924/524288 (65.79% head buckets used)
TB hash occupancy 31.64% avg chain occ. Histogram: [0,10)%|█ ▆ ▅▁▃▁▂|[90,100]%
TB hash avg chain 1.047 buckets. Histogram: 1|█▁▁▁|4
- Bad hashing, i.e. tb_hash_func5(phys_pc, 0, 0):
TB count 702818/2684354
[...]
TB hash buckets 112741/524288 (21.50% head buckets used)
TB hash occupancy 10.15% avg chain occ. Histogram: [0,10)%|█ ▁ ▁▁▁▁▁|[90,100]%
TB hash avg chain 2.107 buckets. Histogram: [1.0,10.2)|█▁▁▁▁▁▁▁▁▁|[83.8,93.0]
- Good hashing, but no auto-resize:
TB count 715634/2684354
TB hash buckets 8192/8192 (100.00% head buckets used)
TB hash occupancy 98.30% avg chain occ. Histogram: [95.3,95.8)%|▁▁▃▄▃▄▁▇▁█|[99.5,100.0]%
TB hash avg chain 22.070 buckets. Histogram: [15.0,16.7)|▁▂▅▄█▅▁▁▁▁|[30.3,32.0]
Acked-by: Sergey Fedorov <sergey.fedorov@linaro.org>
Suggested-by: Richard Henderson <rth@twiddle.net>
Reviewed-by: Richard Henderson <rth@twiddle.net>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Message-Id: <1465412133-3029-16-git-send-email-cota@braap.org>
Signed-off-by: Richard Henderson <rth@twiddle.net>
2016-06-08 21:55:33 +03:00
|
|
|
qht_statistics_destroy(&hst);
|
|
|
|
|
2021-09-08 12:35:43 +03:00
|
|
|
g_string_append_printf(buf, "\nStatistics:\n");
|
|
|
|
g_string_append_printf(buf, "TB flush count %u\n",
|
|
|
|
qatomic_read(&tb_ctx.tb_flush_count));
|
|
|
|
g_string_append_printf(buf, "TB invalidate count %u\n",
|
|
|
|
qatomic_read(&tb_ctx.tb_phys_invalidate_count));
|
2018-10-20 00:36:43 +03:00
|
|
|
|
|
|
|
tlb_flush_counts(&flush_full, &flush_part, &flush_elide);
|
2021-09-08 12:35:43 +03:00
|
|
|
g_string_append_printf(buf, "TLB full flushes %zu\n", flush_full);
|
|
|
|
g_string_append_printf(buf, "TLB partial flushes %zu\n", flush_part);
|
|
|
|
g_string_append_printf(buf, "TLB elided flushes %zu\n", flush_elide);
|
|
|
|
tcg_dump_info(buf);
|
2012-12-02 20:04:43 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
#else /* CONFIG_USER_ONLY */
|
|
|
|
|
2013-01-18 18:03:43 +04:00
|
|
|
void cpu_interrupt(CPUState *cpu, int mask)
|
2012-12-02 20:04:43 +04:00
|
|
|
{
|
tcg: drop global lock during TCG code execution
This finally allows TCG to benefit from the iothread introduction: Drop
the global mutex while running pure TCG CPU code. Reacquire the lock
when entering MMIO or PIO emulation, or when leaving the TCG loop.
We have to revert a few optimization for the current TCG threading
model, namely kicking the TCG thread in qemu_mutex_lock_iothread and not
kicking it in qemu_cpu_kick. We also need to disable RAM block
reordering until we have a more efficient locking mechanism at hand.
Still, a Linux x86 UP guest and my Musicpal ARM model boot fine here.
These numbers demonstrate where we gain something:
20338 jan 20 0 331m 75m 6904 R 99 0.9 0:50.95 qemu-system-arm
20337 jan 20 0 331m 75m 6904 S 20 0.9 0:26.50 qemu-system-arm
The guest CPU was fully loaded, but the iothread could still run mostly
independent on a second core. Without the patch we don't get beyond
32206 jan 20 0 330m 73m 7036 R 82 0.9 1:06.00 qemu-system-arm
32204 jan 20 0 330m 73m 7036 S 21 0.9 0:17.03 qemu-system-arm
We don't benefit significantly, though, when the guest is not fully
loading a host CPU.
Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Message-Id: <1439220437-23957-10-git-send-email-fred.konrad@greensocs.com>
[FK: Rebase, fix qemu_devices_reset deadlock, rm address_space_* mutex]
Signed-off-by: KONRAD Frederic <fred.konrad@greensocs.com>
[EGC: fixed iothread lock for cpu-exec IRQ handling]
Signed-off-by: Emilio G. Cota <cota@braap.org>
[AJB: -smp single-threaded fix, clean commit msg, BQL fixes]
Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Richard Henderson <rth@twiddle.net>
Reviewed-by: Pranith Kumar <bobby.prani@gmail.com>
[PM: target-arm changes]
Acked-by: Peter Maydell <peter.maydell@linaro.org>
2017-02-23 21:29:11 +03:00
|
|
|
g_assert(qemu_mutex_iothread_locked());
|
2013-01-17 21:51:17 +04:00
|
|
|
cpu->interrupt_request |= mask;
|
2020-09-23 13:56:46 +03:00
|
|
|
qatomic_set(&cpu_neg(cpu)->icount_decr.u16.high, -1);
|
2012-12-02 20:04:43 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
#endif /* CONFIG_USER_ONLY */
|
2017-06-26 08:22:55 +03:00
|
|
|
|
2022-08-15 23:13:05 +03:00
|
|
|
/*
|
|
|
|
* Called by generic code at e.g. cpu reset after cpu creation,
|
|
|
|
* therefore we must be prepared to allocate the jump cache.
|
|
|
|
*/
|
|
|
|
void tcg_flush_jmp_cache(CPUState *cpu)
|
|
|
|
{
|
|
|
|
CPUJumpCache *jc = cpu->tb_jmp_cache;
|
|
|
|
|
2022-10-31 05:26:36 +03:00
|
|
|
/* During early initialization, the cache may not yet be allocated. */
|
|
|
|
if (unlikely(jc == NULL)) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (int i = 0; i < TB_JMP_CACHE_SIZE; i++) {
|
|
|
|
qatomic_set(&jc->array[i].tb, NULL);
|
2022-08-15 23:13:05 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-06-26 08:22:55 +03:00
|
|
|
/* This is a wrapper for common code that can not use CONFIG_SOFTMMU */
|
|
|
|
void tcg_flush_softmmu_tlb(CPUState *cs)
|
|
|
|
{
|
|
|
|
#ifdef CONFIG_SOFTMMU
|
|
|
|
tlb_flush(cs);
|
|
|
|
#endif
|
|
|
|
}
|