qemu/include/exec/tb-lookup.h

52 lines
1.4 KiB
C
Raw Normal View History

tcg: consolidate TB lookups in tb_lookup__cpu_state This avoids duplicating code. cpu_exec_step will also use the new common function once we integrate parallel_cpus into tb->cflags. Note that in this commit we also fix a race, described by Richard Henderson during review. Think of this scenario with threads A and B: (A) Lookup succeeds for TB in hash without tb_lock (B) Sets the TB's tb->invalid flag (B) Removes the TB from tb_htable (B) Clears all CPU's tb_jmp_cache (A) Store TB into local tb_jmp_cache Given that order of events, (A) will keep executing that invalid TB until another flush of its tb_jmp_cache happens, which in theory might never happen. We can fix this by checking the tb->invalid flag every time we look up a TB from tb_jmp_cache, so that in the above scenario, next time we try to find that TB in tb_jmp_cache, we won't, and will therefore be forced to look it up in tb_htable. Performance-wise, I measured a small improvement when booting debian-arm. Note that inlining pays off: Performance counter stats for 'taskset -c 0 qemu-system-arm \ -machine type=virt -nographic -smp 1 -m 4096 \ -netdev user,id=unet,hostfwd=tcp::2222-:22 \ -device virtio-net-device,netdev=unet \ -drive file=jessie.qcow2,id=myblock,index=0,if=none \ -device virtio-blk-device,drive=myblock \ -kernel kernel.img -append console=ttyAMA0 root=/dev/vda1 \ -name arm,debug-threads=on -smp 1' (10 runs): Before: 18714.917392 task-clock # 0.952 CPUs utilized ( +- 0.95% ) 23,142 context-switches # 0.001 M/sec ( +- 0.50% ) 1 CPU-migrations # 0.000 M/sec 10,558 page-faults # 0.001 M/sec ( +- 0.95% ) 53,957,727,252 cycles # 2.883 GHz ( +- 0.91% ) [83.33%] 24,440,599,852 stalled-cycles-frontend # 45.30% frontend cycles idle ( +- 1.20% ) [83.33%] 16,495,714,424 stalled-cycles-backend # 30.57% backend cycles idle ( +- 0.95% ) [66.66%] 76,267,572,582 instructions # 1.41 insns per cycle # 0.32 stalled cycles per insn ( +- 0.87% ) [83.34%] 12,692,186,323 branches # 678.186 M/sec ( +- 0.92% ) [83.35%] 263,486,879 branch-misses # 2.08% of all branches ( +- 0.73% ) [83.34%] 19.648474449 seconds time elapsed ( +- 0.82% ) After, w/ inline (this patch): 18471.376627 task-clock # 0.955 CPUs utilized ( +- 0.96% ) 23,048 context-switches # 0.001 M/sec ( +- 0.48% ) 1 CPU-migrations # 0.000 M/sec 10,708 page-faults # 0.001 M/sec ( +- 0.81% ) 53,208,990,796 cycles # 2.881 GHz ( +- 0.98% ) [83.34%] 23,941,071,673 stalled-cycles-frontend # 44.99% frontend cycles idle ( +- 0.95% ) [83.34%] 16,161,773,848 stalled-cycles-backend # 30.37% backend cycles idle ( +- 0.76% ) [66.67%] 75,786,269,766 instructions # 1.42 insns per cycle # 0.32 stalled cycles per insn ( +- 1.24% ) [83.34%] 12,573,617,143 branches # 680.708 M/sec ( +- 1.34% ) [83.33%] 260,235,550 branch-misses # 2.07% of all branches ( +- 0.66% ) [83.33%] 19.340502161 seconds time elapsed ( +- 0.56% ) After, w/o inline: 18791.253967 task-clock # 0.954 CPUs utilized ( +- 0.78% ) 23,230 context-switches # 0.001 M/sec ( +- 0.42% ) 1 CPU-migrations # 0.000 M/sec 10,563 page-faults # 0.001 M/sec ( +- 1.27% ) 54,168,674,622 cycles # 2.883 GHz ( +- 0.80% ) [83.34%] 24,244,712,629 stalled-cycles-frontend # 44.76% frontend cycles idle ( +- 1.37% ) [83.33%] 16,288,648,572 stalled-cycles-backend # 30.07% backend cycles idle ( +- 0.95% ) [66.66%] 77,659,755,503 instructions # 1.43 insns per cycle # 0.31 stalled cycles per insn ( +- 0.97% ) [83.34%] 12,922,780,045 branches # 687.702 M/sec ( +- 1.06% ) [83.34%] 261,962,386 branch-misses # 2.03% of all branches ( +- 0.71% ) [83.35%] 19.700174670 seconds time elapsed ( +- 0.56% ) Reviewed-by: Richard Henderson <rth@twiddle.net> Signed-off-by: Emilio G. Cota <cota@braap.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2017-07-12 00:33:33 +03:00
/*
* Copyright (C) 2017, Emilio G. Cota <cota@braap.org>
*
* License: GNU GPL, version 2 or later.
* See the COPYING file in the top-level directory.
*/
#ifndef EXEC_TB_LOOKUP_H
#define EXEC_TB_LOOKUP_H
#ifdef NEED_CPU_H
#include "cpu.h"
#else
#include "exec/poison.h"
#endif
#include "exec/exec-all.h"
#include "exec/tb-hash.h"
/* Might cause an exception, so have a longjmp destination ready */
static inline TranslationBlock *
tb_lookup__cpu_state(CPUState *cpu, target_ulong *pc, target_ulong *cs_base,
uint32_t *flags, uint32_t cf_mask)
tcg: consolidate TB lookups in tb_lookup__cpu_state This avoids duplicating code. cpu_exec_step will also use the new common function once we integrate parallel_cpus into tb->cflags. Note that in this commit we also fix a race, described by Richard Henderson during review. Think of this scenario with threads A and B: (A) Lookup succeeds for TB in hash without tb_lock (B) Sets the TB's tb->invalid flag (B) Removes the TB from tb_htable (B) Clears all CPU's tb_jmp_cache (A) Store TB into local tb_jmp_cache Given that order of events, (A) will keep executing that invalid TB until another flush of its tb_jmp_cache happens, which in theory might never happen. We can fix this by checking the tb->invalid flag every time we look up a TB from tb_jmp_cache, so that in the above scenario, next time we try to find that TB in tb_jmp_cache, we won't, and will therefore be forced to look it up in tb_htable. Performance-wise, I measured a small improvement when booting debian-arm. Note that inlining pays off: Performance counter stats for 'taskset -c 0 qemu-system-arm \ -machine type=virt -nographic -smp 1 -m 4096 \ -netdev user,id=unet,hostfwd=tcp::2222-:22 \ -device virtio-net-device,netdev=unet \ -drive file=jessie.qcow2,id=myblock,index=0,if=none \ -device virtio-blk-device,drive=myblock \ -kernel kernel.img -append console=ttyAMA0 root=/dev/vda1 \ -name arm,debug-threads=on -smp 1' (10 runs): Before: 18714.917392 task-clock # 0.952 CPUs utilized ( +- 0.95% ) 23,142 context-switches # 0.001 M/sec ( +- 0.50% ) 1 CPU-migrations # 0.000 M/sec 10,558 page-faults # 0.001 M/sec ( +- 0.95% ) 53,957,727,252 cycles # 2.883 GHz ( +- 0.91% ) [83.33%] 24,440,599,852 stalled-cycles-frontend # 45.30% frontend cycles idle ( +- 1.20% ) [83.33%] 16,495,714,424 stalled-cycles-backend # 30.57% backend cycles idle ( +- 0.95% ) [66.66%] 76,267,572,582 instructions # 1.41 insns per cycle # 0.32 stalled cycles per insn ( +- 0.87% ) [83.34%] 12,692,186,323 branches # 678.186 M/sec ( +- 0.92% ) [83.35%] 263,486,879 branch-misses # 2.08% of all branches ( +- 0.73% ) [83.34%] 19.648474449 seconds time elapsed ( +- 0.82% ) After, w/ inline (this patch): 18471.376627 task-clock # 0.955 CPUs utilized ( +- 0.96% ) 23,048 context-switches # 0.001 M/sec ( +- 0.48% ) 1 CPU-migrations # 0.000 M/sec 10,708 page-faults # 0.001 M/sec ( +- 0.81% ) 53,208,990,796 cycles # 2.881 GHz ( +- 0.98% ) [83.34%] 23,941,071,673 stalled-cycles-frontend # 44.99% frontend cycles idle ( +- 0.95% ) [83.34%] 16,161,773,848 stalled-cycles-backend # 30.37% backend cycles idle ( +- 0.76% ) [66.67%] 75,786,269,766 instructions # 1.42 insns per cycle # 0.32 stalled cycles per insn ( +- 1.24% ) [83.34%] 12,573,617,143 branches # 680.708 M/sec ( +- 1.34% ) [83.33%] 260,235,550 branch-misses # 2.07% of all branches ( +- 0.66% ) [83.33%] 19.340502161 seconds time elapsed ( +- 0.56% ) After, w/o inline: 18791.253967 task-clock # 0.954 CPUs utilized ( +- 0.78% ) 23,230 context-switches # 0.001 M/sec ( +- 0.42% ) 1 CPU-migrations # 0.000 M/sec 10,563 page-faults # 0.001 M/sec ( +- 1.27% ) 54,168,674,622 cycles # 2.883 GHz ( +- 0.80% ) [83.34%] 24,244,712,629 stalled-cycles-frontend # 44.76% frontend cycles idle ( +- 1.37% ) [83.33%] 16,288,648,572 stalled-cycles-backend # 30.07% backend cycles idle ( +- 0.95% ) [66.66%] 77,659,755,503 instructions # 1.43 insns per cycle # 0.31 stalled cycles per insn ( +- 0.97% ) [83.34%] 12,922,780,045 branches # 687.702 M/sec ( +- 1.06% ) [83.34%] 261,962,386 branch-misses # 2.03% of all branches ( +- 0.71% ) [83.35%] 19.700174670 seconds time elapsed ( +- 0.56% ) Reviewed-by: Richard Henderson <rth@twiddle.net> Signed-off-by: Emilio G. Cota <cota@braap.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2017-07-12 00:33:33 +03:00
{
CPUArchState *env = (CPUArchState *)cpu->env_ptr;
TranslationBlock *tb;
uint32_t hash;
cpu_get_tb_cpu_state(env, pc, cs_base, flags);
hash = tb_jmp_cache_hash_func(*pc);
tb = qatomic_rcu_read(&cpu->tb_jmp_cache[hash]);
accel/tcg: Consider cluster index in tb_lookup__cpu_state() In commit f7b78602fdc6c6e4be we added the CPU cluster number to the cflags field of the TB hash; this included adding it to the value kept in tb->cflags, since we pass that field directly into the hash calculation in some places. Unfortunately we forgot to check whether other parts of the code were doing comparisons against tb->cflags that would need to be updated. It turns out that there is exactly one such place: the tb_lookup__cpu_state() function checks whether the TB it has found in the tb_jmp_cache has a tb->cflags matching the cf_mask that is passed in. The tb->cflags has the cluster_index in it but the cf_mask does not. Hoist the "add cluster index to the cf_mask" code up from tb_htable_lookup() to tb_lookup__cpu_state() so it can be considered in the "did this TB match in the jmp cache" condition, as well as when we do the full hash lookup by physical PC, flags, etc. (tb_htable_lookup() is only called from tb_lookup__cpu_state(), so this change doesn't require any further knock-on changes.) Fixes: f7b78602fdc6c6e4be ("accel/tcg: Add cluster number to TCG TB hash") Tested-by: Cleber Rosa <crosa@redhat.com> Tested-by: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk> Reported-by: Howard Spoelstra <hsp.cat7@gmail.com> Reported-by: Cleber Rosa <crosa@redhat.com> Signed-off-by: Peter Maydell <peter.maydell@linaro.org> Message-Id: <20190205151810.571-1-peter.maydell@linaro.org> Reviewed-by: Richard Henderson <richard.henderson@linaro.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2019-02-05 18:18:10 +03:00
cf_mask &= ~CF_CLUSTER_MASK;
cf_mask |= cpu->cluster_index << CF_CLUSTER_SHIFT;
tcg: consolidate TB lookups in tb_lookup__cpu_state This avoids duplicating code. cpu_exec_step will also use the new common function once we integrate parallel_cpus into tb->cflags. Note that in this commit we also fix a race, described by Richard Henderson during review. Think of this scenario with threads A and B: (A) Lookup succeeds for TB in hash without tb_lock (B) Sets the TB's tb->invalid flag (B) Removes the TB from tb_htable (B) Clears all CPU's tb_jmp_cache (A) Store TB into local tb_jmp_cache Given that order of events, (A) will keep executing that invalid TB until another flush of its tb_jmp_cache happens, which in theory might never happen. We can fix this by checking the tb->invalid flag every time we look up a TB from tb_jmp_cache, so that in the above scenario, next time we try to find that TB in tb_jmp_cache, we won't, and will therefore be forced to look it up in tb_htable. Performance-wise, I measured a small improvement when booting debian-arm. Note that inlining pays off: Performance counter stats for 'taskset -c 0 qemu-system-arm \ -machine type=virt -nographic -smp 1 -m 4096 \ -netdev user,id=unet,hostfwd=tcp::2222-:22 \ -device virtio-net-device,netdev=unet \ -drive file=jessie.qcow2,id=myblock,index=0,if=none \ -device virtio-blk-device,drive=myblock \ -kernel kernel.img -append console=ttyAMA0 root=/dev/vda1 \ -name arm,debug-threads=on -smp 1' (10 runs): Before: 18714.917392 task-clock # 0.952 CPUs utilized ( +- 0.95% ) 23,142 context-switches # 0.001 M/sec ( +- 0.50% ) 1 CPU-migrations # 0.000 M/sec 10,558 page-faults # 0.001 M/sec ( +- 0.95% ) 53,957,727,252 cycles # 2.883 GHz ( +- 0.91% ) [83.33%] 24,440,599,852 stalled-cycles-frontend # 45.30% frontend cycles idle ( +- 1.20% ) [83.33%] 16,495,714,424 stalled-cycles-backend # 30.57% backend cycles idle ( +- 0.95% ) [66.66%] 76,267,572,582 instructions # 1.41 insns per cycle # 0.32 stalled cycles per insn ( +- 0.87% ) [83.34%] 12,692,186,323 branches # 678.186 M/sec ( +- 0.92% ) [83.35%] 263,486,879 branch-misses # 2.08% of all branches ( +- 0.73% ) [83.34%] 19.648474449 seconds time elapsed ( +- 0.82% ) After, w/ inline (this patch): 18471.376627 task-clock # 0.955 CPUs utilized ( +- 0.96% ) 23,048 context-switches # 0.001 M/sec ( +- 0.48% ) 1 CPU-migrations # 0.000 M/sec 10,708 page-faults # 0.001 M/sec ( +- 0.81% ) 53,208,990,796 cycles # 2.881 GHz ( +- 0.98% ) [83.34%] 23,941,071,673 stalled-cycles-frontend # 44.99% frontend cycles idle ( +- 0.95% ) [83.34%] 16,161,773,848 stalled-cycles-backend # 30.37% backend cycles idle ( +- 0.76% ) [66.67%] 75,786,269,766 instructions # 1.42 insns per cycle # 0.32 stalled cycles per insn ( +- 1.24% ) [83.34%] 12,573,617,143 branches # 680.708 M/sec ( +- 1.34% ) [83.33%] 260,235,550 branch-misses # 2.07% of all branches ( +- 0.66% ) [83.33%] 19.340502161 seconds time elapsed ( +- 0.56% ) After, w/o inline: 18791.253967 task-clock # 0.954 CPUs utilized ( +- 0.78% ) 23,230 context-switches # 0.001 M/sec ( +- 0.42% ) 1 CPU-migrations # 0.000 M/sec 10,563 page-faults # 0.001 M/sec ( +- 1.27% ) 54,168,674,622 cycles # 2.883 GHz ( +- 0.80% ) [83.34%] 24,244,712,629 stalled-cycles-frontend # 44.76% frontend cycles idle ( +- 1.37% ) [83.33%] 16,288,648,572 stalled-cycles-backend # 30.07% backend cycles idle ( +- 0.95% ) [66.66%] 77,659,755,503 instructions # 1.43 insns per cycle # 0.31 stalled cycles per insn ( +- 0.97% ) [83.34%] 12,922,780,045 branches # 687.702 M/sec ( +- 1.06% ) [83.34%] 261,962,386 branch-misses # 2.03% of all branches ( +- 0.71% ) [83.35%] 19.700174670 seconds time elapsed ( +- 0.56% ) Reviewed-by: Richard Henderson <rth@twiddle.net> Signed-off-by: Emilio G. Cota <cota@braap.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2017-07-12 00:33:33 +03:00
if (likely(tb &&
tb->pc == *pc &&
tb->cs_base == *cs_base &&
tb->flags == *flags &&
tb->trace_vcpu_dstate == *cpu->trace_dstate &&
(tb_cflags(tb) & (CF_HASH_MASK | CF_INVALID)) == cf_mask)) {
tcg: consolidate TB lookups in tb_lookup__cpu_state This avoids duplicating code. cpu_exec_step will also use the new common function once we integrate parallel_cpus into tb->cflags. Note that in this commit we also fix a race, described by Richard Henderson during review. Think of this scenario with threads A and B: (A) Lookup succeeds for TB in hash without tb_lock (B) Sets the TB's tb->invalid flag (B) Removes the TB from tb_htable (B) Clears all CPU's tb_jmp_cache (A) Store TB into local tb_jmp_cache Given that order of events, (A) will keep executing that invalid TB until another flush of its tb_jmp_cache happens, which in theory might never happen. We can fix this by checking the tb->invalid flag every time we look up a TB from tb_jmp_cache, so that in the above scenario, next time we try to find that TB in tb_jmp_cache, we won't, and will therefore be forced to look it up in tb_htable. Performance-wise, I measured a small improvement when booting debian-arm. Note that inlining pays off: Performance counter stats for 'taskset -c 0 qemu-system-arm \ -machine type=virt -nographic -smp 1 -m 4096 \ -netdev user,id=unet,hostfwd=tcp::2222-:22 \ -device virtio-net-device,netdev=unet \ -drive file=jessie.qcow2,id=myblock,index=0,if=none \ -device virtio-blk-device,drive=myblock \ -kernel kernel.img -append console=ttyAMA0 root=/dev/vda1 \ -name arm,debug-threads=on -smp 1' (10 runs): Before: 18714.917392 task-clock # 0.952 CPUs utilized ( +- 0.95% ) 23,142 context-switches # 0.001 M/sec ( +- 0.50% ) 1 CPU-migrations # 0.000 M/sec 10,558 page-faults # 0.001 M/sec ( +- 0.95% ) 53,957,727,252 cycles # 2.883 GHz ( +- 0.91% ) [83.33%] 24,440,599,852 stalled-cycles-frontend # 45.30% frontend cycles idle ( +- 1.20% ) [83.33%] 16,495,714,424 stalled-cycles-backend # 30.57% backend cycles idle ( +- 0.95% ) [66.66%] 76,267,572,582 instructions # 1.41 insns per cycle # 0.32 stalled cycles per insn ( +- 0.87% ) [83.34%] 12,692,186,323 branches # 678.186 M/sec ( +- 0.92% ) [83.35%] 263,486,879 branch-misses # 2.08% of all branches ( +- 0.73% ) [83.34%] 19.648474449 seconds time elapsed ( +- 0.82% ) After, w/ inline (this patch): 18471.376627 task-clock # 0.955 CPUs utilized ( +- 0.96% ) 23,048 context-switches # 0.001 M/sec ( +- 0.48% ) 1 CPU-migrations # 0.000 M/sec 10,708 page-faults # 0.001 M/sec ( +- 0.81% ) 53,208,990,796 cycles # 2.881 GHz ( +- 0.98% ) [83.34%] 23,941,071,673 stalled-cycles-frontend # 44.99% frontend cycles idle ( +- 0.95% ) [83.34%] 16,161,773,848 stalled-cycles-backend # 30.37% backend cycles idle ( +- 0.76% ) [66.67%] 75,786,269,766 instructions # 1.42 insns per cycle # 0.32 stalled cycles per insn ( +- 1.24% ) [83.34%] 12,573,617,143 branches # 680.708 M/sec ( +- 1.34% ) [83.33%] 260,235,550 branch-misses # 2.07% of all branches ( +- 0.66% ) [83.33%] 19.340502161 seconds time elapsed ( +- 0.56% ) After, w/o inline: 18791.253967 task-clock # 0.954 CPUs utilized ( +- 0.78% ) 23,230 context-switches # 0.001 M/sec ( +- 0.42% ) 1 CPU-migrations # 0.000 M/sec 10,563 page-faults # 0.001 M/sec ( +- 1.27% ) 54,168,674,622 cycles # 2.883 GHz ( +- 0.80% ) [83.34%] 24,244,712,629 stalled-cycles-frontend # 44.76% frontend cycles idle ( +- 1.37% ) [83.33%] 16,288,648,572 stalled-cycles-backend # 30.07% backend cycles idle ( +- 0.95% ) [66.66%] 77,659,755,503 instructions # 1.43 insns per cycle # 0.31 stalled cycles per insn ( +- 0.97% ) [83.34%] 12,922,780,045 branches # 687.702 M/sec ( +- 1.06% ) [83.34%] 261,962,386 branch-misses # 2.03% of all branches ( +- 0.71% ) [83.35%] 19.700174670 seconds time elapsed ( +- 0.56% ) Reviewed-by: Richard Henderson <rth@twiddle.net> Signed-off-by: Emilio G. Cota <cota@braap.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2017-07-12 00:33:33 +03:00
return tb;
}
tb = tb_htable_lookup(cpu, *pc, *cs_base, *flags, cf_mask);
tcg: consolidate TB lookups in tb_lookup__cpu_state This avoids duplicating code. cpu_exec_step will also use the new common function once we integrate parallel_cpus into tb->cflags. Note that in this commit we also fix a race, described by Richard Henderson during review. Think of this scenario with threads A and B: (A) Lookup succeeds for TB in hash without tb_lock (B) Sets the TB's tb->invalid flag (B) Removes the TB from tb_htable (B) Clears all CPU's tb_jmp_cache (A) Store TB into local tb_jmp_cache Given that order of events, (A) will keep executing that invalid TB until another flush of its tb_jmp_cache happens, which in theory might never happen. We can fix this by checking the tb->invalid flag every time we look up a TB from tb_jmp_cache, so that in the above scenario, next time we try to find that TB in tb_jmp_cache, we won't, and will therefore be forced to look it up in tb_htable. Performance-wise, I measured a small improvement when booting debian-arm. Note that inlining pays off: Performance counter stats for 'taskset -c 0 qemu-system-arm \ -machine type=virt -nographic -smp 1 -m 4096 \ -netdev user,id=unet,hostfwd=tcp::2222-:22 \ -device virtio-net-device,netdev=unet \ -drive file=jessie.qcow2,id=myblock,index=0,if=none \ -device virtio-blk-device,drive=myblock \ -kernel kernel.img -append console=ttyAMA0 root=/dev/vda1 \ -name arm,debug-threads=on -smp 1' (10 runs): Before: 18714.917392 task-clock # 0.952 CPUs utilized ( +- 0.95% ) 23,142 context-switches # 0.001 M/sec ( +- 0.50% ) 1 CPU-migrations # 0.000 M/sec 10,558 page-faults # 0.001 M/sec ( +- 0.95% ) 53,957,727,252 cycles # 2.883 GHz ( +- 0.91% ) [83.33%] 24,440,599,852 stalled-cycles-frontend # 45.30% frontend cycles idle ( +- 1.20% ) [83.33%] 16,495,714,424 stalled-cycles-backend # 30.57% backend cycles idle ( +- 0.95% ) [66.66%] 76,267,572,582 instructions # 1.41 insns per cycle # 0.32 stalled cycles per insn ( +- 0.87% ) [83.34%] 12,692,186,323 branches # 678.186 M/sec ( +- 0.92% ) [83.35%] 263,486,879 branch-misses # 2.08% of all branches ( +- 0.73% ) [83.34%] 19.648474449 seconds time elapsed ( +- 0.82% ) After, w/ inline (this patch): 18471.376627 task-clock # 0.955 CPUs utilized ( +- 0.96% ) 23,048 context-switches # 0.001 M/sec ( +- 0.48% ) 1 CPU-migrations # 0.000 M/sec 10,708 page-faults # 0.001 M/sec ( +- 0.81% ) 53,208,990,796 cycles # 2.881 GHz ( +- 0.98% ) [83.34%] 23,941,071,673 stalled-cycles-frontend # 44.99% frontend cycles idle ( +- 0.95% ) [83.34%] 16,161,773,848 stalled-cycles-backend # 30.37% backend cycles idle ( +- 0.76% ) [66.67%] 75,786,269,766 instructions # 1.42 insns per cycle # 0.32 stalled cycles per insn ( +- 1.24% ) [83.34%] 12,573,617,143 branches # 680.708 M/sec ( +- 1.34% ) [83.33%] 260,235,550 branch-misses # 2.07% of all branches ( +- 0.66% ) [83.33%] 19.340502161 seconds time elapsed ( +- 0.56% ) After, w/o inline: 18791.253967 task-clock # 0.954 CPUs utilized ( +- 0.78% ) 23,230 context-switches # 0.001 M/sec ( +- 0.42% ) 1 CPU-migrations # 0.000 M/sec 10,563 page-faults # 0.001 M/sec ( +- 1.27% ) 54,168,674,622 cycles # 2.883 GHz ( +- 0.80% ) [83.34%] 24,244,712,629 stalled-cycles-frontend # 44.76% frontend cycles idle ( +- 1.37% ) [83.33%] 16,288,648,572 stalled-cycles-backend # 30.07% backend cycles idle ( +- 0.95% ) [66.66%] 77,659,755,503 instructions # 1.43 insns per cycle # 0.31 stalled cycles per insn ( +- 0.97% ) [83.34%] 12,922,780,045 branches # 687.702 M/sec ( +- 1.06% ) [83.34%] 261,962,386 branch-misses # 2.03% of all branches ( +- 0.71% ) [83.35%] 19.700174670 seconds time elapsed ( +- 0.56% ) Reviewed-by: Richard Henderson <rth@twiddle.net> Signed-off-by: Emilio G. Cota <cota@braap.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2017-07-12 00:33:33 +03:00
if (tb == NULL) {
return NULL;
}
qatomic_set(&cpu->tb_jmp_cache[hash], tb);
tcg: consolidate TB lookups in tb_lookup__cpu_state This avoids duplicating code. cpu_exec_step will also use the new common function once we integrate parallel_cpus into tb->cflags. Note that in this commit we also fix a race, described by Richard Henderson during review. Think of this scenario with threads A and B: (A) Lookup succeeds for TB in hash without tb_lock (B) Sets the TB's tb->invalid flag (B) Removes the TB from tb_htable (B) Clears all CPU's tb_jmp_cache (A) Store TB into local tb_jmp_cache Given that order of events, (A) will keep executing that invalid TB until another flush of its tb_jmp_cache happens, which in theory might never happen. We can fix this by checking the tb->invalid flag every time we look up a TB from tb_jmp_cache, so that in the above scenario, next time we try to find that TB in tb_jmp_cache, we won't, and will therefore be forced to look it up in tb_htable. Performance-wise, I measured a small improvement when booting debian-arm. Note that inlining pays off: Performance counter stats for 'taskset -c 0 qemu-system-arm \ -machine type=virt -nographic -smp 1 -m 4096 \ -netdev user,id=unet,hostfwd=tcp::2222-:22 \ -device virtio-net-device,netdev=unet \ -drive file=jessie.qcow2,id=myblock,index=0,if=none \ -device virtio-blk-device,drive=myblock \ -kernel kernel.img -append console=ttyAMA0 root=/dev/vda1 \ -name arm,debug-threads=on -smp 1' (10 runs): Before: 18714.917392 task-clock # 0.952 CPUs utilized ( +- 0.95% ) 23,142 context-switches # 0.001 M/sec ( +- 0.50% ) 1 CPU-migrations # 0.000 M/sec 10,558 page-faults # 0.001 M/sec ( +- 0.95% ) 53,957,727,252 cycles # 2.883 GHz ( +- 0.91% ) [83.33%] 24,440,599,852 stalled-cycles-frontend # 45.30% frontend cycles idle ( +- 1.20% ) [83.33%] 16,495,714,424 stalled-cycles-backend # 30.57% backend cycles idle ( +- 0.95% ) [66.66%] 76,267,572,582 instructions # 1.41 insns per cycle # 0.32 stalled cycles per insn ( +- 0.87% ) [83.34%] 12,692,186,323 branches # 678.186 M/sec ( +- 0.92% ) [83.35%] 263,486,879 branch-misses # 2.08% of all branches ( +- 0.73% ) [83.34%] 19.648474449 seconds time elapsed ( +- 0.82% ) After, w/ inline (this patch): 18471.376627 task-clock # 0.955 CPUs utilized ( +- 0.96% ) 23,048 context-switches # 0.001 M/sec ( +- 0.48% ) 1 CPU-migrations # 0.000 M/sec 10,708 page-faults # 0.001 M/sec ( +- 0.81% ) 53,208,990,796 cycles # 2.881 GHz ( +- 0.98% ) [83.34%] 23,941,071,673 stalled-cycles-frontend # 44.99% frontend cycles idle ( +- 0.95% ) [83.34%] 16,161,773,848 stalled-cycles-backend # 30.37% backend cycles idle ( +- 0.76% ) [66.67%] 75,786,269,766 instructions # 1.42 insns per cycle # 0.32 stalled cycles per insn ( +- 1.24% ) [83.34%] 12,573,617,143 branches # 680.708 M/sec ( +- 1.34% ) [83.33%] 260,235,550 branch-misses # 2.07% of all branches ( +- 0.66% ) [83.33%] 19.340502161 seconds time elapsed ( +- 0.56% ) After, w/o inline: 18791.253967 task-clock # 0.954 CPUs utilized ( +- 0.78% ) 23,230 context-switches # 0.001 M/sec ( +- 0.42% ) 1 CPU-migrations # 0.000 M/sec 10,563 page-faults # 0.001 M/sec ( +- 1.27% ) 54,168,674,622 cycles # 2.883 GHz ( +- 0.80% ) [83.34%] 24,244,712,629 stalled-cycles-frontend # 44.76% frontend cycles idle ( +- 1.37% ) [83.33%] 16,288,648,572 stalled-cycles-backend # 30.07% backend cycles idle ( +- 0.95% ) [66.66%] 77,659,755,503 instructions # 1.43 insns per cycle # 0.31 stalled cycles per insn ( +- 0.97% ) [83.34%] 12,922,780,045 branches # 687.702 M/sec ( +- 1.06% ) [83.34%] 261,962,386 branch-misses # 2.03% of all branches ( +- 0.71% ) [83.35%] 19.700174670 seconds time elapsed ( +- 0.56% ) Reviewed-by: Richard Henderson <rth@twiddle.net> Signed-off-by: Emilio G. Cota <cota@braap.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2017-07-12 00:33:33 +03:00
return tb;
}
#endif /* EXEC_TB_LOOKUP_H */