qemu/accel/tcg/tcg-runtime.c
Emilio G. Cota f6bb84d531 tcg: consolidate TB lookups in tb_lookup__cpu_state
This avoids duplicating code. cpu_exec_step will also use the
new common function once we integrate parallel_cpus into tb->cflags.

Note that in this commit we also fix a race, described by Richard Henderson
during review. Think of this scenario with threads A and B:

   (A) Lookup succeeds for TB in hash without tb_lock
        (B) Sets the TB's tb->invalid flag
        (B) Removes the TB from tb_htable
        (B) Clears all CPU's tb_jmp_cache
   (A) Store TB into local tb_jmp_cache

Given that order of events, (A) will keep executing that invalid TB until
another flush of its tb_jmp_cache happens, which in theory might never happen.
We can fix this by checking the tb->invalid flag every time we look up a TB
from tb_jmp_cache, so that in the above scenario, next time we try to find
that TB in tb_jmp_cache, we won't, and will therefore be forced to look it
up in tb_htable.

Performance-wise, I measured a small improvement when booting debian-arm.
Note that inlining pays off:

 Performance counter stats for 'taskset -c 0 qemu-system-arm \
	-machine type=virt -nographic -smp 1 -m 4096 \
	-netdev user,id=unet,hostfwd=tcp::2222-:22 \
	-device virtio-net-device,netdev=unet \
	-drive file=jessie.qcow2,id=myblock,index=0,if=none \
	-device virtio-blk-device,drive=myblock \
	-kernel kernel.img -append console=ttyAMA0 root=/dev/vda1 \
	-name arm,debug-threads=on -smp 1' (10 runs):

Before:
      18714.917392 task-clock                #    0.952 CPUs utilized            ( +-  0.95% )
            23,142 context-switches          #    0.001 M/sec                    ( +-  0.50% )
                 1 CPU-migrations            #    0.000 M/sec
            10,558 page-faults               #    0.001 M/sec                    ( +-  0.95% )
    53,957,727,252 cycles                    #    2.883 GHz                      ( +-  0.91% ) [83.33%]
    24,440,599,852 stalled-cycles-frontend   #   45.30% frontend cycles idle     ( +-  1.20% ) [83.33%]
    16,495,714,424 stalled-cycles-backend    #   30.57% backend  cycles idle     ( +-  0.95% ) [66.66%]
    76,267,572,582 instructions              #    1.41  insns per cycle
                                             #    0.32  stalled cycles per insn  ( +-  0.87% ) [83.34%]
    12,692,186,323 branches                  #  678.186 M/sec                    ( +-  0.92% ) [83.35%]
       263,486,879 branch-misses             #    2.08% of all branches          ( +-  0.73% ) [83.34%]

      19.648474449 seconds time elapsed                                          ( +-  0.82% )

After, w/ inline (this patch):
      18471.376627 task-clock                #    0.955 CPUs utilized            ( +-  0.96% )
            23,048 context-switches          #    0.001 M/sec                    ( +-  0.48% )
                 1 CPU-migrations            #    0.000 M/sec
            10,708 page-faults               #    0.001 M/sec                    ( +-  0.81% )
    53,208,990,796 cycles                    #    2.881 GHz                      ( +-  0.98% ) [83.34%]
    23,941,071,673 stalled-cycles-frontend   #   44.99% frontend cycles idle     ( +-  0.95% ) [83.34%]
    16,161,773,848 stalled-cycles-backend    #   30.37% backend  cycles idle     ( +-  0.76% ) [66.67%]
    75,786,269,766 instructions              #    1.42  insns per cycle
                                             #    0.32  stalled cycles per insn  ( +-  1.24% ) [83.34%]
    12,573,617,143 branches                  #  680.708 M/sec                    ( +-  1.34% ) [83.33%]
       260,235,550 branch-misses             #    2.07% of all branches          ( +-  0.66% ) [83.33%]

      19.340502161 seconds time elapsed                                          ( +-  0.56% )

After, w/o inline:
      18791.253967 task-clock                #    0.954 CPUs utilized            ( +-  0.78% )
            23,230 context-switches          #    0.001 M/sec                    ( +-  0.42% )
                 1 CPU-migrations            #    0.000 M/sec
            10,563 page-faults               #    0.001 M/sec                    ( +-  1.27% )
    54,168,674,622 cycles                    #    2.883 GHz                      ( +-  0.80% ) [83.34%]
    24,244,712,629 stalled-cycles-frontend   #   44.76% frontend cycles idle     ( +-  1.37% ) [83.33%]
    16,288,648,572 stalled-cycles-backend    #   30.07% backend  cycles idle     ( +-  0.95% ) [66.66%]
    77,659,755,503 instructions              #    1.43  insns per cycle
                                             #    0.31  stalled cycles per insn  ( +-  0.97% ) [83.34%]
    12,922,780,045 branches                  #  687.702 M/sec                    ( +-  1.06% ) [83.34%]
       261,962,386 branch-misses             #    2.03% of all branches          ( +-  0.71% ) [83.35%]

      19.700174670 seconds time elapsed                                          ( +-  0.56% )

Reviewed-by: Richard Henderson <rth@twiddle.net>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2017-10-10 07:37:10 -07:00

169 lines
3.8 KiB
C

/*
* Tiny Code Generator for QEMU
*
* Copyright (c) 2008 Fabrice Bellard
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#include "qemu/osdep.h"
#include "qemu/host-utils.h"
#include "cpu.h"
#include "exec/helper-proto.h"
#include "exec/cpu_ldst.h"
#include "exec/exec-all.h"
#include "exec/tb-lookup.h"
#include "disas/disas.h"
#include "exec/log.h"
/* 32-bit helpers */
int32_t HELPER(div_i32)(int32_t arg1, int32_t arg2)
{
return arg1 / arg2;
}
int32_t HELPER(rem_i32)(int32_t arg1, int32_t arg2)
{
return arg1 % arg2;
}
uint32_t HELPER(divu_i32)(uint32_t arg1, uint32_t arg2)
{
return arg1 / arg2;
}
uint32_t HELPER(remu_i32)(uint32_t arg1, uint32_t arg2)
{
return arg1 % arg2;
}
/* 64-bit helpers */
uint64_t HELPER(shl_i64)(uint64_t arg1, uint64_t arg2)
{
return arg1 << arg2;
}
uint64_t HELPER(shr_i64)(uint64_t arg1, uint64_t arg2)
{
return arg1 >> arg2;
}
int64_t HELPER(sar_i64)(int64_t arg1, int64_t arg2)
{
return arg1 >> arg2;
}
int64_t HELPER(div_i64)(int64_t arg1, int64_t arg2)
{
return arg1 / arg2;
}
int64_t HELPER(rem_i64)(int64_t arg1, int64_t arg2)
{
return arg1 % arg2;
}
uint64_t HELPER(divu_i64)(uint64_t arg1, uint64_t arg2)
{
return arg1 / arg2;
}
uint64_t HELPER(remu_i64)(uint64_t arg1, uint64_t arg2)
{
return arg1 % arg2;
}
uint64_t HELPER(muluh_i64)(uint64_t arg1, uint64_t arg2)
{
uint64_t l, h;
mulu64(&l, &h, arg1, arg2);
return h;
}
int64_t HELPER(mulsh_i64)(int64_t arg1, int64_t arg2)
{
uint64_t l, h;
muls64(&l, &h, arg1, arg2);
return h;
}
uint32_t HELPER(clz_i32)(uint32_t arg, uint32_t zero_val)
{
return arg ? clz32(arg) : zero_val;
}
uint32_t HELPER(ctz_i32)(uint32_t arg, uint32_t zero_val)
{
return arg ? ctz32(arg) : zero_val;
}
uint64_t HELPER(clz_i64)(uint64_t arg, uint64_t zero_val)
{
return arg ? clz64(arg) : zero_val;
}
uint64_t HELPER(ctz_i64)(uint64_t arg, uint64_t zero_val)
{
return arg ? ctz64(arg) : zero_val;
}
uint32_t HELPER(clrsb_i32)(uint32_t arg)
{
return clrsb32(arg);
}
uint64_t HELPER(clrsb_i64)(uint64_t arg)
{
return clrsb64(arg);
}
uint32_t HELPER(ctpop_i32)(uint32_t arg)
{
return ctpop32(arg);
}
uint64_t HELPER(ctpop_i64)(uint64_t arg)
{
return ctpop64(arg);
}
void *HELPER(lookup_tb_ptr)(CPUArchState *env)
{
CPUState *cpu = ENV_GET_CPU(env);
TranslationBlock *tb;
target_ulong cs_base, pc;
uint32_t flags;
tb = tb_lookup__cpu_state(cpu, &pc, &cs_base, &flags);
if (tb == NULL) {
return tcg_ctx.code_gen_epilogue;
}
qemu_log_mask_and_addr(CPU_LOG_EXEC, pc,
"Chain %p [%d: " TARGET_FMT_lx "] %s\n",
tb->tc_ptr, cpu->cpu_index, pc,
lookup_symbol(pc));
return tb->tc_ptr;
}
void HELPER(exit_atomic)(CPUArchState *env)
{
cpu_loop_exit_atomic(ENV_GET_CPU(env), GETPC());
}