tcg/ppc32: Use trampolines to trim the code size for mmu slow path accessors
mmu access looks something like: <check tlb> if miss goto slow_path <fast path> done: ... ; end of the TB slow_path: <pre process> mr r3, r27 ; move areg0 to r3 ; (r3 holds the first argument for all the PPC32 ABIs) <call mmu_helper> b $+8 .long done <post process> b done On ppc32 <call mmu_helper> is: (SysV and Darwin) mmu_helper is most likely not within direct branching distance from the call site, necessitating a. moving 32 bit offset of mmu_helper into a GPR ; 8 bytes b. moving GPR to CTR/LR ; 4 bytes c. (finally) branching to CTR/LR ; 4 bytes r3 setting - 4 bytes call - 16 bytes dummy jump over retaddr - 4 bytes embedded retaddr - 4 bytes Total overhead - 28 bytes (PowerOpen (AIX)) a. moving 32 bit offset of mmu_helper's TOC into a GPR1 ; 8 bytes b. loading 32 bit function pointer into GPR2 ; 4 bytes c. moving GPR2 to CTR/LR ; 4 bytes d. loading 32 bit small area pointer into R2 ; 4 bytes e. (finally) branching to CTR/LR ; 4 bytes r3 setting - 4 bytes call - 24 bytes dummy jump over retaddr - 4 bytes embedded retaddr - 4 bytes Total overhead - 36 bytes Following is done to trim the code size of slow path sections: In tcg_target_qemu_prologue trampolines are emitted that look like this: trampoline: mfspr r3, LR addi r3, 4 mtspr LR, r3 ; fixup LR to point over embedded retaddr mr r3, r27 <jump mmu_helper> ; tail call of sorts And slow path becomes: slow_path: <pre process> <call trampoline> .long done <post process> b done call - 4 bytes (trampoline is within code gen buffer and most likely accessible via direct branch) embedded retaddr - 4 bytes Total overhead - 8 bytes In the end the icache pressure is decreased by 20/28 bytes at the cost of an extra jump to trampoline and adjusting LR (to skip over embedded retaddr) once inside. Signed-off-by: malc <av1474@comtv.ru>
This commit is contained in:
parent
1cfd981ff1
commit
c878da3b27
@ -337,7 +337,7 @@ extern uintptr_t tci_tb_ptr;
|
||||
*(int32_t *)((void *)GETRA() + 3) - 1))
|
||||
# elif defined (_ARCH_PPC) && !defined (_ARCH_PPC64)
|
||||
# define GETRA() ((uintptr_t)__builtin_return_address(0))
|
||||
# define GETPC_LDST() ((uintptr_t) ((*(int32_t *)(GETRA() + 4)) - 1))
|
||||
# define GETPC_LDST() ((uintptr_t) ((*(int32_t *)(GETRA() - 4)) - 1))
|
||||
# else
|
||||
# error "CONFIG_QEMU_LDST_OPTIMIZATION needs GETPC_LDST() implementation!"
|
||||
# endif
|
||||
|
@ -569,6 +569,9 @@ static const void * const qemu_st_helpers[4] = {
|
||||
helper_stq_mmu,
|
||||
};
|
||||
|
||||
static void *ld_trampolines[4];
|
||||
static void *st_trampolines[4];
|
||||
|
||||
static void tcg_out_tlb_check (TCGContext *s, int r0, int r1, int r2,
|
||||
int addr_reg, int addr_reg2, int s_bits,
|
||||
int offset1, int offset2, uint8_t **label_ptr)
|
||||
@ -848,8 +851,7 @@ static void tcg_out_qemu_ld_slow_path (TCGContext *s, TCGLabelQemuLdst *label)
|
||||
reloc_pc14 (label_ptr[0], (tcg_target_long) s->code_ptr);
|
||||
|
||||
/* slow path */
|
||||
ir = 3;
|
||||
tcg_out_mov (s, TCG_TYPE_I32, ir++, TCG_AREG0);
|
||||
ir = 4;
|
||||
#if TARGET_LONG_BITS == 32
|
||||
tcg_out_mov (s, TCG_TYPE_I32, ir++, addr_reg);
|
||||
#else
|
||||
@ -860,8 +862,7 @@ static void tcg_out_qemu_ld_slow_path (TCGContext *s, TCGLabelQemuLdst *label)
|
||||
tcg_out_mov (s, TCG_TYPE_I32, ir++, addr_reg);
|
||||
#endif
|
||||
tcg_out_movi (s, TCG_TYPE_I32, ir, mem_index);
|
||||
tcg_out_call (s, (tcg_target_long) qemu_ld_helpers[s_bits], 1);
|
||||
tcg_out32 (s, B | 8);
|
||||
tcg_out_call (s, (tcg_target_long) ld_trampolines[s_bits], 1);
|
||||
tcg_out32 (s, (tcg_target_long) raddr);
|
||||
switch (opc) {
|
||||
case 0|4:
|
||||
@ -916,8 +917,7 @@ static void tcg_out_qemu_st_slow_path (TCGContext *s, TCGLabelQemuLdst *label)
|
||||
reloc_pc14 (label_ptr[0], (tcg_target_long) s->code_ptr);
|
||||
|
||||
/* slow path */
|
||||
ir = 3;
|
||||
tcg_out_mov (s, TCG_TYPE_I32, ir++, TCG_AREG0);
|
||||
ir = 4;
|
||||
#if TARGET_LONG_BITS == 32
|
||||
tcg_out_mov (s, TCG_TYPE_I32, ir++, addr_reg);
|
||||
#else
|
||||
@ -959,8 +959,7 @@ static void tcg_out_qemu_st_slow_path (TCGContext *s, TCGLabelQemuLdst *label)
|
||||
ir++;
|
||||
|
||||
tcg_out_movi (s, TCG_TYPE_I32, ir, mem_index);
|
||||
tcg_out_call (s, (tcg_target_long) qemu_st_helpers[opc], 1);
|
||||
tcg_out32 (s, B | 8);
|
||||
tcg_out_call (s, (tcg_target_long) st_trampolines[opc], 1);
|
||||
tcg_out32 (s, (tcg_target_long) raddr);
|
||||
tcg_out_b (s, 0, (tcg_target_long) raddr);
|
||||
}
|
||||
@ -983,6 +982,15 @@ void tcg_out_tb_finalize(TCGContext *s)
|
||||
}
|
||||
#endif
|
||||
|
||||
static void emit_ldst_trampoline (TCGContext *s, const void *ptr)
|
||||
{
|
||||
tcg_out32 (s, MFSPR | RT (3) | LR);
|
||||
tcg_out32 (s, ADDI | RT (3) | RA (3) | 4);
|
||||
tcg_out32 (s, MTSPR | RS (3) | LR);
|
||||
tcg_out_mov (s, TCG_TYPE_I32, 3, TCG_AREG0);
|
||||
tcg_out_b (s, 0, (tcg_target_long) ptr);
|
||||
}
|
||||
|
||||
static void tcg_target_qemu_prologue (TCGContext *s)
|
||||
{
|
||||
int i, frame_size;
|
||||
@ -1043,6 +1051,14 @@ static void tcg_target_qemu_prologue (TCGContext *s)
|
||||
tcg_out32 (s, MTSPR | RS (0) | LR);
|
||||
tcg_out32 (s, ADDI | RT (1) | RA (1) | frame_size);
|
||||
tcg_out32 (s, BCLR | BO_ALWAYS);
|
||||
|
||||
for (i = 0; i < 4; ++i) {
|
||||
ld_trampolines[i] = s->code_ptr;
|
||||
emit_ldst_trampoline (s, qemu_ld_helpers[i]);
|
||||
|
||||
st_trampolines[i] = s->code_ptr;
|
||||
emit_ldst_trampoline (s, qemu_st_helpers[i]);
|
||||
}
|
||||
}
|
||||
|
||||
static void tcg_out_ld (TCGContext *s, TCGType type, TCGReg ret, TCGReg arg1,
|
||||
|
Loading…
Reference in New Issue
Block a user