From 096d118592caa03188db6a190988221b65a2bd81 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Fri, 4 Aug 2023 20:35:53 +0000 Subject: [PATCH 01/38] tcg/ppc: Untabify tcg-target.c.inc Signed-off-by: Richard Henderson --- tcg/ppc/tcg-target.c.inc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc index 5c873b2161..5cecc6ed95 100644 --- a/tcg/ppc/tcg-target.c.inc +++ b/tcg/ppc/tcg-target.c.inc @@ -221,7 +221,7 @@ static inline bool in_range_b(tcg_target_long target) } static uint32_t reloc_pc24_val(const tcg_insn_unit *pc, - const tcg_insn_unit *target) + const tcg_insn_unit *target) { ptrdiff_t disp = tcg_ptr_byte_diff(target, pc); tcg_debug_assert(in_range_b(disp)); @@ -241,7 +241,7 @@ static bool reloc_pc24(tcg_insn_unit *src_rw, const tcg_insn_unit *target) } static uint16_t reloc_pc14_val(const tcg_insn_unit *pc, - const tcg_insn_unit *target) + const tcg_insn_unit *target) { ptrdiff_t disp = tcg_ptr_byte_diff(target, pc); tcg_debug_assert(disp == (int16_t) disp); @@ -3645,7 +3645,7 @@ static void expand_vec_mul(TCGType type, unsigned vece, TCGv_vec v0, tcgv_vec_arg(t1), tcgv_vec_arg(t2)); vec_gen_3(INDEX_op_ppc_pkum_vec, type, vece, tcgv_vec_arg(v0), tcgv_vec_arg(v0), tcgv_vec_arg(t1)); - break; + break; case MO_32: tcg_debug_assert(!have_isa_2_07); From 03434d52024934620a001a875b540b06abd0b169 Mon Sep 17 00:00:00 2001 From: Jordan Niethe Date: Tue, 15 Aug 2023 16:47:11 +0000 Subject: [PATCH 02/38] tcg/ppc: Enable direct branching tcg_out_goto_tb with TCG_REG_TB Direct branch patching was disabled when using TCG_REG_TB in commit 736a1588c1 ("tcg/ppc: Fix race in goto_tb implementation"). The issue with direct branch patching with TCG_REG_TB is the lack of synchronization between the new TCG_REG_TB being established and the direct branch being patched in. If each translation block is responsible for establishing its own TCG_REG_TB then there can be no synchronization issue. Make each translation block begin by setting up its own TCG_REG_TB. Use the preferred 'bcl 20,31,$+4' sequence. Signed-off-by: Jordan Niethe [rth: Split out tcg_out_tb_start, power9 addpcis] Signed-off-by: Richard Henderson --- tcg/ppc/tcg-target.c.inc | 48 ++++++++++++++-------------------------- 1 file changed, 17 insertions(+), 31 deletions(-) diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc index 5cecc6ed95..9197cfd6c6 100644 --- a/tcg/ppc/tcg-target.c.inc +++ b/tcg/ppc/tcg-target.c.inc @@ -2509,9 +2509,6 @@ static void tcg_target_qemu_prologue(TCGContext *s) tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]); tcg_out32(s, MTSPR | RS(tcg_target_call_iarg_regs[1]) | CTR); - if (USE_REG_TB) { - tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_TB, tcg_target_call_iarg_regs[1]); - } tcg_out32(s, BCCTR | BO_ALWAYS); /* Epilogue */ @@ -2529,7 +2526,13 @@ static void tcg_target_qemu_prologue(TCGContext *s) static void tcg_out_tb_start(TCGContext *s) { - /* nothing to do */ + /* Load TCG_REG_TB. */ + if (USE_REG_TB) { + /* bcl 20,31,$+4 (preferred form for getting nia) */ + tcg_out32(s, BC | BO_ALWAYS | BI(7, CR_SO) | 0x4 | LK); + tcg_out32(s, MFSPR | RT(TCG_REG_TB) | LR); + tcg_out32(s, ADDI | TAI(TCG_REG_TB, TCG_REG_TB, -4)); + } } static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg) @@ -2542,32 +2545,22 @@ static void tcg_out_goto_tb(TCGContext *s, int which) { uintptr_t ptr = get_jmp_target_addr(s, which); + /* Direct branch will be patched by tb_target_set_jmp_target. */ + set_jmp_insn_offset(s, which); + tcg_out32(s, NOP); + + /* When branch is out of range, fall through to indirect. */ if (USE_REG_TB) { ptrdiff_t offset = tcg_tbrel_diff(s, (void *)ptr); - tcg_out_mem_long(s, LD, LDX, TCG_REG_TB, TCG_REG_TB, offset); - - /* TODO: Use direct branches when possible. */ - set_jmp_insn_offset(s, which); - tcg_out32(s, MTSPR | RS(TCG_REG_TB) | CTR); - - tcg_out32(s, BCCTR | BO_ALWAYS); - - /* For the unlinked case, need to reset TCG_REG_TB. */ - set_jmp_reset_offset(s, which); - tcg_out_mem_long(s, ADDI, ADD, TCG_REG_TB, TCG_REG_TB, - -tcg_current_code_size(s)); + tcg_out_mem_long(s, LD, LDX, TCG_REG_TMP1, TCG_REG_TB, offset); } else { - /* Direct branch will be patched by tb_target_set_jmp_target. */ - set_jmp_insn_offset(s, which); - tcg_out32(s, NOP); - - /* When branch is out of range, fall through to indirect. */ tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_TMP1, ptr - (int16_t)ptr); tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP1, TCG_REG_TMP1, (int16_t)ptr); - tcg_out32(s, MTSPR | RS(TCG_REG_TMP1) | CTR); - tcg_out32(s, BCCTR | BO_ALWAYS); - set_jmp_reset_offset(s, which); } + + tcg_out32(s, MTSPR | RS(TCG_REG_TMP1) | CTR); + tcg_out32(s, BCCTR | BO_ALWAYS); + set_jmp_reset_offset(s, which); } void tb_target_set_jmp_target(const TranslationBlock *tb, int n, @@ -2577,10 +2570,6 @@ void tb_target_set_jmp_target(const TranslationBlock *tb, int n, intptr_t diff = addr - jmp_rx; tcg_insn_unit insn; - if (USE_REG_TB) { - return; - } - if (in_range_b(diff)) { insn = B | (diff & 0x3fffffc); } else { @@ -2600,9 +2589,6 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, switch (opc) { case INDEX_op_goto_ptr: tcg_out32(s, MTSPR | RS(args[0]) | CTR); - if (USE_REG_TB) { - tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_TB, args[0]); - } tcg_out32(s, ADDI | TAI(TCG_REG_R3, 0, 0)); tcg_out32(s, BCCTR | BO_ALWAYS); break; From 4430b60c7f201eefac28c711c1bf93983e5966e9 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Tue, 15 Aug 2023 16:58:19 +0000 Subject: [PATCH 03/38] tcg/ppc: Reinterpret tb-relative to TB+4 It saves one insn to load the address of TB+4 instead of TB. Adjust all of the indexing to match. Signed-off-by: Richard Henderson --- tcg/ppc/tcg-target.c.inc | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc index 9197cfd6c6..aafbf2db4e 100644 --- a/tcg/ppc/tcg-target.c.inc +++ b/tcg/ppc/tcg-target.c.inc @@ -215,6 +215,12 @@ static const int tcg_target_callee_save_regs[] = { TCG_REG_R31 }; +/* For PPC, we use TB+4 instead of TB as the base. */ +static inline ptrdiff_t ppc_tbrel_diff(TCGContext *s, const void *target) +{ + return tcg_tbrel_diff(s, target) - 4; +} + static inline bool in_range_b(tcg_target_long target) { return target == sextract64(target, 0, 26); @@ -991,7 +997,7 @@ static void tcg_out_movi_int(TCGContext *s, TCGType type, TCGReg ret, } /* Load addresses within the TB with one insn. */ - tb_diff = tcg_tbrel_diff(s, (void *)arg); + tb_diff = ppc_tbrel_diff(s, (void *)arg); if (!in_prologue && USE_REG_TB && tb_diff == (int16_t)tb_diff) { tcg_out32(s, ADDI | TAI(ret, TCG_REG_TB, tb_diff)); return; @@ -1044,7 +1050,7 @@ static void tcg_out_movi_int(TCGContext *s, TCGType type, TCGReg ret, /* Use the constant pool, if possible. */ if (!in_prologue && USE_REG_TB) { new_pool_label(s, arg, R_PPC_ADDR16, s->code_ptr, - tcg_tbrel_diff(s, NULL)); + ppc_tbrel_diff(s, NULL)); tcg_out32(s, LD | TAI(ret, TCG_REG_TB, 0)); return; } @@ -1104,7 +1110,7 @@ static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece, */ if (USE_REG_TB) { rel = R_PPC_ADDR16; - add = tcg_tbrel_diff(s, NULL); + add = ppc_tbrel_diff(s, NULL); } else { rel = R_PPC_ADDR32; add = 0; @@ -2531,7 +2537,6 @@ static void tcg_out_tb_start(TCGContext *s) /* bcl 20,31,$+4 (preferred form for getting nia) */ tcg_out32(s, BC | BO_ALWAYS | BI(7, CR_SO) | 0x4 | LK); tcg_out32(s, MFSPR | RT(TCG_REG_TB) | LR); - tcg_out32(s, ADDI | TAI(TCG_REG_TB, TCG_REG_TB, -4)); } } @@ -2551,7 +2556,7 @@ static void tcg_out_goto_tb(TCGContext *s, int which) /* When branch is out of range, fall through to indirect. */ if (USE_REG_TB) { - ptrdiff_t offset = tcg_tbrel_diff(s, (void *)ptr); + ptrdiff_t offset = ppc_tbrel_diff(s, (void *)ptr); tcg_out_mem_long(s, LD, LDX, TCG_REG_TMP1, TCG_REG_TB, offset); } else { tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_TMP1, ptr - (int16_t)ptr); From 3acd75b851759ccfa70a900c04d9ca030140de01 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Tue, 15 Aug 2023 17:04:42 +0000 Subject: [PATCH 04/38] tcg/ppc: Use ADDPCIS in tcg_out_tb_start With ISA v3.0, we can use ADDPCIS instead of BCL+MFLR to load NIA. Signed-off-by: Richard Henderson --- tcg/ppc/tcg-target.c.inc | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc index aafbf2db4e..b0b8cd2390 100644 --- a/tcg/ppc/tcg-target.c.inc +++ b/tcg/ppc/tcg-target.c.inc @@ -362,6 +362,7 @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct, int vece) #define CRNAND XO19(225) #define CROR XO19(449) #define CRNOR XO19( 33) +#define ADDPCIS XO19( 2) #define EXTSB XO31(954) #define EXTSH XO31(922) @@ -859,6 +860,19 @@ static inline void tcg_out_sari64(TCGContext *s, TCGReg dst, TCGReg src, int c) tcg_out32(s, SRADI | RA(dst) | RS(src) | SH(c & 0x1f) | ((c >> 4) & 2)); } +static void tcg_out_addpcis(TCGContext *s, TCGReg dst, intptr_t imm) +{ + uint32_t d0, d1, d2; + + tcg_debug_assert((imm & 0xffff) == 0); + tcg_debug_assert(imm == (int32_t)imm); + + d2 = extract32(imm, 16, 1); + d1 = extract32(imm, 17, 5); + d0 = extract32(imm, 22, 10); + tcg_out32(s, ADDPCIS | RT(dst) | (d1 << 16) | (d0 << 6) | d2); +} + static void tcg_out_bswap16(TCGContext *s, TCGReg dst, TCGReg src, int flags) { TCGReg tmp = dst == src ? TCG_REG_R0 : dst; @@ -2534,9 +2548,14 @@ static void tcg_out_tb_start(TCGContext *s) { /* Load TCG_REG_TB. */ if (USE_REG_TB) { - /* bcl 20,31,$+4 (preferred form for getting nia) */ - tcg_out32(s, BC | BO_ALWAYS | BI(7, CR_SO) | 0x4 | LK); - tcg_out32(s, MFSPR | RT(TCG_REG_TB) | LR); + if (have_isa_3_00) { + /* lnia REG_TB */ + tcg_out_addpcis(s, TCG_REG_TB, 0); + } else { + /* bcl 20,31,$+4 (preferred form for getting nia) */ + tcg_out32(s, BC | BO_ALWAYS | BI(7, CR_SO) | 0x4 | LK); + tcg_out32(s, MFSPR | RT(TCG_REG_TB) | LR); + } } } From 776cd9b64834d735db9c4cac7924497509739755 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Tue, 15 Aug 2023 17:19:37 +0000 Subject: [PATCH 05/38] tcg/ppc: Use ADDPCIS in tcg_out_movi_int Signed-off-by: Richard Henderson --- tcg/ppc/tcg-target.c.inc | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc index b0b8cd2390..226b5598ac 100644 --- a/tcg/ppc/tcg-target.c.inc +++ b/tcg/ppc/tcg-target.c.inc @@ -1055,6 +1055,19 @@ static void tcg_out_movi_int(TCGContext *s, TCGType type, TCGReg ret, return; } + /* Load addresses within 2GB with 2 insns. */ + if (have_isa_3_00) { + intptr_t hi = tcg_pcrel_diff(s, (void *)arg) - 4; + int16_t lo = hi; + + hi -= lo; + if (hi == (int32_t)hi) { + tcg_out_addpcis(s, TCG_REG_TMP2, hi); + tcg_out32(s, ADDI | TAI(ret, TCG_REG_TMP2, lo)); + return; + } + } + /* Load addresses within 2GB of TB with 2 (or rarely 3) insns. */ if (!in_prologue && USE_REG_TB && tb_diff == (int32_t)tb_diff) { tcg_out_mem_long(s, ADDI, ADD, ret, TCG_REG_TB, tb_diff); From 7a379976d6f12d44d8c97dd4ae68dab5240475b3 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Tue, 15 Aug 2023 17:35:21 +0000 Subject: [PATCH 06/38] tcg/ppc: Use ADDPCIS for the constant pool Signed-off-by: Richard Henderson --- tcg/ppc/tcg-target.c.inc | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc index 226b5598ac..720f92ff33 100644 --- a/tcg/ppc/tcg-target.c.inc +++ b/tcg/ppc/tcg-target.c.inc @@ -1081,6 +1081,12 @@ static void tcg_out_movi_int(TCGContext *s, TCGType type, TCGReg ret, tcg_out32(s, LD | TAI(ret, TCG_REG_TB, 0)); return; } + if (have_isa_3_00) { + tcg_out_addpcis(s, TCG_REG_TMP2, 0); + new_pool_label(s, arg, R_PPC_REL14, s->code_ptr, 0); + tcg_out32(s, LD | TAI(ret, TCG_REG_TMP2, 0)); + return; + } tmp = arg >> 31 >> 1; tcg_out_movi(s, TCG_TYPE_I32, ret, tmp); @@ -1138,6 +1144,10 @@ static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece, if (USE_REG_TB) { rel = R_PPC_ADDR16; add = ppc_tbrel_diff(s, NULL); + } else if (have_isa_3_00) { + tcg_out_addpcis(s, TCG_REG_TMP1, 0); + rel = R_PPC_REL14; + add = 0; } else { rel = R_PPC_ADDR32; add = 0; @@ -1164,6 +1174,8 @@ static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece, if (USE_REG_TB) { tcg_out32(s, ADDI | TAI(TCG_REG_TMP1, 0, 0)); load_insn |= RA(TCG_REG_TB); + } else if (have_isa_3_00) { + tcg_out32(s, ADDI | TAI(TCG_REG_TMP1, TCG_REG_TMP1, 0)); } else { tcg_out32(s, ADDIS | TAI(TCG_REG_TMP1, 0, 0)); tcg_out32(s, ADDI | TAI(TCG_REG_TMP1, TCG_REG_TMP1, 0)); From 4a388c7932322476e2395eed350ee6b23a08bf87 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Tue, 15 Aug 2023 17:47:47 +0000 Subject: [PATCH 07/38] tcg/ppc: Use ADDPCIS in tcg_out_goto_tb Signed-off-by: Richard Henderson --- tcg/ppc/tcg-target.c.inc | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc index 720f92ff33..6337b1e8be 100644 --- a/tcg/ppc/tcg-target.c.inc +++ b/tcg/ppc/tcg-target.c.inc @@ -2593,6 +2593,7 @@ static void tcg_out_exit_tb(TCGContext *s, uintptr_t arg) static void tcg_out_goto_tb(TCGContext *s, int which) { uintptr_t ptr = get_jmp_target_addr(s, which); + int16_t lo; /* Direct branch will be patched by tb_target_set_jmp_target. */ set_jmp_insn_offset(s, which); @@ -2602,9 +2603,15 @@ static void tcg_out_goto_tb(TCGContext *s, int which) if (USE_REG_TB) { ptrdiff_t offset = ppc_tbrel_diff(s, (void *)ptr); tcg_out_mem_long(s, LD, LDX, TCG_REG_TMP1, TCG_REG_TB, offset); + } else if (have_isa_3_00) { + ptrdiff_t offset = tcg_pcrel_diff(s, (void *)ptr) - 4; + lo = offset; + tcg_out_addpcis(s, TCG_REG_TMP1, offset - lo); + tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP1, TCG_REG_TMP1, lo); } else { - tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_TMP1, ptr - (int16_t)ptr); - tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP1, TCG_REG_TMP1, (int16_t)ptr); + lo = ptr; + tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_TMP1, ptr - lo); + tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP1, TCG_REG_TMP1, lo); } tcg_out32(s, MTSPR | RS(TCG_REG_TMP1) | CTR); From 103044e4062c55b956617971985c6baa8843ed6b Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Mon, 26 Jun 2023 18:02:18 +0000 Subject: [PATCH 08/38] tcg/ppc: Use PADDI in tcg_out_movi PADDI can load 34-bit immediates and 34-bit pc-relative addresses. Reviewed-by: Jordan Niethe Signed-off-by: Richard Henderson --- tcg/ppc/tcg-target.c.inc | 51 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc index 6337b1e8be..f4235383c6 100644 --- a/tcg/ppc/tcg-target.c.inc +++ b/tcg/ppc/tcg-target.c.inc @@ -719,6 +719,38 @@ static bool patch_reloc(tcg_insn_unit *code_ptr, int type, return true; } +/* Ensure that the prefixed instruction does not cross a 64-byte boundary. */ +static bool tcg_out_need_prefix_align(TCGContext *s) +{ + return ((uintptr_t)s->code_ptr & 0x3f) == 0x3c; +} + +static void tcg_out_prefix_align(TCGContext *s) +{ + if (tcg_out_need_prefix_align(s)) { + tcg_out32(s, NOP); + } +} + +static ptrdiff_t tcg_pcrel_diff_for_prefix(TCGContext *s, const void *target) +{ + return tcg_pcrel_diff(s, target) - (tcg_out_need_prefix_align(s) ? 4 : 0); +} + +/* Output Type 10 Prefix - Modified Load/Store Form (MLS:D) */ +static void tcg_out_mls_d(TCGContext *s, tcg_insn_unit opc, unsigned rt, + unsigned ra, tcg_target_long imm, bool r) +{ + tcg_insn_unit p, i; + + p = OPCD(1) | (2 << 24) | (r << 20) | ((imm >> 16) & 0x3ffff); + i = opc | TAI(rt, ra, imm); + + tcg_out_prefix_align(s); + tcg_out32(s, p); + tcg_out32(s, i); +} + static void tcg_out_mem_long(TCGContext *s, int opi, int opx, TCGReg rt, TCGReg base, tcg_target_long offset); @@ -1017,6 +1049,25 @@ static void tcg_out_movi_int(TCGContext *s, TCGType type, TCGReg ret, return; } + /* + * Load values up to 34 bits, and pc-relative addresses, + * with one prefixed insn. + */ + if (have_isa_3_10) { + if (arg == sextract64(arg, 0, 34)) { + /* pli ret,value = paddi ret,0,value,0 */ + tcg_out_mls_d(s, ADDI, ret, 0, arg, 0); + return; + } + + tmp = tcg_pcrel_diff_for_prefix(s, (void *)arg); + if (tmp == sextract64(tmp, 0, 34)) { + /* pla ret,value = paddi ret,0,value,1 */ + tcg_out_mls_d(s, ADDI, ret, 0, tmp, 1); + return; + } + } + /* Load 32-bit immediates with two insns. Note that we've already eliminated bare ADDIS, so we know both insns are required. */ if (TCG_TARGET_REG_BITS == 32 || arg == (int32_t)arg) { From 1f8e4dc182b7ee8240a4d016162847ff4b581399 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Fri, 4 Aug 2023 17:16:32 +0000 Subject: [PATCH 09/38] tcg/ppc: Use prefixed instructions in tcg_out_mem_long When the offset is out of range of the non-prefixed insn, but fits the 34-bit immediate of the prefixed insn, use that. Reviewed-by: Jordan Niethe Signed-off-by: Richard Henderson --- tcg/ppc/tcg-target.c.inc | 66 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc index f4235383c6..34df9144cc 100644 --- a/tcg/ppc/tcg-target.c.inc +++ b/tcg/ppc/tcg-target.c.inc @@ -329,6 +329,15 @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct, int vece) #define STDX XO31(149) #define STQ XO62( 2) +#define PLWA OPCD( 41) +#define PLD OPCD( 57) +#define PLXSD OPCD( 42) +#define PLXV OPCD(25 * 2 + 1) /* force tx=1 */ + +#define PSTD OPCD( 61) +#define PSTXSD OPCD( 46) +#define PSTXV OPCD(27 * 2 + 1) /* force sx=1 */ + #define ADDIC OPCD( 12) #define ADDI OPCD( 14) #define ADDIS OPCD( 15) @@ -737,6 +746,20 @@ static ptrdiff_t tcg_pcrel_diff_for_prefix(TCGContext *s, const void *target) return tcg_pcrel_diff(s, target) - (tcg_out_need_prefix_align(s) ? 4 : 0); } +/* Output Type 00 Prefix - 8-Byte Load/Store Form (8LS:D) */ +static void tcg_out_8ls_d(TCGContext *s, tcg_insn_unit opc, unsigned rt, + unsigned ra, tcg_target_long imm, bool r) +{ + tcg_insn_unit p, i; + + p = OPCD(1) | (r << 20) | ((imm >> 16) & 0x3ffff); + i = opc | TAI(rt, ra, imm); + + tcg_out_prefix_align(s); + tcg_out32(s, p); + tcg_out32(s, i); +} + /* Output Type 10 Prefix - Modified Load/Store Form (MLS:D) */ static void tcg_out_mls_d(TCGContext *s, tcg_insn_unit opc, unsigned rt, unsigned ra, tcg_target_long imm, bool r) @@ -1418,6 +1441,49 @@ static void tcg_out_mem_long(TCGContext *s, int opi, int opx, TCGReg rt, break; } + /* For unaligned or large offsets, use the prefixed form. */ + if (have_isa_3_10 + && (offset != (int16_t)offset || (offset & align)) + && offset == sextract64(offset, 0, 34)) { + /* + * Note that the MLS:D insns retain their un-prefixed opcode, + * while the 8LS:D insns use a different opcode space. + */ + switch (opi) { + case LBZ: + case LHZ: + case LHA: + case LWZ: + case STB: + case STH: + case STW: + case ADDI: + tcg_out_mls_d(s, opi, rt, base, offset, 0); + return; + case LWA: + tcg_out_8ls_d(s, PLWA, rt, base, offset, 0); + return; + case LD: + tcg_out_8ls_d(s, PLD, rt, base, offset, 0); + return; + case STD: + tcg_out_8ls_d(s, PSTD, rt, base, offset, 0); + return; + case LXSD: + tcg_out_8ls_d(s, PLXSD, rt & 31, base, offset, 0); + return; + case STXSD: + tcg_out_8ls_d(s, PSTXSD, rt & 31, base, offset, 0); + return; + case LXV: + tcg_out_8ls_d(s, PLXV, rt & 31, base, offset, 0); + return; + case STXV: + tcg_out_8ls_d(s, PSTXV, rt & 31, base, offset, 0); + return; + } + } + /* For unaligned, or very large offsets, use the indexed form. */ if (offset & align || offset != (int32_t)offset || opi == 0) { if (rs == base) { From d90b23af6dedb93384c8964882d9fad4af0f3cab Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Fri, 4 Aug 2023 18:19:25 +0000 Subject: [PATCH 10/38] tcg/ppc: Use PLD in tcg_out_movi for constant pool The prefixed instruction has a pc-relative form to use here. Signed-off-by: Richard Henderson --- tcg/ppc/tcg-target.c.inc | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc index 34df9144cc..79e82d2f94 100644 --- a/tcg/ppc/tcg-target.c.inc +++ b/tcg/ppc/tcg-target.c.inc @@ -101,6 +101,10 @@ #define ALL_GENERAL_REGS 0xffffffffu #define ALL_VECTOR_REGS 0xffffffff00000000ull +#ifndef R_PPC64_PCREL34 +#define R_PPC64_PCREL34 132 +#endif + #define have_isel (cpuinfo & CPUINFO_ISEL) #ifndef CONFIG_SOFTMMU @@ -266,6 +270,19 @@ static bool reloc_pc14(tcg_insn_unit *src_rw, const tcg_insn_unit *target) return false; } +static bool reloc_pc34(tcg_insn_unit *src_rw, const tcg_insn_unit *target) +{ + const tcg_insn_unit *src_rx = tcg_splitwx_to_rx(src_rw); + ptrdiff_t disp = tcg_ptr_byte_diff(target, src_rx); + + if (disp == sextract64(disp, 0, 34)) { + src_rw[0] = (src_rw[0] & ~0x3ffff) | ((disp >> 16) & 0x3ffff); + src_rw[1] = (src_rw[1] & ~0xffff) | (disp & 0xffff); + return true; + } + return false; +} + /* test if a constant matches the constraint */ static bool tcg_target_const_match(int64_t val, TCGType type, int ct, int vece) { @@ -696,6 +713,8 @@ static bool patch_reloc(tcg_insn_unit *code_ptr, int type, return reloc_pc14(code_ptr, target); case R_PPC_REL24: return reloc_pc24(code_ptr, target); + case R_PPC64_PCREL34: + return reloc_pc34(code_ptr, target); case R_PPC_ADDR16: /* * We are (slightly) abusing this relocation type. In particular, @@ -1155,6 +1174,11 @@ static void tcg_out_movi_int(TCGContext *s, TCGType type, TCGReg ret, tcg_out32(s, LD | TAI(ret, TCG_REG_TB, 0)); return; } + if (have_isa_3_10) { + tcg_out_8ls_d(s, PLD, ret, 0, 0, 1); + new_pool_label(s, arg, R_PPC64_PCREL34, s->code_ptr - 2, 0); + return; + } if (have_isa_3_00) { tcg_out_addpcis(s, TCG_REG_TMP2, 0); new_pool_label(s, arg, R_PPC_REL14, s->code_ptr, 0); From c90f897a541951821af1abdadca3b68c3f74c3ae Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Fri, 4 Aug 2023 18:32:57 +0000 Subject: [PATCH 11/38] tcg/ppc: Use prefixed instructions in tcg_out_dupi_vec The prefixed instructions have a pc-relative form to use here. Signed-off-by: Richard Henderson --- tcg/ppc/tcg-target.c.inc | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc index 79e82d2f94..db3212083b 100644 --- a/tcg/ppc/tcg-target.c.inc +++ b/tcg/ppc/tcg-target.c.inc @@ -1242,6 +1242,15 @@ static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece, if (USE_REG_TB) { rel = R_PPC_ADDR16; add = ppc_tbrel_diff(s, NULL); + } else if (have_isa_3_10) { + if (type == TCG_TYPE_V64) { + tcg_out_8ls_d(s, PLXSD, ret & 31, 0, 0, 1); + new_pool_label(s, val, R_PPC64_PCREL34, s->code_ptr - 2, 0); + } else { + tcg_out_8ls_d(s, PLXV, ret & 31, 0, 0, 1); + new_pool_l2(s, R_PPC64_PCREL34, s->code_ptr - 2, 0, val, val); + } + return; } else if (have_isa_3_00) { tcg_out_addpcis(s, TCG_REG_TMP1, 0); rel = R_PPC_REL14; From 1958dbbd2a79b9fe7cae80da66e04e11b86f5455 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Tue, 15 Aug 2023 18:17:10 +0000 Subject: [PATCH 12/38] tcg/ppc: Use PLD in tcg_out_goto_tb Signed-off-by: Richard Henderson --- tcg/ppc/tcg-target.c.inc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc index db3212083b..6496f76e41 100644 --- a/tcg/ppc/tcg-target.c.inc +++ b/tcg/ppc/tcg-target.c.inc @@ -2753,6 +2753,9 @@ static void tcg_out_goto_tb(TCGContext *s, int which) if (USE_REG_TB) { ptrdiff_t offset = ppc_tbrel_diff(s, (void *)ptr); tcg_out_mem_long(s, LD, LDX, TCG_REG_TMP1, TCG_REG_TB, offset); + } else if (have_isa_3_10) { + ptrdiff_t offset = tcg_pcrel_diff_for_prefix(s, (void *)ptr); + tcg_out_8ls_d(s, PLD, TCG_REG_TMP1, 0, offset, 1); } else if (have_isa_3_00) { ptrdiff_t offset = tcg_pcrel_diff(s, (void *)ptr) - 4; lo = offset; From cc3f99aac41903362521fac551b835abe369659a Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Tue, 15 Aug 2023 17:48:19 +0000 Subject: [PATCH 13/38] tcg/ppc: Disable TCG_REG_TB for Power9/Power10 This appears to slightly improve performance on power9/10. Signed-off-by: Richard Henderson --- tcg/ppc/tcg-target.c.inc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc index 6496f76e41..c31da4da9d 100644 --- a/tcg/ppc/tcg-target.c.inc +++ b/tcg/ppc/tcg-target.c.inc @@ -83,7 +83,7 @@ #define TCG_VEC_TMP2 TCG_REG_V1 #define TCG_REG_TB TCG_REG_R31 -#define USE_REG_TB (TCG_TARGET_REG_BITS == 64) +#define USE_REG_TB (TCG_TARGET_REG_BITS == 64 && !have_isa_3_00) /* Shorthand for size of a pointer. Avoid promotion to unsigned. */ #define SZP ((int)sizeof(void *)) From 397cabaae035102afb547155757fd506620b2bb2 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Sun, 1 Oct 2023 07:53:03 -0700 Subject: [PATCH 14/38] tcg: Introduce tcg_use_softmmu MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Begin disconnecting CONFIG_SOFTMMU from !CONFIG_USER_ONLY. Introduce a variable which can be set at startup to select one method or another for user-only. Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Richard Henderson --- include/tcg/tcg.h | 8 ++++++-- tcg/tcg-op-ldst.c | 14 +++++++------- tcg/tcg.c | 9 ++++++--- 3 files changed, 19 insertions(+), 12 deletions(-) diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h index 680ff00722..a9282cdcc6 100644 --- a/include/tcg/tcg.h +++ b/include/tcg/tcg.h @@ -488,11 +488,9 @@ struct TCGContext { int nb_ops; TCGType addr_type; /* TCG_TYPE_I32 or TCG_TYPE_I64 */ -#ifdef CONFIG_SOFTMMU int page_mask; uint8_t page_bits; uint8_t tlb_dyn_max_bits; -#endif uint8_t insn_start_words; TCGBar guest_mo; @@ -573,6 +571,12 @@ static inline bool temp_readonly(TCGTemp *ts) return ts->kind >= TEMP_FIXED; } +#ifdef CONFIG_USER_ONLY +extern bool tcg_use_softmmu; +#else +#define tcg_use_softmmu true +#endif + extern __thread TCGContext *tcg_ctx; extern const void *tcg_code_gen_epilogue; extern uintptr_t tcg_splitwx_diff; diff --git a/tcg/tcg-op-ldst.c b/tcg/tcg-op-ldst.c index df4f22c427..2b96687699 100644 --- a/tcg/tcg-op-ldst.c +++ b/tcg/tcg-op-ldst.c @@ -34,13 +34,13 @@ static void check_max_alignment(unsigned a_bits) { -#if defined(CONFIG_SOFTMMU) /* * The requested alignment cannot overlap the TLB flags. * FIXME: Must keep the count up-to-date with "exec/cpu-all.h". */ - tcg_debug_assert(a_bits + 5 <= tcg_ctx->page_bits); -#endif + if (tcg_use_softmmu) { + tcg_debug_assert(a_bits + 5 <= tcg_ctx->page_bits); + } } static MemOp tcg_canonicalize_memop(MemOp op, bool is64, bool st) @@ -411,10 +411,11 @@ void tcg_gen_qemu_st_i64_chk(TCGv_i64 val, TCGTemp *addr, TCGArg idx, */ static bool use_two_i64_for_i128(MemOp mop) { -#ifdef CONFIG_SOFTMMU /* Two softmmu tlb lookups is larger than one function call. */ - return false; -#else + if (tcg_use_softmmu) { + return false; + } + /* * For user-only, two 64-bit operations may well be smaller than a call. * Determine if that would be legal for the requested atomicity. @@ -432,7 +433,6 @@ static bool use_two_i64_for_i128(MemOp mop) default: g_assert_not_reached(); } -#endif } static void canonicalize_memop_i128_as_i64(MemOp ret[2], MemOp orig) diff --git a/tcg/tcg.c b/tcg/tcg.c index 637b9e6870..d3a4a17ef2 100644 --- a/tcg/tcg.c +++ b/tcg/tcg.c @@ -226,6 +226,10 @@ static TCGAtomAlign atom_and_align_for_opc(TCGContext *s, MemOp opc, MemOp host_atom, bool allow_two_ops) __attribute__((unused)); +#ifdef CONFIG_USER_ONLY +bool tcg_use_softmmu; +#endif + TCGContext tcg_init_ctx; __thread TCGContext *tcg_ctx; @@ -404,13 +408,12 @@ static uintptr_t G_GNUC_UNUSED get_jmp_target_addr(TCGContext *s, int which) return (uintptr_t)tcg_splitwx_to_rx(&s->gen_tb->jmp_target_addr[which]); } -#if defined(CONFIG_SOFTMMU) && !defined(CONFIG_TCG_INTERPRETER) -static int tlb_mask_table_ofs(TCGContext *s, int which) +static int __attribute__((unused)) +tlb_mask_table_ofs(TCGContext *s, int which) { return (offsetof(CPUNegativeOffsetState, tlb.f[which]) - sizeof(CPUNegativeOffsetState)); } -#endif /* Signal overflow, starting over with fewer guest insns. */ static G_NORETURN From 23088ca0bcf491eaa3e99a2760cf85f4cd7a3bce Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Sun, 1 Oct 2023 17:12:32 +0000 Subject: [PATCH 15/38] tcg: Provide guest_base fallback for system mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Provide a define to allow !tcg_use_softmmu code paths to compile in system mode, but require elimination. Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Richard Henderson --- tcg/tcg.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tcg/tcg.c b/tcg/tcg.c index d3a4a17ef2..35158a0846 100644 --- a/tcg/tcg.c +++ b/tcg/tcg.c @@ -178,6 +178,10 @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct, int vece); static int tcg_out_ldst_finalize(TCGContext *s); #endif +#ifndef CONFIG_USER_ONLY +#define guest_base ({ qemu_build_not_reached(); (uintptr_t)0; }) +#endif + typedef struct TCGLdstHelperParam { TCGReg (*ra_gen)(TCGContext *s, const TCGLabelQemuLdst *l, int arg_reg); unsigned ntmp; From 2c53bdf110ef09422ceed89357a7dd8fc7d0b7a8 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Sun, 1 Oct 2023 17:13:00 +0000 Subject: [PATCH 16/38] tcg/arm: Use tcg_use_softmmu MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Richard Henderson --- tcg/arm/tcg-target.c.inc | 203 +++++++++++++++++++-------------------- 1 file changed, 97 insertions(+), 106 deletions(-) diff --git a/tcg/arm/tcg-target.c.inc b/tcg/arm/tcg-target.c.inc index 0d9c2d157b..fc78566494 100644 --- a/tcg/arm/tcg-target.c.inc +++ b/tcg/arm/tcg-target.c.inc @@ -89,9 +89,7 @@ static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot) #define TCG_REG_TMP TCG_REG_R12 #define TCG_VEC_TMP TCG_REG_Q15 -#ifndef CONFIG_SOFTMMU #define TCG_REG_GUEST_BASE TCG_REG_R11 -#endif typedef enum { COND_EQ = 0x0, @@ -356,14 +354,8 @@ static bool patch_reloc(tcg_insn_unit *code_ptr, int type, * r0-r3 will be overwritten when reading the tlb entry (system-mode only); * r14 will be overwritten by the BLNE branching to the slow path. */ -#ifdef CONFIG_SOFTMMU #define ALL_QLDST_REGS \ - (ALL_GENERAL_REGS & ~((1 << TCG_REG_R0) | (1 << TCG_REG_R1) | \ - (1 << TCG_REG_R2) | (1 << TCG_REG_R3) | \ - (1 << TCG_REG_R14))) -#else -#define ALL_QLDST_REGS (ALL_GENERAL_REGS & ~(1 << TCG_REG_R14)) -#endif + (ALL_GENERAL_REGS & ~((tcg_use_softmmu ? 0xf : 0) | (1 << TCG_REG_R14))) /* * ARM immediates for ALU instructions are made of an unsigned 8-bit @@ -1387,113 +1379,115 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, MemOp opc = get_memop(oi); unsigned a_mask; -#ifdef CONFIG_SOFTMMU - *h = (HostAddress){ - .cond = COND_AL, - .base = addrlo, - .index = TCG_REG_R1, - .index_scratch = true, - }; -#else - *h = (HostAddress){ - .cond = COND_AL, - .base = addrlo, - .index = guest_base ? TCG_REG_GUEST_BASE : -1, - .index_scratch = false, - }; -#endif + if (tcg_use_softmmu) { + *h = (HostAddress){ + .cond = COND_AL, + .base = addrlo, + .index = TCG_REG_R1, + .index_scratch = true, + }; + } else { + *h = (HostAddress){ + .cond = COND_AL, + .base = addrlo, + .index = guest_base ? TCG_REG_GUEST_BASE : -1, + .index_scratch = false, + }; + } h->aa = atom_and_align_for_opc(s, opc, MO_ATOM_IFALIGN, false); a_mask = (1 << h->aa.align) - 1; -#ifdef CONFIG_SOFTMMU - int mem_index = get_mmuidx(oi); - int cmp_off = is_ld ? offsetof(CPUTLBEntry, addr_read) - : offsetof(CPUTLBEntry, addr_write); - int fast_off = tlb_mask_table_ofs(s, mem_index); - unsigned s_mask = (1 << (opc & MO_SIZE)) - 1; - TCGReg t_addr; + if (tcg_use_softmmu) { + int mem_index = get_mmuidx(oi); + int cmp_off = is_ld ? offsetof(CPUTLBEntry, addr_read) + : offsetof(CPUTLBEntry, addr_write); + int fast_off = tlb_mask_table_ofs(s, mem_index); + unsigned s_mask = (1 << (opc & MO_SIZE)) - 1; + TCGReg t_addr; - ldst = new_ldst_label(s); - ldst->is_ld = is_ld; - ldst->oi = oi; - ldst->addrlo_reg = addrlo; - ldst->addrhi_reg = addrhi; + ldst = new_ldst_label(s); + ldst->is_ld = is_ld; + ldst->oi = oi; + ldst->addrlo_reg = addrlo; + ldst->addrhi_reg = addrhi; - /* Load cpu->neg.tlb.f[mmu_idx].{mask,table} into {r0,r1}. */ - QEMU_BUILD_BUG_ON(offsetof(CPUTLBDescFast, mask) != 0); - QEMU_BUILD_BUG_ON(offsetof(CPUTLBDescFast, table) != 4); - tcg_out_ldrd_8(s, COND_AL, TCG_REG_R0, TCG_AREG0, fast_off); + /* Load cpu->neg.tlb.f[mmu_idx].{mask,table} into {r0,r1}. */ + QEMU_BUILD_BUG_ON(offsetof(CPUTLBDescFast, mask) != 0); + QEMU_BUILD_BUG_ON(offsetof(CPUTLBDescFast, table) != 4); + tcg_out_ldrd_8(s, COND_AL, TCG_REG_R0, TCG_AREG0, fast_off); - /* Extract the tlb index from the address into R0. */ - tcg_out_dat_reg(s, COND_AL, ARITH_AND, TCG_REG_R0, TCG_REG_R0, addrlo, - SHIFT_IMM_LSR(s->page_bits - CPU_TLB_ENTRY_BITS)); + /* Extract the tlb index from the address into R0. */ + tcg_out_dat_reg(s, COND_AL, ARITH_AND, TCG_REG_R0, TCG_REG_R0, addrlo, + SHIFT_IMM_LSR(s->page_bits - CPU_TLB_ENTRY_BITS)); - /* - * Add the tlb_table pointer, creating the CPUTLBEntry address in R1. - * Load the tlb comparator into R2/R3 and the fast path addend into R1. - */ - QEMU_BUILD_BUG_ON(HOST_BIG_ENDIAN); - if (cmp_off == 0) { - if (s->addr_type == TCG_TYPE_I32) { - tcg_out_ld32_rwb(s, COND_AL, TCG_REG_R2, TCG_REG_R1, TCG_REG_R0); + /* + * Add the tlb_table pointer, creating the CPUTLBEntry address in R1. + * Load the tlb comparator into R2/R3 and the fast path addend into R1. + */ + QEMU_BUILD_BUG_ON(HOST_BIG_ENDIAN); + if (cmp_off == 0) { + if (s->addr_type == TCG_TYPE_I32) { + tcg_out_ld32_rwb(s, COND_AL, TCG_REG_R2, + TCG_REG_R1, TCG_REG_R0); + } else { + tcg_out_ldrd_rwb(s, COND_AL, TCG_REG_R2, + TCG_REG_R1, TCG_REG_R0); + } } else { - tcg_out_ldrd_rwb(s, COND_AL, TCG_REG_R2, TCG_REG_R1, TCG_REG_R0); + tcg_out_dat_reg(s, COND_AL, ARITH_ADD, + TCG_REG_R1, TCG_REG_R1, TCG_REG_R0, 0); + if (s->addr_type == TCG_TYPE_I32) { + tcg_out_ld32_12(s, COND_AL, TCG_REG_R2, TCG_REG_R1, cmp_off); + } else { + tcg_out_ldrd_8(s, COND_AL, TCG_REG_R2, TCG_REG_R1, cmp_off); + } } - } else { - tcg_out_dat_reg(s, COND_AL, ARITH_ADD, - TCG_REG_R1, TCG_REG_R1, TCG_REG_R0, 0); - if (s->addr_type == TCG_TYPE_I32) { - tcg_out_ld32_12(s, COND_AL, TCG_REG_R2, TCG_REG_R1, cmp_off); + + /* Load the tlb addend. */ + tcg_out_ld32_12(s, COND_AL, TCG_REG_R1, TCG_REG_R1, + offsetof(CPUTLBEntry, addend)); + + /* + * Check alignment, check comparators. + * Do this in 2-4 insns. Use MOVW for v7, if possible, + * to reduce the number of sequential conditional instructions. + * Almost all guests have at least 4k pages, which means that we need + * to clear at least 9 bits even for an 8-byte memory, which means it + * isn't worth checking for an immediate operand for BIC. + * + * For unaligned accesses, test the page of the last unit of alignment. + * This leaves the least significant alignment bits unchanged, and of + * course must be zero. + */ + t_addr = addrlo; + if (a_mask < s_mask) { + t_addr = TCG_REG_R0; + tcg_out_dat_imm(s, COND_AL, ARITH_ADD, t_addr, + addrlo, s_mask - a_mask); + } + if (use_armv7_instructions && s->page_bits <= 16) { + tcg_out_movi32(s, COND_AL, TCG_REG_TMP, ~(s->page_mask | a_mask)); + tcg_out_dat_reg(s, COND_AL, ARITH_BIC, TCG_REG_TMP, + t_addr, TCG_REG_TMP, 0); + tcg_out_dat_reg(s, COND_AL, ARITH_CMP, 0, + TCG_REG_R2, TCG_REG_TMP, 0); } else { - tcg_out_ldrd_8(s, COND_AL, TCG_REG_R2, TCG_REG_R1, cmp_off); + if (a_mask) { + tcg_debug_assert(a_mask <= 0xff); + tcg_out_dat_imm(s, COND_AL, ARITH_TST, 0, addrlo, a_mask); + } + tcg_out_dat_reg(s, COND_AL, ARITH_MOV, TCG_REG_TMP, 0, t_addr, + SHIFT_IMM_LSR(s->page_bits)); + tcg_out_dat_reg(s, (a_mask ? COND_EQ : COND_AL), ARITH_CMP, + 0, TCG_REG_R2, TCG_REG_TMP, + SHIFT_IMM_LSL(s->page_bits)); } - } - /* Load the tlb addend. */ - tcg_out_ld32_12(s, COND_AL, TCG_REG_R1, TCG_REG_R1, - offsetof(CPUTLBEntry, addend)); - - /* - * Check alignment, check comparators. - * Do this in 2-4 insns. Use MOVW for v7, if possible, - * to reduce the number of sequential conditional instructions. - * Almost all guests have at least 4k pages, which means that we need - * to clear at least 9 bits even for an 8-byte memory, which means it - * isn't worth checking for an immediate operand for BIC. - * - * For unaligned accesses, test the page of the last unit of alignment. - * This leaves the least significant alignment bits unchanged, and of - * course must be zero. - */ - t_addr = addrlo; - if (a_mask < s_mask) { - t_addr = TCG_REG_R0; - tcg_out_dat_imm(s, COND_AL, ARITH_ADD, t_addr, - addrlo, s_mask - a_mask); - } - if (use_armv7_instructions && s->page_bits <= 16) { - tcg_out_movi32(s, COND_AL, TCG_REG_TMP, ~(s->page_mask | a_mask)); - tcg_out_dat_reg(s, COND_AL, ARITH_BIC, TCG_REG_TMP, - t_addr, TCG_REG_TMP, 0); - tcg_out_dat_reg(s, COND_AL, ARITH_CMP, 0, TCG_REG_R2, TCG_REG_TMP, 0); - } else { - if (a_mask) { - tcg_debug_assert(a_mask <= 0xff); - tcg_out_dat_imm(s, COND_AL, ARITH_TST, 0, addrlo, a_mask); + if (s->addr_type != TCG_TYPE_I32) { + tcg_out_dat_reg(s, COND_EQ, ARITH_CMP, 0, TCG_REG_R3, addrhi, 0); } - tcg_out_dat_reg(s, COND_AL, ARITH_MOV, TCG_REG_TMP, 0, t_addr, - SHIFT_IMM_LSR(s->page_bits)); - tcg_out_dat_reg(s, (a_mask ? COND_EQ : COND_AL), ARITH_CMP, - 0, TCG_REG_R2, TCG_REG_TMP, - SHIFT_IMM_LSL(s->page_bits)); - } - - if (s->addr_type != TCG_TYPE_I32) { - tcg_out_dat_reg(s, COND_EQ, ARITH_CMP, 0, TCG_REG_R3, addrhi, 0); - } -#else - if (a_mask) { + } else if (a_mask) { ldst = new_ldst_label(s); ldst->is_ld = is_ld; ldst->oi = oi; @@ -1505,7 +1499,6 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, /* tst addr, #mask */ tcg_out_dat_imm(s, COND_AL, ARITH_TST, 0, addrlo, a_mask); } -#endif return ldst; } @@ -2931,12 +2924,10 @@ static void tcg_target_qemu_prologue(TCGContext *s) tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]); -#ifndef CONFIG_SOFTMMU - if (guest_base) { + if (!tcg_use_softmmu && guest_base) { tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_GUEST_BASE, guest_base); tcg_regset_set_reg(s->reserved_regs, TCG_REG_GUEST_BASE); } -#endif tcg_out_b_reg(s, COND_AL, tcg_target_call_iarg_regs[1]); From e2b7a40d05c682e8a254d9f664b368af9fdb1e8f Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Sun, 1 Oct 2023 16:26:53 +0000 Subject: [PATCH 17/38] tcg/aarch64: Use tcg_use_softmmu MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Richard Henderson --- tcg/aarch64/tcg-target.c.inc | 177 +++++++++++++++++------------------ 1 file changed, 88 insertions(+), 89 deletions(-) diff --git a/tcg/aarch64/tcg-target.c.inc b/tcg/aarch64/tcg-target.c.inc index 3afb896a3a..a3efa1e67a 100644 --- a/tcg/aarch64/tcg-target.c.inc +++ b/tcg/aarch64/tcg-target.c.inc @@ -77,9 +77,7 @@ static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot) #define TCG_REG_TMP2 TCG_REG_X30 #define TCG_VEC_TMP0 TCG_REG_V31 -#ifndef CONFIG_SOFTMMU #define TCG_REG_GUEST_BASE TCG_REG_X28 -#endif static bool reloc_pc26(tcg_insn_unit *src_rw, const tcg_insn_unit *target) { @@ -1664,97 +1662,98 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, s_bits == MO_128); a_mask = (1 << h->aa.align) - 1; -#ifdef CONFIG_SOFTMMU - unsigned s_mask = (1u << s_bits) - 1; - unsigned mem_index = get_mmuidx(oi); - TCGReg addr_adj; - TCGType mask_type; - uint64_t compare_mask; + if (tcg_use_softmmu) { + unsigned s_mask = (1u << s_bits) - 1; + unsigned mem_index = get_mmuidx(oi); + TCGReg addr_adj; + TCGType mask_type; + uint64_t compare_mask; - ldst = new_ldst_label(s); - ldst->is_ld = is_ld; - ldst->oi = oi; - ldst->addrlo_reg = addr_reg; - - mask_type = (s->page_bits + s->tlb_dyn_max_bits > 32 - ? TCG_TYPE_I64 : TCG_TYPE_I32); - - /* Load cpu->neg.tlb.f[mmu_idx].{mask,table} into {tmp0,tmp1}. */ - QEMU_BUILD_BUG_ON(offsetof(CPUTLBDescFast, mask) != 0); - QEMU_BUILD_BUG_ON(offsetof(CPUTLBDescFast, table) != 8); - tcg_out_insn(s, 3314, LDP, TCG_REG_TMP0, TCG_REG_TMP1, TCG_AREG0, - tlb_mask_table_ofs(s, mem_index), 1, 0); - - /* Extract the TLB index from the address into X0. */ - tcg_out_insn(s, 3502S, AND_LSR, mask_type == TCG_TYPE_I64, - TCG_REG_TMP0, TCG_REG_TMP0, addr_reg, - s->page_bits - CPU_TLB_ENTRY_BITS); - - /* Add the tlb_table pointer, forming the CPUTLBEntry address in TMP1. */ - tcg_out_insn(s, 3502, ADD, 1, TCG_REG_TMP1, TCG_REG_TMP1, TCG_REG_TMP0); - - /* Load the tlb comparator into TMP0, and the fast path addend into TMP1. */ - QEMU_BUILD_BUG_ON(HOST_BIG_ENDIAN); - tcg_out_ld(s, addr_type, TCG_REG_TMP0, TCG_REG_TMP1, - is_ld ? offsetof(CPUTLBEntry, addr_read) - : offsetof(CPUTLBEntry, addr_write)); - tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP1, TCG_REG_TMP1, - offsetof(CPUTLBEntry, addend)); - - /* - * For aligned accesses, we check the first byte and include the alignment - * bits within the address. For unaligned access, we check that we don't - * cross pages using the address of the last byte of the access. - */ - if (a_mask >= s_mask) { - addr_adj = addr_reg; - } else { - addr_adj = TCG_REG_TMP2; - tcg_out_insn(s, 3401, ADDI, addr_type, - addr_adj, addr_reg, s_mask - a_mask); - } - compare_mask = (uint64_t)s->page_mask | a_mask; - - /* Store the page mask part of the address into TMP2. */ - tcg_out_logicali(s, I3404_ANDI, addr_type, TCG_REG_TMP2, - addr_adj, compare_mask); - - /* Perform the address comparison. */ - tcg_out_cmp(s, addr_type, TCG_REG_TMP0, TCG_REG_TMP2, 0); - - /* If not equal, we jump to the slow path. */ - ldst->label_ptr[0] = s->code_ptr; - tcg_out_insn(s, 3202, B_C, TCG_COND_NE, 0); - - h->base = TCG_REG_TMP1; - h->index = addr_reg; - h->index_ext = addr_type; -#else - if (a_mask) { ldst = new_ldst_label(s); - ldst->is_ld = is_ld; ldst->oi = oi; ldst->addrlo_reg = addr_reg; - /* tst addr, #mask */ - tcg_out_logicali(s, I3404_ANDSI, 0, TCG_REG_XZR, addr_reg, a_mask); + mask_type = (s->page_bits + s->tlb_dyn_max_bits > 32 + ? TCG_TYPE_I64 : TCG_TYPE_I32); - /* b.ne slow_path */ + /* Load cpu->neg.tlb.f[mmu_idx].{mask,table} into {tmp0,tmp1}. */ + QEMU_BUILD_BUG_ON(offsetof(CPUTLBDescFast, mask) != 0); + QEMU_BUILD_BUG_ON(offsetof(CPUTLBDescFast, table) != 8); + tcg_out_insn(s, 3314, LDP, TCG_REG_TMP0, TCG_REG_TMP1, TCG_AREG0, + tlb_mask_table_ofs(s, mem_index), 1, 0); + + /* Extract the TLB index from the address into X0. */ + tcg_out_insn(s, 3502S, AND_LSR, mask_type == TCG_TYPE_I64, + TCG_REG_TMP0, TCG_REG_TMP0, addr_reg, + s->page_bits - CPU_TLB_ENTRY_BITS); + + /* Add the tlb_table pointer, forming the CPUTLBEntry address. */ + tcg_out_insn(s, 3502, ADD, 1, TCG_REG_TMP1, TCG_REG_TMP1, TCG_REG_TMP0); + + /* Load the tlb comparator into TMP0, and the fast path addend. */ + QEMU_BUILD_BUG_ON(HOST_BIG_ENDIAN); + tcg_out_ld(s, addr_type, TCG_REG_TMP0, TCG_REG_TMP1, + is_ld ? offsetof(CPUTLBEntry, addr_read) + : offsetof(CPUTLBEntry, addr_write)); + tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP1, TCG_REG_TMP1, + offsetof(CPUTLBEntry, addend)); + + /* + * For aligned accesses, we check the first byte and include + * the alignment bits within the address. For unaligned access, + * we check that we don't cross pages using the address of the + * last byte of the access. + */ + if (a_mask >= s_mask) { + addr_adj = addr_reg; + } else { + addr_adj = TCG_REG_TMP2; + tcg_out_insn(s, 3401, ADDI, addr_type, + addr_adj, addr_reg, s_mask - a_mask); + } + compare_mask = (uint64_t)s->page_mask | a_mask; + + /* Store the page mask part of the address into TMP2. */ + tcg_out_logicali(s, I3404_ANDI, addr_type, TCG_REG_TMP2, + addr_adj, compare_mask); + + /* Perform the address comparison. */ + tcg_out_cmp(s, addr_type, TCG_REG_TMP0, TCG_REG_TMP2, 0); + + /* If not equal, we jump to the slow path. */ ldst->label_ptr[0] = s->code_ptr; tcg_out_insn(s, 3202, B_C, TCG_COND_NE, 0); - } - if (guest_base || addr_type == TCG_TYPE_I32) { - h->base = TCG_REG_GUEST_BASE; + h->base = TCG_REG_TMP1; h->index = addr_reg; h->index_ext = addr_type; } else { - h->base = addr_reg; - h->index = TCG_REG_XZR; - h->index_ext = TCG_TYPE_I64; + if (a_mask) { + ldst = new_ldst_label(s); + + ldst->is_ld = is_ld; + ldst->oi = oi; + ldst->addrlo_reg = addr_reg; + + /* tst addr, #mask */ + tcg_out_logicali(s, I3404_ANDSI, 0, TCG_REG_XZR, addr_reg, a_mask); + + /* b.ne slow_path */ + ldst->label_ptr[0] = s->code_ptr; + tcg_out_insn(s, 3202, B_C, TCG_COND_NE, 0); + } + + if (guest_base || addr_type == TCG_TYPE_I32) { + h->base = TCG_REG_GUEST_BASE; + h->index = addr_reg; + h->index_ext = addr_type; + } else { + h->base = addr_reg; + h->index = TCG_REG_XZR; + h->index_ext = TCG_TYPE_I64; + } } -#endif return ldst; } @@ -3117,16 +3116,16 @@ static void tcg_target_qemu_prologue(TCGContext *s) tcg_set_frame(s, TCG_REG_SP, TCG_STATIC_CALL_ARGS_SIZE, CPU_TEMP_BUF_NLONGS * sizeof(long)); -#if !defined(CONFIG_SOFTMMU) - /* - * Note that XZR cannot be encoded in the address base register slot, - * as that actually encodes SP. Depending on the guest, we may need - * to zero-extend the guest address via the address index register slot, - * therefore we need to load even a zero guest base into a register. - */ - tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_GUEST_BASE, guest_base); - tcg_regset_set_reg(s->reserved_regs, TCG_REG_GUEST_BASE); -#endif + if (!tcg_use_softmmu) { + /* + * Note that XZR cannot be encoded in the address base register slot, + * as that actually encodes SP. Depending on the guest, we may need + * to zero-extend the guest address via the address index register slot, + * therefore we need to load even a zero guest base into a register. + */ + tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_GUEST_BASE, guest_base); + tcg_regset_set_reg(s->reserved_regs, TCG_REG_GUEST_BASE); + } tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]); tcg_out_insn(s, 3207, BR, tcg_target_call_iarg_regs[1]); From 915e1d52e205cdfa85cfc848e022b470f35a090e Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Sun, 1 Oct 2023 08:13:44 -0700 Subject: [PATCH 18/38] tcg/i386: Use tcg_use_softmmu MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Richard Henderson --- tcg/i386/tcg-target.c.inc | 198 +++++++++++++++++++------------------- 1 file changed, 98 insertions(+), 100 deletions(-) diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc index 788d608150..a83f8aab30 100644 --- a/tcg/i386/tcg-target.c.inc +++ b/tcg/i386/tcg-target.c.inc @@ -153,11 +153,8 @@ static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot) # define ALL_VECTOR_REGS 0x00ff0000u # define ALL_BYTEL_REGS 0x0000000fu #endif -#ifdef CONFIG_SOFTMMU -# define SOFTMMU_RESERVE_REGS ((1 << TCG_REG_L0) | (1 << TCG_REG_L1)) -#else -# define SOFTMMU_RESERVE_REGS 0 -#endif +#define SOFTMMU_RESERVE_REGS \ + (tcg_use_softmmu ? (1 << TCG_REG_L0) | (1 << TCG_REG_L1) : 0) /* For 64-bit, we always know that CMOV is available. */ #if TCG_TARGET_REG_BITS == 64 @@ -1933,7 +1930,7 @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l) return true; } -#ifndef CONFIG_SOFTMMU +#ifdef CONFIG_USER_ONLY static HostAddress x86_guest_base = { .index = -1 }; @@ -1949,6 +1946,7 @@ static inline int setup_guest_base_seg(void) } return 0; } +#define setup_guest_base_seg setup_guest_base_seg #elif defined(__x86_64__) && \ (defined (__FreeBSD__) || defined (__FreeBSD_kernel__)) # include @@ -1959,13 +1957,14 @@ static inline int setup_guest_base_seg(void) } return 0; } +#define setup_guest_base_seg setup_guest_base_seg +#endif #else -static inline int setup_guest_base_seg(void) -{ - return 0; -} -#endif /* setup_guest_base_seg */ -#endif /* !SOFTMMU */ +# define x86_guest_base (*(HostAddress *)({ qemu_build_not_reached(); NULL; })) +#endif /* CONFIG_USER_ONLY */ +#ifndef setup_guest_base_seg +# define setup_guest_base_seg() 0 +#endif #define MIN_TLB_MASK_TABLE_OFS INT_MIN @@ -1984,94 +1983,94 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, MemOp s_bits = opc & MO_SIZE; unsigned a_mask; -#ifdef CONFIG_SOFTMMU - h->index = TCG_REG_L0; - h->ofs = 0; - h->seg = 0; -#else - *h = x86_guest_base; -#endif + if (tcg_use_softmmu) { + h->index = TCG_REG_L0; + h->ofs = 0; + h->seg = 0; + } else { + *h = x86_guest_base; + } h->base = addrlo; h->aa = atom_and_align_for_opc(s, opc, MO_ATOM_IFALIGN, s_bits == MO_128); a_mask = (1 << h->aa.align) - 1; -#ifdef CONFIG_SOFTMMU - int cmp_ofs = is_ld ? offsetof(CPUTLBEntry, addr_read) - : offsetof(CPUTLBEntry, addr_write); - TCGType ttype = TCG_TYPE_I32; - TCGType tlbtype = TCG_TYPE_I32; - int trexw = 0, hrexw = 0, tlbrexw = 0; - unsigned mem_index = get_mmuidx(oi); - unsigned s_mask = (1 << s_bits) - 1; - int fast_ofs = tlb_mask_table_ofs(s, mem_index); - int tlb_mask; + if (tcg_use_softmmu) { + int cmp_ofs = is_ld ? offsetof(CPUTLBEntry, addr_read) + : offsetof(CPUTLBEntry, addr_write); + TCGType ttype = TCG_TYPE_I32; + TCGType tlbtype = TCG_TYPE_I32; + int trexw = 0, hrexw = 0, tlbrexw = 0; + unsigned mem_index = get_mmuidx(oi); + unsigned s_mask = (1 << s_bits) - 1; + int fast_ofs = tlb_mask_table_ofs(s, mem_index); + int tlb_mask; - ldst = new_ldst_label(s); - ldst->is_ld = is_ld; - ldst->oi = oi; - ldst->addrlo_reg = addrlo; - ldst->addrhi_reg = addrhi; + ldst = new_ldst_label(s); + ldst->is_ld = is_ld; + ldst->oi = oi; + ldst->addrlo_reg = addrlo; + ldst->addrhi_reg = addrhi; - if (TCG_TARGET_REG_BITS == 64) { - ttype = s->addr_type; - trexw = (ttype == TCG_TYPE_I32 ? 0 : P_REXW); - if (TCG_TYPE_PTR == TCG_TYPE_I64) { - hrexw = P_REXW; - if (s->page_bits + s->tlb_dyn_max_bits > 32) { - tlbtype = TCG_TYPE_I64; - tlbrexw = P_REXW; + if (TCG_TARGET_REG_BITS == 64) { + ttype = s->addr_type; + trexw = (ttype == TCG_TYPE_I32 ? 0 : P_REXW); + if (TCG_TYPE_PTR == TCG_TYPE_I64) { + hrexw = P_REXW; + if (s->page_bits + s->tlb_dyn_max_bits > 32) { + tlbtype = TCG_TYPE_I64; + tlbrexw = P_REXW; + } } } - } - tcg_out_mov(s, tlbtype, TCG_REG_L0, addrlo); - tcg_out_shifti(s, SHIFT_SHR + tlbrexw, TCG_REG_L0, - s->page_bits - CPU_TLB_ENTRY_BITS); + tcg_out_mov(s, tlbtype, TCG_REG_L0, addrlo); + tcg_out_shifti(s, SHIFT_SHR + tlbrexw, TCG_REG_L0, + s->page_bits - CPU_TLB_ENTRY_BITS); - tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, TCG_REG_L0, TCG_AREG0, - fast_ofs + offsetof(CPUTLBDescFast, mask)); + tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, TCG_REG_L0, TCG_AREG0, + fast_ofs + offsetof(CPUTLBDescFast, mask)); - tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, TCG_REG_L0, TCG_AREG0, - fast_ofs + offsetof(CPUTLBDescFast, table)); + tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, TCG_REG_L0, TCG_AREG0, + fast_ofs + offsetof(CPUTLBDescFast, table)); - /* - * If the required alignment is at least as large as the access, simply - * copy the address and mask. For lesser alignments, check that we don't - * cross pages for the complete access. - */ - if (a_mask >= s_mask) { - tcg_out_mov(s, ttype, TCG_REG_L1, addrlo); - } else { - tcg_out_modrm_offset(s, OPC_LEA + trexw, TCG_REG_L1, - addrlo, s_mask - a_mask); - } - tlb_mask = s->page_mask | a_mask; - tgen_arithi(s, ARITH_AND + trexw, TCG_REG_L1, tlb_mask, 0); + /* + * If the required alignment is at least as large as the access, + * simply copy the address and mask. For lesser alignments, + * check that we don't cross pages for the complete access. + */ + if (a_mask >= s_mask) { + tcg_out_mov(s, ttype, TCG_REG_L1, addrlo); + } else { + tcg_out_modrm_offset(s, OPC_LEA + trexw, TCG_REG_L1, + addrlo, s_mask - a_mask); + } + tlb_mask = s->page_mask | a_mask; + tgen_arithi(s, ARITH_AND + trexw, TCG_REG_L1, tlb_mask, 0); - /* cmp 0(TCG_REG_L0), TCG_REG_L1 */ - tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw, - TCG_REG_L1, TCG_REG_L0, cmp_ofs); - - /* jne slow_path */ - tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0); - ldst->label_ptr[0] = s->code_ptr; - s->code_ptr += 4; - - if (TCG_TARGET_REG_BITS == 32 && s->addr_type == TCG_TYPE_I64) { - /* cmp 4(TCG_REG_L0), addrhi */ - tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, TCG_REG_L0, cmp_ofs + 4); + /* cmp 0(TCG_REG_L0), TCG_REG_L1 */ + tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw, + TCG_REG_L1, TCG_REG_L0, cmp_ofs); /* jne slow_path */ tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0); - ldst->label_ptr[1] = s->code_ptr; + ldst->label_ptr[0] = s->code_ptr; s->code_ptr += 4; - } - /* TLB Hit. */ - tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_L0, TCG_REG_L0, - offsetof(CPUTLBEntry, addend)); -#else - if (a_mask) { + if (TCG_TARGET_REG_BITS == 32 && s->addr_type == TCG_TYPE_I64) { + /* cmp 4(TCG_REG_L0), addrhi */ + tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, + TCG_REG_L0, cmp_ofs + 4); + + /* jne slow_path */ + tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0); + ldst->label_ptr[1] = s->code_ptr; + s->code_ptr += 4; + } + + /* TLB Hit. */ + tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_L0, TCG_REG_L0, + offsetof(CPUTLBEntry, addend)); + } else if (a_mask) { ldst = new_ldst_label(s); ldst->is_ld = is_ld; @@ -2085,7 +2084,6 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, ldst->label_ptr[0] = s->code_ptr; s->code_ptr += 4; } -#endif return ldst; } @@ -4140,35 +4138,35 @@ static void tcg_target_qemu_prologue(TCGContext *s) tcg_out_push(s, tcg_target_callee_save_regs[i]); } -#if TCG_TARGET_REG_BITS == 32 - tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, - (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4); - tcg_out_addi(s, TCG_REG_ESP, -stack_addend); - /* jmp *tb. */ - tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP, - (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4 - + stack_addend); -#else -# if !defined(CONFIG_SOFTMMU) - if (guest_base) { + if (!tcg_use_softmmu && guest_base) { int seg = setup_guest_base_seg(); if (seg != 0) { x86_guest_base.seg = seg; } else if (guest_base == (int32_t)guest_base) { x86_guest_base.ofs = guest_base; } else { + assert(TCG_TARGET_REG_BITS == 64); /* Choose R12 because, as a base, it requires a SIB byte. */ x86_guest_base.index = TCG_REG_R12; tcg_out_movi(s, TCG_TYPE_PTR, x86_guest_base.index, guest_base); tcg_regset_set_reg(s->reserved_regs, x86_guest_base.index); } } -# endif - tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]); - tcg_out_addi(s, TCG_REG_ESP, -stack_addend); - /* jmp *tb. */ - tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]); -#endif + + if (TCG_TARGET_REG_BITS == 32) { + tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, + (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4); + tcg_out_addi(s, TCG_REG_ESP, -stack_addend); + /* jmp *tb. */ + tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP, + (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4 + + stack_addend); + } else { + tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]); + tcg_out_addi(s, TCG_REG_ESP, -stack_addend); + /* jmp *tb. */ + tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]); + } /* * Return path for goto_ptr. Set return value to 0, a-la exit_tb, From 10e1fd2784e59e1781ce09f65b0499784938e83b Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Sun, 1 Oct 2023 18:05:27 +0000 Subject: [PATCH 19/38] tcg/loongarch64: Use tcg_use_softmmu MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Richard Henderson --- tcg/loongarch64/tcg-target.c.inc | 126 +++++++++++++++---------------- 1 file changed, 61 insertions(+), 65 deletions(-) diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc index 801302d85d..ccf133db4b 100644 --- a/tcg/loongarch64/tcg-target.c.inc +++ b/tcg/loongarch64/tcg-target.c.inc @@ -165,10 +165,7 @@ static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot) return TCG_REG_A0 + slot; } -#ifndef CONFIG_SOFTMMU -#define USE_GUEST_BASE (guest_base != 0) #define TCG_GUEST_BASE_REG TCG_REG_S1 -#endif #define TCG_CT_CONST_ZERO 0x100 #define TCG_CT_CONST_S12 0x200 @@ -908,76 +905,77 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, h->aa = atom_and_align_for_opc(s, opc, MO_ATOM_IFALIGN, false); a_bits = h->aa.align; -#ifdef CONFIG_SOFTMMU - unsigned s_bits = opc & MO_SIZE; - int mem_index = get_mmuidx(oi); - int fast_ofs = tlb_mask_table_ofs(s, mem_index); - int mask_ofs = fast_ofs + offsetof(CPUTLBDescFast, mask); - int table_ofs = fast_ofs + offsetof(CPUTLBDescFast, table); + if (tcg_use_softmmu) { + unsigned s_bits = opc & MO_SIZE; + int mem_index = get_mmuidx(oi); + int fast_ofs = tlb_mask_table_ofs(s, mem_index); + int mask_ofs = fast_ofs + offsetof(CPUTLBDescFast, mask); + int table_ofs = fast_ofs + offsetof(CPUTLBDescFast, table); - ldst = new_ldst_label(s); - ldst->is_ld = is_ld; - ldst->oi = oi; - ldst->addrlo_reg = addr_reg; - - tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP0, TCG_AREG0, mask_ofs); - tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP1, TCG_AREG0, table_ofs); - - tcg_out_opc_srli_d(s, TCG_REG_TMP2, addr_reg, - s->page_bits - CPU_TLB_ENTRY_BITS); - tcg_out_opc_and(s, TCG_REG_TMP2, TCG_REG_TMP2, TCG_REG_TMP0); - tcg_out_opc_add_d(s, TCG_REG_TMP2, TCG_REG_TMP2, TCG_REG_TMP1); - - /* Load the tlb comparator and the addend. */ - QEMU_BUILD_BUG_ON(HOST_BIG_ENDIAN); - tcg_out_ld(s, addr_type, TCG_REG_TMP0, TCG_REG_TMP2, - is_ld ? offsetof(CPUTLBEntry, addr_read) - : offsetof(CPUTLBEntry, addr_write)); - tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP2, TCG_REG_TMP2, - offsetof(CPUTLBEntry, addend)); - - /* - * For aligned accesses, we check the first byte and include the alignment - * bits within the address. For unaligned access, we check that we don't - * cross pages using the address of the last byte of the access. - */ - if (a_bits < s_bits) { - unsigned a_mask = (1u << a_bits) - 1; - unsigned s_mask = (1u << s_bits) - 1; - tcg_out_addi(s, addr_type, TCG_REG_TMP1, addr_reg, s_mask - a_mask); - } else { - tcg_out_mov(s, addr_type, TCG_REG_TMP1, addr_reg); - } - tcg_out_opc_bstrins_d(s, TCG_REG_TMP1, TCG_REG_ZERO, - a_bits, s->page_bits - 1); - - /* Compare masked address with the TLB entry. */ - ldst->label_ptr[0] = s->code_ptr; - tcg_out_opc_bne(s, TCG_REG_TMP0, TCG_REG_TMP1, 0); - - h->index = TCG_REG_TMP2; -#else - if (a_bits) { ldst = new_ldst_label(s); - ldst->is_ld = is_ld; ldst->oi = oi; ldst->addrlo_reg = addr_reg; + tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP0, TCG_AREG0, mask_ofs); + tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP1, TCG_AREG0, table_ofs); + + tcg_out_opc_srli_d(s, TCG_REG_TMP2, addr_reg, + s->page_bits - CPU_TLB_ENTRY_BITS); + tcg_out_opc_and(s, TCG_REG_TMP2, TCG_REG_TMP2, TCG_REG_TMP0); + tcg_out_opc_add_d(s, TCG_REG_TMP2, TCG_REG_TMP2, TCG_REG_TMP1); + + /* Load the tlb comparator and the addend. */ + QEMU_BUILD_BUG_ON(HOST_BIG_ENDIAN); + tcg_out_ld(s, addr_type, TCG_REG_TMP0, TCG_REG_TMP2, + is_ld ? offsetof(CPUTLBEntry, addr_read) + : offsetof(CPUTLBEntry, addr_write)); + tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP2, TCG_REG_TMP2, + offsetof(CPUTLBEntry, addend)); + /* - * Without micro-architecture details, we don't know which of - * bstrpick or andi is faster, so use bstrpick as it's not - * constrained by imm field width. Not to say alignments >= 2^12 - * are going to happen any time soon. + * For aligned accesses, we check the first byte and include the + * alignment bits within the address. For unaligned access, we + * check that we don't cross pages using the address of the last + * byte of the access. */ - tcg_out_opc_bstrpick_d(s, TCG_REG_TMP1, addr_reg, 0, a_bits - 1); + if (a_bits < s_bits) { + unsigned a_mask = (1u << a_bits) - 1; + unsigned s_mask = (1u << s_bits) - 1; + tcg_out_addi(s, addr_type, TCG_REG_TMP1, addr_reg, s_mask - a_mask); + } else { + tcg_out_mov(s, addr_type, TCG_REG_TMP1, addr_reg); + } + tcg_out_opc_bstrins_d(s, TCG_REG_TMP1, TCG_REG_ZERO, + a_bits, s->page_bits - 1); + /* Compare masked address with the TLB entry. */ ldst->label_ptr[0] = s->code_ptr; - tcg_out_opc_bne(s, TCG_REG_TMP1, TCG_REG_ZERO, 0); - } + tcg_out_opc_bne(s, TCG_REG_TMP0, TCG_REG_TMP1, 0); - h->index = USE_GUEST_BASE ? TCG_GUEST_BASE_REG : TCG_REG_ZERO; -#endif + h->index = TCG_REG_TMP2; + } else { + if (a_bits) { + ldst = new_ldst_label(s); + + ldst->is_ld = is_ld; + ldst->oi = oi; + ldst->addrlo_reg = addr_reg; + + /* + * Without micro-architecture details, we don't know which of + * bstrpick or andi is faster, so use bstrpick as it's not + * constrained by imm field width. Not to say alignments >= 2^12 + * are going to happen any time soon. + */ + tcg_out_opc_bstrpick_d(s, TCG_REG_TMP1, addr_reg, 0, a_bits - 1); + + ldst->label_ptr[0] = s->code_ptr; + tcg_out_opc_bne(s, TCG_REG_TMP1, TCG_REG_ZERO, 0); + } + + h->index = guest_base ? TCG_GUEST_BASE_REG : TCG_REG_ZERO; + } if (addr_type == TCG_TYPE_I32) { h->base = TCG_REG_TMP0; @@ -2272,12 +2270,10 @@ static void tcg_target_qemu_prologue(TCGContext *s) TCG_REG_SP, SAVE_OFS + i * REG_SIZE); } -#if !defined(CONFIG_SOFTMMU) - if (USE_GUEST_BASE) { + if (!tcg_use_softmmu && guest_base) { tcg_out_movi(s, TCG_TYPE_PTR, TCG_GUEST_BASE_REG, guest_base); tcg_regset_set_reg(s->reserved_regs, TCG_GUEST_BASE_REG); } -#endif /* Call generated code */ tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]); From e3a650cd9dca1d10df8ac426fc59736e009ebd7f Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Sun, 1 Oct 2023 11:49:03 -0700 Subject: [PATCH 20/38] tcg/mips: Use tcg_use_softmmu MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Richard Henderson --- tcg/mips/tcg-target.c.inc | 231 +++++++++++++++++++------------------- 1 file changed, 113 insertions(+), 118 deletions(-) diff --git a/tcg/mips/tcg-target.c.inc b/tcg/mips/tcg-target.c.inc index e2892edc6a..328984ccff 100644 --- a/tcg/mips/tcg-target.c.inc +++ b/tcg/mips/tcg-target.c.inc @@ -78,13 +78,11 @@ static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = { #define TCG_TMP2 TCG_REG_T8 #define TCG_TMP3 TCG_REG_T7 -#ifndef CONFIG_SOFTMMU #define TCG_GUEST_BASE_REG TCG_REG_S7 -#endif #if TCG_TARGET_REG_BITS == 64 #define TCG_REG_TB TCG_REG_S6 #else -#define TCG_REG_TB (qemu_build_not_reached(), TCG_REG_ZERO) +#define TCG_REG_TB ({ qemu_build_not_reached(); TCG_REG_ZERO; }) #endif /* check if we really need so many registers :P */ @@ -1279,130 +1277,129 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, a_bits = h->aa.align; a_mask = (1 << a_bits) - 1; -#ifdef CONFIG_SOFTMMU - unsigned s_mask = (1 << s_bits) - 1; - int mem_index = get_mmuidx(oi); - int fast_off = tlb_mask_table_ofs(s, mem_index); - int mask_off = fast_off + offsetof(CPUTLBDescFast, mask); - int table_off = fast_off + offsetof(CPUTLBDescFast, table); - int add_off = offsetof(CPUTLBEntry, addend); - int cmp_off = is_ld ? offsetof(CPUTLBEntry, addr_read) - : offsetof(CPUTLBEntry, addr_write); + if (tcg_use_softmmu) { + unsigned s_mask = (1 << s_bits) - 1; + int mem_index = get_mmuidx(oi); + int fast_off = tlb_mask_table_ofs(s, mem_index); + int mask_off = fast_off + offsetof(CPUTLBDescFast, mask); + int table_off = fast_off + offsetof(CPUTLBDescFast, table); + int add_off = offsetof(CPUTLBEntry, addend); + int cmp_off = is_ld ? offsetof(CPUTLBEntry, addr_read) + : offsetof(CPUTLBEntry, addr_write); - ldst = new_ldst_label(s); - ldst->is_ld = is_ld; - ldst->oi = oi; - ldst->addrlo_reg = addrlo; - ldst->addrhi_reg = addrhi; - - /* Load tlb_mask[mmu_idx] and tlb_table[mmu_idx]. */ - tcg_out_ld(s, TCG_TYPE_PTR, TCG_TMP0, TCG_AREG0, mask_off); - tcg_out_ld(s, TCG_TYPE_PTR, TCG_TMP1, TCG_AREG0, table_off); - - /* Extract the TLB index from the address into TMP3. */ - if (TCG_TARGET_REG_BITS == 32 || addr_type == TCG_TYPE_I32) { - tcg_out_opc_sa(s, OPC_SRL, TCG_TMP3, addrlo, - s->page_bits - CPU_TLB_ENTRY_BITS); - } else { - tcg_out_dsrl(s, TCG_TMP3, addrlo, - s->page_bits - CPU_TLB_ENTRY_BITS); - } - tcg_out_opc_reg(s, OPC_AND, TCG_TMP3, TCG_TMP3, TCG_TMP0); - - /* Add the tlb_table pointer, creating the CPUTLBEntry address in TMP3. */ - tcg_out_opc_reg(s, ALIAS_PADD, TCG_TMP3, TCG_TMP3, TCG_TMP1); - - if (TCG_TARGET_REG_BITS == 32 || addr_type == TCG_TYPE_I32) { - /* Load the (low half) tlb comparator. */ - tcg_out_ld(s, TCG_TYPE_I32, TCG_TMP0, TCG_TMP3, - cmp_off + HOST_BIG_ENDIAN * 4); - } else { - tcg_out_ld(s, TCG_TYPE_I64, TCG_TMP0, TCG_TMP3, cmp_off); - } - - if (TCG_TARGET_REG_BITS == 64 || addr_type == TCG_TYPE_I32) { - /* Load the tlb addend for the fast path. */ - tcg_out_ld(s, TCG_TYPE_PTR, TCG_TMP3, TCG_TMP3, add_off); - } - - /* - * Mask the page bits, keeping the alignment bits to compare against. - * For unaligned accesses, compare against the end of the access to - * verify that it does not cross a page boundary. - */ - tcg_out_movi(s, addr_type, TCG_TMP1, s->page_mask | a_mask); - if (a_mask < s_mask) { - if (TCG_TARGET_REG_BITS == 32 || addr_type == TCG_TYPE_I32) { - tcg_out_opc_imm(s, OPC_ADDIU, TCG_TMP2, addrlo, s_mask - a_mask); - } else { - tcg_out_opc_imm(s, OPC_DADDIU, TCG_TMP2, addrlo, s_mask - a_mask); - } - tcg_out_opc_reg(s, OPC_AND, TCG_TMP1, TCG_TMP1, TCG_TMP2); - } else { - tcg_out_opc_reg(s, OPC_AND, TCG_TMP1, TCG_TMP1, addrlo); - } - - /* Zero extend a 32-bit guest address for a 64-bit host. */ - if (TCG_TARGET_REG_BITS == 64 && addr_type == TCG_TYPE_I32) { - tcg_out_ext32u(s, TCG_TMP2, addrlo); - addrlo = TCG_TMP2; - } - - ldst->label_ptr[0] = s->code_ptr; - tcg_out_opc_br(s, OPC_BNE, TCG_TMP1, TCG_TMP0); - - /* Load and test the high half tlb comparator. */ - if (TCG_TARGET_REG_BITS == 32 && addr_type != TCG_TYPE_I32) { - /* delay slot */ - tcg_out_ldst(s, OPC_LW, TCG_TMP0, TCG_TMP3, cmp_off + HI_OFF); - - /* Load the tlb addend for the fast path. */ - tcg_out_ld(s, TCG_TYPE_PTR, TCG_TMP3, TCG_TMP3, add_off); - - ldst->label_ptr[1] = s->code_ptr; - tcg_out_opc_br(s, OPC_BNE, addrhi, TCG_TMP0); - } - - /* delay slot */ - base = TCG_TMP3; - tcg_out_opc_reg(s, ALIAS_PADD, base, TCG_TMP3, addrlo); -#else - if (a_mask && (use_mips32r6_instructions || a_bits != s_bits)) { ldst = new_ldst_label(s); - ldst->is_ld = is_ld; ldst->oi = oi; ldst->addrlo_reg = addrlo; ldst->addrhi_reg = addrhi; - /* We are expecting a_bits to max out at 7, much lower than ANDI. */ - tcg_debug_assert(a_bits < 16); - tcg_out_opc_imm(s, OPC_ANDI, TCG_TMP0, addrlo, a_mask); + /* Load tlb_mask[mmu_idx] and tlb_table[mmu_idx]. */ + tcg_out_ld(s, TCG_TYPE_PTR, TCG_TMP0, TCG_AREG0, mask_off); + tcg_out_ld(s, TCG_TYPE_PTR, TCG_TMP1, TCG_AREG0, table_off); + + /* Extract the TLB index from the address into TMP3. */ + if (TCG_TARGET_REG_BITS == 32 || addr_type == TCG_TYPE_I32) { + tcg_out_opc_sa(s, OPC_SRL, TCG_TMP3, addrlo, + s->page_bits - CPU_TLB_ENTRY_BITS); + } else { + tcg_out_dsrl(s, TCG_TMP3, addrlo, + s->page_bits - CPU_TLB_ENTRY_BITS); + } + tcg_out_opc_reg(s, OPC_AND, TCG_TMP3, TCG_TMP3, TCG_TMP0); + + /* Add the tlb_table pointer, creating the CPUTLBEntry address. */ + tcg_out_opc_reg(s, ALIAS_PADD, TCG_TMP3, TCG_TMP3, TCG_TMP1); + + if (TCG_TARGET_REG_BITS == 32 || addr_type == TCG_TYPE_I32) { + /* Load the (low half) tlb comparator. */ + tcg_out_ld(s, TCG_TYPE_I32, TCG_TMP0, TCG_TMP3, + cmp_off + HOST_BIG_ENDIAN * 4); + } else { + tcg_out_ld(s, TCG_TYPE_I64, TCG_TMP0, TCG_TMP3, cmp_off); + } + + if (TCG_TARGET_REG_BITS == 64 || addr_type == TCG_TYPE_I32) { + /* Load the tlb addend for the fast path. */ + tcg_out_ld(s, TCG_TYPE_PTR, TCG_TMP3, TCG_TMP3, add_off); + } + + /* + * Mask the page bits, keeping the alignment bits to compare against. + * For unaligned accesses, compare against the end of the access to + * verify that it does not cross a page boundary. + */ + tcg_out_movi(s, addr_type, TCG_TMP1, s->page_mask | a_mask); + if (a_mask < s_mask) { + tcg_out_opc_imm(s, (TCG_TARGET_REG_BITS == 32 + || addr_type == TCG_TYPE_I32 + ? OPC_ADDIU : OPC_DADDIU), + TCG_TMP2, addrlo, s_mask - a_mask); + tcg_out_opc_reg(s, OPC_AND, TCG_TMP1, TCG_TMP1, TCG_TMP2); + } else { + tcg_out_opc_reg(s, OPC_AND, TCG_TMP1, TCG_TMP1, addrlo); + } + + /* Zero extend a 32-bit guest address for a 64-bit host. */ + if (TCG_TARGET_REG_BITS == 64 && addr_type == TCG_TYPE_I32) { + tcg_out_ext32u(s, TCG_TMP2, addrlo); + addrlo = TCG_TMP2; + } ldst->label_ptr[0] = s->code_ptr; - if (use_mips32r6_instructions) { - tcg_out_opc_br(s, OPC_BNEZALC_R6, TCG_REG_ZERO, TCG_TMP0); - } else { - tcg_out_opc_br(s, OPC_BNEL, TCG_TMP0, TCG_REG_ZERO); - tcg_out_nop(s); - } - } + tcg_out_opc_br(s, OPC_BNE, TCG_TMP1, TCG_TMP0); - base = addrlo; - if (TCG_TARGET_REG_BITS == 64 && addr_type == TCG_TYPE_I32) { - tcg_out_ext32u(s, TCG_REG_A0, base); - base = TCG_REG_A0; - } - if (guest_base) { - if (guest_base == (int16_t)guest_base) { - tcg_out_opc_imm(s, ALIAS_PADDI, TCG_REG_A0, base, guest_base); - } else { - tcg_out_opc_reg(s, ALIAS_PADD, TCG_REG_A0, base, - TCG_GUEST_BASE_REG); + /* Load and test the high half tlb comparator. */ + if (TCG_TARGET_REG_BITS == 32 && addr_type != TCG_TYPE_I32) { + /* delay slot */ + tcg_out_ldst(s, OPC_LW, TCG_TMP0, TCG_TMP3, cmp_off + HI_OFF); + + /* Load the tlb addend for the fast path. */ + tcg_out_ld(s, TCG_TYPE_PTR, TCG_TMP3, TCG_TMP3, add_off); + + ldst->label_ptr[1] = s->code_ptr; + tcg_out_opc_br(s, OPC_BNE, addrhi, TCG_TMP0); + } + + /* delay slot */ + base = TCG_TMP3; + tcg_out_opc_reg(s, ALIAS_PADD, base, TCG_TMP3, addrlo); + } else { + if (a_mask && (use_mips32r6_instructions || a_bits != s_bits)) { + ldst = new_ldst_label(s); + + ldst->is_ld = is_ld; + ldst->oi = oi; + ldst->addrlo_reg = addrlo; + ldst->addrhi_reg = addrhi; + + /* We are expecting a_bits to max out at 7, much lower than ANDI. */ + tcg_debug_assert(a_bits < 16); + tcg_out_opc_imm(s, OPC_ANDI, TCG_TMP0, addrlo, a_mask); + + ldst->label_ptr[0] = s->code_ptr; + if (use_mips32r6_instructions) { + tcg_out_opc_br(s, OPC_BNEZALC_R6, TCG_REG_ZERO, TCG_TMP0); + } else { + tcg_out_opc_br(s, OPC_BNEL, TCG_TMP0, TCG_REG_ZERO); + tcg_out_nop(s); + } + } + + base = addrlo; + if (TCG_TARGET_REG_BITS == 64 && addr_type == TCG_TYPE_I32) { + tcg_out_ext32u(s, TCG_REG_A0, base); + base = TCG_REG_A0; + } + if (guest_base) { + if (guest_base == (int16_t)guest_base) { + tcg_out_opc_imm(s, ALIAS_PADDI, TCG_REG_A0, base, guest_base); + } else { + tcg_out_opc_reg(s, ALIAS_PADD, TCG_REG_A0, base, + TCG_GUEST_BASE_REG); + } + base = TCG_REG_A0; } - base = TCG_REG_A0; } -#endif h->base = base; return ldst; @@ -2465,8 +2462,7 @@ static void tcg_target_qemu_prologue(TCGContext *s) TCG_REG_SP, SAVE_OFS + i * REG_SIZE); } -#ifndef CONFIG_SOFTMMU - if (guest_base != (int16_t)guest_base) { + if (!tcg_use_softmmu && guest_base != (int16_t)guest_base) { /* * The function call abi for n32 and n64 will have loaded $25 (t9) * with the address of the prologue, so we can use that instead @@ -2479,7 +2475,6 @@ static void tcg_target_qemu_prologue(TCGContext *s) TCG_TARGET_REG_BITS == 64 ? TCG_REG_T9 : 0); tcg_regset_set_reg(s->reserved_regs, TCG_GUEST_BASE_REG); } -#endif if (TCG_TARGET_REG_BITS == 64) { tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_TB, tcg_target_call_iarg_regs[1]); From 5b5bd4a9b1ddd724b5f12725fa186e75047dbfe1 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Sun, 1 Oct 2023 19:09:52 +0000 Subject: [PATCH 21/38] tcg/ppc: Use tcg_use_softmmu MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix TCG_GUEST_BASE_REG to use 'TCG_REG_R30' instead of '30'. Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Richard Henderson --- tcg/ppc/tcg-target.c.inc | 284 ++++++++++++++++++++------------------- 1 file changed, 143 insertions(+), 141 deletions(-) diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc index c31da4da9d..856c3b18f5 100644 --- a/tcg/ppc/tcg-target.c.inc +++ b/tcg/ppc/tcg-target.c.inc @@ -107,9 +107,7 @@ #define have_isel (cpuinfo & CPUINFO_ISEL) -#ifndef CONFIG_SOFTMMU -#define TCG_GUEST_BASE_REG 30 -#endif +#define TCG_GUEST_BASE_REG TCG_REG_R30 #ifdef CONFIG_DEBUG_TCG static const char tcg_target_reg_names[TCG_TARGET_NB_REGS][4] = { @@ -2317,151 +2315,157 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, s_bits == MO_128); a_bits = h->aa.align; -#ifdef CONFIG_SOFTMMU - int mem_index = get_mmuidx(oi); - int cmp_off = is_ld ? offsetof(CPUTLBEntry, addr_read) - : offsetof(CPUTLBEntry, addr_write); - int fast_off = tlb_mask_table_ofs(s, mem_index); - int mask_off = fast_off + offsetof(CPUTLBDescFast, mask); - int table_off = fast_off + offsetof(CPUTLBDescFast, table); + if (tcg_use_softmmu) { + int mem_index = get_mmuidx(oi); + int cmp_off = is_ld ? offsetof(CPUTLBEntry, addr_read) + : offsetof(CPUTLBEntry, addr_write); + int fast_off = tlb_mask_table_ofs(s, mem_index); + int mask_off = fast_off + offsetof(CPUTLBDescFast, mask); + int table_off = fast_off + offsetof(CPUTLBDescFast, table); - ldst = new_ldst_label(s); - ldst->is_ld = is_ld; - ldst->oi = oi; - ldst->addrlo_reg = addrlo; - ldst->addrhi_reg = addrhi; - - /* Load tlb_mask[mmu_idx] and tlb_table[mmu_idx]. */ - tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP1, TCG_AREG0, mask_off); - tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP2, TCG_AREG0, table_off); - - /* Extract the page index, shifted into place for tlb index. */ - if (TCG_TARGET_REG_BITS == 32) { - tcg_out_shri32(s, TCG_REG_R0, addrlo, - s->page_bits - CPU_TLB_ENTRY_BITS); - } else { - tcg_out_shri64(s, TCG_REG_R0, addrlo, - s->page_bits - CPU_TLB_ENTRY_BITS); - } - tcg_out32(s, AND | SAB(TCG_REG_TMP1, TCG_REG_TMP1, TCG_REG_R0)); - - /* - * Load the (low part) TLB comparator into TMP2. - * For 64-bit host, always load the entire 64-bit slot for simplicity. - * We will ignore the high bits with tcg_out_cmp(..., addr_type). - */ - if (TCG_TARGET_REG_BITS == 64) { - if (cmp_off == 0) { - tcg_out32(s, LDUX | TAB(TCG_REG_TMP2, TCG_REG_TMP1, TCG_REG_TMP2)); - } else { - tcg_out32(s, ADD | TAB(TCG_REG_TMP1, TCG_REG_TMP1, TCG_REG_TMP2)); - tcg_out_ld(s, TCG_TYPE_I64, TCG_REG_TMP2, TCG_REG_TMP1, cmp_off); - } - } else if (cmp_off == 0 && !HOST_BIG_ENDIAN) { - tcg_out32(s, LWZUX | TAB(TCG_REG_TMP2, TCG_REG_TMP1, TCG_REG_TMP2)); - } else { - tcg_out32(s, ADD | TAB(TCG_REG_TMP1, TCG_REG_TMP1, TCG_REG_TMP2)); - tcg_out_ld(s, TCG_TYPE_I32, TCG_REG_TMP2, TCG_REG_TMP1, - cmp_off + 4 * HOST_BIG_ENDIAN); - } - - /* - * Load the TLB addend for use on the fast path. - * Do this asap to minimize any load use delay. - */ - if (TCG_TARGET_REG_BITS == 64 || addr_type == TCG_TYPE_I32) { - tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP1, TCG_REG_TMP1, - offsetof(CPUTLBEntry, addend)); - } - - /* Clear the non-page, non-alignment bits from the address in R0. */ - if (TCG_TARGET_REG_BITS == 32) { - /* - * We don't support unaligned accesses on 32-bits. - * Preserve the bottom bits and thus trigger a comparison - * failure on unaligned accesses. - */ - if (a_bits < s_bits) { - a_bits = s_bits; - } - tcg_out_rlw(s, RLWINM, TCG_REG_R0, addrlo, 0, - (32 - a_bits) & 31, 31 - s->page_bits); - } else { - TCGReg t = addrlo; - - /* - * If the access is unaligned, we need to make sure we fail if we - * cross a page boundary. The trick is to add the access size-1 - * to the address before masking the low bits. That will make the - * address overflow to the next page if we cross a page boundary, - * which will then force a mismatch of the TLB compare. - */ - if (a_bits < s_bits) { - unsigned a_mask = (1 << a_bits) - 1; - unsigned s_mask = (1 << s_bits) - 1; - tcg_out32(s, ADDI | TAI(TCG_REG_R0, t, s_mask - a_mask)); - t = TCG_REG_R0; - } - - /* Mask the address for the requested alignment. */ - if (addr_type == TCG_TYPE_I32) { - tcg_out_rlw(s, RLWINM, TCG_REG_R0, t, 0, - (32 - a_bits) & 31, 31 - s->page_bits); - } else if (a_bits == 0) { - tcg_out_rld(s, RLDICR, TCG_REG_R0, t, 0, 63 - s->page_bits); - } else { - tcg_out_rld(s, RLDICL, TCG_REG_R0, t, - 64 - s->page_bits, s->page_bits - a_bits); - tcg_out_rld(s, RLDICL, TCG_REG_R0, TCG_REG_R0, s->page_bits, 0); - } - } - - if (TCG_TARGET_REG_BITS == 32 && addr_type != TCG_TYPE_I32) { - /* Low part comparison into cr7. */ - tcg_out_cmp(s, TCG_COND_EQ, TCG_REG_R0, TCG_REG_TMP2, - 0, 7, TCG_TYPE_I32); - - /* Load the high part TLB comparator into TMP2. */ - tcg_out_ld(s, TCG_TYPE_I32, TCG_REG_TMP2, TCG_REG_TMP1, - cmp_off + 4 * !HOST_BIG_ENDIAN); - - /* Load addend, deferred for this case. */ - tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP1, TCG_REG_TMP1, - offsetof(CPUTLBEntry, addend)); - - /* High part comparison into cr6. */ - tcg_out_cmp(s, TCG_COND_EQ, addrhi, TCG_REG_TMP2, 0, 6, TCG_TYPE_I32); - - /* Combine comparisons into cr7. */ - tcg_out32(s, CRAND | BT(7, CR_EQ) | BA(6, CR_EQ) | BB(7, CR_EQ)); - } else { - /* Full comparison into cr7. */ - tcg_out_cmp(s, TCG_COND_EQ, TCG_REG_R0, TCG_REG_TMP2, 0, 7, addr_type); - } - - /* Load a pointer into the current opcode w/conditional branch-link. */ - ldst->label_ptr[0] = s->code_ptr; - tcg_out32(s, BC | BI(7, CR_EQ) | BO_COND_FALSE | LK); - - h->base = TCG_REG_TMP1; -#else - if (a_bits) { ldst = new_ldst_label(s); ldst->is_ld = is_ld; ldst->oi = oi; ldst->addrlo_reg = addrlo; ldst->addrhi_reg = addrhi; - /* We are expecting a_bits to max out at 7, much lower than ANDI. */ - tcg_debug_assert(a_bits < 16); - tcg_out32(s, ANDI | SAI(addrlo, TCG_REG_R0, (1 << a_bits) - 1)); + /* Load tlb_mask[mmu_idx] and tlb_table[mmu_idx]. */ + tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP1, TCG_AREG0, mask_off); + tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP2, TCG_AREG0, table_off); + /* Extract the page index, shifted into place for tlb index. */ + if (TCG_TARGET_REG_BITS == 32) { + tcg_out_shri32(s, TCG_REG_R0, addrlo, + s->page_bits - CPU_TLB_ENTRY_BITS); + } else { + tcg_out_shri64(s, TCG_REG_R0, addrlo, + s->page_bits - CPU_TLB_ENTRY_BITS); + } + tcg_out32(s, AND | SAB(TCG_REG_TMP1, TCG_REG_TMP1, TCG_REG_R0)); + + /* + * Load the (low part) TLB comparator into TMP2. + * For 64-bit host, always load the entire 64-bit slot for simplicity. + * We will ignore the high bits with tcg_out_cmp(..., addr_type). + */ + if (TCG_TARGET_REG_BITS == 64) { + if (cmp_off == 0) { + tcg_out32(s, LDUX | TAB(TCG_REG_TMP2, + TCG_REG_TMP1, TCG_REG_TMP2)); + } else { + tcg_out32(s, ADD | TAB(TCG_REG_TMP1, + TCG_REG_TMP1, TCG_REG_TMP2)); + tcg_out_ld(s, TCG_TYPE_I64, TCG_REG_TMP2, + TCG_REG_TMP1, cmp_off); + } + } else if (cmp_off == 0 && !HOST_BIG_ENDIAN) { + tcg_out32(s, LWZUX | TAB(TCG_REG_TMP2, + TCG_REG_TMP1, TCG_REG_TMP2)); + } else { + tcg_out32(s, ADD | TAB(TCG_REG_TMP1, TCG_REG_TMP1, TCG_REG_TMP2)); + tcg_out_ld(s, TCG_TYPE_I32, TCG_REG_TMP2, TCG_REG_TMP1, + cmp_off + 4 * HOST_BIG_ENDIAN); + } + + /* + * Load the TLB addend for use on the fast path. + * Do this asap to minimize any load use delay. + */ + if (TCG_TARGET_REG_BITS == 64 || addr_type == TCG_TYPE_I32) { + tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP1, TCG_REG_TMP1, + offsetof(CPUTLBEntry, addend)); + } + + /* Clear the non-page, non-alignment bits from the address in R0. */ + if (TCG_TARGET_REG_BITS == 32) { + /* + * We don't support unaligned accesses on 32-bits. + * Preserve the bottom bits and thus trigger a comparison + * failure on unaligned accesses. + */ + if (a_bits < s_bits) { + a_bits = s_bits; + } + tcg_out_rlw(s, RLWINM, TCG_REG_R0, addrlo, 0, + (32 - a_bits) & 31, 31 - s->page_bits); + } else { + TCGReg t = addrlo; + + /* + * If the access is unaligned, we need to make sure we fail if we + * cross a page boundary. The trick is to add the access size-1 + * to the address before masking the low bits. That will make the + * address overflow to the next page if we cross a page boundary, + * which will then force a mismatch of the TLB compare. + */ + if (a_bits < s_bits) { + unsigned a_mask = (1 << a_bits) - 1; + unsigned s_mask = (1 << s_bits) - 1; + tcg_out32(s, ADDI | TAI(TCG_REG_R0, t, s_mask - a_mask)); + t = TCG_REG_R0; + } + + /* Mask the address for the requested alignment. */ + if (addr_type == TCG_TYPE_I32) { + tcg_out_rlw(s, RLWINM, TCG_REG_R0, t, 0, + (32 - a_bits) & 31, 31 - s->page_bits); + } else if (a_bits == 0) { + tcg_out_rld(s, RLDICR, TCG_REG_R0, t, 0, 63 - s->page_bits); + } else { + tcg_out_rld(s, RLDICL, TCG_REG_R0, t, + 64 - s->page_bits, s->page_bits - a_bits); + tcg_out_rld(s, RLDICL, TCG_REG_R0, TCG_REG_R0, s->page_bits, 0); + } + } + + if (TCG_TARGET_REG_BITS == 32 && addr_type != TCG_TYPE_I32) { + /* Low part comparison into cr7. */ + tcg_out_cmp(s, TCG_COND_EQ, TCG_REG_R0, TCG_REG_TMP2, + 0, 7, TCG_TYPE_I32); + + /* Load the high part TLB comparator into TMP2. */ + tcg_out_ld(s, TCG_TYPE_I32, TCG_REG_TMP2, TCG_REG_TMP1, + cmp_off + 4 * !HOST_BIG_ENDIAN); + + /* Load addend, deferred for this case. */ + tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP1, TCG_REG_TMP1, + offsetof(CPUTLBEntry, addend)); + + /* High part comparison into cr6. */ + tcg_out_cmp(s, TCG_COND_EQ, addrhi, TCG_REG_TMP2, + 0, 6, TCG_TYPE_I32); + + /* Combine comparisons into cr7. */ + tcg_out32(s, CRAND | BT(7, CR_EQ) | BA(6, CR_EQ) | BB(7, CR_EQ)); + } else { + /* Full comparison into cr7. */ + tcg_out_cmp(s, TCG_COND_EQ, TCG_REG_R0, TCG_REG_TMP2, + 0, 7, addr_type); + } + + /* Load a pointer into the current opcode w/conditional branch-link. */ ldst->label_ptr[0] = s->code_ptr; - tcg_out32(s, BC | BI(0, CR_EQ) | BO_COND_FALSE | LK); - } + tcg_out32(s, BC | BI(7, CR_EQ) | BO_COND_FALSE | LK); - h->base = guest_base ? TCG_GUEST_BASE_REG : 0; -#endif + h->base = TCG_REG_TMP1; + } else { + if (a_bits) { + ldst = new_ldst_label(s); + ldst->is_ld = is_ld; + ldst->oi = oi; + ldst->addrlo_reg = addrlo; + ldst->addrhi_reg = addrhi; + + /* We are expecting a_bits to max out at 7, much lower than ANDI. */ + tcg_debug_assert(a_bits < 16); + tcg_out32(s, ANDI | SAI(addrlo, TCG_REG_R0, (1 << a_bits) - 1)); + + ldst->label_ptr[0] = s->code_ptr; + tcg_out32(s, BC | BI(0, CR_EQ) | BO_COND_FALSE | LK); + } + + h->base = guest_base ? TCG_GUEST_BASE_REG : 0; + } if (TCG_TARGET_REG_BITS == 64 && addr_type == TCG_TYPE_I32) { /* Zero-extend the guest address for use in the host address. */ @@ -2695,12 +2699,10 @@ static void tcg_target_qemu_prologue(TCGContext *s) } tcg_out_st(s, TCG_TYPE_PTR, TCG_REG_R0, TCG_REG_R1, FRAME_SIZE+LR_OFFSET); -#ifndef CONFIG_SOFTMMU - if (guest_base) { + if (!tcg_use_softmmu && guest_base) { tcg_out_movi_int(s, TCG_TYPE_PTR, TCG_GUEST_BASE_REG, guest_base, true); tcg_regset_set_reg(s->reserved_regs, TCG_GUEST_BASE_REG); } -#endif tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]); tcg_out32(s, MTSPR | RS(tcg_target_call_iarg_regs[1]) | CTR); From cf0ed30eb10c7341fd1f253446e3bbb6e0114c30 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Thu, 12 Oct 2023 20:45:36 -0700 Subject: [PATCH 22/38] tcg/riscv: Do not reserve TCG_GUEST_BASE_REG for guest_base zero Fixes: 92c041c59b ("tcg/riscv: Add the prologue generation and register the JIT") Signed-off-by: Richard Henderson --- tcg/riscv/tcg-target.c.inc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc index d6dbcaf3cb..dc71f829d1 100644 --- a/tcg/riscv/tcg-target.c.inc +++ b/tcg/riscv/tcg-target.c.inc @@ -2076,8 +2076,10 @@ static void tcg_target_qemu_prologue(TCGContext *s) } #if !defined(CONFIG_SOFTMMU) - tcg_out_movi(s, TCG_TYPE_PTR, TCG_GUEST_BASE_REG, guest_base); - tcg_regset_set_reg(s->reserved_regs, TCG_GUEST_BASE_REG); + if (guest_base) { + tcg_out_movi(s, TCG_TYPE_PTR, TCG_GUEST_BASE_REG, guest_base); + tcg_regset_set_reg(s->reserved_regs, TCG_GUEST_BASE_REG); + } #endif /* Call generated code */ From 4944d359107fa3e88200697daef28e3b5878daa6 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Sun, 1 Oct 2023 12:35:44 -0700 Subject: [PATCH 23/38] tcg/riscv: Use tcg_use_softmmu Signed-off-by: Richard Henderson --- tcg/riscv/tcg-target.c.inc | 185 +++++++++++++++++++------------------ 1 file changed, 94 insertions(+), 91 deletions(-) diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc index dc71f829d1..34e10e77d9 100644 --- a/tcg/riscv/tcg-target.c.inc +++ b/tcg/riscv/tcg-target.c.inc @@ -1245,105 +1245,110 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, TCGReg *pbase, aa = atom_and_align_for_opc(s, opc, MO_ATOM_IFALIGN, false); a_mask = (1u << aa.align) - 1; -#ifdef CONFIG_SOFTMMU - unsigned s_bits = opc & MO_SIZE; - unsigned s_mask = (1u << s_bits) - 1; - int mem_index = get_mmuidx(oi); - int fast_ofs = tlb_mask_table_ofs(s, mem_index); - int mask_ofs = fast_ofs + offsetof(CPUTLBDescFast, mask); - int table_ofs = fast_ofs + offsetof(CPUTLBDescFast, table); - int compare_mask; - TCGReg addr_adj; + if (tcg_use_softmmu) { + unsigned s_bits = opc & MO_SIZE; + unsigned s_mask = (1u << s_bits) - 1; + int mem_index = get_mmuidx(oi); + int fast_ofs = tlb_mask_table_ofs(s, mem_index); + int mask_ofs = fast_ofs + offsetof(CPUTLBDescFast, mask); + int table_ofs = fast_ofs + offsetof(CPUTLBDescFast, table); + int compare_mask; + TCGReg addr_adj; - ldst = new_ldst_label(s); - ldst->is_ld = is_ld; - ldst->oi = oi; - ldst->addrlo_reg = addr_reg; - - tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP0, TCG_AREG0, mask_ofs); - tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP1, TCG_AREG0, table_ofs); - - tcg_out_opc_imm(s, OPC_SRLI, TCG_REG_TMP2, addr_reg, - s->page_bits - CPU_TLB_ENTRY_BITS); - tcg_out_opc_reg(s, OPC_AND, TCG_REG_TMP2, TCG_REG_TMP2, TCG_REG_TMP0); - tcg_out_opc_reg(s, OPC_ADD, TCG_REG_TMP2, TCG_REG_TMP2, TCG_REG_TMP1); - - /* - * For aligned accesses, we check the first byte and include the alignment - * bits within the address. For unaligned access, we check that we don't - * cross pages using the address of the last byte of the access. - */ - addr_adj = addr_reg; - if (a_mask < s_mask) { - addr_adj = TCG_REG_TMP0; - tcg_out_opc_imm(s, addr_type == TCG_TYPE_I32 ? OPC_ADDIW : OPC_ADDI, - addr_adj, addr_reg, s_mask - a_mask); - } - compare_mask = s->page_mask | a_mask; - if (compare_mask == sextreg(compare_mask, 0, 12)) { - tcg_out_opc_imm(s, OPC_ANDI, TCG_REG_TMP1, addr_adj, compare_mask); - } else { - tcg_out_movi(s, addr_type, TCG_REG_TMP1, compare_mask); - tcg_out_opc_reg(s, OPC_AND, TCG_REG_TMP1, TCG_REG_TMP1, addr_adj); - } - - /* Load the tlb comparator and the addend. */ - QEMU_BUILD_BUG_ON(HOST_BIG_ENDIAN); - tcg_out_ld(s, addr_type, TCG_REG_TMP0, TCG_REG_TMP2, - is_ld ? offsetof(CPUTLBEntry, addr_read) - : offsetof(CPUTLBEntry, addr_write)); - tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP2, TCG_REG_TMP2, - offsetof(CPUTLBEntry, addend)); - - /* Compare masked address with the TLB entry. */ - ldst->label_ptr[0] = s->code_ptr; - tcg_out_opc_branch(s, OPC_BNE, TCG_REG_TMP0, TCG_REG_TMP1, 0); - - /* TLB Hit - translate address using addend. */ - if (addr_type != TCG_TYPE_I32) { - tcg_out_opc_reg(s, OPC_ADD, TCG_REG_TMP0, addr_reg, TCG_REG_TMP2); - } else if (have_zba) { - tcg_out_opc_reg(s, OPC_ADD_UW, TCG_REG_TMP0, addr_reg, TCG_REG_TMP2); - } else { - tcg_out_ext32u(s, TCG_REG_TMP0, addr_reg); - tcg_out_opc_reg(s, OPC_ADD, TCG_REG_TMP0, TCG_REG_TMP0, TCG_REG_TMP2); - } - *pbase = TCG_REG_TMP0; -#else - TCGReg base; - - if (a_mask) { ldst = new_ldst_label(s); ldst->is_ld = is_ld; ldst->oi = oi; ldst->addrlo_reg = addr_reg; - /* We are expecting alignment max 7, so we can always use andi. */ - tcg_debug_assert(a_mask == sextreg(a_mask, 0, 12)); - tcg_out_opc_imm(s, OPC_ANDI, TCG_REG_TMP1, addr_reg, a_mask); + tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP0, TCG_AREG0, mask_ofs); + tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP1, TCG_AREG0, table_ofs); - ldst->label_ptr[0] = s->code_ptr; - tcg_out_opc_branch(s, OPC_BNE, TCG_REG_TMP1, TCG_REG_ZERO, 0); - } + tcg_out_opc_imm(s, OPC_SRLI, TCG_REG_TMP2, addr_reg, + s->page_bits - CPU_TLB_ENTRY_BITS); + tcg_out_opc_reg(s, OPC_AND, TCG_REG_TMP2, TCG_REG_TMP2, TCG_REG_TMP0); + tcg_out_opc_reg(s, OPC_ADD, TCG_REG_TMP2, TCG_REG_TMP2, TCG_REG_TMP1); - if (guest_base != 0) { - base = TCG_REG_TMP0; - if (addr_type != TCG_TYPE_I32) { - tcg_out_opc_reg(s, OPC_ADD, base, addr_reg, TCG_GUEST_BASE_REG); - } else if (have_zba) { - tcg_out_opc_reg(s, OPC_ADD_UW, base, addr_reg, TCG_GUEST_BASE_REG); - } else { - tcg_out_ext32u(s, base, addr_reg); - tcg_out_opc_reg(s, OPC_ADD, base, base, TCG_GUEST_BASE_REG); + /* + * For aligned accesses, we check the first byte and include the + * alignment bits within the address. For unaligned access, we + * check that we don't cross pages using the address of the last + * byte of the access. + */ + addr_adj = addr_reg; + if (a_mask < s_mask) { + addr_adj = TCG_REG_TMP0; + tcg_out_opc_imm(s, addr_type == TCG_TYPE_I32 ? OPC_ADDIW : OPC_ADDI, + addr_adj, addr_reg, s_mask - a_mask); } - } else if (addr_type != TCG_TYPE_I32) { - base = addr_reg; + compare_mask = s->page_mask | a_mask; + if (compare_mask == sextreg(compare_mask, 0, 12)) { + tcg_out_opc_imm(s, OPC_ANDI, TCG_REG_TMP1, addr_adj, compare_mask); + } else { + tcg_out_movi(s, addr_type, TCG_REG_TMP1, compare_mask); + tcg_out_opc_reg(s, OPC_AND, TCG_REG_TMP1, TCG_REG_TMP1, addr_adj); + } + + /* Load the tlb comparator and the addend. */ + QEMU_BUILD_BUG_ON(HOST_BIG_ENDIAN); + tcg_out_ld(s, addr_type, TCG_REG_TMP0, TCG_REG_TMP2, + is_ld ? offsetof(CPUTLBEntry, addr_read) + : offsetof(CPUTLBEntry, addr_write)); + tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP2, TCG_REG_TMP2, + offsetof(CPUTLBEntry, addend)); + + /* Compare masked address with the TLB entry. */ + ldst->label_ptr[0] = s->code_ptr; + tcg_out_opc_branch(s, OPC_BNE, TCG_REG_TMP0, TCG_REG_TMP1, 0); + + /* TLB Hit - translate address using addend. */ + if (addr_type != TCG_TYPE_I32) { + tcg_out_opc_reg(s, OPC_ADD, TCG_REG_TMP0, addr_reg, TCG_REG_TMP2); + } else if (have_zba) { + tcg_out_opc_reg(s, OPC_ADD_UW, TCG_REG_TMP0, + addr_reg, TCG_REG_TMP2); + } else { + tcg_out_ext32u(s, TCG_REG_TMP0, addr_reg); + tcg_out_opc_reg(s, OPC_ADD, TCG_REG_TMP0, + TCG_REG_TMP0, TCG_REG_TMP2); + } + *pbase = TCG_REG_TMP0; } else { - base = TCG_REG_TMP0; - tcg_out_ext32u(s, base, addr_reg); + TCGReg base; + + if (a_mask) { + ldst = new_ldst_label(s); + ldst->is_ld = is_ld; + ldst->oi = oi; + ldst->addrlo_reg = addr_reg; + + /* We are expecting alignment max 7, so we can always use andi. */ + tcg_debug_assert(a_mask == sextreg(a_mask, 0, 12)); + tcg_out_opc_imm(s, OPC_ANDI, TCG_REG_TMP1, addr_reg, a_mask); + + ldst->label_ptr[0] = s->code_ptr; + tcg_out_opc_branch(s, OPC_BNE, TCG_REG_TMP1, TCG_REG_ZERO, 0); + } + + if (guest_base != 0) { + base = TCG_REG_TMP0; + if (addr_type != TCG_TYPE_I32) { + tcg_out_opc_reg(s, OPC_ADD, base, addr_reg, + TCG_GUEST_BASE_REG); + } else if (have_zba) { + tcg_out_opc_reg(s, OPC_ADD_UW, base, addr_reg, + TCG_GUEST_BASE_REG); + } else { + tcg_out_ext32u(s, base, addr_reg); + tcg_out_opc_reg(s, OPC_ADD, base, base, TCG_GUEST_BASE_REG); + } + } else if (addr_type != TCG_TYPE_I32) { + base = addr_reg; + } else { + base = TCG_REG_TMP0; + tcg_out_ext32u(s, base, addr_reg); + } + *pbase = base; } - *pbase = base; -#endif return ldst; } @@ -2075,12 +2080,10 @@ static void tcg_target_qemu_prologue(TCGContext *s) TCG_REG_SP, SAVE_OFS + i * REG_SIZE); } -#if !defined(CONFIG_SOFTMMU) - if (guest_base) { + if (!tcg_use_softmmu && guest_base) { tcg_out_movi(s, TCG_TYPE_PTR, TCG_GUEST_BASE_REG, guest_base); tcg_regset_set_reg(s->reserved_regs, TCG_GUEST_BASE_REG); } -#endif /* Call generated code */ tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]); From 2e486b5901986134863487352df961331a2b56c2 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Sun, 1 Oct 2023 21:16:32 +0000 Subject: [PATCH 24/38] tcg/s390x: Use tcg_use_softmmu MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Richard Henderson --- tcg/s390x/tcg-target.c.inc | 161 ++++++++++++++++++------------------- 1 file changed, 79 insertions(+), 82 deletions(-) diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc index 4ef9ac3d5b..fbee43d3b0 100644 --- a/tcg/s390x/tcg-target.c.inc +++ b/tcg/s390x/tcg-target.c.inc @@ -46,9 +46,7 @@ /* A scratch register that may be be used throughout the backend. */ #define TCG_TMP0 TCG_REG_R1 -#ifndef CONFIG_SOFTMMU #define TCG_GUEST_BASE_REG TCG_REG_R13 -#endif /* All of the following instructions are prefixed with their instruction format, and are defined as 8- or 16-bit quantities, even when the two @@ -1768,94 +1766,95 @@ static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h, h->aa = atom_and_align_for_opc(s, opc, MO_ATOM_IFALIGN, s_bits == MO_128); a_mask = (1 << h->aa.align) - 1; -#ifdef CONFIG_SOFTMMU - unsigned s_mask = (1 << s_bits) - 1; - int mem_index = get_mmuidx(oi); - int fast_off = tlb_mask_table_ofs(s, mem_index); - int mask_off = fast_off + offsetof(CPUTLBDescFast, mask); - int table_off = fast_off + offsetof(CPUTLBDescFast, table); - int ofs, a_off; - uint64_t tlb_mask; + if (tcg_use_softmmu) { + unsigned s_mask = (1 << s_bits) - 1; + int mem_index = get_mmuidx(oi); + int fast_off = tlb_mask_table_ofs(s, mem_index); + int mask_off = fast_off + offsetof(CPUTLBDescFast, mask); + int table_off = fast_off + offsetof(CPUTLBDescFast, table); + int ofs, a_off; + uint64_t tlb_mask; - ldst = new_ldst_label(s); - ldst->is_ld = is_ld; - ldst->oi = oi; - ldst->addrlo_reg = addr_reg; - - tcg_out_sh64(s, RSY_SRLG, TCG_TMP0, addr_reg, TCG_REG_NONE, - s->page_bits - CPU_TLB_ENTRY_BITS); - - tcg_out_insn(s, RXY, NG, TCG_TMP0, TCG_AREG0, TCG_REG_NONE, mask_off); - tcg_out_insn(s, RXY, AG, TCG_TMP0, TCG_AREG0, TCG_REG_NONE, table_off); - - /* - * For aligned accesses, we check the first byte and include the alignment - * bits within the address. For unaligned access, we check that we don't - * cross pages using the address of the last byte of the access. - */ - a_off = (a_mask >= s_mask ? 0 : s_mask - a_mask); - tlb_mask = (uint64_t)s->page_mask | a_mask; - if (a_off == 0) { - tgen_andi_risbg(s, TCG_REG_R0, addr_reg, tlb_mask); - } else { - tcg_out_insn(s, RX, LA, TCG_REG_R0, addr_reg, TCG_REG_NONE, a_off); - tgen_andi(s, addr_type, TCG_REG_R0, tlb_mask); - } - - if (is_ld) { - ofs = offsetof(CPUTLBEntry, addr_read); - } else { - ofs = offsetof(CPUTLBEntry, addr_write); - } - if (addr_type == TCG_TYPE_I32) { - ofs += HOST_BIG_ENDIAN * 4; - tcg_out_insn(s, RX, C, TCG_REG_R0, TCG_TMP0, TCG_REG_NONE, ofs); - } else { - tcg_out_insn(s, RXY, CG, TCG_REG_R0, TCG_TMP0, TCG_REG_NONE, ofs); - } - - tcg_out16(s, RI_BRC | (S390_CC_NE << 4)); - ldst->label_ptr[0] = s->code_ptr++; - - h->index = TCG_TMP0; - tcg_out_insn(s, RXY, LG, h->index, TCG_TMP0, TCG_REG_NONE, - offsetof(CPUTLBEntry, addend)); - - if (addr_type == TCG_TYPE_I32) { - tcg_out_insn(s, RRE, ALGFR, h->index, addr_reg); - h->base = TCG_REG_NONE; - } else { - h->base = addr_reg; - } - h->disp = 0; -#else - if (a_mask) { ldst = new_ldst_label(s); ldst->is_ld = is_ld; ldst->oi = oi; ldst->addrlo_reg = addr_reg; - /* We are expecting a_bits to max out at 7, much lower than TMLL. */ - tcg_debug_assert(a_mask <= 0xffff); - tcg_out_insn(s, RI, TMLL, addr_reg, a_mask); + tcg_out_sh64(s, RSY_SRLG, TCG_TMP0, addr_reg, TCG_REG_NONE, + s->page_bits - CPU_TLB_ENTRY_BITS); - tcg_out16(s, RI_BRC | (7 << 4)); /* CC in {1,2,3} */ + tcg_out_insn(s, RXY, NG, TCG_TMP0, TCG_AREG0, TCG_REG_NONE, mask_off); + tcg_out_insn(s, RXY, AG, TCG_TMP0, TCG_AREG0, TCG_REG_NONE, table_off); + + /* + * For aligned accesses, we check the first byte and include the + * alignment bits within the address. For unaligned access, we + * check that we don't cross pages using the address of the last + * byte of the access. + */ + a_off = (a_mask >= s_mask ? 0 : s_mask - a_mask); + tlb_mask = (uint64_t)s->page_mask | a_mask; + if (a_off == 0) { + tgen_andi_risbg(s, TCG_REG_R0, addr_reg, tlb_mask); + } else { + tcg_out_insn(s, RX, LA, TCG_REG_R0, addr_reg, TCG_REG_NONE, a_off); + tgen_andi(s, addr_type, TCG_REG_R0, tlb_mask); + } + + if (is_ld) { + ofs = offsetof(CPUTLBEntry, addr_read); + } else { + ofs = offsetof(CPUTLBEntry, addr_write); + } + if (addr_type == TCG_TYPE_I32) { + ofs += HOST_BIG_ENDIAN * 4; + tcg_out_insn(s, RX, C, TCG_REG_R0, TCG_TMP0, TCG_REG_NONE, ofs); + } else { + tcg_out_insn(s, RXY, CG, TCG_REG_R0, TCG_TMP0, TCG_REG_NONE, ofs); + } + + tcg_out16(s, RI_BRC | (S390_CC_NE << 4)); ldst->label_ptr[0] = s->code_ptr++; - } - h->base = addr_reg; - if (addr_type == TCG_TYPE_I32) { - tcg_out_ext32u(s, TCG_TMP0, addr_reg); - h->base = TCG_TMP0; - } - if (guest_base < 0x80000) { - h->index = TCG_REG_NONE; - h->disp = guest_base; - } else { - h->index = TCG_GUEST_BASE_REG; + h->index = TCG_TMP0; + tcg_out_insn(s, RXY, LG, h->index, TCG_TMP0, TCG_REG_NONE, + offsetof(CPUTLBEntry, addend)); + + if (addr_type == TCG_TYPE_I32) { + tcg_out_insn(s, RRE, ALGFR, h->index, addr_reg); + h->base = TCG_REG_NONE; + } else { + h->base = addr_reg; + } h->disp = 0; + } else { + if (a_mask) { + ldst = new_ldst_label(s); + ldst->is_ld = is_ld; + ldst->oi = oi; + ldst->addrlo_reg = addr_reg; + + /* We are expecting a_bits to max out at 7, much lower than TMLL. */ + tcg_debug_assert(a_mask <= 0xffff); + tcg_out_insn(s, RI, TMLL, addr_reg, a_mask); + + tcg_out16(s, RI_BRC | (7 << 4)); /* CC in {1,2,3} */ + ldst->label_ptr[0] = s->code_ptr++; + } + + h->base = addr_reg; + if (addr_type == TCG_TYPE_I32) { + tcg_out_ext32u(s, TCG_TMP0, addr_reg); + h->base = TCG_TMP0; + } + if (guest_base < 0x80000) { + h->index = TCG_REG_NONE; + h->disp = guest_base; + } else { + h->index = TCG_GUEST_BASE_REG; + h->disp = 0; + } } -#endif return ldst; } @@ -3453,12 +3452,10 @@ static void tcg_target_qemu_prologue(TCGContext *s) TCG_STATIC_CALL_ARGS_SIZE + TCG_TARGET_CALL_STACK_OFFSET, CPU_TEMP_BUF_NLONGS * sizeof(long)); -#ifndef CONFIG_SOFTMMU - if (guest_base >= 0x80000) { + if (!tcg_use_softmmu && guest_base >= 0x80000) { tcg_out_movi(s, TCG_TYPE_PTR, TCG_GUEST_BASE_REG, guest_base); tcg_regset_set_reg(s->reserved_regs, TCG_GUEST_BASE_REG); } -#endif tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]); From 40f40fc8f09edd8db620786a16a927e711257ff2 Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Sun, 15 Oct 2023 06:45:46 +0545 Subject: [PATCH 25/38] tcg: drop unused tcg_temp_free define MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use of the API was removed a while back, but the define wasn't. Signed-off-by: Mike Frysinger Reviewed-by: Philippe Mathieu-Daudé Message-Id: <20231015010046.16020-1-vapier@gentoo.org> Signed-off-by: Richard Henderson --- include/tcg/tcg-op.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/tcg/tcg-op.h b/include/tcg/tcg-op.h index 80cfcf8104..3ead59e459 100644 --- a/include/tcg/tcg-op.h +++ b/include/tcg/tcg-op.h @@ -52,7 +52,6 @@ static inline void tcg_gen_insn_start(target_ulong pc, target_ulong a1, typedef TCGv_i32 TCGv; #define tcg_temp_new() tcg_temp_new_i32() #define tcg_global_mem_new tcg_global_mem_new_i32 -#define tcg_temp_free tcg_temp_free_i32 #define tcgv_tl_temp tcgv_i32_temp #define tcg_gen_qemu_ld_tl tcg_gen_qemu_ld_i32 #define tcg_gen_qemu_st_tl tcg_gen_qemu_st_i32 @@ -60,7 +59,6 @@ typedef TCGv_i32 TCGv; typedef TCGv_i64 TCGv; #define tcg_temp_new() tcg_temp_new_i64() #define tcg_global_mem_new tcg_global_mem_new_i64 -#define tcg_temp_free tcg_temp_free_i64 #define tcgv_tl_temp tcgv_i64_temp #define tcg_gen_qemu_ld_tl tcg_gen_qemu_ld_i64 #define tcg_gen_qemu_st_tl tcg_gen_qemu_st_i64 From bfefdbea9ee491842d4e61e54f97508c040913a8 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Mon, 16 Oct 2023 14:45:12 -0700 Subject: [PATCH 26/38] tcg: Use constant zero when expanding with divu2 Signed-off-by: Richard Henderson --- tcg/tcg-op.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c index 393dbcd01c..c29355b67b 100644 --- a/tcg/tcg-op.c +++ b/tcg/tcg-op.c @@ -342,8 +342,8 @@ void tcg_gen_divu_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2) tcg_gen_op3_i32(INDEX_op_divu_i32, ret, arg1, arg2); } else if (TCG_TARGET_HAS_div2_i32) { TCGv_i32 t0 = tcg_temp_ebb_new_i32(); - tcg_gen_movi_i32(t0, 0); - tcg_gen_op5_i32(INDEX_op_divu2_i32, ret, t0, arg1, t0, arg2); + TCGv_i32 zero = tcg_constant_i32(0); + tcg_gen_op5_i32(INDEX_op_divu2_i32, ret, t0, arg1, zero, arg2); tcg_temp_free_i32(t0); } else { gen_helper_divu_i32(ret, arg1, arg2); @@ -362,8 +362,8 @@ void tcg_gen_remu_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2) tcg_temp_free_i32(t0); } else if (TCG_TARGET_HAS_div2_i32) { TCGv_i32 t0 = tcg_temp_ebb_new_i32(); - tcg_gen_movi_i32(t0, 0); - tcg_gen_op5_i32(INDEX_op_divu2_i32, t0, ret, arg1, t0, arg2); + TCGv_i32 zero = tcg_constant_i32(0); + tcg_gen_op5_i32(INDEX_op_divu2_i32, t0, ret, arg1, zero, arg2); tcg_temp_free_i32(t0); } else { gen_helper_remu_i32(ret, arg1, arg2); @@ -1674,8 +1674,8 @@ void tcg_gen_divu_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2) tcg_gen_op3_i64(INDEX_op_divu_i64, ret, arg1, arg2); } else if (TCG_TARGET_HAS_div2_i64) { TCGv_i64 t0 = tcg_temp_ebb_new_i64(); - tcg_gen_movi_i64(t0, 0); - tcg_gen_op5_i64(INDEX_op_divu2_i64, ret, t0, arg1, t0, arg2); + TCGv_i64 zero = tcg_constant_i64(0); + tcg_gen_op5_i64(INDEX_op_divu2_i64, ret, t0, arg1, zero, arg2); tcg_temp_free_i64(t0); } else { gen_helper_divu_i64(ret, arg1, arg2); @@ -1694,8 +1694,8 @@ void tcg_gen_remu_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2) tcg_temp_free_i64(t0); } else if (TCG_TARGET_HAS_div2_i64) { TCGv_i64 t0 = tcg_temp_ebb_new_i64(); - tcg_gen_movi_i64(t0, 0); - tcg_gen_op5_i64(INDEX_op_divu2_i64, t0, ret, arg1, t0, arg2); + TCGv_i64 zero = tcg_constant_i64(0); + tcg_gen_op5_i64(INDEX_op_divu2_i64, t0, ret, arg1, zero, arg2); tcg_temp_free_i64(t0); } else { gen_helper_remu_i64(ret, arg1, arg2); From d97f8f394184403beb655f5f077bef8ef83ffa3a Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Mon, 16 Oct 2023 19:10:42 -0700 Subject: [PATCH 27/38] tcg: Optimize past conditional branches We already register allocate through extended basic blocks, optimize through extended basic blocks as well. Signed-off-by: Richard Henderson --- tcg/optimize.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tcg/optimize.c b/tcg/optimize.c index 3013eb04e6..2db5177c32 100644 --- a/tcg/optimize.c +++ b/tcg/optimize.c @@ -688,12 +688,14 @@ static void finish_folding(OptContext *ctx, TCGOp *op) int i, nb_oargs; /* - * For an opcode that ends a BB, reset all temp data. - * We do no cross-BB optimization. + * We only optimize extended basic blocks. If the opcode ends a BB + * and is not a conditional branch, reset all temp data. */ if (def->flags & TCG_OPF_BB_END) { - memset(&ctx->temps_used, 0, sizeof(ctx->temps_used)); ctx->prev_mb = NULL; + if (!(def->flags & TCG_OPF_COND_BRANCH)) { + memset(&ctx->temps_used, 0, sizeof(ctx->temps_used)); + } return; } From a01d9792a737f371a5050c483097e38eab9fa81c Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Thu, 24 Aug 2023 10:25:21 -0700 Subject: [PATCH 28/38] tcg: Add tcg_gen_{ld,st}_i128 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Do not require the translators to jump through concat and extract of i64 in order to move values to and from env. Tested-by: Song Gao Reviewed-by: Song Gao Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Richard Henderson --- include/tcg/tcg-op-common.h | 3 +++ tcg/tcg-op.c | 22 ++++++++++++++++++++++ 2 files changed, 25 insertions(+) diff --git a/include/tcg/tcg-op-common.h b/include/tcg/tcg-op-common.h index 2048f92b5e..56d4e9cb9f 100644 --- a/include/tcg/tcg-op-common.h +++ b/include/tcg/tcg-op-common.h @@ -747,6 +747,9 @@ void tcg_gen_mov_i128(TCGv_i128 dst, TCGv_i128 src); void tcg_gen_extr_i128_i64(TCGv_i64 lo, TCGv_i64 hi, TCGv_i128 arg); void tcg_gen_concat_i64_i128(TCGv_i128 ret, TCGv_i64 lo, TCGv_i64 hi); +void tcg_gen_ld_i128(TCGv_i128 ret, TCGv_ptr base, tcg_target_long offset); +void tcg_gen_st_i128(TCGv_i128 val, TCGv_ptr base, tcg_target_long offset); + static inline void tcg_gen_concat32_i64(TCGv_i64 ret, TCGv_i64 lo, TCGv_i64 hi) { tcg_gen_deposit_i64(ret, lo, hi, 32, 32); diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c index c29355b67b..b4dbb2f2ba 100644 --- a/tcg/tcg-op.c +++ b/tcg/tcg-op.c @@ -2880,6 +2880,28 @@ void tcg_gen_mov_i128(TCGv_i128 dst, TCGv_i128 src) } } +void tcg_gen_ld_i128(TCGv_i128 ret, TCGv_ptr base, tcg_target_long offset) +{ + if (HOST_BIG_ENDIAN) { + tcg_gen_ld_i64(TCGV128_HIGH(ret), base, offset); + tcg_gen_ld_i64(TCGV128_LOW(ret), base, offset + 8); + } else { + tcg_gen_ld_i64(TCGV128_LOW(ret), base, offset); + tcg_gen_ld_i64(TCGV128_HIGH(ret), base, offset + 8); + } +} + +void tcg_gen_st_i128(TCGv_i128 val, TCGv_ptr base, tcg_target_long offset) +{ + if (HOST_BIG_ENDIAN) { + tcg_gen_st_i64(TCGV128_HIGH(val), base, offset); + tcg_gen_st_i64(TCGV128_LOW(val), base, offset + 8); + } else { + tcg_gen_st_i64(TCGV128_LOW(val), base, offset); + tcg_gen_st_i64(TCGV128_HIGH(val), base, offset + 8); + } +} + /* QEMU specific operations. */ void tcg_gen_exit_tb(const TranslationBlock *tb, unsigned idx) From 46c684c86260489eef485311aa76bb7aa94cedfc Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Thu, 24 Aug 2023 11:08:44 -0700 Subject: [PATCH 29/38] target/i386: Use i128 for 128 and 256-bit loads and stores Signed-off-by: Richard Henderson --- target/i386/tcg/translate.c | 63 +++++++++++++++++-------------------- 1 file changed, 29 insertions(+), 34 deletions(-) diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c index 4f6f9fa7e5..18d06ab247 100644 --- a/target/i386/tcg/translate.c +++ b/target/i386/tcg/translate.c @@ -2918,59 +2918,54 @@ static inline void gen_stq_env_A0(DisasContext *s, int offset) static inline void gen_ldo_env_A0(DisasContext *s, int offset, bool align) { + MemOp atom = (s->cpuid_ext_features & CPUID_EXT_AVX + ? MO_ATOM_IFALIGN : MO_ATOM_IFALIGN_PAIR); + MemOp mop = MO_128 | MO_LE | atom | (align ? MO_ALIGN_16 : 0); int mem_index = s->mem_index; - tcg_gen_qemu_ld_i64(s->tmp1_i64, s->A0, mem_index, - MO_LEUQ | (align ? MO_ALIGN_16 : 0)); - tcg_gen_st_i64(s->tmp1_i64, tcg_env, offset + offsetof(XMMReg, XMM_Q(0))); - tcg_gen_addi_tl(s->tmp0, s->A0, 8); - tcg_gen_qemu_ld_i64(s->tmp1_i64, s->tmp0, mem_index, MO_LEUQ); - tcg_gen_st_i64(s->tmp1_i64, tcg_env, offset + offsetof(XMMReg, XMM_Q(1))); + TCGv_i128 t = tcg_temp_new_i128(); + + tcg_gen_qemu_ld_i128(t, s->A0, mem_index, mop); + tcg_gen_st_i128(t, tcg_env, offset); } static inline void gen_sto_env_A0(DisasContext *s, int offset, bool align) { + MemOp atom = (s->cpuid_ext_features & CPUID_EXT_AVX + ? MO_ATOM_IFALIGN : MO_ATOM_IFALIGN_PAIR); + MemOp mop = MO_128 | MO_LE | atom | (align ? MO_ALIGN_16 : 0); int mem_index = s->mem_index; - tcg_gen_ld_i64(s->tmp1_i64, tcg_env, offset + offsetof(XMMReg, XMM_Q(0))); - tcg_gen_qemu_st_i64(s->tmp1_i64, s->A0, mem_index, - MO_LEUQ | (align ? MO_ALIGN_16 : 0)); - tcg_gen_addi_tl(s->tmp0, s->A0, 8); - tcg_gen_ld_i64(s->tmp1_i64, tcg_env, offset + offsetof(XMMReg, XMM_Q(1))); - tcg_gen_qemu_st_i64(s->tmp1_i64, s->tmp0, mem_index, MO_LEUQ); + TCGv_i128 t = tcg_temp_new_i128(); + + tcg_gen_ld_i128(t, tcg_env, offset); + tcg_gen_qemu_st_i128(t, s->A0, mem_index, mop); } static void gen_ldy_env_A0(DisasContext *s, int offset, bool align) { + MemOp mop = MO_128 | MO_LE | MO_ATOM_IFALIGN_PAIR; int mem_index = s->mem_index; - tcg_gen_qemu_ld_i64(s->tmp1_i64, s->A0, mem_index, - MO_LEUQ | (align ? MO_ALIGN_32 : 0)); - tcg_gen_st_i64(s->tmp1_i64, tcg_env, offset + offsetof(YMMReg, YMM_Q(0))); - tcg_gen_addi_tl(s->tmp0, s->A0, 8); - tcg_gen_qemu_ld_i64(s->tmp1_i64, s->tmp0, mem_index, MO_LEUQ); - tcg_gen_st_i64(s->tmp1_i64, tcg_env, offset + offsetof(YMMReg, YMM_Q(1))); + TCGv_i128 t0 = tcg_temp_new_i128(); + TCGv_i128 t1 = tcg_temp_new_i128(); + tcg_gen_qemu_ld_i128(t0, s->A0, mem_index, mop | (align ? MO_ALIGN_32 : 0)); tcg_gen_addi_tl(s->tmp0, s->A0, 16); - tcg_gen_qemu_ld_i64(s->tmp1_i64, s->tmp0, mem_index, MO_LEUQ); - tcg_gen_st_i64(s->tmp1_i64, tcg_env, offset + offsetof(YMMReg, YMM_Q(2))); - tcg_gen_addi_tl(s->tmp0, s->A0, 24); - tcg_gen_qemu_ld_i64(s->tmp1_i64, s->tmp0, mem_index, MO_LEUQ); - tcg_gen_st_i64(s->tmp1_i64, tcg_env, offset + offsetof(YMMReg, YMM_Q(3))); + tcg_gen_qemu_ld_i128(t1, s->tmp0, mem_index, mop); + + tcg_gen_st_i128(t0, tcg_env, offset + offsetof(YMMReg, YMM_X(0))); + tcg_gen_st_i128(t1, tcg_env, offset + offsetof(YMMReg, YMM_X(1))); } static void gen_sty_env_A0(DisasContext *s, int offset, bool align) { + MemOp mop = MO_128 | MO_LE | MO_ATOM_IFALIGN_PAIR; int mem_index = s->mem_index; - tcg_gen_ld_i64(s->tmp1_i64, tcg_env, offset + offsetof(YMMReg, YMM_Q(0))); - tcg_gen_qemu_st_i64(s->tmp1_i64, s->A0, mem_index, - MO_LEUQ | (align ? MO_ALIGN_32 : 0)); - tcg_gen_addi_tl(s->tmp0, s->A0, 8); - tcg_gen_ld_i64(s->tmp1_i64, tcg_env, offset + offsetof(YMMReg, YMM_Q(1))); - tcg_gen_qemu_st_i64(s->tmp1_i64, s->tmp0, mem_index, MO_LEUQ); + TCGv_i128 t = tcg_temp_new_i128(); + + tcg_gen_ld_i128(t, tcg_env, offset + offsetof(YMMReg, YMM_X(0))); + tcg_gen_qemu_st_i128(t, s->A0, mem_index, mop | (align ? MO_ALIGN_32 : 0)); tcg_gen_addi_tl(s->tmp0, s->A0, 16); - tcg_gen_ld_i64(s->tmp1_i64, tcg_env, offset + offsetof(YMMReg, YMM_Q(2))); - tcg_gen_qemu_st_i64(s->tmp1_i64, s->tmp0, mem_index, MO_LEUQ); - tcg_gen_addi_tl(s->tmp0, s->A0, 24); - tcg_gen_ld_i64(s->tmp1_i64, tcg_env, offset + offsetof(YMMReg, YMM_Q(3))); - tcg_gen_qemu_st_i64(s->tmp1_i64, s->tmp0, mem_index, MO_LEUQ); + tcg_gen_ld_i128(t, tcg_env, offset + offsetof(YMMReg, YMM_X(1))); + tcg_gen_qemu_st_i128(t, s->tmp0, mem_index, mop); } #include "decode-new.h" From 93c86ecd77a308e023fe158936abc78ea01072d5 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 19 Oct 2023 12:46:43 +0200 Subject: [PATCH 30/38] tcg: add negsetcondi This can be useful to write a shift bit extraction that does not depend on TARGET_LONG_BITS. Signed-off-by: Paolo Bonzini Message-Id: <20231019104648.389942-15-pbonzini@redhat.com> Signed-off-by: Richard Henderson --- include/tcg/tcg-op-common.h | 4 ++++ include/tcg/tcg-op.h | 2 ++ tcg/tcg-op.c | 12 ++++++++++++ 3 files changed, 18 insertions(+) diff --git a/include/tcg/tcg-op-common.h b/include/tcg/tcg-op-common.h index 56d4e9cb9f..a0bae5df01 100644 --- a/include/tcg/tcg-op-common.h +++ b/include/tcg/tcg-op-common.h @@ -346,6 +346,8 @@ void tcg_gen_setcondi_i32(TCGCond cond, TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2); void tcg_gen_negsetcond_i32(TCGCond cond, TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2); +void tcg_gen_negsetcondi_i32(TCGCond cond, TCGv_i32 ret, + TCGv_i32 arg1, int32_t arg2); void tcg_gen_movcond_i32(TCGCond cond, TCGv_i32 ret, TCGv_i32 c1, TCGv_i32 c2, TCGv_i32 v1, TCGv_i32 v2); void tcg_gen_add2_i32(TCGv_i32 rl, TCGv_i32 rh, TCGv_i32 al, @@ -544,6 +546,8 @@ void tcg_gen_setcondi_i64(TCGCond cond, TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2); void tcg_gen_negsetcond_i64(TCGCond cond, TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2); +void tcg_gen_negsetcondi_i64(TCGCond cond, TCGv_i64 ret, + TCGv_i64 arg1, int64_t arg2); void tcg_gen_movcond_i64(TCGCond cond, TCGv_i64 ret, TCGv_i64 c1, TCGv_i64 c2, TCGv_i64 v1, TCGv_i64 v2); void tcg_gen_add2_i64(TCGv_i64 rl, TCGv_i64 rh, TCGv_i64 al, diff --git a/include/tcg/tcg-op.h b/include/tcg/tcg-op.h index 3ead59e459..e81dd7dd9e 100644 --- a/include/tcg/tcg-op.h +++ b/include/tcg/tcg-op.h @@ -199,6 +199,7 @@ DEF_ATOMIC2(tcg_gen_atomic_umax_fetch, i64) #define tcg_gen_setcond_tl tcg_gen_setcond_i64 #define tcg_gen_setcondi_tl tcg_gen_setcondi_i64 #define tcg_gen_negsetcond_tl tcg_gen_negsetcond_i64 +#define tcg_gen_negsetcondi_tl tcg_gen_negsetcondi_i64 #define tcg_gen_mul_tl tcg_gen_mul_i64 #define tcg_gen_muli_tl tcg_gen_muli_i64 #define tcg_gen_div_tl tcg_gen_div_i64 @@ -317,6 +318,7 @@ DEF_ATOMIC2(tcg_gen_atomic_umax_fetch, i64) #define tcg_gen_setcond_tl tcg_gen_setcond_i32 #define tcg_gen_setcondi_tl tcg_gen_setcondi_i32 #define tcg_gen_negsetcond_tl tcg_gen_negsetcond_i32 +#define tcg_gen_negsetcondi_tl tcg_gen_negsetcondi_i32 #define tcg_gen_mul_tl tcg_gen_mul_i32 #define tcg_gen_muli_tl tcg_gen_muli_i32 #define tcg_gen_div_tl tcg_gen_div_i32 diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c index b4dbb2f2ba..828eb9ee46 100644 --- a/tcg/tcg-op.c +++ b/tcg/tcg-op.c @@ -291,6 +291,12 @@ void tcg_gen_negsetcond_i32(TCGCond cond, TCGv_i32 ret, } } +void tcg_gen_negsetcondi_i32(TCGCond cond, TCGv_i32 ret, + TCGv_i32 arg1, int32_t arg2) +{ + tcg_gen_negsetcond_i32(cond, ret, arg1, tcg_constant_i32(arg2)); +} + void tcg_gen_muli_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2) { if (arg2 == 0) { @@ -1602,6 +1608,12 @@ void tcg_gen_setcondi_i64(TCGCond cond, TCGv_i64 ret, } } +void tcg_gen_negsetcondi_i64(TCGCond cond, TCGv_i64 ret, + TCGv_i64 arg1, int64_t arg2) +{ + tcg_gen_negsetcond_i64(cond, ret, arg1, tcg_constant_i64(arg2)); +} + void tcg_gen_negsetcond_i64(TCGCond cond, TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2) { From f1c29532e7f85932d541fc733fda82e8cf887311 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Thu, 19 Oct 2023 09:15:22 -0700 Subject: [PATCH 31/38] tcg: Export tcg_gen_ext_{i32,i64,tl} MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The two concrete type functions already existed, merely needing a bit of hardening to invalid inputs. Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Richard Henderson --- include/tcg/tcg-op-common.h | 2 ++ include/tcg/tcg-op.h | 2 ++ tcg/tcg-op-ldst.c | 14 ++++++++++---- 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/include/tcg/tcg-op-common.h b/include/tcg/tcg-op-common.h index a0bae5df01..677aea6dd1 100644 --- a/include/tcg/tcg-op-common.h +++ b/include/tcg/tcg-op-common.h @@ -361,6 +361,7 @@ void tcg_gen_ext8s_i32(TCGv_i32 ret, TCGv_i32 arg); void tcg_gen_ext16s_i32(TCGv_i32 ret, TCGv_i32 arg); void tcg_gen_ext8u_i32(TCGv_i32 ret, TCGv_i32 arg); void tcg_gen_ext16u_i32(TCGv_i32 ret, TCGv_i32 arg); +void tcg_gen_ext_i32(TCGv_i32 ret, TCGv_i32 val, MemOp opc); void tcg_gen_bswap16_i32(TCGv_i32 ret, TCGv_i32 arg, int flags); void tcg_gen_bswap32_i32(TCGv_i32 ret, TCGv_i32 arg); void tcg_gen_hswap_i32(TCGv_i32 ret, TCGv_i32 arg); @@ -564,6 +565,7 @@ void tcg_gen_ext32s_i64(TCGv_i64 ret, TCGv_i64 arg); void tcg_gen_ext8u_i64(TCGv_i64 ret, TCGv_i64 arg); void tcg_gen_ext16u_i64(TCGv_i64 ret, TCGv_i64 arg); void tcg_gen_ext32u_i64(TCGv_i64 ret, TCGv_i64 arg); +void tcg_gen_ext_i64(TCGv_i64 ret, TCGv_i64 val, MemOp opc); void tcg_gen_bswap16_i64(TCGv_i64 ret, TCGv_i64 arg, int flags); void tcg_gen_bswap32_i64(TCGv_i64 ret, TCGv_i64 arg, int flags); void tcg_gen_bswap64_i64(TCGv_i64 ret, TCGv_i64 arg); diff --git a/include/tcg/tcg-op.h b/include/tcg/tcg-op.h index e81dd7dd9e..a02850583b 100644 --- a/include/tcg/tcg-op.h +++ b/include/tcg/tcg-op.h @@ -219,6 +219,7 @@ DEF_ATOMIC2(tcg_gen_atomic_umax_fetch, i64) #define tcg_gen_ext16s_tl tcg_gen_ext16s_i64 #define tcg_gen_ext32u_tl tcg_gen_ext32u_i64 #define tcg_gen_ext32s_tl tcg_gen_ext32s_i64 +#define tcg_gen_ext_tl tcg_gen_ext_i64 #define tcg_gen_bswap16_tl tcg_gen_bswap16_i64 #define tcg_gen_bswap32_tl tcg_gen_bswap32_i64 #define tcg_gen_bswap64_tl tcg_gen_bswap64_i64 @@ -338,6 +339,7 @@ DEF_ATOMIC2(tcg_gen_atomic_umax_fetch, i64) #define tcg_gen_ext16s_tl tcg_gen_ext16s_i32 #define tcg_gen_ext32u_tl tcg_gen_mov_i32 #define tcg_gen_ext32s_tl tcg_gen_mov_i32 +#define tcg_gen_ext_tl tcg_gen_ext_i32 #define tcg_gen_bswap16_tl tcg_gen_bswap16_i32 #define tcg_gen_bswap32_tl(D, S, F) tcg_gen_bswap32_i32(D, S) #define tcg_gen_bswap_tl tcg_gen_bswap32_i32 diff --git a/tcg/tcg-op-ldst.c b/tcg/tcg-op-ldst.c index 2b96687699..e2c55df217 100644 --- a/tcg/tcg-op-ldst.c +++ b/tcg/tcg-op-ldst.c @@ -714,7 +714,7 @@ void tcg_gen_qemu_st_i128_chk(TCGv_i128 val, TCGTemp *addr, TCGArg idx, tcg_gen_qemu_st_i128_int(val, addr, idx, memop); } -static void tcg_gen_ext_i32(TCGv_i32 ret, TCGv_i32 val, MemOp opc) +void tcg_gen_ext_i32(TCGv_i32 ret, TCGv_i32 val, MemOp opc) { switch (opc & MO_SSIZE) { case MO_SB: @@ -729,13 +729,16 @@ static void tcg_gen_ext_i32(TCGv_i32 ret, TCGv_i32 val, MemOp opc) case MO_UW: tcg_gen_ext16u_i32(ret, val); break; - default: + case MO_UL: + case MO_SL: tcg_gen_mov_i32(ret, val); break; + default: + g_assert_not_reached(); } } -static void tcg_gen_ext_i64(TCGv_i64 ret, TCGv_i64 val, MemOp opc) +void tcg_gen_ext_i64(TCGv_i64 ret, TCGv_i64 val, MemOp opc) { switch (opc & MO_SSIZE) { case MO_SB: @@ -756,9 +759,12 @@ static void tcg_gen_ext_i64(TCGv_i64 ret, TCGv_i64 val, MemOp opc) case MO_UL: tcg_gen_ext32u_i64(ret, val); break; - default: + case MO_UQ: + case MO_SQ: tcg_gen_mov_i64(ret, val); break; + default: + g_assert_not_reached(); } } From c048b68385c2146931ea032d775479a8ab1cddf3 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Sun, 22 Oct 2023 16:34:21 -0700 Subject: [PATCH 32/38] tcg: Define MO_TL This will also come in handy later for "less than" comparisons. Signed-off-by: Paolo Bonzini Message-Id: <03ba02fd-fade-4409-be16-2f81a5690b4c@redhat.com> Reviewed-by: Richard Henderson Signed-off-by: Richard Henderson --- include/exec/target_long.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/exec/target_long.h b/include/exec/target_long.h index 93c9472971..3cd8e26a23 100644 --- a/include/exec/target_long.h +++ b/include/exec/target_long.h @@ -29,12 +29,14 @@ typedef uint32_t target_ulong; #define TARGET_FMT_lx "%08x" #define TARGET_FMT_ld "%d" #define TARGET_FMT_lu "%u" +#define MO_TL MO_32 #elif TARGET_LONG_SIZE == 8 typedef int64_t target_long; typedef uint64_t target_ulong; #define TARGET_FMT_lx "%016" PRIx64 #define TARGET_FMT_ld "%" PRId64 #define TARGET_FMT_lu "%" PRIu64 +#define MO_TL MO_64 #else #error TARGET_LONG_SIZE undefined #endif From 2f02c14b21cfe0954e203353af21cd41bce76822 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Thu, 19 Oct 2023 11:18:08 -0700 Subject: [PATCH 33/38] target/arm: Use tcg_gen_ext_i64 The ext_and_shift_reg helper does this plus a shift. The non-zero check for shift count is duplicate to the one done within tcg_gen_shli_i64. Signed-off-by: Richard Henderson --- target/arm/tcg/translate-a64.c | 37 ++-------------------------------- 1 file changed, 2 insertions(+), 35 deletions(-) diff --git a/target/arm/tcg/translate-a64.c b/target/arm/tcg/translate-a64.c index 10e8dcf743..ad78b8b120 100644 --- a/target/arm/tcg/translate-a64.c +++ b/target/arm/tcg/translate-a64.c @@ -1324,41 +1324,8 @@ static void ext_and_shift_reg(TCGv_i64 tcg_out, TCGv_i64 tcg_in, int extsize = extract32(option, 0, 2); bool is_signed = extract32(option, 2, 1); - if (is_signed) { - switch (extsize) { - case 0: - tcg_gen_ext8s_i64(tcg_out, tcg_in); - break; - case 1: - tcg_gen_ext16s_i64(tcg_out, tcg_in); - break; - case 2: - tcg_gen_ext32s_i64(tcg_out, tcg_in); - break; - case 3: - tcg_gen_mov_i64(tcg_out, tcg_in); - break; - } - } else { - switch (extsize) { - case 0: - tcg_gen_ext8u_i64(tcg_out, tcg_in); - break; - case 1: - tcg_gen_ext16u_i64(tcg_out, tcg_in); - break; - case 2: - tcg_gen_ext32u_i64(tcg_out, tcg_in); - break; - case 3: - tcg_gen_mov_i64(tcg_out, tcg_in); - break; - } - } - - if (shift) { - tcg_gen_shli_i64(tcg_out, tcg_out, shift); - } + tcg_gen_ext_i64(tcg_out, tcg_in, extsize | (is_signed ? MO_SIGN : 0)); + tcg_gen_shli_i64(tcg_out, tcg_out, shift); } static inline void gen_check_sp_alignment(DisasContext *s) From 23f3d586e4da86b86df8ab38e8dacad8a804ed3f Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Thu, 19 Oct 2023 11:18:51 -0700 Subject: [PATCH 34/38] target/i386: Use tcg_gen_ext_tl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Richard Henderson --- target/i386/tcg/translate.c | 28 +++------------------------- 1 file changed, 3 insertions(+), 25 deletions(-) diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c index 18d06ab247..587d88692a 100644 --- a/target/i386/tcg/translate.c +++ b/target/i386/tcg/translate.c @@ -701,33 +701,11 @@ static inline void gen_op_movl_T0_Dshift(DisasContext *s, MemOp ot) static TCGv gen_ext_tl(TCGv dst, TCGv src, MemOp size, bool sign) { - switch (size) { - case MO_8: - if (sign) { - tcg_gen_ext8s_tl(dst, src); - } else { - tcg_gen_ext8u_tl(dst, src); - } - return dst; - case MO_16: - if (sign) { - tcg_gen_ext16s_tl(dst, src); - } else { - tcg_gen_ext16u_tl(dst, src); - } - return dst; -#ifdef TARGET_X86_64 - case MO_32: - if (sign) { - tcg_gen_ext32s_tl(dst, src); - } else { - tcg_gen_ext32u_tl(dst, src); - } - return dst; -#endif - default: + if (size == MO_TL) { return src; } + tcg_gen_ext_tl(dst, src, size | (sign ? MO_SIGN : 0)); + return dst; } static void gen_extu(MemOp ot, TCGv reg) From 443025e4d0f71bb82fe80369032bf1a04f433352 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Thu, 19 Oct 2023 11:20:12 -0700 Subject: [PATCH 35/38] target/m68k: Use tcg_gen_ext_i32 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We still need to check OS_{BYTE,WORD,LONG}, because m68k includes floating point in OS_*. Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Richard Henderson --- target/m68k/translate.c | 23 +++-------------------- 1 file changed, 3 insertions(+), 20 deletions(-) diff --git a/target/m68k/translate.c b/target/m68k/translate.c index 4d0110de95..4a0b0b2703 100644 --- a/target/m68k/translate.c +++ b/target/m68k/translate.c @@ -520,21 +520,9 @@ static inline void gen_ext(TCGv res, TCGv val, int opsize, int sign) { switch (opsize) { case OS_BYTE: - if (sign) { - tcg_gen_ext8s_i32(res, val); - } else { - tcg_gen_ext8u_i32(res, val); - } - break; case OS_WORD: - if (sign) { - tcg_gen_ext16s_i32(res, val); - } else { - tcg_gen_ext16u_i32(res, val); - } - break; case OS_LONG: - tcg_gen_mov_i32(res, val); + tcg_gen_ext_i32(res, val, opsize | (sign ? MO_SIGN : 0)); break; default: g_assert_not_reached(); @@ -1072,15 +1060,10 @@ static int gen_ea_mode_fp(CPUM68KState *env, DisasContext *s, int mode, tmp = tcg_temp_new(); switch (opsize) { case OS_BYTE: - tcg_gen_ext8s_i32(tmp, reg); - gen_helper_exts32(tcg_env, fp, tmp); - break; case OS_WORD: - tcg_gen_ext16s_i32(tmp, reg); - gen_helper_exts32(tcg_env, fp, tmp); - break; case OS_LONG: - gen_helper_exts32(tcg_env, fp, reg); + tcg_gen_ext_i32(tmp, reg, opsize | MO_SIGN); + gen_helper_exts32(tcg_env, fp, tmp); break; case OS_SINGLE: gen_helper_extf32(tcg_env, fp, reg); From 0d67249c6d30a626434815c4fc39ab6bc60708f6 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Thu, 19 Oct 2023 11:21:40 -0700 Subject: [PATCH 36/38] target/rx: Use tcg_gen_ext_i32 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: Yoshinori Sato Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Richard Henderson --- target/rx/translate.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/target/rx/translate.c b/target/rx/translate.c index f8860830ae..c6ce717a95 100644 --- a/target/rx/translate.c +++ b/target/rx/translate.c @@ -492,13 +492,11 @@ static bool trans_MOV_ra(DisasContext *ctx, arg_MOV_ra *a) /* mov. rs,rd */ static bool trans_MOV_mm(DisasContext *ctx, arg_MOV_mm *a) { - static void (* const mov[])(TCGv ret, TCGv arg) = { - tcg_gen_ext8s_i32, tcg_gen_ext16s_i32, tcg_gen_mov_i32, - }; TCGv tmp, mem, addr; + if (a->lds == 3 && a->ldd == 3) { /* mov. rs,rd */ - mov[a->sz](cpu_regs[a->rd], cpu_regs[a->rs]); + tcg_gen_ext_i32(cpu_regs[a->rd], cpu_regs[a->rs], a->sz | MO_SIGN); return true; } @@ -570,10 +568,7 @@ static bool trans_MOVU_mr(DisasContext *ctx, arg_MOVU_mr *a) /* movu. rs,rd */ static bool trans_MOVU_rr(DisasContext *ctx, arg_MOVU_rr *a) { - static void (* const ext[])(TCGv ret, TCGv arg) = { - tcg_gen_ext8u_i32, tcg_gen_ext16u_i32, - }; - ext[a->sz](cpu_regs[a->rd], cpu_regs[a->rs]); + tcg_gen_ext_i32(cpu_regs[a->rd], cpu_regs[a->rs], a->sz); return true; } From 8b8bb295489fe8853a650aabf7349cd7aa6d1e9c Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Thu, 19 Oct 2023 11:23:19 -0700 Subject: [PATCH 37/38] target/tricore: Use tcg_gen_*extract_tl The EXTR instructions can use the extract opcodes. Reviewed-by: Bastian Koppelmann Signed-off-by: Richard Henderson --- target/tricore/translate.c | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/target/tricore/translate.c b/target/tricore/translate.c index dd812ec0f0..66553d1be0 100644 --- a/target/tricore/translate.c +++ b/target/tricore/translate.c @@ -6542,28 +6542,16 @@ static void decode_rrpw_extract_insert(DisasContext *ctx) switch (op2) { case OPC2_32_RRPW_EXTR: if (width == 0) { - tcg_gen_movi_tl(cpu_gpr_d[r3], 0); - break; - } - - if (pos + width <= 32) { - /* optimize special cases */ - if ((pos == 0) && (width == 8)) { - tcg_gen_ext8s_tl(cpu_gpr_d[r3], cpu_gpr_d[r1]); - } else if ((pos == 0) && (width == 16)) { - tcg_gen_ext16s_tl(cpu_gpr_d[r3], cpu_gpr_d[r1]); - } else { - tcg_gen_shli_tl(cpu_gpr_d[r3], cpu_gpr_d[r1], 32 - pos - width); - tcg_gen_sari_tl(cpu_gpr_d[r3], cpu_gpr_d[r3], 32 - width); - } + tcg_gen_movi_tl(cpu_gpr_d[r3], 0); + } else if (pos + width <= 32) { + tcg_gen_sextract_tl(cpu_gpr_d[r3], cpu_gpr_d[r1], pos, width); } break; case OPC2_32_RRPW_EXTR_U: if (width == 0) { tcg_gen_movi_tl(cpu_gpr_d[r3], 0); } else { - tcg_gen_shri_tl(cpu_gpr_d[r3], cpu_gpr_d[r1], pos); - tcg_gen_andi_tl(cpu_gpr_d[r3], cpu_gpr_d[r3], ~0u >> (32-width)); + tcg_gen_extract_tl(cpu_gpr_d[r3], cpu_gpr_d[r1], pos, width); } break; case OPC2_32_RRPW_IMASK: From e40df3522b384d3b7dd38187d735bd6228b20e47 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Thu, 19 Oct 2023 11:25:32 -0700 Subject: [PATCH 38/38] target/xtensa: Use tcg_gen_sextract_i32 Reviewed-by: Max Filippov Signed-off-by: Richard Henderson --- target/xtensa/translate.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/target/xtensa/translate.c b/target/xtensa/translate.c index 54bee7ddba..de89940599 100644 --- a/target/xtensa/translate.c +++ b/target/xtensa/translate.c @@ -2262,17 +2262,7 @@ static void translate_salt(DisasContext *dc, const OpcodeArg arg[], static void translate_sext(DisasContext *dc, const OpcodeArg arg[], const uint32_t par[]) { - int shift = 31 - arg[2].imm; - - if (shift == 24) { - tcg_gen_ext8s_i32(arg[0].out, arg[1].in); - } else if (shift == 16) { - tcg_gen_ext16s_i32(arg[0].out, arg[1].in); - } else { - TCGv_i32 tmp = tcg_temp_new_i32(); - tcg_gen_shli_i32(tmp, arg[1].in, shift); - tcg_gen_sari_i32(arg[0].out, tmp, shift); - } + tcg_gen_sextract_i32(arg[0].out, arg[1].in, 0, arg[2].imm + 1); } static uint32_t test_exceptions_simcall(DisasContext *dc,