target/arm: Implement v8.1M low-overhead-loop instructions
v8.1M's "low-overhead-loop" extension has three instructions for looping: * DLS (start of a do-loop) * WLS (start of a while-loop) * LE (end of a loop) The loop-start instructions are both simple operations to start a loop whose iteration count (if any) is in LR. The loop-end instruction handles "decrement iteration count and jump back to loop start"; it also caches the information about the branch back to the start of the loop to improve performance of the branch on subsequent iterations. As with the branch-future instructions, the architecture permits an implementation to discard the LO_BRANCH_INFO cache at any time, and QEMU takes the IMPDEF option to never set it in the first place (equivalent to discarding it immediately), because for us a "real" implementation would be unnecessary complexity. (This implementation only provides the simple looping constructs; the vector extension MVE (Helium) adds some extra variants to handle looping across vectors. We'll add those later when we implement MVE.) Signed-off-by: Peter Maydell <peter.maydell@linaro.org> Reviewed-by: Richard Henderson <richard.henderson@linaro.org> Message-id: 20201019151301.2046-8-peter.maydell@linaro.org
This commit is contained in:
parent
05903f036e
commit
b722636972
@ -659,4 +659,12 @@ BL 1111 0. .......... 11.1 ............ @branch24
|
||||
BF 1111 0 boff:4 10 ----- 1110 - ---------- 1 # BF
|
||||
BF 1111 0 boff:4 11 ----- 1110 0 0000000000 1 # BFX, BFLX
|
||||
]
|
||||
[
|
||||
# LE and WLS immediate
|
||||
%lob_imm 1:10 11:1 !function=times_2
|
||||
|
||||
DLS 1111 0 0000 100 rn:4 1110 0000 0000 0001
|
||||
WLS 1111 0 0000 100 rn:4 1100 . .......... 1 imm=%lob_imm
|
||||
LE 1111 0 0000 0 f:1 0 1111 1100 . .......... 1 imm=%lob_imm
|
||||
]
|
||||
}
|
||||
|
@ -2490,17 +2490,23 @@ static void gen_goto_tb(DisasContext *s, int n, target_ulong dest)
|
||||
s->base.is_jmp = DISAS_NORETURN;
|
||||
}
|
||||
|
||||
static inline void gen_jmp (DisasContext *s, uint32_t dest)
|
||||
/* Jump, specifying which TB number to use if we gen_goto_tb() */
|
||||
static inline void gen_jmp_tb(DisasContext *s, uint32_t dest, int tbno)
|
||||
{
|
||||
if (unlikely(is_singlestepping(s))) {
|
||||
/* An indirect jump so that we still trigger the debug exception. */
|
||||
gen_set_pc_im(s, dest);
|
||||
s->base.is_jmp = DISAS_JUMP;
|
||||
} else {
|
||||
gen_goto_tb(s, 0, dest);
|
||||
gen_goto_tb(s, tbno, dest);
|
||||
}
|
||||
}
|
||||
|
||||
static inline void gen_jmp(DisasContext *s, uint32_t dest)
|
||||
{
|
||||
gen_jmp_tb(s, dest, 0);
|
||||
}
|
||||
|
||||
static inline void gen_mulxy(TCGv_i32 t0, TCGv_i32 t1, int x, int y)
|
||||
{
|
||||
if (x)
|
||||
@ -7991,6 +7997,89 @@ static bool trans_BF(DisasContext *s, arg_BF *a)
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool trans_DLS(DisasContext *s, arg_DLS *a)
|
||||
{
|
||||
/* M-profile low-overhead loop start */
|
||||
TCGv_i32 tmp;
|
||||
|
||||
if (!dc_isar_feature(aa32_lob, s)) {
|
||||
return false;
|
||||
}
|
||||
if (a->rn == 13 || a->rn == 15) {
|
||||
/* CONSTRAINED UNPREDICTABLE: we choose to UNDEF */
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Not a while loop, no tail predication: just set LR to the count */
|
||||
tmp = load_reg(s, a->rn);
|
||||
store_reg(s, 14, tmp);
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool trans_WLS(DisasContext *s, arg_WLS *a)
|
||||
{
|
||||
/* M-profile low-overhead while-loop start */
|
||||
TCGv_i32 tmp;
|
||||
TCGLabel *nextlabel;
|
||||
|
||||
if (!dc_isar_feature(aa32_lob, s)) {
|
||||
return false;
|
||||
}
|
||||
if (a->rn == 13 || a->rn == 15) {
|
||||
/* CONSTRAINED UNPREDICTABLE: we choose to UNDEF */
|
||||
return false;
|
||||
}
|
||||
if (s->condexec_mask) {
|
||||
/*
|
||||
* WLS in an IT block is CONSTRAINED UNPREDICTABLE;
|
||||
* we choose to UNDEF, because otherwise our use of
|
||||
* gen_goto_tb(1) would clash with the use of TB exit 1
|
||||
* in the dc->condjmp condition-failed codepath in
|
||||
* arm_tr_tb_stop() and we'd get an assertion.
|
||||
*/
|
||||
return false;
|
||||
}
|
||||
nextlabel = gen_new_label();
|
||||
tcg_gen_brcondi_i32(TCG_COND_EQ, cpu_R[a->rn], 0, nextlabel);
|
||||
tmp = load_reg(s, a->rn);
|
||||
store_reg(s, 14, tmp);
|
||||
gen_jmp_tb(s, s->base.pc_next, 1);
|
||||
|
||||
gen_set_label(nextlabel);
|
||||
gen_jmp(s, read_pc(s) + a->imm);
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool trans_LE(DisasContext *s, arg_LE *a)
|
||||
{
|
||||
/*
|
||||
* M-profile low-overhead loop end. The architecture permits an
|
||||
* implementation to discard the LO_BRANCH_INFO cache at any time,
|
||||
* and we take the IMPDEF option to never set it in the first place
|
||||
* (equivalent to always discarding it immediately), because for QEMU
|
||||
* a "real" implementation would be complicated and wouldn't execute
|
||||
* any faster.
|
||||
*/
|
||||
TCGv_i32 tmp;
|
||||
|
||||
if (!dc_isar_feature(aa32_lob, s)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!a->f) {
|
||||
/* Not loop-forever. If LR <= 1 this is the last loop: do nothing. */
|
||||
arm_gen_condlabel(s);
|
||||
tcg_gen_brcondi_i32(TCG_COND_LEU, cpu_R[14], 1, s->condlabel);
|
||||
/* Decrement LR */
|
||||
tmp = load_reg(s, 14);
|
||||
tcg_gen_addi_i32(tmp, tmp, -1);
|
||||
store_reg(s, 14, tmp);
|
||||
}
|
||||
/* Jump back to the loop start */
|
||||
gen_jmp(s, read_pc(s) - a->imm);
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool op_tbranch(DisasContext *s, arg_tbranch *a, bool half)
|
||||
{
|
||||
TCGv_i32 addr, tmp;
|
||||
|
Loading…
Reference in New Issue
Block a user