target/arm: Promote consecutive memory ops for aa64
For a sequence of loads or stores from a single register, little-endian operations can be promoted to an 8-byte op. This can reduce the number of operations by a factor of 8. Signed-off-by: Richard Henderson <richard.henderson@linaro.org> Message-id: 20181011205206.3552-5-richard.henderson@linaro.org Reviewed-by: Peter Maydell <peter.maydell@linaro.org> Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
This commit is contained in:
parent
10e0b33c67
commit
87f9a7f0c8
@ -1201,25 +1201,23 @@ static void write_vec_element_i32(DisasContext *s, TCGv_i32 tcg_src,
|
||||
|
||||
/* Store from vector register to memory */
|
||||
static void do_vec_st(DisasContext *s, int srcidx, int element,
|
||||
TCGv_i64 tcg_addr, int size)
|
||||
TCGv_i64 tcg_addr, int size, TCGMemOp endian)
|
||||
{
|
||||
TCGMemOp memop = s->be_data + size;
|
||||
TCGv_i64 tcg_tmp = tcg_temp_new_i64();
|
||||
|
||||
read_vec_element(s, tcg_tmp, srcidx, element, size);
|
||||
tcg_gen_qemu_st_i64(tcg_tmp, tcg_addr, get_mem_index(s), memop);
|
||||
tcg_gen_qemu_st_i64(tcg_tmp, tcg_addr, get_mem_index(s), endian | size);
|
||||
|
||||
tcg_temp_free_i64(tcg_tmp);
|
||||
}
|
||||
|
||||
/* Load from memory to vector register */
|
||||
static void do_vec_ld(DisasContext *s, int destidx, int element,
|
||||
TCGv_i64 tcg_addr, int size)
|
||||
TCGv_i64 tcg_addr, int size, TCGMemOp endian)
|
||||
{
|
||||
TCGMemOp memop = s->be_data + size;
|
||||
TCGv_i64 tcg_tmp = tcg_temp_new_i64();
|
||||
|
||||
tcg_gen_qemu_ld_i64(tcg_tmp, tcg_addr, get_mem_index(s), memop);
|
||||
tcg_gen_qemu_ld_i64(tcg_tmp, tcg_addr, get_mem_index(s), endian | size);
|
||||
write_vec_element(s, tcg_tmp, destidx, element, size);
|
||||
|
||||
tcg_temp_free_i64(tcg_tmp);
|
||||
@ -3019,9 +3017,10 @@ static void disas_ldst_multiple_struct(DisasContext *s, uint32_t insn)
|
||||
bool is_postidx = extract32(insn, 23, 1);
|
||||
bool is_q = extract32(insn, 30, 1);
|
||||
TCGv_i64 tcg_addr, tcg_rn, tcg_ebytes;
|
||||
TCGMemOp endian = s->be_data;
|
||||
|
||||
int ebytes = 1 << size;
|
||||
int elements = (is_q ? 128 : 64) / (8 << size);
|
||||
int ebytes; /* bytes per element */
|
||||
int elements; /* elements per vector */
|
||||
int rpt; /* num iterations */
|
||||
int selem; /* structure elements */
|
||||
int r;
|
||||
@ -3080,6 +3079,20 @@ static void disas_ldst_multiple_struct(DisasContext *s, uint32_t insn)
|
||||
gen_check_sp_alignment(s);
|
||||
}
|
||||
|
||||
/* For our purposes, bytes are always little-endian. */
|
||||
if (size == 0) {
|
||||
endian = MO_LE;
|
||||
}
|
||||
|
||||
/* Consecutive little-endian elements from a single register
|
||||
* can be promoted to a larger little-endian operation.
|
||||
*/
|
||||
if (selem == 1 && endian == MO_LE) {
|
||||
size = 3;
|
||||
}
|
||||
ebytes = 1 << size;
|
||||
elements = (is_q ? 16 : 8) / ebytes;
|
||||
|
||||
tcg_rn = cpu_reg_sp(s, rn);
|
||||
tcg_addr = tcg_temp_new_i64();
|
||||
tcg_gen_mov_i64(tcg_addr, tcg_rn);
|
||||
@ -3088,32 +3101,33 @@ static void disas_ldst_multiple_struct(DisasContext *s, uint32_t insn)
|
||||
for (r = 0; r < rpt; r++) {
|
||||
int e;
|
||||
for (e = 0; e < elements; e++) {
|
||||
int tt = (rt + r) % 32;
|
||||
int xs;
|
||||
for (xs = 0; xs < selem; xs++) {
|
||||
int tt = (rt + r + xs) % 32;
|
||||
if (is_store) {
|
||||
do_vec_st(s, tt, e, tcg_addr, size);
|
||||
do_vec_st(s, tt, e, tcg_addr, size, endian);
|
||||
} else {
|
||||
do_vec_ld(s, tt, e, tcg_addr, size);
|
||||
|
||||
/* For non-quad operations, setting a slice of the low
|
||||
* 64 bits of the register clears the high 64 bits (in
|
||||
* the ARM ARM pseudocode this is implicit in the fact
|
||||
* that 'rval' is a 64 bit wide variable).
|
||||
* For quad operations, we might still need to zero the
|
||||
* high bits of SVE. We optimize by noticing that we only
|
||||
* need to do this the first time we touch a register.
|
||||
*/
|
||||
if (e == 0 && (r == 0 || xs == selem - 1)) {
|
||||
clear_vec_high(s, is_q, tt);
|
||||
}
|
||||
do_vec_ld(s, tt, e, tcg_addr, size, endian);
|
||||
}
|
||||
tcg_gen_add_i64(tcg_addr, tcg_addr, tcg_ebytes);
|
||||
tt = (tt + 1) % 32;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!is_store) {
|
||||
/* For non-quad operations, setting a slice of the low
|
||||
* 64 bits of the register clears the high 64 bits (in
|
||||
* the ARM ARM pseudocode this is implicit in the fact
|
||||
* that 'rval' is a 64 bit wide variable).
|
||||
* For quad operations, we might still need to zero the
|
||||
* high bits of SVE.
|
||||
*/
|
||||
for (r = 0; r < rpt * selem; r++) {
|
||||
int tt = (rt + r) % 32;
|
||||
clear_vec_high(s, is_q, tt);
|
||||
}
|
||||
}
|
||||
|
||||
if (is_postidx) {
|
||||
int rm = extract32(insn, 16, 5);
|
||||
if (rm == 31) {
|
||||
@ -3234,9 +3248,9 @@ static void disas_ldst_single_struct(DisasContext *s, uint32_t insn)
|
||||
} else {
|
||||
/* Load/store one element per register */
|
||||
if (is_load) {
|
||||
do_vec_ld(s, rt, index, tcg_addr, scale);
|
||||
do_vec_ld(s, rt, index, tcg_addr, scale, s->be_data);
|
||||
} else {
|
||||
do_vec_st(s, rt, index, tcg_addr, scale);
|
||||
do_vec_st(s, rt, index, tcg_addr, scale, s->be_data);
|
||||
}
|
||||
}
|
||||
tcg_gen_add_i64(tcg_addr, tcg_addr, tcg_ebytes);
|
||||
|
Loading…
Reference in New Issue
Block a user