diff --git a/target/arm/translate-neon.c.inc b/target/arm/translate-neon.c.inc index 9c2b076027..e706c37c80 100644 --- a/target/arm/translate-neon.c.inc +++ b/target/arm/translate-neon.c.inc @@ -429,7 +429,7 @@ static bool trans_VLDST_multiple(DisasContext *s, arg_VLDST_multiple *a) { /* Neon load/store multiple structures */ int nregs, interleave, spacing, reg, n; - MemOp endian = s->be_data; + MemOp mop, align, endian; int mmu_idx = get_mem_index(s); int size = a->size; TCGv_i64 tmp64; @@ -473,20 +473,36 @@ static bool trans_VLDST_multiple(DisasContext *s, arg_VLDST_multiple *a) } /* For our purposes, bytes are always little-endian. */ + endian = s->be_data; if (size == 0) { endian = MO_LE; } + + /* Enforce alignment requested by the instruction */ + if (a->align) { + align = pow2_align(a->align + 2); /* 4 ** a->align */ + } else { + align = s->align_mem ? MO_ALIGN : 0; + } + /* * Consecutive little-endian elements from a single register * can be promoted to a larger little-endian operation. */ if (interleave == 1 && endian == MO_LE) { + /* Retain any natural alignment. */ + if (align == MO_ALIGN) { + align = pow2_align(size); + } size = 3; } + tmp64 = tcg_temp_new_i64(); addr = tcg_temp_new_i32(); tmp = tcg_const_i32(1 << size); load_reg_var(s, addr, a->rn); + + mop = endian | size | align; for (reg = 0; reg < nregs; reg++) { for (n = 0; n < 8 >> size; n++) { int xs; @@ -494,15 +510,16 @@ static bool trans_VLDST_multiple(DisasContext *s, arg_VLDST_multiple *a) int tt = a->vd + reg + spacing * xs; if (a->l) { - gen_aa32_ld_internal_i64(s, tmp64, addr, mmu_idx, - endian | size); + gen_aa32_ld_internal_i64(s, tmp64, addr, mmu_idx, mop); neon_store_element64(tt, n, size, tmp64); } else { neon_load_element64(tmp64, tt, n, size); - gen_aa32_st_internal_i64(s, tmp64, addr, mmu_idx, - endian | size); + gen_aa32_st_internal_i64(s, tmp64, addr, mmu_idx, mop); } tcg_gen_add_i32(addr, addr, tmp); + + /* Subsequent memory operations inherit alignment */ + mop &= ~MO_AMASK; } } }