Clean up read_region_2 code to use fast version under more circumstances, and
to always use it aligned on a 4-word boundary. The setup code feels like it could be sleeker, but I can't currently see how.
This commit is contained in:
parent
4171072ea1
commit
9dc368e353
@ -1,4 +1,4 @@
|
|||||||
/* $NetBSD: bus_asm.S,v 1.2 2006/10/01 22:47:18 bjh21 Exp $ */
|
/* $NetBSD: bus_asm.S,v 1.3 2006/10/03 22:27:02 bjh21 Exp $ */
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2006 Ben Harris
|
* Copyright (c) 2006 Ben Harris
|
||||||
@ -225,34 +225,40 @@ ENTRY(iobus_bs_rr_2)
|
|||||||
mov r2, #1
|
mov r2, #1
|
||||||
mov r0, r2, lsl r0
|
mov r0, r2, lsl r0
|
||||||
ldr r10, [sp, #44]
|
ldr r10, [sp, #44]
|
||||||
tst r9, #3 /* If we're word-aligned */
|
|
||||||
teqeq r0, #4 /* and registers are every 4 bytes */
|
teq r0, #4 /* Is the step 4? */
|
||||||
bne 2f
|
movne r7, r10 /* If not, do the whole lot slowly. */
|
||||||
cmp r10, #8 /* and we're reading >=8 registers */
|
rsbeq r7, r9, #0 /* Otherwise, go slowly to a */
|
||||||
bge 3f /* do it the fast way. */
|
andeq r7, r7, #15 /* 16-byte boundary. */
|
||||||
|
moveq r7, r7, lsr #1 /* Convert to uint16_ts */
|
||||||
|
sub r10, r10, r7 /* Adjust fast transfer len to match */
|
||||||
|
cmp r10, #8
|
||||||
|
addlt r7, r7, r10 /* If remaining fast xfer is <8 */
|
||||||
|
movlt r10, #0 /* make it zero and correct r7. */
|
||||||
|
|
||||||
/* Make sure that we have a positive length */
|
/* Make sure that we have a positive length */
|
||||||
2: cmp r10, #0
|
teq r7, #0
|
||||||
ldmledb fp, {r4-r10, fp, sp, pc}
|
beq 2f
|
||||||
|
|
||||||
1: ldr r1, [r8], r0
|
1: ldr r1, [r8], r0
|
||||||
strb r1, [r9], #1
|
strb r1, [r9], #1
|
||||||
mov r1, r1, lsr #8
|
mov r1, r1, lsr #8
|
||||||
strb r1, [r9], #1
|
strb r1, [r9], #1
|
||||||
subs r10, r10, #1
|
subs r7, r7, #1
|
||||||
bgt 1b
|
bgt 1b
|
||||||
|
|
||||||
ldmdb fp, {r4-r10, fp, sp, pc}
|
2: teq r10, #0
|
||||||
|
ldmeqdb fp, {r4-r10, fp, sp, pc}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Fast read_region_2 code. This is at its best when dealing with
|
* Fast read_region_2 code. This is at its best when dealing with
|
||||||
* 16-byte-aligned blocks of memory, which should happen quite
|
* 16-byte-aligned blocks of memory, which is arranged by the code
|
||||||
* a lot anyway, but the above code could help.
|
* above.
|
||||||
*/
|
*/
|
||||||
3: mov r12, #0x00ff
|
mov r12, #0x00ff
|
||||||
orr r12, r12, #0xff00
|
orr r12, r12, #0xff00
|
||||||
sub r10, r10, #7
|
sub r10, r10, #7
|
||||||
1: ldmia r8!, {r0-r7}
|
2: ldmia r8!, {r0-r7}
|
||||||
subs r10, r10, #8
|
subs r10, r10, #8
|
||||||
and r0, r0, r12
|
and r0, r0, r12
|
||||||
and r2, r2, r12
|
and r2, r2, r12
|
||||||
@ -263,11 +269,12 @@ ENTRY(iobus_bs_rr_2)
|
|||||||
orr r4, r4, r5, lsl #16
|
orr r4, r4, r5, lsl #16
|
||||||
orr r6, r6, r7, lsl #16
|
orr r6, r6, r7, lsl #16
|
||||||
stmia r9!, {r0, r2, r4, r6}
|
stmia r9!, {r0, r2, r4, r6}
|
||||||
bgt 1b
|
bgt 2b
|
||||||
adds r10, r10, #7
|
adds r7, r10, #7
|
||||||
ldmeqdb fp, {r4-r10, fp, sp, pc}
|
ldmeqdb fp, {r4-r10, fp, sp, pc}
|
||||||
mov r0, #4
|
mov r0, #4
|
||||||
b 2b
|
mov r10, #0
|
||||||
|
b 1b
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* write region
|
* write region
|
||||||
|
Loading…
Reference in New Issue
Block a user