Slightly smaller code and tune for StrongARM.

This commit is contained in:
rearnsha 2001-03-19 22:51:51 +00:00
parent 3dfa223aa3
commit 51e6b460ab
1 changed files with 122 additions and 128 deletions

View File

@ -1,4 +1,4 @@
/* $NetBSD: blockio.S,v 1.9 1999/10/26 06:53:41 cgd Exp $ */
/* $NetBSD: blockio.S,v 1.10 2001/03/19 22:51:51 rearnsha Exp $ */
/*
* Copyright (c) 1994 Mark Brinicombe.
@ -41,6 +41,8 @@
* optimised block read/write from/to IO routines.
*
* Created : 08/10/94
* Modified : 22/01/99 -- R.Earnshaw
* Faster, and small tweaks for StrongARM
*/
#include <machine/asm.h>
@ -68,10 +70,10 @@ ENTRY(insw)
inswloop:
ldr r3, [r0]
subs r2, r2, #0x00000001 /* Loop test in load delay slot */
strb r3, [r1], #0x0001
mov r3, r3, lsr #8
strb r3, [r1], #0x0001
subs r2, r2, #0x00000001
bgt inswloop
mov pc, lr
@ -79,20 +81,17 @@ inswloop:
/* Word aligned insw */
fastinsw:
stmfd sp!, {r4}
fastinswloop:
ldr r3, [r0, #0x0002] /* take advantage of nonaligned
* word accesses */
ldr r4, [r0]
ldr ip, [r0]
mov r3, r3, lsr #16 /* Put the two shorts together */
orr r3, r3, r4, lsl #16
orr r3, r3, ip, lsl #16
str r3, [r1], #0x0004 /* Store */
subs r2, r2, #0x00000002 /* Next */
bgt fastinswloop
ldmfd sp!, {r4}
mov pc, lr
@ -117,42 +116,43 @@ ENTRY(outsw)
/* Non aligned outsw */
stmfd sp!, {r4}
outswloop:
ldrb r3, [r1], #0x0001
ldrb r4, [r1], #0x0001
orr r3, r3, r4, lsl #8
ldrb ip, [r1], #0x0001
subs r2, r2, #0x00000001 /* Loop test in load delay slot */
orr r3, r3, ip, lsl #8
orr r3, r3, r3, lsl #16
str r3, [r0]
subs r2, r2, #0x00000001
bgt outswloop
ldmfd sp!, {r4}
mov pc, lr
/* Word aligned outsw */
fastoutsw:
stmfd sp!, {r4}
fastoutswloop:
ldr r3, [r1], #0x0004
ldr r3, [r1], #0x0004 /* r3 = (H)(L) */
subs r2, r2, #0x00000002 /* Loop test in load delay slot */
mov r4, r3, lsl #16
orr r4, r4, r4, lsr #16
str r4, [r0]
eor ip, r3, r3, lsr #16 /* ip = (H)(H^L) */
eor r3, r3, ip, lsl #16 /* r3 = (H^H^L)(L) = (L)(L) */
eor ip, ip, r3, lsr #16 /* ip = (H)(H^L^L) = (H)(H) */
mov r4, r3, lsr #16
orr r4, r4, r4, lsl #16
str r4, [r0]
str r3, [r0]
str ip, [r0]
/* mov ip, r3, lsl #16
* orr ip, ip, ip, lsr #16
* str ip, [r0]
*
* mov ip, r3, lsr #16
* orr ip, ip, ip, lsl #16
* str ip, [r0]
*/
subs r2, r2, #0x00000002
bgt fastoutswloop
ldmfd sp!, {r4}
mov pc, lr
/*
@ -170,7 +170,8 @@ ENTRY(insw16)
cmp r2, #0x00000000
movle pc, lr
/* If the destination address is word aligned and the size suitably aligned, do it fast */
/* If the destination address is word aligned and the size suitably
aligned, do it fast */
tst r2, #0x00000007
tsteq r1, #0x00000003
@ -179,40 +180,38 @@ ENTRY(insw16)
/* Word aligned insw */
stmfd sp!, {r4-r7}
stmfd sp!, {r4,r5,lr}
insw16loop:
ldr r3, [r0, #0x0002] /* take advantage of nonaligned
* word accesses */
ldr r7, [r0]
ldr lr, [r0]
mov r3, r3, lsr #16 /* Put the two shorts together */
orr r3, r3, r7, lsl #16
orr r3, r3, lr, lsl #16
ldr r4, [r0, #0x0002] /* take advantage of nonaligned
* word accesses */
ldr r7, [r0]
ldr lr, [r0]
mov r4, r4, lsr #16 /* Put the two shorts together */
orr r4, r4, r7, lsl #16
orr r4, r4, lr, lsl #16
ldr r5, [r0, #0x0002] /* take advantage of nonaligned
* word accesses */
ldr r7, [r0]
ldr lr, [r0]
mov r5, r5, lsr #16 /* Put the two shorts together */
orr r5, r5, r7, lsl #16
orr r5, r5, lr, lsl #16
ldr r6, [r0, #0x0002] /* take advantage of nonaligned
ldr ip, [r0, #0x0002] /* take advantage of nonaligned
* word accesses */
ldr r7, [r0]
mov r6, r6, lsr #16 /* Put the two shorts together */
orr r6, r6, r7, lsl #16
ldr lr, [r0]
mov ip, ip, lsr #16 /* Put the two shorts together */
orr ip, ip, lr, lsl #16
stmia r1!, {r3-r6}
stmia r1!, {r3-r5,ip}
subs r2, r2, #0x00000008 /* Next */
bgt insw16loop
ldmfd sp!, {r4-r7}
mov pc, lr
ldmfd sp!, {r4,r5,pc} /* Restore regs and go home */
/*
@ -228,7 +227,8 @@ ENTRY(outsw16)
cmp r2, #0x00000000
movle pc, lr
/* If the destination address is word aligned and the size suitably aligned, do it fast */
/* If the destination address is word aligned and the size suitably
aligned, do it fast */
tst r2, #0x00000007
tsteq r1, #0x00000003
@ -237,49 +237,48 @@ ENTRY(outsw16)
/* Word aligned outsw */
stmfd sp!, {r4-r7}
stmfd sp!, {r4,r5,lr}
outsw16loop:
ldmia r1!, {r4-r7}
ldmia r1!, {r4,r5,ip,lr}
mov r3, r4, lsl #16
orr r3, r3, r3, lsr #16
eor r3, r4, r4, lsl #16 /* r3 = (A^B)(B) */
eor r4, r4, r3, lsr #16 /* r4 = (A)(B^A^B) = (A)(A) */
eor r3, r3, r4, lsl #16 /* r3 = (A^B^A)(B) = (B)(B) */
str r3, [r0]
str r4, [r0]
/* mov r3, r4, lsl #16
* orr r3, r3, r3, lsr #16
* str r3, [r0]
*
* mov r3, r4, lsr #16
* orr r3, r3, r3, lsl #16
* str r3, [r0]
*/
mov r3, r4, lsr #16
orr r3, r3, r3, lsl #16
eor r3, r5, r5, lsl #16 /* r3 = (A^B)(B) */
eor r5, r5, r3, lsr #16 /* r4 = (A)(B^A^B) = (A)(A) */
eor r3, r3, r5, lsl #16 /* r3 = (A^B^A)(B) = (B)(B) */
str r3, [r0]
str r5, [r0]
mov r3, r5, lsl #16
orr r3, r3, r3, lsr #16
eor r3, ip, ip, lsl #16 /* r3 = (A^B)(B) */
eor ip, ip, r3, lsr #16 /* r4 = (A)(B^A^B) = (A)(A) */
eor r3, r3, ip, lsl #16 /* r3 = (A^B^A)(B) = (B)(B) */
str r3, [r0]
str ip, [r0]
mov r3, r5, lsr #16
orr r3, r3, r3, lsl #16
str r3, [r0]
mov r3, r6, lsl #16
orr r3, r3, r3, lsr #16
str r3, [r0]
mov r3, r6, lsr #16
orr r3, r3, r3, lsl #16
str r3, [r0]
mov r3, r7, lsl #16
orr r3, r3, r3, lsr #16
str r3, [r0]
mov r3, r7, lsr #16
orr r3, r3, r3, lsl #16
eor r3, lr, lr, lsl #16 /* r3 = (A^B)(B) */
eor lr, lr, r3, lsr #16 /* r4 = (A)(B^A^B) = (A)(A) */
eor r3, r3, lr, lsl #16 /* r3 = (A^B^A)(B) = (B)(B) */
str r3, [r0]
str lr, [r0]
subs r2, r2, #0x00000008
bgt outsw16loop
ldmfd sp!, {r4-r7}
mov pc, lr
ldmfd sp!, {r4,r5,pc} /* and go home */
/*
* reads short ints (16 bits) from an I/O address into a block of memory
@ -297,7 +296,8 @@ ENTRY(inswm8)
cmp r2, #0x00000000
movle pc, lr
/* If the destination address is word aligned and the size suitably aligned, do it fast */
/* If the destination address is word aligned and the size suitably
aligned, do it fast */
tst r1, #0x00000003
@ -305,25 +305,25 @@ ENTRY(inswm8)
/* Word aligned insw */
stmfd sp!, {r4-r11}
stmfd sp!, {r4-r9,lr}
mov r11, #0xff000000
orr r11, r11, #0x00ff0000
mov lr, #0xff000000
orr lr, lr, #0x00ff0000
inswm8_loop8:
cmp r2, #8
bcc inswm8_l8
ldmia r0, {r3-r10}
ldmia r0, {r3-r9,ip}
bic r3, r3, r11
bic r3, r3, lr
orr r3, r3, r4, lsl #16
bic r5, r5, r11
bic r5, r5, lr
orr r4, r5, r6, lsl #16
bic r7, r7, r11
bic r7, r7, lr
orr r5, r7, r8, lsl #16
bic r9, r9, r11
orr r6, r9, r10, lsl #16
bic r9, r9, lr
orr r6, r9, ip, lsl #16
stmia r1!, {r3-r6}
@ -337,9 +337,9 @@ inswm8_l8:
ldmia r0, {r3-r6}
bic r3, r3, r11
bic r3, r3, lr
orr r3, r3, r4, lsl #16
bic r5, r5, r11
bic r5, r5, lr
orr r4, r5, r6, lsl #16
stmia r1!, {r3-r4}
@ -353,7 +353,7 @@ inswm8_l4:
ldmia r0, {r3-r4}
bic r3, r3, r11
bic r3, r3, lr
orr r3, r3, r4, lsl #16
str r3, [r1], #0x0004
@ -365,17 +365,16 @@ inswm8_l2:
bcc inswm8_l1
ldr r3, [r0]
subs r2, r2, #0x00000001 /* Test in load delay slot */
/* XXX, why don't we use result? */
strb r3, [r1], #0x0001
mov r3, r3, lsr #8
strb r3, [r1], #0x0001
subs r2, r2, #0x00000001
inswm8_l1:
ldmfd sp!, {r4-r11}
mov pc, lr
ldmfd sp!, {r4-r9,pc} /* And go home */
/*
* write short ints (16 bits) to an I/O address from a block of memory
@ -393,7 +392,8 @@ ENTRY(outswm8)
cmp r2, #0x00000000
movle pc, lr
/* If the destination address is word aligned and the size suitably aligned, do it fast */
/* If the destination address is word aligned and the size suitably
aligned, do it fast */
tst r1, #0x00000003
@ -401,32 +401,31 @@ ENTRY(outswm8)
/* Word aligned outsw */
stmfd sp!, {r4-r10}
stmfd sp!, {r4-r8,lr}
outswm8_loop8:
cmp r2, #8
bcc outswm8_l8
ldmia r1!, {r3,r5,r7,r9}
ldmia r1!, {r3,r5,r7,ip}
mov r4, r3, lsr #16
orr r4, r4, r4, lsl #16
mov r3, r3, lsl #16
orr r3, r3, r3, lsr #16
mov r6, r5, lsr #16
orr r6, r6, r6, lsl #16
mov r5, r5, lsl #16
orr r5, r5, r5, lsr #16
mov r8, r7, lsr #16
orr r8, r8, r8, lsl #16
mov r7, r7, lsl #16
orr r7, r7, r7, lsr #16
mov r10, r9, lsr #16
orr r10, r10, r10, lsl #16
mov r9, r9, lsl #16
orr r9, r9, r9, lsr #16
eor r4, r3, r3, lsr #16 /* r4 = (A)(A^B) */
eor r3, r3, r4, lsl #16 /* r3 = (A^A^B)(B) = (B)(B) */
eor r4, r4, r3, lsr #16 /* r4 = (A)(B^A^B) = (A)(A) */
stmia r0, {r3-r10}
eor r6, r5, r5, lsr #16 /* r6 = (A)(A^B) */
eor r5, r5, r6, lsl #16 /* r5 = (A^A^B)(B) = (B)(B) */
eor r6, r6, r5, lsr #16 /* r6 = (A)(B^A^B) = (A)(A) */
eor r8, r7, r7, lsr #16 /* r8 = (A)(A^B) */
eor r7, r7, r8, lsl #16 /* r7 = (A^A^B)(B) = (B)(B) */
eor r8, r8, r7, lsr #16 /* r8 = (A)(B^A^B) = (A)(A) */
eor lr, ip, ip, lsr #16 /* lr = (A)(A^B) */
eor ip, ip, lr, lsl #16 /* ip = (A^A^B)(B) = (B)(B) */
eor lr, lr, ip, lsr #16 /* lr = (A)(B^A^B) = (A)(A) */
stmia r0, {r3-r8,ip,lr}
subs r2, r2, #0x00000008 /* Next */
bne outswm8_loop8
@ -438,14 +437,13 @@ outswm8_l8:
ldmia r1!, {r3-r4}
mov r5, r3, lsl #16
orr r5, r5, r5, lsr #16
mov r6, r3, lsr #16
orr r6, r6, r6, lsl #16
mov r7, r4, lsl #16
orr r7, r7, r7, lsr #16
mov r8, r4, lsr #16
orr r8, r8, r8, lsl #16
eor r6, r3, r3, lsr #16 /* r6 = (A)(A^B) */
eor r5, r3, r6, lsl #16 /* r5 = (A^A^B)(B) = (B)(B) */
eor r6, r6, r5, lsr #16 /* r6 = (A)(B^A^B) = (A)(A) */
eor r8, r4, r4, lsr #16 /* r8 = (A)(A^B) */
eor r7, r4, r8, lsl #16 /* r7 = (A^A^B)(B) = (B)(B) */
eor r8, r8, r7, lsr #16 /* r8 = (A)(B^A^B) = (A)(A) */
stmia r0, {r5-r8}
@ -456,16 +454,15 @@ outswm8_l4:
cmp r2, #2
bcc outswm8_l2
ldr r3, [r1], #0x0004
ldr r3, [r1], #0x0004 /* r3 = (A)(B) */
subs r2, r2, #0x00000002 /* Done test in Load delay slot */
mov r4, r3, lsl #16
orr r4, r4, r4, lsr #16
mov r5, r3, lsr #16
orr r5, r5, r5, lsl #16
eor r5, r3, r3, lsr #16 /* r5 = (A)(A^B)*/
eor r4, r3, r5, lsl #16 /* r4 = (A^A^B)(B) = (B)(B) */
eor r5, r5, r4, lsr #16 /* r5 = (A)(B^A^B) = (A)(A) */
stmia r0, {r4, r5}
subs r2, r2, #0x00000002
beq outswm8_l1
outswm8_l2:
@ -474,14 +471,11 @@ outswm8_l2:
ldrb r3, [r1], #0x0001
ldrb r4, [r1], #0x0001
subs r2, r2, #0x00000001 /* Done test in load delay slot */
/* XXX This test isn't used? */
orr r3, r3, r4, lsl #8
orr r3, r3, r3, lsl #16
str r3, [r0]
subs r2, r2, #0x00000001
outswm8_l1:
ldmfd sp!, {r4-r10}
mov pc, lr
ldmfd sp!, {r4-r8,pc} /* And go home */