Xscale-optimised mem* functions, contributed by Wasabi Systems.

(Note: memcmp/memset improvements also benefit non-Xscale).

memcmp()  - Compare 32-bits at a time if possible. Special-case 6-byte
            comparisons, for the benefit of the network stack.

memset()  - More loop unrolling, plus use of 'strd' instruction,
            results in > 100% speedup on Xscale.

memcpy()  - Big-endian support, unrolled loops, 'strd/ldrd/pld', plus
            special-cases for very common length/alignment combinations
            (at least in the kernel). Benchmarks show ~50% improvment on
            Xscale.

memmove() - Big-endian support. Use fast memcpy(), above, if the regions
            don't overlap. Otherwise unchanged.
This commit is contained in:
scw 2003-10-13 19:59:24 +00:00
parent 9d9b366ce0
commit 5e7e19ec12
6 changed files with 2993 additions and 662 deletions

View File

@ -1,5 +1,39 @@
/* $NetBSD: memcmp.S,v 1.2 2003/04/05 23:27:15 bjh21 Exp $ */
/* $NetBSD: memcmp.S,v 1.3 2003/10/13 19:59:24 scw Exp $ */
/*
* Copyright 2003 Wasabi Systems, Inc.
* All rights reserved.
*
* Written by Steve C. Woodford for Wasabi Systems, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed for the NetBSD Project by
* Wasabi Systems, Inc.
* 4. The name of Wasabi Systems, Inc. may not be used to endorse
* or promote products derived from this software without specific prior
* written permission.
*
* THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 2002 ARM Ltd
* All rights reserved.
@ -30,21 +64,117 @@
#include <machine/asm.h>
RCSID("$NetBSD: memcmp.S,v 1.2 2003/04/05 23:27:15 bjh21 Exp $")
RCSID("$NetBSD: memcmp.S,v 1.3 2003/10/13 19:59:24 scw Exp $")
ENTRY(memcmp)
/* if ((len - 1) < 0) return 0 */
subs r2, r2, #1
movmi r0, #0
movmi pc, lr
mov ip, r0
#if defined(_KERNEL) && !defined(_STANDALONE)
cmp r2, #0x06
beq .Lmemcmp_6bytes
#endif
mov r0, #0x00
/* ip == last src address to compare */
add ip, r0, r2
1:
ldrb r2, [r0], #1
ldrb r3, [r1], #1
cmp ip, r0
cmpcs r2, r3
beq 1b
sub r0, r2, r3
/* Are both addresses aligned the same way? */
cmp r2, #0x00
eornes r3, ip, r1
moveq pc, lr /* len == 0, or same addresses! */
tst r3, #0x03
subne r2, r2, #0x01
bne .Lmemcmp_bytewise2 /* Badly aligned. Do it the slow way */
/* Word-align the addresses, if necessary */
and r3, r1, #0x03
rsbs r3, r3, #0x03
add r3, r3, r3, lsl #1
addne pc, pc, r3, lsl #3
nop
/* Compare up to 3 bytes */
ldrb r0, [ip], #0x01
ldrb r3, [r1], #0x01
subs r0, r0, r3
movne pc, lr
subs r2, r2, #0x01
moveq pc, lr
/* Compare up to 2 bytes */
ldrb r0, [ip], #0x01
ldrb r3, [r1], #0x01
subs r0, r0, r3
movne pc, lr
subs r2, r2, #0x01
moveq pc, lr
/* Compare 1 byte */
ldrb r0, [ip], #0x01
ldrb r3, [r1], #0x01
subs r0, r0, r3
movne pc, lr
subs r2, r2, #0x01
moveq pc, lr
/* Compare 4 bytes at a time, if possible */
subs r2, r2, #0x04
bcc .Lmemcmp_bytewise
.Lmemcmp_word_aligned:
ldr r0, [ip], #0x04
ldr r3, [r1], #0x04
subs r2, r2, #0x04
cmpcs r0, r3
beq .Lmemcmp_word_aligned
sub r0, r0, r3
/* Correct for extra subtraction, and check if done */
adds r2, r2, #0x04
cmpeq r0, #0x00 /* If done, did all bytes match? */
moveq pc, lr /* Yup. Just return */
/* Re-do the final word byte-wise */
sub ip, ip, #0x04
sub r1, r1, #0x04
.Lmemcmp_bytewise:
add r2, r2, #0x03
.Lmemcmp_bytewise2:
ldrb r0, [ip], #0x01
ldrb r3, [r1], #0x01
subs r2, r2, #0x01
cmpcs r0, r3
beq .Lmemcmp_bytewise2
sub r0, r0, r3
mov pc, lr
#if defined(_KERNEL) && !defined(_STANDALONE)
/*
* 6 byte compares are very common, thanks to the network stack.
* This code is hand-scheduled to reduce the number of stalls for
* load results. Everything else being equal, this will be ~32%
* faster than a byte-wise memcmp.
*/
.align 5
.Lmemcmp_6bytes:
ldrb r3, [r1, #0x00] /* r3 = b2#0 */
ldrb r0, [ip, #0x00] /* r0 = b1#0 */
ldrb r2, [r1, #0x01] /* r2 = b2#1 */
subs r0, r0, r3 /* r0 = b1#0 - b2#0 */
ldreqb r3, [ip, #0x01] /* r3 = b1#1 */
movne pc, lr /* Return if mismatch on #0 */
subs r0, r3, r2 /* r0 = b1#1 - b2#1 */
ldreqb r3, [r1, #0x02] /* r3 = b2#2 */
ldreqb r0, [ip, #0x02] /* r0 = b1#2 */
movne pc, lr /* Return if mismatch on #1 */
ldrb r2, [r1, #0x03] /* r2 = b2#3 */
subs r0, r0, r3 /* r0 = b1#2 - b2#2 */
ldreqb r3, [ip, #0x03] /* r3 = b1#3 */
movne pc, lr /* Return if mismatch on #2 */
subs r0, r3, r2 /* r0 = b1#3 - b2#3 */
ldreqb r3, [r1, #0x04] /* r3 = b2#4 */
ldreqb r0, [ip, #0x04] /* r0 = b1#4 */
movne pc, lr /* Return if mismatch on #3 */
ldrb r2, [r1, #0x05] /* r2 = b2#5 */
subs r0, r0, r3 /* r0 = b1#4 - b2#4 */
ldreqb r3, [ip, #0x05] /* r3 = b1#5 */
movne pc, lr /* Return if mismatch on #4 */
sub r0, r3, r2 /* r0 = b1#5 - b2#5 */
mov pc, lr
#endif

View File

@ -1,585 +1,7 @@
/* $NetBSD: memcpy.S,v 1.5 2003/10/09 08:54:54 ichiro Exp $ */
/* $NetBSD: memcpy.S,v 1.6 2003/10/13 19:59:24 scw Exp $ */
/*-
* Copyright (c) 1997 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Neil A. Carson and Mark Brinicombe
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the NetBSD
* Foundation, Inc. and its contributors.
* 4. Neither the name of The NetBSD Foundation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <machine/asm.h>
/*
* This is one fun bit of code ...
* Some easy listening music is suggested while trying to understand this
* code e.g. Iron Maiden
*
* For anyone attempting to understand it :
*
* The core code is implemented here with simple stubs for memcpy()
* memmove() and bcopy().
*
* All local labels are prefixed with Lmemcpy_
* Following the prefix a label starting f is used in the forward copy code
* while a label using b is used in the backwards copy code
* The source and destination addresses determine whether a forward or
* backward copy is performed.
* Separate bits of code are used to deal with the following situations
* for both the forward and backwards copy.
* unaligned source address
* unaligned destination address
* Separate copy routines are used to produce an optimised result for each
* of these cases.
* The copy code will use LDM/STM instructions to copy up to 32 bytes at
* a time where possible.
*
* Note: r12 (aka ip) can be trashed during the function along with
* r0-r3 although r0-r2 have defined uses i.e. src, dest, len through out.
* Additional registers are preserved prior to use i.e. r4, r5 & lr
*
* Apologies for the state of the comments ;-)
*/
ENTRY(memcpy)
ENTRY_NP(memmove)
/* Determine copy direction */
cmp r1, r0
moveq r0, #0 /* Quick abort for len=0 */
moveq pc, lr
/* save leaf functions having to store this away */
stmdb sp!, {r0, lr} /* memcpy() returns dest addr */
bcc .Lmemcpy_backwards
/* start of forwards copy */
subs r2, r2, #4
blt .Lmemcpy_fl4 /* less than 4 bytes */
ands r12, r0, #3
bne .Lmemcpy_fdestul /* oh unaligned destination addr */
ands r12, r1, #3
bne .Lmemcpy_fsrcul /* oh unaligned source addr */
.Lmemcpy_ft8:
/* We have aligned source and destination */
subs r2, r2, #8
blt .Lmemcpy_fl12 /* less than 12 bytes (4 from above) */
subs r2, r2, #0x14
blt .Lmemcpy_fl32 /* less than 32 bytes (12 from above) */
stmdb sp!, {r4} /* borrow r4 */
/* blat 32 bytes at a time */
/* XXX for really big copies perhaps we should use more registers */
.Lmemcpy_floop32:
ldmia r1!, {r3, r4, r12, lr}
stmia r0!, {r3, r4, r12, lr}
ldmia r1!, {r3, r4, r12, lr}
stmia r0!, {r3, r4, r12, lr}
subs r2, r2, #0x20
bge .Lmemcpy_floop32
cmn r2, #0x10
ldmgeia r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */
stmgeia r0!, {r3, r4, r12, lr}
subge r2, r2, #0x10
ldmia sp!, {r4} /* return r4 */
.Lmemcpy_fl32:
adds r2, r2, #0x14
/* blat 12 bytes at a time */
.Lmemcpy_floop12:
ldmgeia r1!, {r3, r12, lr}
stmgeia r0!, {r3, r12, lr}
subges r2, r2, #0x0c
bge .Lmemcpy_floop12
.Lmemcpy_fl12:
adds r2, r2, #8
blt .Lmemcpy_fl4
subs r2, r2, #4
ldrlt r3, [r1], #4
strlt r3, [r0], #4
ldmgeia r1!, {r3, r12}
stmgeia r0!, {r3, r12}
subge r2, r2, #4
.Lmemcpy_fl4:
/* less than 4 bytes to go */
adds r2, r2, #4
#ifdef __APCS_26_
ldmeqia sp!, {r0, pc}^ /* done */
#if !defined(__XSCALE__) || defined(_STANDALONE)
#include "memcpy_arm.S"
#else
ldmeqia sp!, {r0, pc} /* done */
#include "memcpy_xscale.S"
#endif
/* copy the crud byte at a time */
cmp r2, #2
ldrb r3, [r1], #1
strb r3, [r0], #1
ldrgeb r3, [r1], #1
strgeb r3, [r0], #1
ldrgtb r3, [r1], #1
strgtb r3, [r0], #1
ldmia sp!, {r0, pc}
/* erg - unaligned destination */
.Lmemcpy_fdestul:
rsb r12, r12, #4
cmp r12, #2
/* align destination with byte copies */
ldrb r3, [r1], #1
strb r3, [r0], #1
ldrgeb r3, [r1], #1
strgeb r3, [r0], #1
ldrgtb r3, [r1], #1
strgtb r3, [r0], #1
subs r2, r2, r12
blt .Lmemcpy_fl4 /* less the 4 bytes */
ands r12, r1, #3
beq .Lmemcpy_ft8 /* we have an aligned source */
/* erg - unaligned source */
/* This is where it gets nasty ... */
.Lmemcpy_fsrcul:
bic r1, r1, #3
ldr lr, [r1], #4
cmp r12, #2
bgt .Lmemcpy_fsrcul3
beq .Lmemcpy_fsrcul2
cmp r2, #0x0c
blt .Lmemcpy_fsrcul1loop4
sub r2, r2, #0x0c
stmdb sp!, {r4, r5}
.Lmemcpy_fsrcul1loop16:
#ifdef __ARMEB__
mov r3, lr, lsl #8
ldmia r1!, {r4, r5, r12, lr}
orr r3, r3, r4, lsr #24
mov r4, r4, lsl #8
orr r4, r4, r5, lsr #24
mov r5, r5, lsl #8
orr r5, r5, r12, lsr #24
mov r12, r12, lsl #8
orr r12, r12, lr, lsr #24
#else
mov r3, lr, lsr #8
ldmia r1!, {r4, r5, r12, lr}
orr r3, r3, r4, lsl #24
mov r4, r4, lsr #8
orr r4, r4, r5, lsl #24
mov r5, r5, lsr #8
orr r5, r5, r12, lsl #24
mov r12, r12, lsr #8
orr r12, r12, lr, lsl #24
#endif
stmia r0!, {r3-r5, r12}
subs r2, r2, #0x10
bge .Lmemcpy_fsrcul1loop16
ldmia sp!, {r4, r5}
adds r2, r2, #0x0c
blt .Lmemcpy_fsrcul1l4
.Lmemcpy_fsrcul1loop4:
#ifdef __ARMEB__
mov r12, lr, lsl #8
ldr lr, [r1], #4
orr r12, r12, lr, lsr #24
#else
mov r12, lr, lsr #8
ldr lr, [r1], #4
orr r12, r12, lr, lsl #24
#endif
str r12, [r0], #4
subs r2, r2, #4
bge .Lmemcpy_fsrcul1loop4
.Lmemcpy_fsrcul1l4:
sub r1, r1, #3
b .Lmemcpy_fl4
.Lmemcpy_fsrcul2:
cmp r2, #0x0c
blt .Lmemcpy_fsrcul2loop4
sub r2, r2, #0x0c
stmdb sp!, {r4, r5}
.Lmemcpy_fsrcul2loop16:
#ifdef __ARMEB__
mov r3, lr, lsl #16
ldmia r1!, {r4, r5, r12, lr}
orr r3, r3, r4, lsr #16
mov r4, r4, lsl #16
orr r4, r4, r5, lsr #16
mov r5, r5, lsl #16
orr r5, r5, r12, lsr #16
mov r12, r12, lsl #16
orr r12, r12, lr, lsr #16
#else
mov r3, lr, lsr #16
ldmia r1!, {r4, r5, r12, lr}
orr r3, r3, r4, lsl #16
mov r4, r4, lsr #16
orr r4, r4, r5, lsl #16
mov r5, r5, lsr #16
orr r5, r5, r12, lsl #16
mov r12, r12, lsr #16
orr r12, r12, lr, lsl #16
#endif
stmia r0!, {r3-r5, r12}
subs r2, r2, #0x10
bge .Lmemcpy_fsrcul2loop16
ldmia sp!, {r4, r5}
adds r2, r2, #0x0c
blt .Lmemcpy_fsrcul2l4
.Lmemcpy_fsrcul2loop4:
#ifdef __ARMEB__
mov r12, lr, lsl #16
ldr lr, [r1], #4
orr r12, r12, lr, lsr #16
#else
mov r12, lr, lsr #16
ldr lr, [r1], #4
orr r12, r12, lr, lsl #16
#endif
str r12, [r0], #4
subs r2, r2, #4
bge .Lmemcpy_fsrcul2loop4
.Lmemcpy_fsrcul2l4:
sub r1, r1, #2
b .Lmemcpy_fl4
.Lmemcpy_fsrcul3:
cmp r2, #0x0c
blt .Lmemcpy_fsrcul3loop4
sub r2, r2, #0x0c
stmdb sp!, {r4, r5}
.Lmemcpy_fsrcul3loop16:
#ifdef __ARMEB__
mov r3, lr, lsl #24
ldmia r1!, {r4, r5, r12, lr}
orr r3, r3, r4, lsr #8
mov r4, r4, lsl #24
orr r4, r4, r5, lsr #8
mov r5, r5, lsl #24
orr r5, r5, r12, lsr #8
mov r12, r12, lsl #24
orr r12, r12, lr, lsr #8
#else
mov r3, lr, lsr #24
ldmia r1!, {r4, r5, r12, lr}
orr r3, r3, r4, lsl #8
mov r4, r4, lsr #24
orr r4, r4, r5, lsl #8
mov r5, r5, lsr #24
orr r5, r5, r12, lsl #8
mov r12, r12, lsr #24
orr r12, r12, lr, lsl #8
#endif
stmia r0!, {r3-r5, r12}
subs r2, r2, #0x10
bge .Lmemcpy_fsrcul3loop16
ldmia sp!, {r4, r5}
adds r2, r2, #0x0c
blt .Lmemcpy_fsrcul3l4
.Lmemcpy_fsrcul3loop4:
#ifdef __ARMEB__
mov r12, lr, lsl #24
ldr lr, [r1], #4
orr r12, r12, lr, lsr #8
#else
mov r12, lr, lsr #24
ldr lr, [r1], #4
orr r12, r12, lr, lsl #8
#endif
str r12, [r0], #4
subs r2, r2, #4
bge .Lmemcpy_fsrcul3loop4
.Lmemcpy_fsrcul3l4:
sub r1, r1, #1
b .Lmemcpy_fl4
.Lmemcpy_backwards:
add r1, r1, r2
add r0, r0, r2
subs r2, r2, #4
blt .Lmemcpy_bl4 /* less than 4 bytes */
ands r12, r0, #3
bne .Lmemcpy_bdestul /* oh unaligned destination addr */
ands r12, r1, #3
bne .Lmemcpy_bsrcul /* oh unaligned source addr */
.Lmemcpy_bt8:
/* We have aligned source and destination */
subs r2, r2, #8
blt .Lmemcpy_bl12 /* less than 12 bytes (4 from above) */
stmdb sp!, {r4}
subs r2, r2, #0x14 /* less than 32 bytes (12 from above) */
blt .Lmemcpy_bl32
/* blat 32 bytes at a time */
/* XXX for really big copies perhaps we should use more registers */
.Lmemcpy_bloop32:
ldmdb r1!, {r3, r4, r12, lr}
stmdb r0!, {r3, r4, r12, lr}
ldmdb r1!, {r3, r4, r12, lr}
stmdb r0!, {r3, r4, r12, lr}
subs r2, r2, #0x20
bge .Lmemcpy_bloop32
.Lmemcpy_bl32:
cmn r2, #0x10
ldmgedb r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */
stmgedb r0!, {r3, r4, r12, lr}
subge r2, r2, #0x10
adds r2, r2, #0x14
ldmgedb r1!, {r3, r12, lr} /* blat a remaining 12 bytes */
stmgedb r0!, {r3, r12, lr}
subge r2, r2, #0x0c
ldmia sp!, {r4}
.Lmemcpy_bl12:
adds r2, r2, #8
blt .Lmemcpy_bl4
subs r2, r2, #4
ldrlt r3, [r1, #-4]!
strlt r3, [r0, #-4]!
ldmgedb r1!, {r3, r12}
stmgedb r0!, {r3, r12}
subge r2, r2, #4
.Lmemcpy_bl4:
/* less than 4 bytes to go */
adds r2, r2, #4
ldmeqia sp!, {r0, pc}
/* copy the crud byte at a time */
cmp r2, #2
ldrb r3, [r1, #-1]!
strb r3, [r0, #-1]!
ldrgeb r3, [r1, #-1]!
strgeb r3, [r0, #-1]!
ldrgtb r3, [r1, #-1]!
strgtb r3, [r0, #-1]!
ldmia sp!, {r0, pc}
/* erg - unaligned destination */
.Lmemcpy_bdestul:
cmp r12, #2
/* align destination with byte copies */
ldrb r3, [r1, #-1]!
strb r3, [r0, #-1]!
ldrgeb r3, [r1, #-1]!
strgeb r3, [r0, #-1]!
ldrgtb r3, [r1, #-1]!
strgtb r3, [r0, #-1]!
subs r2, r2, r12
blt .Lmemcpy_bl4 /* less than 4 bytes to go */
ands r12, r1, #3
beq .Lmemcpy_bt8 /* we have an aligned source */
/* erg - unaligned source */
/* This is where it gets nasty ... */
.Lmemcpy_bsrcul:
bic r1, r1, #3
ldr r3, [r1, #0]
cmp r12, #2
blt .Lmemcpy_bsrcul1
beq .Lmemcpy_bsrcul2
cmp r2, #0x0c
blt .Lmemcpy_bsrcul3loop4
sub r2, r2, #0x0c
stmdb sp!, {r4, r5}
.Lmemcpy_bsrcul3loop16:
#ifdef __ARMEB__
mov lr, r3, lsr #8
ldmdb r1!, {r3-r5, r12}
orr lr, lr, r12, lsl #24
mov r12, r12, lsr #8
orr r12, r12, r5, lsl #24
mov r5, r5, lsr #8
orr r5, r5, r4, lsl #24
mov r4, r4, lsr #8
orr r4, r4, r3, lsl #24
#else
mov lr, r3, lsl #8
ldmdb r1!, {r3-r5, r12}
orr lr, lr, r12, lsr #24
mov r12, r12, lsl #8
orr r12, r12, r5, lsr #24
mov r5, r5, lsl #8
orr r5, r5, r4, lsr #24
mov r4, r4, lsl #8
orr r4, r4, r3, lsr #24
#endif
stmdb r0!, {r4, r5, r12, lr}
subs r2, r2, #0x10
bge .Lmemcpy_bsrcul3loop16
ldmia sp!, {r4, r5}
adds r2, r2, #0x0c
blt .Lmemcpy_bsrcul3l4
.Lmemcpy_bsrcul3loop4:
#ifdef __ARMEB__
mov r12, r3, lsr #8
ldr r3, [r1, #-4]!
orr r12, r12, r3, lsl #24
#else
mov r12, r3, lsl #8
ldr r3, [r1, #-4]!
orr r12, r12, r3, lsr #24
#endif
str r12, [r0, #-4]!
subs r2, r2, #4
bge .Lmemcpy_bsrcul3loop4
.Lmemcpy_bsrcul3l4:
add r1, r1, #3
b .Lmemcpy_bl4
.Lmemcpy_bsrcul2:
cmp r2, #0x0c
blt .Lmemcpy_bsrcul2loop4
sub r2, r2, #0x0c
stmdb sp!, {r4, r5}
.Lmemcpy_bsrcul2loop16:
#ifdef __ARMEB__
mov lr, r3, lsr #16
ldmdb r1!, {r3-r5, r12}
orr lr, lr, r12, lsl #16
mov r12, r12, lsr #16
orr r12, r12, r5, lsl #16
mov r5, r5, lsr #16
orr r5, r5, r4, lsl #16
mov r4, r4, lsr #16
orr r4, r4, r3, lsl #16
#else
mov lr, r3, lsl #16
ldmdb r1!, {r3-r5, r12}
orr lr, lr, r12, lsr #16
mov r12, r12, lsl #16
orr r12, r12, r5, lsr #16
mov r5, r5, lsl #16
orr r5, r5, r4, lsr #16
mov r4, r4, lsl #16
orr r4, r4, r3, lsr #16
#endif
stmdb r0!, {r4, r5, r12, lr}
subs r2, r2, #0x10
bge .Lmemcpy_bsrcul2loop16
ldmia sp!, {r4, r5}
adds r2, r2, #0x0c
blt .Lmemcpy_bsrcul2l4
.Lmemcpy_bsrcul2loop4:
#ifdef __ARMEB__
mov r12, r3, lsr #16
ldr r3, [r1, #-4]!
orr r12, r12, r3, lsl #16
#else
mov r12, r3, lsl #16
ldr r3, [r1, #-4]!
orr r12, r12, r3, lsr #16
#endif
str r12, [r0, #-4]!
subs r2, r2, #4
bge .Lmemcpy_bsrcul2loop4
.Lmemcpy_bsrcul2l4:
add r1, r1, #2
b .Lmemcpy_bl4
.Lmemcpy_bsrcul1:
cmp r2, #0x0c
blt .Lmemcpy_bsrcul1loop4
sub r2, r2, #0x0c
stmdb sp!, {r4, r5}
.Lmemcpy_bsrcul1loop32:
#ifdef __ARMEB__
mov lr, r3, lsr #24
ldmdb r1!, {r3-r5, r12}
orr lr, lr, r12, lsl #8
mov r12, r12, lsr #24
orr r12, r12, r5, lsl #8
mov r5, r5, lsr #24
orr r5, r5, r4, lsl #8
mov r4, r4, lsr #24
orr r4, r4, r3, lsl #8
#else
mov lr, r3, lsl #24
ldmdb r1!, {r3-r5, r12}
orr lr, lr, r12, lsr #8
mov r12, r12, lsl #24
orr r12, r12, r5, lsr #8
mov r5, r5, lsl #24
orr r5, r5, r4, lsr #8
mov r4, r4, lsl #24
orr r4, r4, r3, lsr #8
#endif
stmdb r0!, {r4, r5, r12, lr}
subs r2, r2, #0x10
bge .Lmemcpy_bsrcul1loop32
ldmia sp!, {r4, r5}
adds r2, r2, #0x0c
blt .Lmemcpy_bsrcul1l4
.Lmemcpy_bsrcul1loop4:
#ifdef __ARMEB__
mov r12, r3, lsr #24
ldr r3, [r1, #-4]!
orr r12, r12, r3, lsl #8
#else
mov r12, r3, lsl #24
ldr r3, [r1, #-4]!
orr r12, r12, r3, lsr #8
#endif
str r12, [r0, #-4]!
subs r2, r2, #4
bge .Lmemcpy_bsrcul1loop4
.Lmemcpy_bsrcul1l4:
add r1, r1, #1
b .Lmemcpy_bl4

View File

@ -0,0 +1,273 @@
/* $NetBSD: memcpy_arm.S,v 1.1 2003/10/13 19:59:24 scw Exp $ */
/*-
* Copyright (c) 1997 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Neil A. Carson and Mark Brinicombe
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the NetBSD
* Foundation, Inc. and its contributors.
* 4. Neither the name of The NetBSD Foundation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <machine/asm.h>
/*
* This is one fun bit of code ...
* Some easy listening music is suggested while trying to understand this
* code e.g. Iron Maiden
*
* For anyone attempting to understand it :
*
* The core code is implemented here with simple stubs for memcpy().
*
* All local labels are prefixed with Lmemcpy_
* Following the prefix a label starting f is used in the forward copy code
* while a label using b is used in the backwards copy code
* The source and destination addresses determine whether a forward or
* backward copy is performed.
* Separate bits of code are used to deal with the following situations
* for both the forward and backwards copy.
* unaligned source address
* unaligned destination address
* Separate copy routines are used to produce an optimised result for each
* of these cases.
* The copy code will use LDM/STM instructions to copy up to 32 bytes at
* a time where possible.
*
* Note: r12 (aka ip) can be trashed during the function along with
* r0-r3 although r0-r2 have defined uses i.e. src, dest, len through out.
* Additional registers are preserved prior to use i.e. r4, r5 & lr
*
* Apologies for the state of the comments ;-)
*/
/* LINTSTUB: Func: void *memcpy(void *dst, const void *src, size_t len) */
ENTRY(memcpy)
/* save leaf functions having to store this away */
stmdb sp!, {r0, lr} /* memcpy() returns dest addr */
subs r2, r2, #4
blt .Lmemcpy_l4 /* less than 4 bytes */
ands r12, r0, #3
bne .Lmemcpy_destul /* oh unaligned destination addr */
ands r12, r1, #3
bne .Lmemcpy_srcul /* oh unaligned source addr */
.Lmemcpy_t8:
/* We have aligned source and destination */
subs r2, r2, #8
blt .Lmemcpy_l12 /* less than 12 bytes (4 from above) */
subs r2, r2, #0x14
blt .Lmemcpy_l32 /* less than 32 bytes (12 from above) */
stmdb sp!, {r4} /* borrow r4 */
/* blat 32 bytes at a time */
/* XXX for really big copies perhaps we should use more registers */
.Lmemcpy_loop32:
ldmia r1!, {r3, r4, r12, lr}
stmia r0!, {r3, r4, r12, lr}
ldmia r1!, {r3, r4, r12, lr}
stmia r0!, {r3, r4, r12, lr}
subs r2, r2, #0x20
bge .Lmemcpy_loop32
cmn r2, #0x10
ldmgeia r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */
stmgeia r0!, {r3, r4, r12, lr}
subge r2, r2, #0x10
ldmia sp!, {r4} /* return r4 */
.Lmemcpy_l32:
adds r2, r2, #0x14
/* blat 12 bytes at a time */
.Lmemcpy_loop12:
ldmgeia r1!, {r3, r12, lr}
stmgeia r0!, {r3, r12, lr}
subges r2, r2, #0x0c
bge .Lmemcpy_loop12
.Lmemcpy_l12:
adds r2, r2, #8
blt .Lmemcpy_l4
subs r2, r2, #4
ldrlt r3, [r1], #4
strlt r3, [r0], #4
ldmgeia r1!, {r3, r12}
stmgeia r0!, {r3, r12}
subge r2, r2, #4
.Lmemcpy_l4:
/* less than 4 bytes to go */
adds r2, r2, #4
#ifdef __APCS_26_
ldmeqia sp!, {r0, pc}^ /* done */
#else
ldmeqia sp!, {r0, pc} /* done */
#endif
/* copy the crud byte at a time */
cmp r2, #2
ldrb r3, [r1], #1
strb r3, [r0], #1
ldrgeb r3, [r1], #1
strgeb r3, [r0], #1
ldrgtb r3, [r1], #1
strgtb r3, [r0], #1
ldmia sp!, {r0, pc}
/* erg - unaligned destination */
.Lmemcpy_destul:
rsb r12, r12, #4
cmp r12, #2
/* align destination with byte copies */
ldrb r3, [r1], #1
strb r3, [r0], #1
ldrgeb r3, [r1], #1
strgeb r3, [r0], #1
ldrgtb r3, [r1], #1
strgtb r3, [r0], #1
subs r2, r2, r12
blt .Lmemcpy_l4 /* less the 4 bytes */
ands r12, r1, #3
beq .Lmemcpy_t8 /* we have an aligned source */
/* erg - unaligned source */
/* This is where it gets nasty ... */
.Lmemcpy_srcul:
bic r1, r1, #3
ldr lr, [r1], #4
cmp r12, #2
bgt .Lmemcpy_srcul3
beq .Lmemcpy_srcul2
cmp r2, #0x0c
blt .Lmemcpy_srcul1loop4
sub r2, r2, #0x0c
stmdb sp!, {r4, r5}
.Lmemcpy_srcul1loop16:
mov r3, lr, lsr #8
ldmia r1!, {r4, r5, r12, lr}
orr r3, r3, r4, lsl #24
mov r4, r4, lsr #8
orr r4, r4, r5, lsl #24
mov r5, r5, lsr #8
orr r5, r5, r12, lsl #24
mov r12, r12, lsr #8
orr r12, r12, lr, lsl #24
stmia r0!, {r3-r5, r12}
subs r2, r2, #0x10
bge .Lmemcpy_srcul1loop16
ldmia sp!, {r4, r5}
adds r2, r2, #0x0c
blt .Lmemcpy_srcul1l4
.Lmemcpy_srcul1loop4:
mov r12, lr, lsr #8
ldr lr, [r1], #4
orr r12, r12, lr, lsl #24
str r12, [r0], #4
subs r2, r2, #4
bge .Lmemcpy_srcul1loop4
.Lmemcpy_srcul1l4:
sub r1, r1, #3
b .Lmemcpy_l4
.Lmemcpy_srcul2:
cmp r2, #0x0c
blt .Lmemcpy_srcul2loop4
sub r2, r2, #0x0c
stmdb sp!, {r4, r5}
.Lmemcpy_srcul2loop16:
mov r3, lr, lsr #16
ldmia r1!, {r4, r5, r12, lr}
orr r3, r3, r4, lsl #16
mov r4, r4, lsr #16
orr r4, r4, r5, lsl #16
mov r5, r5, lsr #16
orr r5, r5, r12, lsl #16
mov r12, r12, lsr #16
orr r12, r12, lr, lsl #16
stmia r0!, {r3-r5, r12}
subs r2, r2, #0x10
bge .Lmemcpy_srcul2loop16
ldmia sp!, {r4, r5}
adds r2, r2, #0x0c
blt .Lmemcpy_srcul2l4
.Lmemcpy_srcul2loop4:
mov r12, lr, lsr #16
ldr lr, [r1], #4
orr r12, r12, lr, lsl #16
str r12, [r0], #4
subs r2, r2, #4
bge .Lmemcpy_srcul2loop4
.Lmemcpy_srcul2l4:
sub r1, r1, #2
b .Lmemcpy_l4
.Lmemcpy_srcul3:
cmp r2, #0x0c
blt .Lmemcpy_srcul3loop4
sub r2, r2, #0x0c
stmdb sp!, {r4, r5}
.Lmemcpy_srcul3loop16:
mov r3, lr, lsr #24
ldmia r1!, {r4, r5, r12, lr}
orr r3, r3, r4, lsl #8
mov r4, r4, lsr #24
orr r4, r4, r5, lsl #8
mov r5, r5, lsr #24
orr r5, r5, r12, lsl #8
mov r12, r12, lsr #24
orr r12, r12, lr, lsl #8
stmia r0!, {r3-r5, r12}
subs r2, r2, #0x10
bge .Lmemcpy_srcul3loop16
ldmia sp!, {r4, r5}
adds r2, r2, #0x0c
blt .Lmemcpy_srcul3l4
.Lmemcpy_srcul3loop4:
mov r12, lr, lsr #24
ldr lr, [r1], #4
orr r12, r12, lr, lsl #8
str r12, [r0], #4
subs r2, r2, #4
bge .Lmemcpy_srcul3loop4
.Lmemcpy_srcul3l4:
sub r1, r1, #1
b .Lmemcpy_l4

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,620 @@
/* $NetBSD: memmove.S,v 1.2 2001/11/20 00:29:20 chris Exp $ */
/* $NetBSD: memmove.S,v 1.3 2003/10/13 19:59:24 scw Exp $ */
/*-
* Copyright (c) 1997 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Neil A. Carson and Mark Brinicombe
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the NetBSD
* Foundation, Inc. and its contributors.
* 4. Neither the name of The NetBSD Foundation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <machine/asm.h>
/*
* placeholder to keep the make system happy, memove is actually in memcpy.S
* This is one fun bit of code ...
* Some easy listening music is suggested while trying to understand this
* code e.g. Iron Maiden
*
* For anyone attempting to understand it :
*
* The core code is implemented here with simple stubs for memmove()
*
* All local labels are prefixed with Lmemmove_
* Following the prefix a label starting f is used in the forward copy code
* while a label using b is used in the backwards copy code
* The source and destination addresses determine whether a forward or
* backward copy is performed.
* Separate bits of code are used to deal with the following situations
* for both the forward and backwards copy.
* unaligned source address
* unaligned destination address
* Separate copy routines are used to produce an optimised result for each
* of these cases.
* The copy code will use LDM/STM instructions to copy up to 32 bytes at
* a time where possible.
*
* Note: r12 (aka ip) can be trashed during the function along with
* r0-r3 although r0-r2 have defined uses i.e. src, dest, len through out.
* Additional registers are preserved prior to use i.e. r4, r5 & lr
*
* Apologies for the state of the comments ;-)
*/
ENTRY(memmove)
#ifdef __XSCALE__
/*
* The XSCALE version of memcpy() is *way* faster than
* this code, so see if we can use it.
*/
/* Do the buffers overlap? */
cmp r0, r1
moveq pc, lr /* Bail now if src/dst are the same */
subcc r3, r0, r1 /* if (dst > src) r3 = dst - src */
subcs r3, r1, r0 /* if (src > dsr) r3 = src - dst */
cmp r3, r2 /* if (r3 < len) we have an overlap */
bcc _C_LABEL(memcpy)
#endif
/* Determine copy direction */
cmp r1, r0
moveq r0, #0 /* Quick abort for len=0 */
moveq pc, lr
/* save leaf functions having to store this away */
stmdb sp!, {r0, lr} /* memmove() returns dest addr */
bcc .Lmemmove_backwards
/* start of forwards copy */
subs r2, r2, #4
blt .Lmemmove_fl4 /* less than 4 bytes */
ands r12, r0, #3
bne .Lmemmove_fdestul /* oh unaligned destination addr */
ands r12, r1, #3
bne .Lmemmove_fsrcul /* oh unaligned source addr */
.Lmemmove_ft8:
/* We have aligned source and destination */
subs r2, r2, #8
blt .Lmemmove_fl12 /* less than 12 bytes (4 from above) */
subs r2, r2, #0x14
blt .Lmemmove_fl32 /* less than 32 bytes (12 from above) */
stmdb sp!, {r4} /* borrow r4 */
/* blat 32 bytes at a time */
/* XXX for really big copies perhaps we should use more registers */
.Lmemmove_floop32:
ldmia r1!, {r3, r4, r12, lr}
stmia r0!, {r3, r4, r12, lr}
ldmia r1!, {r3, r4, r12, lr}
stmia r0!, {r3, r4, r12, lr}
subs r2, r2, #0x20
bge .Lmemmove_floop32
cmn r2, #0x10
ldmgeia r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */
stmgeia r0!, {r3, r4, r12, lr}
subge r2, r2, #0x10
ldmia sp!, {r4} /* return r4 */
.Lmemmove_fl32:
adds r2, r2, #0x14
/* blat 12 bytes at a time */
.Lmemmove_floop12:
ldmgeia r1!, {r3, r12, lr}
stmgeia r0!, {r3, r12, lr}
subges r2, r2, #0x0c
bge .Lmemmove_floop12
.Lmemmove_fl12:
adds r2, r2, #8
blt .Lmemmove_fl4
subs r2, r2, #4
ldrlt r3, [r1], #4
strlt r3, [r0], #4
ldmgeia r1!, {r3, r12}
stmgeia r0!, {r3, r12}
subge r2, r2, #4
.Lmemmove_fl4:
/* less than 4 bytes to go */
adds r2, r2, #4
#ifdef __APCS_26_
ldmeqia sp!, {r0, pc}^ /* done */
#else
ldmeqia sp!, {r0, pc} /* done */
#endif
/* copy the crud byte at a time */
cmp r2, #2
ldrb r3, [r1], #1
strb r3, [r0], #1
ldrgeb r3, [r1], #1
strgeb r3, [r0], #1
ldrgtb r3, [r1], #1
strgtb r3, [r0], #1
ldmia sp!, {r0, pc}
/* erg - unaligned destination */
.Lmemmove_fdestul:
rsb r12, r12, #4
cmp r12, #2
/* align destination with byte copies */
ldrb r3, [r1], #1
strb r3, [r0], #1
ldrgeb r3, [r1], #1
strgeb r3, [r0], #1
ldrgtb r3, [r1], #1
strgtb r3, [r0], #1
subs r2, r2, r12
blt .Lmemmove_fl4 /* less the 4 bytes */
ands r12, r1, #3
beq .Lmemmove_ft8 /* we have an aligned source */
/* erg - unaligned source */
/* This is where it gets nasty ... */
.Lmemmove_fsrcul:
bic r1, r1, #3
ldr lr, [r1], #4
cmp r12, #2
bgt .Lmemmove_fsrcul3
beq .Lmemmove_fsrcul2
cmp r2, #0x0c
blt .Lmemmove_fsrcul1loop4
sub r2, r2, #0x0c
stmdb sp!, {r4, r5}
.Lmemmove_fsrcul1loop16:
#ifdef __ARMEB__
mov r3, lr, lsl #8
#else
mov r3, lr, lsr #8
#endif
ldmia r1!, {r4, r5, r12, lr}
#ifdef __ARMEB__
orr r3, r3, r4, lsl #24
mov r4, r4, lsr #8
orr r4, r4, r5, lsl #24
mov r5, r5, lsr #8
orr r5, r5, r12, lsl #24
mov r12, r12, lsr #8
orr r12, r12, lr, lsl #24
#else
orr r3, r3, r4, lsr #24
mov r4, r4, lsl #8
orr r4, r4, r5, lsr #24
mov r5, r5, lsl #8
orr r5, r5, r12, lsr #24
mov r12, r12, lsl #8
orr r12, r12, lr, lsr #24
#endif
stmia r0!, {r3-r5, r12}
subs r2, r2, #0x10
bge .Lmemmove_fsrcul1loop16
ldmia sp!, {r4, r5}
adds r2, r2, #0x0c
blt .Lmemmove_fsrcul1l4
.Lmemmove_fsrcul1loop4:
#ifdef __ARMEB__
mov r12, lr, lsl #8
#else
mov r12, lr, lsr #8
#endif
ldr lr, [r1], #4
#ifdef __ARMEB__
orr r12, r12, lr, lsr #24
#else
orr r12, r12, lr, lsl #24
#endif
str r12, [r0], #4
subs r2, r2, #4
bge .Lmemmove_fsrcul1loop4
.Lmemmove_fsrcul1l4:
sub r1, r1, #3
b .Lmemmove_fl4
.Lmemmove_fsrcul2:
cmp r2, #0x0c
blt .Lmemmove_fsrcul2loop4
sub r2, r2, #0x0c
stmdb sp!, {r4, r5}
.Lmemmove_fsrcul2loop16:
#ifdef __ARMEB__
mov r3, lr, lsl #16
#else
mov r3, lr, lsr #16
#endif
ldmia r1!, {r4, r5, r12, lr}
#ifdef __ARMEB__
orr r3, r3, r4, lsr #16
mov r4, r4, lsl #16
orr r4, r4, r5, lsr #16
mov r5, r5, lsl #16
orr r5, r5, r12, lsr #16
mov r12, r12, lsl #16
orr r12, r12, lr, lsr #16
#else
orr r3, r3, r4, lsl #16
mov r4, r4, lsr #16
orr r4, r4, r5, lsl #16
mov r5, r5, lsr #16
orr r5, r5, r12, lsl #16
mov r12, r12, lsr #16
orr r12, r12, lr, lsl #16
#endif
stmia r0!, {r3-r5, r12}
subs r2, r2, #0x10
bge .Lmemmove_fsrcul2loop16
ldmia sp!, {r4, r5}
adds r2, r2, #0x0c
blt .Lmemmove_fsrcul2l4
.Lmemmove_fsrcul2loop4:
#ifdef __ARMEB__
mov r12, lr, lsl #16
#else
mov r12, lr, lsr #16
#endif
ldr lr, [r1], #4
#ifdef __ARMEB__
orr r12, r12, lr, lsr #16
#else
orr r12, r12, lr, lsl #16
#endif
str r12, [r0], #4
subs r2, r2, #4
bge .Lmemmove_fsrcul2loop4
.Lmemmove_fsrcul2l4:
sub r1, r1, #2
b .Lmemmove_fl4
.Lmemmove_fsrcul3:
cmp r2, #0x0c
blt .Lmemmove_fsrcul3loop4
sub r2, r2, #0x0c
stmdb sp!, {r4, r5}
.Lmemmove_fsrcul3loop16:
#ifdef __ARMEB__
mov r3, lr, lsl #24
#else
mov r3, lr, lsr #24
#endif
ldmia r1!, {r4, r5, r12, lr}
#ifdef __ARMEB__
orr r3, r3, r4, lsr #8
mov r4, r4, lsl #24
orr r4, r4, r5, lsr #8
mov r5, r5, lsl #24
orr r5, r5, r12, lsr #8
mov r12, r12, lsl #24
orr r12, r12, lr, lsr #8
#else
orr r3, r3, r4, lsl #8
mov r4, r4, lsr #24
orr r4, r4, r5, lsl #8
mov r5, r5, lsr #24
orr r5, r5, r12, lsl #8
mov r12, r12, lsr #24
orr r12, r12, lr, lsl #8
#endif
stmia r0!, {r3-r5, r12}
subs r2, r2, #0x10
bge .Lmemmove_fsrcul3loop16
ldmia sp!, {r4, r5}
adds r2, r2, #0x0c
blt .Lmemmove_fsrcul3l4
.Lmemmove_fsrcul3loop4:
#ifdef __ARMEB__
mov r12, lr, lsl #24
#else
mov r12, lr, lsr #24
#endif
ldr lr, [r1], #4
#ifdef __ARMEB__
orr r12, r12, lr, lsr #8
#else
orr r12, r12, lr, lsl #8
#endif
str r12, [r0], #4
subs r2, r2, #4
bge .Lmemmove_fsrcul3loop4
.Lmemmove_fsrcul3l4:
sub r1, r1, #1
b .Lmemmove_fl4
.Lmemmove_backwards:
add r1, r1, r2
add r0, r0, r2
subs r2, r2, #4
blt .Lmemmove_bl4 /* less than 4 bytes */
ands r12, r0, #3
bne .Lmemmove_bdestul /* oh unaligned destination addr */
ands r12, r1, #3
bne .Lmemmove_bsrcul /* oh unaligned source addr */
.Lmemmove_bt8:
/* We have aligned source and destination */
subs r2, r2, #8
blt .Lmemmove_bl12 /* less than 12 bytes (4 from above) */
stmdb sp!, {r4}
subs r2, r2, #0x14 /* less than 32 bytes (12 from above) */
blt .Lmemmove_bl32
/* blat 32 bytes at a time */
/* XXX for really big copies perhaps we should use more registers */
.Lmemmove_bloop32:
ldmdb r1!, {r3, r4, r12, lr}
stmdb r0!, {r3, r4, r12, lr}
ldmdb r1!, {r3, r4, r12, lr}
stmdb r0!, {r3, r4, r12, lr}
subs r2, r2, #0x20
bge .Lmemmove_bloop32
.Lmemmove_bl32:
cmn r2, #0x10
ldmgedb r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */
stmgedb r0!, {r3, r4, r12, lr}
subge r2, r2, #0x10
adds r2, r2, #0x14
ldmgedb r1!, {r3, r12, lr} /* blat a remaining 12 bytes */
stmgedb r0!, {r3, r12, lr}
subge r2, r2, #0x0c
ldmia sp!, {r4}
.Lmemmove_bl12:
adds r2, r2, #8
blt .Lmemmove_bl4
subs r2, r2, #4
ldrlt r3, [r1, #-4]!
strlt r3, [r0, #-4]!
ldmgedb r1!, {r3, r12}
stmgedb r0!, {r3, r12}
subge r2, r2, #4
.Lmemmove_bl4:
/* less than 4 bytes to go */
adds r2, r2, #4
ldmeqia sp!, {r0, pc}
/* copy the crud byte at a time */
cmp r2, #2
ldrb r3, [r1, #-1]!
strb r3, [r0, #-1]!
ldrgeb r3, [r1, #-1]!
strgeb r3, [r0, #-1]!
ldrgtb r3, [r1, #-1]!
strgtb r3, [r0, #-1]!
ldmia sp!, {r0, pc}
/* erg - unaligned destination */
.Lmemmove_bdestul:
cmp r12, #2
/* align destination with byte copies */
ldrb r3, [r1, #-1]!
strb r3, [r0, #-1]!
ldrgeb r3, [r1, #-1]!
strgeb r3, [r0, #-1]!
ldrgtb r3, [r1, #-1]!
strgtb r3, [r0, #-1]!
subs r2, r2, r12
blt .Lmemmove_bl4 /* less than 4 bytes to go */
ands r12, r1, #3
beq .Lmemmove_bt8 /* we have an aligned source */
/* erg - unaligned source */
/* This is where it gets nasty ... */
.Lmemmove_bsrcul:
bic r1, r1, #3
ldr r3, [r1, #0]
cmp r12, #2
blt .Lmemmove_bsrcul1
beq .Lmemmove_bsrcul2
cmp r2, #0x0c
blt .Lmemmove_bsrcul3loop4
sub r2, r2, #0x0c
stmdb sp!, {r4, r5}
.Lmemmove_bsrcul3loop16:
#ifdef __ARMEB__
mov lr, r3, lsr #8
#else
mov lr, r3, lsl #8
#endif
ldmdb r1!, {r3-r5, r12}
#ifdef __ARMEB__
orr lr, lr, r12, lsl #24
mov r12, r12, lsr #8
orr r12, r12, r5, lsl #24
mov r5, r5, lsr #8
orr r5, r5, r4, lsl #24
mov r4, r4, lsr #8
orr r4, r4, r3, lsl #24
#else
orr lr, lr, r12, lsr #24
mov r12, r12, lsl #8
orr r12, r12, r5, lsr #24
mov r5, r5, lsl #8
orr r5, r5, r4, lsr #24
mov r4, r4, lsl #8
orr r4, r4, r3, lsr #24
#endif
stmdb r0!, {r4, r5, r12, lr}
subs r2, r2, #0x10
bge .Lmemmove_bsrcul3loop16
ldmia sp!, {r4, r5}
adds r2, r2, #0x0c
blt .Lmemmove_bsrcul3l4
.Lmemmove_bsrcul3loop4:
#ifdef __ARMEB__
mov r12, r3, lsr #8
#else
mov r12, r3, lsl #8
#endif
ldr r3, [r1, #-4]!
#ifdef __ARMEB__
orr r12, r12, r3, lsl #24
#else
orr r12, r12, r3, lsr #24
#endif
str r12, [r0, #-4]!
subs r2, r2, #4
bge .Lmemmove_bsrcul3loop4
.Lmemmove_bsrcul3l4:
add r1, r1, #3
b .Lmemmove_bl4
.Lmemmove_bsrcul2:
cmp r2, #0x0c
blt .Lmemmove_bsrcul2loop4
sub r2, r2, #0x0c
stmdb sp!, {r4, r5}
.Lmemmove_bsrcul2loop16:
#ifdef __ARMEB__
mov lr, r3, lsr #16
#else
mov lr, r3, lsl #16
#endif
ldmdb r1!, {r3-r5, r12}
#ifdef __ARMEB__
orr lr, lr, r12, lsl #16
mov r12, r12, lsr #16
orr r12, r12, r5, lsl #16
mov r5, r5, lsr #16
orr r5, r5, r4, lsl #16
mov r4, r4, lsr #16
orr r4, r4, r3, lsl #16
#else
orr lr, lr, r12, lsr #16
mov r12, r12, lsl #16
orr r12, r12, r5, lsr #16
mov r5, r5, lsl #16
orr r5, r5, r4, lsr #16
mov r4, r4, lsl #16
orr r4, r4, r3, lsr #16
#endif
stmdb r0!, {r4, r5, r12, lr}
subs r2, r2, #0x10
bge .Lmemmove_bsrcul2loop16
ldmia sp!, {r4, r5}
adds r2, r2, #0x0c
blt .Lmemmove_bsrcul2l4
.Lmemmove_bsrcul2loop4:
#ifdef __ARMEB__
mov r12, r3, lsr #16
#else
mov r12, r3, lsl #16
#endif
ldr r3, [r1, #-4]!
#ifdef __ARMEB__
orr r12, r12, r3, lsl #16
#else
orr r12, r12, r3, lsr #16
#endif
str r12, [r0, #-4]!
subs r2, r2, #4
bge .Lmemmove_bsrcul2loop4
.Lmemmove_bsrcul2l4:
add r1, r1, #2
b .Lmemmove_bl4
.Lmemmove_bsrcul1:
cmp r2, #0x0c
blt .Lmemmove_bsrcul1loop4
sub r2, r2, #0x0c
stmdb sp!, {r4, r5}
.Lmemmove_bsrcul1loop32:
#ifdef __ARMEB__
mov lr, r3, lsr #24
#else
mov lr, r3, lsl #24
#endif
ldmdb r1!, {r3-r5, r12}
#ifdef __ARMEB__
orr lr, lr, r12, lsl #8
mov r12, r12, lsr #24
orr r12, r12, r5, lsl #8
mov r5, r5, lsr #24
orr r5, r5, r4, lsl #8
mov r4, r4, lsr #24
orr r4, r4, r3, lsl #8
#else
orr lr, lr, r12, lsr #8
mov r12, r12, lsl #24
orr r12, r12, r5, lsr #8
mov r5, r5, lsl #24
orr r5, r5, r4, lsr #8
mov r4, r4, lsl #24
orr r4, r4, r3, lsr #8
#endif
stmdb r0!, {r4, r5, r12, lr}
subs r2, r2, #0x10
bge .Lmemmove_bsrcul1loop32
ldmia sp!, {r4, r5}
adds r2, r2, #0x0c
blt .Lmemmove_bsrcul1l4
.Lmemmove_bsrcul1loop4:
#ifdef __ARMEB__
mov r12, r3, lsr #24
#else
mov r12, r3, lsl #24
#endif
ldr r3, [r1, #-4]!
#ifdef __ARMEB__
orr r12, r12, r3, lsl #8
#else
orr r12, r12, r3, lsr #8
#endif
str r12, [r0, #-4]!
subs r2, r2, #4
bge .Lmemmove_bsrcul1loop4
.Lmemmove_bsrcul1l4:
add r1, r1, #1
b .Lmemmove_bl4

View File

@ -1,5 +1,39 @@
/* $NetBSD: memset.S,v 1.3 2003/04/05 23:27:15 bjh21 Exp $ */
/* $NetBSD: memset.S,v 1.4 2003/10/13 19:59:24 scw Exp $ */
/*
* Copyright 2003 Wasabi Systems, Inc.
* All rights reserved.
*
* Written by Steve C. Woodford for Wasabi Systems, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed for the NetBSD Project by
* Wasabi Systems, Inc.
* 4. The name of Wasabi Systems, Inc. may not be used to endorse
* or promote products derived from this software without specific prior
* written permission.
*
* THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1995 Mark Brinicombe.
* All rights reserved.
@ -35,7 +69,7 @@
#include <machine/asm.h>
/*
* Sets a block of memory to the specified value
* memset: Sets a block of memory to the specified value
*
* On entry:
* r0 - dest address
@ -45,82 +79,157 @@
* On exit:
* r0 - dest address
*/
#ifdef _BZERO
/* LINTSTUB: Func: void bzero(void *, size_t) */
ENTRY(bzero)
mov r3, #0x00
#else
/* LINTSTUB: Func: void *memset(void *, int, size_t) */
ENTRY(memset)
stmfd sp!, {r0} /* Remember address for return value */
and r1, r1, #0x000000ff /* We write bytes */
cmp r2, #0x00000004 /* Do we have less than 4 bytes */
and r3, r1, #0xff /* We deal with bytes */
mov r1, r2
#endif
cmp r1, #0x04 /* Do we have less than 4 bytes */
mov ip, r0
blt .Lmemset_lessthanfour
/* Ok first we will word align the address */
ands r2, ip, #0x03 /* Get the bottom two bits */
bne .Lmemset_wordunaligned /* The address is not word aligned */
ands r3, r0, #0x00000003 /* Get the bottom two bits */
beq .Lmemset_addraligned /* The address is word aligned */
/* We are now word aligned */
.Lmemset_wordaligned:
#ifndef _BZERO
orr r3, r3, r3, lsl #8 /* Extend value to 16-bits */
#endif
#ifdef __XSCALE__
tst ip, #0x04 /* Quad-align for Xscale */
#else
cmp r1, #0x10
#endif
#ifndef _BZERO
orr r3, r3, r3, lsl #16 /* Extend value to 32-bits */
#endif
#ifdef __XSCALE__
subne r1, r1, #0x04 /* Quad-align if necessary */
strne r3, [ip], #0x04
cmp r1, #0x10
#endif
blt .Lmemset_loop4 /* If less than 16 then use words */
mov r2, r3 /* Duplicate data */
cmp r1, #0x80 /* If < 128 then skip the big loop */
blt .Lmemset_loop32
rsb r3, r3, #0x00000004
sub r2, r2, r3
cmp r3, #0x00000002
strb r1, [r0], #0x0001 /* Set 1 byte */
strgeb r1, [r0], #0x0001 /* Set another byte */
strgtb r1, [r0], #0x0001 /* and a third */
/* Do 128 bytes at a time */
.Lmemset_loop128:
subs r1, r1, #0x80
#ifdef __XSCALE__
strged r2, [ip], #0x08
strged r2, [ip], #0x08
strged r2, [ip], #0x08
strged r2, [ip], #0x08
strged r2, [ip], #0x08
strged r2, [ip], #0x08
strged r2, [ip], #0x08
strged r2, [ip], #0x08
strged r2, [ip], #0x08
strged r2, [ip], #0x08
strged r2, [ip], #0x08
strged r2, [ip], #0x08
strged r2, [ip], #0x08
strged r2, [ip], #0x08
strged r2, [ip], #0x08
strged r2, [ip], #0x08
#else
stmgeia ip!, {r2-r3}
stmgeia ip!, {r2-r3}
stmgeia ip!, {r2-r3}
stmgeia ip!, {r2-r3}
stmgeia ip!, {r2-r3}
stmgeia ip!, {r2-r3}
stmgeia ip!, {r2-r3}
stmgeia ip!, {r2-r3}
stmgeia ip!, {r2-r3}
stmgeia ip!, {r2-r3}
stmgeia ip!, {r2-r3}
stmgeia ip!, {r2-r3}
stmgeia ip!, {r2-r3}
stmgeia ip!, {r2-r3}
stmgeia ip!, {r2-r3}
stmgeia ip!, {r2-r3}
#endif
bgt .Lmemset_loop128
moveq pc, lr /* Zero length so just exit */
cmp r2, #0x00000004
blt .Lmemset_lessthanfour
add r1, r1, #0x80 /* Adjust for extra sub */
/* Now we must be word aligned */
/* Do 32 bytes at a time */
.Lmemset_loop32:
subs r1, r1, #0x20
#ifdef __XSCALE__
strged r2, [ip], #0x08
strged r2, [ip], #0x08
strged r2, [ip], #0x08
strged r2, [ip], #0x08
#else
stmgeia ip!, {r2-r3}
stmgeia ip!, {r2-r3}
stmgeia ip!, {r2-r3}
stmgeia ip!, {r2-r3}
#endif
bgt .Lmemset_loop32
moveq pc, lr /* Zero length so just exit */
.Lmemset_addraligned:
adds r1, r1, #0x10 /* Partially adjust for extra sub */
orr r3, r1, r1, lsl #8 /* Repeat the byte into a word */
orr r3, r3, r3, lsl #16
/* Deal with 16 bytes or more */
#ifdef __XSCALE__
strged r2, [ip], #0x08
strged r2, [ip], #0x08
#else
stmgeia ip!, {r2-r3}
stmgeia ip!, {r2-r3}
#endif
moveq pc, lr /* Zero length so just exit */
/* We know we have at least 4 bytes ... */
cmp r2, #0x00000020 /* If less than 32 then use words */
blt .Lmemset_lessthan32
/* We have at least 32 so lets use quad words */
stmfd sp!, {r4-r6} /* Store registers */
mov r4, r3 /* Duplicate data */
mov r5, r3
mov r6, r3
.Lmemset_loop16:
stmia r0!, {r3-r6} /* Store 16 bytes */
sub r2, r2, #0x00000010 /* Adjust count */
cmp r2, #0x00000010 /* Still got at least 16 bytes ? */
bgt .Lmemset_loop16
ldmfd sp!, {r4-r6} /* Restore registers */
/* Do we need to set some words as well ? */
cmp r2, #0x00000004
blt .Lmemset_lessthanfour
/* Have either less than 16 or less than 32 depending on route taken */
.Lmemset_lessthan32:
addlt r1, r1, #0x10 /* Possibly adjust for extra sub */
/* We have at least 4 bytes so copy as words */
.Lmemset_loop4:
str r3, [r0], #0x0004
sub r2, r2, #0x0004
cmp r2, #0x00000004
bge .Lmemset_loop4
subs r1, r1, #0x04
strge r3, [ip], #0x04
bgt .Lmemset_loop4
moveq pc, lr /* Zero length so just exit */
#ifdef __XSCALE__
/* Compensate for 64-bit alignment check */
adds r1, r1, #0x04
moveq pc, lr
cmp r1, #2
#else
cmp r1, #-2
#endif
strb r3, [ip], #0x01 /* Set 1 byte */
strgeb r3, [ip], #0x01 /* Set another byte */
strgtb r3, [ip] /* and a third */
mov pc, lr /* Exit */
.Lmemset_wordunaligned:
rsb r2, r2, #0x004
strb r3, [ip], #0x01 /* Set 1 byte */
cmp r2, #0x02
strgeb r3, [ip], #0x01 /* Set another byte */
sub r1, r1, r2
strgtb r3, [ip], #0x01 /* and a third */
cmp r1, #0x04 /* More than 4 bytes left? */
bge .Lmemset_wordaligned /* Yup */
.Lmemset_lessthanfour:
cmp r2, #0x00000000
ldmeqfd sp!, {r0}
cmp r1, #0x00
moveq pc, lr /* Zero length so exit */
cmp r2, #0x00000002
strb r1, [r0], #0x0001 /* Set 1 byte */
strgeb r1, [r0], #0x0001 /* Set another byte */
strgtb r1, [r0], #0x0001 /* and a third */
ldmfd sp!, {r0}
strb r3, [ip], #0x01 /* Set 1 byte */
cmp r1, #0x02
strgeb r3, [ip], #0x01 /* Set another byte */
strgtb r3, [ip] /* and a third */
mov pc, lr /* Exit */