Xscale-optimised mem* functions, contributed by Wasabi Systems.

(Note: memcmp/memset improvements also benefit non-Xscale). memcmp() - Compare 32-bits at a time if possible. Special-case 6-byte comparisons, for the benefit of the network stack. memset() - More loop unrolling, plus use of 'strd' instruction, results in > 100% speedup on Xscale. memcpy() - Big-endian support, unrolled loops, 'strd/ldrd/pld', plus special-cases for very common length/alignment combinations (at least in the kernel). Benchmarks show ~50% improvment on Xscale. memmove() - Big-endian support. Use fast memcpy(), above, if the regions don't overlap. Otherwise unchanged.
2003-10-13 19:59:24 +00:00 · 2003-10-13 19:59:24 +00:00 · 5e7e19ec12
parent 9d9b366ce0
commit 5e7e19ec12
6 changed files with 2993 additions and 662 deletions
--- a/sys/lib/libkern/arch/arm/memcmp.S
+++ b/sys/lib/libkern/arch/arm/memcmp.S
@ -1,5 +1,39 @@
-/*      $NetBSD: memcmp.S,v 1.2 2003/04/05 23:27:15 bjh21 Exp $ */
+/*      $NetBSD: memcmp.S,v 1.3 2003/10/13 19:59:24 scw Exp $ */

+/*
+ * Copyright 2003 Wasabi Systems, Inc.
+ * All rights reserved.
+ *
+ * Written by Steve C. Woodford for Wasabi Systems, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *      This product includes software developed for the NetBSD Project by
+ *      Wasabi Systems, Inc.
+ * 4. The name of Wasabi Systems, Inc. may not be used to endorse
+ *    or promote products derived from this software without specific prior
+ *    written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
 /*
 * Copyright (c) 2002 ARM Ltd
 * All rights reserved.
@ -30,21 +64,117 @@

 #include <machine/asm.h>

-RCSID("$NetBSD: memcmp.S,v 1.2 2003/04/05 23:27:15 bjh21 Exp $")
+RCSID("$NetBSD: memcmp.S,v 1.3 2003/10/13 19:59:24 scw Exp $")

 ENTRY(memcmp)
-/* if ((len - 1) < 0) return 0 */
-	subs	r2, r2, #1
-	movmi	r0, #0
-	movmi	pc, lr
+	mov	ip, r0
+#if defined(_KERNEL) && !defined(_STANDALONE)
+	cmp	r2, #0x06
+	beq	.Lmemcmp_6bytes
+#endif
+	mov	r0, #0x00

-/* ip == last src address to compare */
-	add	ip, r0, r2
-1:
-	ldrb	r2, [r0], #1
-	ldrb	r3, [r1], #1
-	cmp	ip, r0
-	cmpcs	r2, r3
-	beq	1b
-	sub	r0, r2, r3
+	/* Are both addresses aligned the same way? */
+	cmp	r2, #0x00
+	eornes	r3, ip, r1
+	moveq	pc, lr			/* len == 0, or same addresses! */
+	tst	r3, #0x03
+	subne	r2, r2, #0x01
+	bne	.Lmemcmp_bytewise2	/* Badly aligned. Do it the slow way */
+
+	/* Word-align the addresses, if necessary */
+	and	r3, r1, #0x03
+	rsbs	r3, r3, #0x03
+	add	r3, r3, r3, lsl #1
+	addne	pc, pc, r3, lsl #3
+	nop
+
+	/* Compare up to 3 bytes */
+	ldrb	r0, [ip], #0x01
+	ldrb	r3, [r1], #0x01
+	subs	r0, r0, r3
+	movne	pc, lr
+	subs	r2, r2, #0x01
+	moveq	pc, lr
+
+	/* Compare up to 2 bytes */
+	ldrb	r0, [ip], #0x01
+	ldrb	r3, [r1], #0x01
+	subs	r0, r0, r3
+	movne	pc, lr
+	subs	r2, r2, #0x01
+	moveq	pc, lr
+
+	/* Compare 1 byte */
+	ldrb	r0, [ip], #0x01
+	ldrb	r3, [r1], #0x01
+	subs	r0, r0, r3
+	movne	pc, lr
+	subs	r2, r2, #0x01
+	moveq	pc, lr
+
+	/* Compare 4 bytes at a time, if possible */
+	subs	r2, r2, #0x04
+	bcc	.Lmemcmp_bytewise
+.Lmemcmp_word_aligned:
+	ldr	r0, [ip], #0x04
+	ldr	r3, [r1], #0x04
+	subs	r2, r2, #0x04
+	cmpcs	r0, r3
+	beq	.Lmemcmp_word_aligned
+	sub	r0, r0, r3
+
+	/* Correct for extra subtraction, and check if done */
+	adds	r2, r2, #0x04
+	cmpeq	r0, #0x00		/* If done, did all bytes match? */
+	moveq	pc, lr			/* Yup. Just return */
+
+	/* Re-do the final word byte-wise */
+	sub	ip, ip, #0x04
+	sub	r1, r1, #0x04
+
+.Lmemcmp_bytewise:
+	add	r2, r2, #0x03
+.Lmemcmp_bytewise2:
+	ldrb	r0, [ip], #0x01
+	ldrb	r3, [r1], #0x01
+	subs	r2, r2, #0x01
+	cmpcs	r0, r3
+	beq	.Lmemcmp_bytewise2
+	sub	r0, r0, r3
 	mov	pc, lr
+
+#if defined(_KERNEL) && !defined(_STANDALONE)
+	/*
+	 * 6 byte compares are very common, thanks to the network stack.
+	 * This code is hand-scheduled to reduce the number of stalls for
+	 * load results. Everything else being equal, this will be ~32%
+	 * faster than a byte-wise memcmp.
+	 */
+	.align	5
+.Lmemcmp_6bytes:
+	ldrb	r3, [r1, #0x00]		/* r3 = b2#0 */
+	ldrb	r0, [ip, #0x00]		/* r0 = b1#0 */
+	ldrb	r2, [r1, #0x01]		/* r2 = b2#1 */
+	subs	r0, r0, r3		/* r0 = b1#0 - b2#0 */
+	ldreqb	r3, [ip, #0x01]		/* r3 = b1#1 */
+	movne	pc, lr			/* Return if mismatch on #0 */
+	subs	r0, r3, r2		/* r0 = b1#1 - b2#1 */
+	ldreqb	r3, [r1, #0x02]		/* r3 = b2#2 */
+	ldreqb	r0, [ip, #0x02]		/* r0 = b1#2 */
+	movne	pc, lr			/* Return if mismatch on #1 */
+	ldrb	r2, [r1, #0x03]		/* r2 = b2#3 */
+	subs	r0, r0, r3		/* r0 = b1#2 - b2#2 */
+	ldreqb	r3, [ip, #0x03]		/* r3 = b1#3 */
+	movne	pc, lr			/* Return if mismatch on #2 */
+	subs	r0, r3, r2		/* r0 = b1#3 - b2#3 */
+	ldreqb	r3, [r1, #0x04]		/* r3 = b2#4 */
+	ldreqb	r0, [ip, #0x04]		/* r0 = b1#4 */
+	movne	pc, lr			/* Return if mismatch on #3 */
+	ldrb	r2, [r1, #0x05]		/* r2 = b2#5 */
+	subs	r0, r0, r3		/* r0 = b1#4 - b2#4 */
+	ldreqb	r3, [ip, #0x05]		/* r3 = b1#5 */
+	movne	pc, lr			/* Return if mismatch on #4 */
+	sub	r0, r3, r2		/* r0 = b1#5 - b2#5 */
+	mov	pc, lr
+#endif
--- a/sys/lib/libkern/arch/arm/memcpy.S
+++ b/sys/lib/libkern/arch/arm/memcpy.S
@ -1,585 +1,7 @@
-/*	$NetBSD: memcpy.S,v 1.5 2003/10/09 08:54:54 ichiro Exp $	*/
+/*	$NetBSD: memcpy.S,v 1.6 2003/10/13 19:59:24 scw Exp $	*/

-/*-
- * Copyright (c) 1997 The NetBSD Foundation, Inc.
- * All rights reserved.
- *
- * This code is derived from software contributed to The NetBSD Foundation
- * by Neil A. Carson and Mark Brinicombe
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- *    must display the following acknowledgement:
- *        This product includes software developed by the NetBSD
- *        Foundation, Inc. and its contributors.
- * 4. Neither the name of The NetBSD Foundation nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
- * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
- * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
- * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <machine/asm.h>
-
-/*
- * This is one fun bit of code ...
- * Some easy listening music is suggested while trying to understand this
- * code e.g. Iron Maiden
- *
- * For anyone attempting to understand it :
- *
- * The core code is implemented here with simple stubs for memcpy()
- * memmove() and bcopy().
- *
- * All local labels are prefixed with Lmemcpy_
- * Following the prefix a label starting f is used in the forward copy code
- * while a label using b is used in the backwards copy code
- * The source and destination addresses determine whether a forward or
- * backward copy is performed.
- * Separate bits of code are used to deal with the following situations
- * for both the forward and backwards copy.
- * unaligned source address
- * unaligned destination address
- * Separate copy routines are used to produce an optimised result for each
- * of these cases.
- * The copy code will use LDM/STM instructions to copy up to 32 bytes at
- * a time where possible.
- *
- * Note: r12 (aka ip) can be trashed during the function along with
- * r0-r3 although r0-r2 have defined uses i.e. src, dest, len through out.
- * Additional registers are preserved prior to use i.e. r4, r5 & lr
- *
- * Apologies for the state of the comments ;-)
- */
-
-ENTRY(memcpy)
-ENTRY_NP(memmove)
-	/* Determine copy direction */
-	cmp	r1, r0
-
-	moveq	r0, #0			/* Quick abort for len=0 */
-	moveq	pc, lr
-
-	/* save leaf functions having to store this away */
-	stmdb	sp!, {r0, lr}		/* memcpy() returns dest addr */
-
-	bcc	.Lmemcpy_backwards
-
-	/* start of forwards copy */	
-	subs	r2, r2, #4
-	blt	.Lmemcpy_fl4		/* less than 4 bytes */
-	ands	r12, r0, #3
-	bne	.Lmemcpy_fdestul	/* oh unaligned destination addr */
-	ands	r12, r1, #3
-	bne	.Lmemcpy_fsrcul		/* oh unaligned source addr */
-
-.Lmemcpy_ft8:
-	/* We have aligned source and destination */
-	subs	r2, r2, #8
-	blt	.Lmemcpy_fl12		/* less than 12 bytes (4 from above) */
-	subs	r2, r2, #0x14         
-	blt	.Lmemcpy_fl32		/* less than 32 bytes (12 from above) */
-	stmdb	sp!, {r4}		/* borrow r4 */
-
-	/* blat 32 bytes at a time */
-	/* XXX for really big copies perhaps we should use more registers */
-.Lmemcpy_floop32:	
-	ldmia	r1!, {r3, r4, r12, lr}
-	stmia	r0!, {r3, r4, r12, lr}
-	ldmia	r1!, {r3, r4, r12, lr}
-	stmia	r0!, {r3, r4, r12, lr}
-	subs	r2, r2, #0x20         
-	bge	.Lmemcpy_floop32
-
-	cmn	r2, #0x10
-	ldmgeia	r1!, {r3, r4, r12, lr}	/* blat a remaining 16 bytes */
-	stmgeia	r0!, {r3, r4, r12, lr}
-	subge	r2, r2, #0x10         
-	ldmia	sp!, {r4}		/* return r4 */
-
-.Lmemcpy_fl32:
-	adds	r2, r2, #0x14         
-
-	/* blat 12 bytes at a time */
-.Lmemcpy_floop12:
-	ldmgeia	r1!, {r3, r12, lr}
-	stmgeia	r0!, {r3, r12, lr}
-	subges	r2, r2, #0x0c         
-	bge	.Lmemcpy_floop12
-
-.Lmemcpy_fl12:
-	adds	r2, r2, #8
-	blt	.Lmemcpy_fl4
-
-	subs	r2, r2, #4
-	ldrlt	r3, [r1], #4
-	strlt	r3, [r0], #4
-	ldmgeia	r1!, {r3, r12}
-	stmgeia	r0!, {r3, r12}
-	subge	r2, r2, #4
-
-.Lmemcpy_fl4:
-	/* less than 4 bytes to go */
-	adds	r2, r2, #4
-#ifdef __APCS_26_
-	ldmeqia sp!, {r0, pc}^		/* done */
+#if !defined(__XSCALE__) || defined(_STANDALONE)
+#include "memcpy_arm.S"
 #else
-	ldmeqia	sp!, {r0, pc}		/* done */
+#include "memcpy_xscale.S"
 #endif
-	/* copy the crud byte at a time */
-	cmp	r2, #2
-	ldrb	r3, [r1], #1
-	strb	r3, [r0], #1
-	ldrgeb	r3, [r1], #1
-	strgeb	r3, [r0], #1
-	ldrgtb	r3, [r1], #1
-	strgtb	r3, [r0], #1
-	ldmia	sp!, {r0, pc}
-
-	/* erg - unaligned destination */
-.Lmemcpy_fdestul:
-	rsb	r12, r12, #4
-	cmp	r12, #2
-
-	/* align destination with byte copies */
-	ldrb	r3, [r1], #1
-	strb	r3, [r0], #1
-	ldrgeb	r3, [r1], #1
-	strgeb	r3, [r0], #1
-	ldrgtb	r3, [r1], #1
-	strgtb	r3, [r0], #1
-	subs	r2, r2, r12
-	blt	.Lmemcpy_fl4		/* less the 4 bytes */
-
-	ands	r12, r1, #3
-	beq	.Lmemcpy_ft8		/* we have an aligned source */
-
-	/* erg - unaligned source */
-	/* This is where it gets nasty ... */
-.Lmemcpy_fsrcul:
-	bic	r1, r1, #3
-	ldr	lr, [r1], #4
-	cmp	r12, #2
-	bgt	.Lmemcpy_fsrcul3
-	beq	.Lmemcpy_fsrcul2
-	cmp	r2, #0x0c            
-	blt	.Lmemcpy_fsrcul1loop4
-	sub	r2, r2, #0x0c         
-	stmdb	sp!, {r4, r5}
-
-.Lmemcpy_fsrcul1loop16:
-#ifdef __ARMEB__
-	mov	r3, lr, lsl #8
-	ldmia	r1!, {r4, r5, r12, lr}
-	orr	r3, r3, r4, lsr #24
-	mov	r4, r4, lsl #8
-	orr	r4, r4, r5, lsr #24
-	mov	r5, r5, lsl #8
-	orr	r5, r5, r12, lsr #24
-	mov	r12, r12, lsl #8
-	orr	r12, r12, lr, lsr #24
-#else
-	mov	r3, lr, lsr #8
-	ldmia	r1!, {r4, r5, r12, lr}
-	orr	r3, r3, r4, lsl #24
-	mov	r4, r4, lsr #8
-	orr	r4, r4, r5, lsl #24
-	mov	r5, r5, lsr #8
-	orr	r5, r5, r12, lsl #24
-	mov	r12, r12, lsr #8
-	orr	r12, r12, lr, lsl #24
-#endif
-	stmia	r0!, {r3-r5, r12}
-	subs	r2, r2, #0x10         
-	bge	.Lmemcpy_fsrcul1loop16
-	ldmia	sp!, {r4, r5}
-	adds	r2, r2, #0x0c         
-	blt	.Lmemcpy_fsrcul1l4
-
-.Lmemcpy_fsrcul1loop4:
-#ifdef __ARMEB__
-	mov	r12, lr, lsl #8
-	ldr	lr, [r1], #4
-	orr	r12, r12, lr, lsr #24
-#else
-	mov	r12, lr, lsr #8
-	ldr	lr, [r1], #4
-	orr	r12, r12, lr, lsl #24
-#endif
-	str	r12, [r0], #4
-	subs	r2, r2, #4
-	bge	.Lmemcpy_fsrcul1loop4
-
-.Lmemcpy_fsrcul1l4:
-	sub	r1, r1, #3
-	b	.Lmemcpy_fl4
-
-.Lmemcpy_fsrcul2:
-	cmp	r2, #0x0c            
-	blt	.Lmemcpy_fsrcul2loop4
-	sub	r2, r2, #0x0c         
-	stmdb	sp!, {r4, r5}
-
-.Lmemcpy_fsrcul2loop16:
-#ifdef __ARMEB__
-	mov	r3, lr, lsl #16
-	ldmia	r1!, {r4, r5, r12, lr}
-	orr	r3, r3, r4, lsr #16
-	mov	r4, r4, lsl #16
-	orr	r4, r4, r5, lsr #16
-	mov	r5, r5, lsl #16
-	orr	r5, r5, r12, lsr #16
-	mov	r12, r12, lsl #16
-	orr	r12, r12, lr, lsr #16
-#else
-	mov	r3, lr, lsr #16
-	ldmia	r1!, {r4, r5, r12, lr}
-	orr	r3, r3, r4, lsl #16
-	mov	r4, r4, lsr #16
-	orr	r4, r4, r5, lsl #16
-	mov	r5, r5, lsr #16
-	orr	r5, r5, r12, lsl #16
-	mov	r12, r12, lsr #16
-	orr	r12, r12, lr, lsl #16
-#endif
-	stmia	r0!, {r3-r5, r12}
-	subs	r2, r2, #0x10         
-	bge	.Lmemcpy_fsrcul2loop16
-	ldmia	sp!, {r4, r5}
-	adds	r2, r2, #0x0c         
-	blt	.Lmemcpy_fsrcul2l4
-
-.Lmemcpy_fsrcul2loop4:
-#ifdef __ARMEB__
-	mov	r12, lr, lsl #16
-	ldr	lr, [r1], #4
-	orr	r12, r12, lr, lsr #16
-#else
-	mov	r12, lr, lsr #16
-	ldr	lr, [r1], #4
-	orr	r12, r12, lr, lsl #16
-#endif
-	str	r12, [r0], #4
-	subs	r2, r2, #4
-	bge	.Lmemcpy_fsrcul2loop4
-
-.Lmemcpy_fsrcul2l4:
-	sub	r1, r1, #2
-	b	.Lmemcpy_fl4
-
-.Lmemcpy_fsrcul3:
-	cmp	r2, #0x0c            
-	blt	.Lmemcpy_fsrcul3loop4
-	sub	r2, r2, #0x0c         
-	stmdb	sp!, {r4, r5}
-
-.Lmemcpy_fsrcul3loop16:
-#ifdef __ARMEB__
-	mov	r3, lr, lsl #24
-	ldmia	r1!, {r4, r5, r12, lr}
-	orr	r3, r3, r4, lsr #8
-	mov	r4, r4, lsl #24
-	orr	r4, r4, r5, lsr #8
-	mov	r5, r5, lsl #24
-	orr	r5, r5, r12, lsr #8
-	mov	r12, r12, lsl #24
-	orr	r12, r12, lr, lsr #8
-#else
-	mov	r3, lr, lsr #24
-	ldmia	r1!, {r4, r5, r12, lr}
-	orr	r3, r3, r4, lsl #8
-	mov	r4, r4, lsr #24
-	orr	r4, r4, r5, lsl #8
-	mov	r5, r5, lsr #24
-	orr	r5, r5, r12, lsl #8
-	mov	r12, r12, lsr #24
-	orr	r12, r12, lr, lsl #8
-#endif
-	stmia	r0!, {r3-r5, r12}
-	subs	r2, r2, #0x10         
-	bge	.Lmemcpy_fsrcul3loop16
-	ldmia	sp!, {r4, r5}
-	adds	r2, r2, #0x0c         
-	blt	.Lmemcpy_fsrcul3l4
-
-.Lmemcpy_fsrcul3loop4:
-#ifdef __ARMEB__
-	mov	r12, lr, lsl #24
-	ldr	lr, [r1], #4
-	orr	r12, r12, lr, lsr #8
-#else
-	mov	r12, lr, lsr #24
-	ldr	lr, [r1], #4
-	orr	r12, r12, lr, lsl #8
-#endif
-	str	r12, [r0], #4
-	subs	r2, r2, #4
-	bge	.Lmemcpy_fsrcul3loop4
-
-.Lmemcpy_fsrcul3l4:
-	sub	r1, r1, #1
-	b	.Lmemcpy_fl4
-
-.Lmemcpy_backwards:
-	add	r1, r1, r2
-	add	r0, r0, r2
-	subs	r2, r2, #4
-	blt	.Lmemcpy_bl4		/* less than 4 bytes */
-	ands	r12, r0, #3
-	bne	.Lmemcpy_bdestul	/* oh unaligned destination addr */
-	ands	r12, r1, #3
-	bne	.Lmemcpy_bsrcul		/* oh unaligned source addr */
-
-.Lmemcpy_bt8:
-	/* We have aligned source and destination */
-	subs	r2, r2, #8
-	blt	.Lmemcpy_bl12		/* less than 12 bytes (4 from above) */
-	stmdb	sp!, {r4}
-	subs	r2, r2, #0x14		/* less than 32 bytes (12 from above) */
-	blt	.Lmemcpy_bl32
-
-	/* blat 32 bytes at a time */
-	/* XXX for really big copies perhaps we should use more registers */
-.Lmemcpy_bloop32:
-	ldmdb	r1!, {r3, r4, r12, lr}
-	stmdb	r0!, {r3, r4, r12, lr}
-	ldmdb	r1!, {r3, r4, r12, lr}
-	stmdb	r0!, {r3, r4, r12, lr}
-	subs	r2, r2, #0x20         
-	bge	.Lmemcpy_bloop32
-
-.Lmemcpy_bl32:
-	cmn	r2, #0x10            
-	ldmgedb	r1!, {r3, r4, r12, lr}	/* blat a remaining 16 bytes */
-	stmgedb	r0!, {r3, r4, r12, lr}
-	subge	r2, r2, #0x10         
-	adds	r2, r2, #0x14         
-	ldmgedb	r1!, {r3, r12, lr}	/* blat a remaining 12 bytes */
-	stmgedb	r0!, {r3, r12, lr}
-	subge	r2, r2, #0x0c         
-	ldmia	sp!, {r4}
-
-.Lmemcpy_bl12:
-	adds	r2, r2, #8
-	blt	.Lmemcpy_bl4
-	subs	r2, r2, #4
-	ldrlt	r3, [r1, #-4]!
-	strlt	r3, [r0, #-4]!
-	ldmgedb	r1!, {r3, r12}
-	stmgedb	r0!, {r3, r12}
-	subge	r2, r2, #4
-
-.Lmemcpy_bl4:
-	/* less than 4 bytes to go */
-	adds	r2, r2, #4
-	ldmeqia	sp!, {r0, pc}
-
-	/* copy the crud byte at a time */
-	cmp	r2, #2
-	ldrb	r3, [r1, #-1]!
-	strb	r3, [r0, #-1]!
-	ldrgeb	r3, [r1, #-1]!
-	strgeb	r3, [r0, #-1]!
-	ldrgtb	r3, [r1, #-1]!
-	strgtb	r3, [r0, #-1]!
-	ldmia	sp!, {r0, pc}
-
-	/* erg - unaligned destination */
-.Lmemcpy_bdestul:
-	cmp	r12, #2
-
-	/* align destination with byte copies */
-	ldrb	r3, [r1, #-1]!
-	strb	r3, [r0, #-1]!
-	ldrgeb	r3, [r1, #-1]!
-	strgeb	r3, [r0, #-1]!
-	ldrgtb	r3, [r1, #-1]!
-	strgtb	r3, [r0, #-1]!
-	subs	r2, r2, r12
-	blt	.Lmemcpy_bl4		/* less than 4 bytes to go */
-	ands	r12, r1, #3
-	beq	.Lmemcpy_bt8		/* we have an aligned source */
-
-	/* erg - unaligned source */
-	/* This is where it gets nasty ... */
-.Lmemcpy_bsrcul:
-	bic	r1, r1, #3
-	ldr	r3, [r1, #0]
-	cmp	r12, #2
-	blt	.Lmemcpy_bsrcul1
-	beq	.Lmemcpy_bsrcul2
-	cmp	r2, #0x0c            
-	blt	.Lmemcpy_bsrcul3loop4
-	sub	r2, r2, #0x0c         
-	stmdb	sp!, {r4, r5}
-
-.Lmemcpy_bsrcul3loop16:
-#ifdef __ARMEB__
-	mov	lr, r3, lsr #8
-	ldmdb	r1!, {r3-r5, r12}
-	orr	lr, lr, r12, lsl #24
-	mov	r12, r12, lsr #8
-	orr	r12, r12, r5, lsl #24
-	mov	r5, r5, lsr #8
-	orr	r5, r5, r4, lsl #24
-	mov	r4, r4, lsr #8
-	orr	r4, r4, r3, lsl #24
-#else
-	mov	lr, r3, lsl #8
-	ldmdb	r1!, {r3-r5, r12}
-	orr	lr, lr, r12, lsr #24
-	mov	r12, r12, lsl #8
-	orr	r12, r12, r5, lsr #24
-	mov	r5, r5, lsl #8
-	orr	r5, r5, r4, lsr #24
-	mov	r4, r4, lsl #8
-	orr	r4, r4, r3, lsr #24
-#endif
-	stmdb	r0!, {r4, r5, r12, lr}
-	subs	r2, r2, #0x10         
-	bge	.Lmemcpy_bsrcul3loop16
-	ldmia	sp!, {r4, r5}
-	adds	r2, r2, #0x0c         
-	blt	.Lmemcpy_bsrcul3l4
-
-.Lmemcpy_bsrcul3loop4:
-#ifdef __ARMEB__
-	mov	r12, r3, lsr #8
-	ldr	r3, [r1, #-4]!
-	orr	r12, r12, r3, lsl #24
-#else
-	mov	r12, r3, lsl #8
-	ldr	r3, [r1, #-4]!
-	orr	r12, r12, r3, lsr #24
-#endif
-	str	r12, [r0, #-4]!
-	subs	r2, r2, #4
-	bge	.Lmemcpy_bsrcul3loop4
-
-.Lmemcpy_bsrcul3l4:
-	add	r1, r1, #3
-	b	.Lmemcpy_bl4
-
-.Lmemcpy_bsrcul2:
-	cmp	r2, #0x0c            
-	blt	.Lmemcpy_bsrcul2loop4
-	sub	r2, r2, #0x0c         
-	stmdb	sp!, {r4, r5}
-
-.Lmemcpy_bsrcul2loop16:
-#ifdef __ARMEB__
-	mov	lr, r3, lsr #16
-	ldmdb	r1!, {r3-r5, r12}
-	orr	lr, lr, r12, lsl #16
-	mov	r12, r12, lsr #16
-	orr	r12, r12, r5, lsl #16
-	mov	r5, r5, lsr #16
-	orr	r5, r5, r4, lsl #16
-	mov	r4, r4, lsr #16
-	orr	r4, r4, r3, lsl #16
-#else
-	mov	lr, r3, lsl #16
-	ldmdb	r1!, {r3-r5, r12}
-	orr	lr, lr, r12, lsr #16
-	mov	r12, r12, lsl #16
-	orr	r12, r12, r5, lsr #16
-	mov	r5, r5, lsl #16
-	orr	r5, r5, r4, lsr #16
-	mov	r4, r4, lsl #16
-	orr	r4, r4, r3, lsr #16
-#endif
-	stmdb	r0!, {r4, r5, r12, lr}
-	subs	r2, r2, #0x10         
-	bge	.Lmemcpy_bsrcul2loop16
-	ldmia	sp!, {r4, r5}
-	adds	r2, r2, #0x0c         
-	blt	.Lmemcpy_bsrcul2l4
-
-.Lmemcpy_bsrcul2loop4:
-#ifdef __ARMEB__
-	mov	r12, r3, lsr #16
-	ldr	r3, [r1, #-4]!
-	orr	r12, r12, r3, lsl #16
-#else
-	mov	r12, r3, lsl #16
-	ldr	r3, [r1, #-4]!
-	orr	r12, r12, r3, lsr #16
-#endif
-	str	r12, [r0, #-4]!
-	subs	r2, r2, #4
-	bge	.Lmemcpy_bsrcul2loop4
-
-.Lmemcpy_bsrcul2l4:
-	add	r1, r1, #2
-	b	.Lmemcpy_bl4
-
-.Lmemcpy_bsrcul1:
-	cmp	r2, #0x0c            
-	blt	.Lmemcpy_bsrcul1loop4
-	sub	r2, r2, #0x0c         
-	stmdb	sp!, {r4, r5}
-
-.Lmemcpy_bsrcul1loop32:
-#ifdef __ARMEB__
-	mov	lr, r3, lsr #24
-	ldmdb	r1!, {r3-r5, r12}
-	orr	lr, lr, r12, lsl #8
-	mov	r12, r12, lsr #24
-	orr	r12, r12, r5, lsl #8
-	mov	r5, r5, lsr #24
-	orr	r5, r5, r4, lsl #8
-	mov	r4, r4, lsr #24
-	orr	r4, r4, r3, lsl #8
-#else
-	mov	lr, r3, lsl #24
-	ldmdb	r1!, {r3-r5, r12}
-	orr	lr, lr, r12, lsr #8
-	mov	r12, r12, lsl #24
-	orr	r12, r12, r5, lsr #8
-	mov	r5, r5, lsl #24
-	orr	r5, r5, r4, lsr #8
-	mov	r4, r4, lsl #24
-	orr	r4, r4, r3, lsr #8
-#endif
-	stmdb	r0!, {r4, r5, r12, lr}
-	subs	r2, r2, #0x10         
-	bge	.Lmemcpy_bsrcul1loop32
-	ldmia	sp!, {r4, r5}
-	adds	r2, r2, #0x0c         
-	blt	.Lmemcpy_bsrcul1l4
-
-.Lmemcpy_bsrcul1loop4:
-#ifdef __ARMEB__
-	mov	r12, r3, lsr #24
-	ldr	r3, [r1, #-4]!
-	orr	r12, r12, r3, lsl #8
-#else
-	mov	r12, r3, lsl #24
-	ldr	r3, [r1, #-4]!
-	orr	r12, r12, r3, lsr #8
-#endif
-	str	r12, [r0, #-4]!
-	subs	r2, r2, #4
-	bge	.Lmemcpy_bsrcul1loop4
-
-.Lmemcpy_bsrcul1l4:
-	add	r1, r1, #1
-	b	.Lmemcpy_bl4
-	
--- a/sys/lib/libkern/arch/arm/memcpy_arm.S
+++ b/sys/lib/libkern/arch/arm/memcpy_arm.S
@ -0,0 +1,273 @@
+/*	$NetBSD: memcpy_arm.S,v 1.1 2003/10/13 19:59:24 scw Exp $	*/
+
+/*-
+ * Copyright (c) 1997 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Neil A. Carson and Mark Brinicombe
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *        This product includes software developed by the NetBSD
+ *        Foundation, Inc. and its contributors.
+ * 4. Neither the name of The NetBSD Foundation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <machine/asm.h>
+
+/*
+ * This is one fun bit of code ...
+ * Some easy listening music is suggested while trying to understand this
+ * code e.g. Iron Maiden
+ *
+ * For anyone attempting to understand it :
+ *
+ * The core code is implemented here with simple stubs for memcpy().
+ *
+ * All local labels are prefixed with Lmemcpy_
+ * Following the prefix a label starting f is used in the forward copy code
+ * while a label using b is used in the backwards copy code
+ * The source and destination addresses determine whether a forward or
+ * backward copy is performed.
+ * Separate bits of code are used to deal with the following situations
+ * for both the forward and backwards copy.
+ * unaligned source address
+ * unaligned destination address
+ * Separate copy routines are used to produce an optimised result for each
+ * of these cases.
+ * The copy code will use LDM/STM instructions to copy up to 32 bytes at
+ * a time where possible.
+ *
+ * Note: r12 (aka ip) can be trashed during the function along with
+ * r0-r3 although r0-r2 have defined uses i.e. src, dest, len through out.
+ * Additional registers are preserved prior to use i.e. r4, r5 & lr
+ *
+ * Apologies for the state of the comments ;-)
+ */
+/* LINTSTUB: Func: void *memcpy(void *dst, const void *src, size_t len) */
+ENTRY(memcpy)
+	/* save leaf functions having to store this away */
+	stmdb	sp!, {r0, lr}		/* memcpy() returns dest addr */
+
+	subs	r2, r2, #4
+	blt	.Lmemcpy_l4		/* less than 4 bytes */
+	ands	r12, r0, #3
+	bne	.Lmemcpy_destul		/* oh unaligned destination addr */
+	ands	r12, r1, #3
+	bne	.Lmemcpy_srcul		/* oh unaligned source addr */
+
+.Lmemcpy_t8:
+	/* We have aligned source and destination */
+	subs	r2, r2, #8
+	blt	.Lmemcpy_l12		/* less than 12 bytes (4 from above) */
+	subs	r2, r2, #0x14         
+	blt	.Lmemcpy_l32		/* less than 32 bytes (12 from above) */
+	stmdb	sp!, {r4}		/* borrow r4 */
+
+	/* blat 32 bytes at a time */
+	/* XXX for really big copies perhaps we should use more registers */
+.Lmemcpy_loop32:	
+	ldmia	r1!, {r3, r4, r12, lr}
+	stmia	r0!, {r3, r4, r12, lr}
+	ldmia	r1!, {r3, r4, r12, lr}
+	stmia	r0!, {r3, r4, r12, lr}
+	subs	r2, r2, #0x20         
+	bge	.Lmemcpy_loop32
+
+	cmn	r2, #0x10
+	ldmgeia	r1!, {r3, r4, r12, lr}	/* blat a remaining 16 bytes */
+	stmgeia	r0!, {r3, r4, r12, lr}
+	subge	r2, r2, #0x10         
+	ldmia	sp!, {r4}		/* return r4 */
+
+.Lmemcpy_l32:
+	adds	r2, r2, #0x14         
+
+	/* blat 12 bytes at a time */
+.Lmemcpy_loop12:
+	ldmgeia	r1!, {r3, r12, lr}
+	stmgeia	r0!, {r3, r12, lr}
+	subges	r2, r2, #0x0c         
+	bge	.Lmemcpy_loop12
+
+.Lmemcpy_l12:
+	adds	r2, r2, #8
+	blt	.Lmemcpy_l4
+
+	subs	r2, r2, #4
+	ldrlt	r3, [r1], #4
+	strlt	r3, [r0], #4
+	ldmgeia	r1!, {r3, r12}
+	stmgeia	r0!, {r3, r12}
+	subge	r2, r2, #4
+
+.Lmemcpy_l4:
+	/* less than 4 bytes to go */
+	adds	r2, r2, #4
+#ifdef __APCS_26_
+	ldmeqia sp!, {r0, pc}^		/* done */
+#else
+	ldmeqia	sp!, {r0, pc}		/* done */
+#endif
+	/* copy the crud byte at a time */
+	cmp	r2, #2
+	ldrb	r3, [r1], #1
+	strb	r3, [r0], #1
+	ldrgeb	r3, [r1], #1
+	strgeb	r3, [r0], #1
+	ldrgtb	r3, [r1], #1
+	strgtb	r3, [r0], #1
+	ldmia	sp!, {r0, pc}
+
+	/* erg - unaligned destination */
+.Lmemcpy_destul:
+	rsb	r12, r12, #4
+	cmp	r12, #2
+
+	/* align destination with byte copies */
+	ldrb	r3, [r1], #1
+	strb	r3, [r0], #1
+	ldrgeb	r3, [r1], #1
+	strgeb	r3, [r0], #1
+	ldrgtb	r3, [r1], #1
+	strgtb	r3, [r0], #1
+	subs	r2, r2, r12
+	blt	.Lmemcpy_l4		/* less the 4 bytes */
+
+	ands	r12, r1, #3
+	beq	.Lmemcpy_t8		/* we have an aligned source */
+
+	/* erg - unaligned source */
+	/* This is where it gets nasty ... */
+.Lmemcpy_srcul:
+	bic	r1, r1, #3
+	ldr	lr, [r1], #4
+	cmp	r12, #2
+	bgt	.Lmemcpy_srcul3
+	beq	.Lmemcpy_srcul2
+	cmp	r2, #0x0c            
+	blt	.Lmemcpy_srcul1loop4
+	sub	r2, r2, #0x0c         
+	stmdb	sp!, {r4, r5}
+
+.Lmemcpy_srcul1loop16:
+	mov	r3, lr, lsr #8
+	ldmia	r1!, {r4, r5, r12, lr}
+	orr	r3, r3, r4, lsl #24
+	mov	r4, r4, lsr #8
+	orr	r4, r4, r5, lsl #24
+	mov	r5, r5, lsr #8
+	orr	r5, r5, r12, lsl #24
+	mov	r12, r12, lsr #8
+	orr	r12, r12, lr, lsl #24
+	stmia	r0!, {r3-r5, r12}
+	subs	r2, r2, #0x10         
+	bge	.Lmemcpy_srcul1loop16
+	ldmia	sp!, {r4, r5}
+	adds	r2, r2, #0x0c         
+	blt	.Lmemcpy_srcul1l4
+
+.Lmemcpy_srcul1loop4:
+	mov	r12, lr, lsr #8
+	ldr	lr, [r1], #4
+	orr	r12, r12, lr, lsl #24
+	str	r12, [r0], #4
+	subs	r2, r2, #4
+	bge	.Lmemcpy_srcul1loop4
+
+.Lmemcpy_srcul1l4:
+	sub	r1, r1, #3
+	b	.Lmemcpy_l4
+
+.Lmemcpy_srcul2:
+	cmp	r2, #0x0c            
+	blt	.Lmemcpy_srcul2loop4
+	sub	r2, r2, #0x0c         
+	stmdb	sp!, {r4, r5}
+
+.Lmemcpy_srcul2loop16:
+	mov	r3, lr, lsr #16
+	ldmia	r1!, {r4, r5, r12, lr}
+	orr	r3, r3, r4, lsl #16
+	mov	r4, r4, lsr #16
+	orr	r4, r4, r5, lsl #16
+	mov	r5, r5, lsr #16
+	orr	r5, r5, r12, lsl #16
+	mov	r12, r12, lsr #16
+	orr	r12, r12, lr, lsl #16
+	stmia	r0!, {r3-r5, r12}
+	subs	r2, r2, #0x10         
+	bge	.Lmemcpy_srcul2loop16
+	ldmia	sp!, {r4, r5}
+	adds	r2, r2, #0x0c         
+	blt	.Lmemcpy_srcul2l4
+
+.Lmemcpy_srcul2loop4:
+	mov	r12, lr, lsr #16
+	ldr	lr, [r1], #4
+	orr	r12, r12, lr, lsl #16
+	str	r12, [r0], #4
+	subs	r2, r2, #4
+	bge	.Lmemcpy_srcul2loop4
+
+.Lmemcpy_srcul2l4:
+	sub	r1, r1, #2
+	b	.Lmemcpy_l4
+
+.Lmemcpy_srcul3:
+	cmp	r2, #0x0c            
+	blt	.Lmemcpy_srcul3loop4
+	sub	r2, r2, #0x0c         
+	stmdb	sp!, {r4, r5}
+
+.Lmemcpy_srcul3loop16:
+	mov	r3, lr, lsr #24
+	ldmia	r1!, {r4, r5, r12, lr}
+	orr	r3, r3, r4, lsl #8
+	mov	r4, r4, lsr #24
+	orr	r4, r4, r5, lsl #8
+	mov	r5, r5, lsr #24
+	orr	r5, r5, r12, lsl #8
+	mov	r12, r12, lsr #24
+	orr	r12, r12, lr, lsl #8
+	stmia	r0!, {r3-r5, r12}
+	subs	r2, r2, #0x10         
+	bge	.Lmemcpy_srcul3loop16
+	ldmia	sp!, {r4, r5}
+	adds	r2, r2, #0x0c         
+	blt	.Lmemcpy_srcul3l4
+
+.Lmemcpy_srcul3loop4:
+	mov	r12, lr, lsr #24
+	ldr	lr, [r1], #4
+	orr	r12, r12, lr, lsl #8
+	str	r12, [r0], #4
+	subs	r2, r2, #4
+	bge	.Lmemcpy_srcul3loop4
+
+.Lmemcpy_srcul3l4:
+	sub	r1, r1, #1
+	b	.Lmemcpy_l4
--- a/sys/lib/libkern/arch/arm/memcpy_xscale.S
+++ b/sys/lib/libkern/arch/arm/memcpy_xscale.S
--- a/sys/lib/libkern/arch/arm/memmove.S
+++ b/sys/lib/libkern/arch/arm/memmove.S
@ -1,5 +1,620 @@
-/*	$NetBSD: memmove.S,v 1.2 2001/11/20 00:29:20 chris Exp $	*/
+/*	$NetBSD: memmove.S,v 1.3 2003/10/13 19:59:24 scw Exp $	*/
+
+/*-
+ * Copyright (c) 1997 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Neil A. Carson and Mark Brinicombe
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *        This product includes software developed by the NetBSD
+ *        Foundation, Inc. and its contributors.
+ * 4. Neither the name of The NetBSD Foundation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <machine/asm.h>

 /*
- * placeholder to keep the make system happy, memove is actually in memcpy.S
+ * This is one fun bit of code ...
+ * Some easy listening music is suggested while trying to understand this
+ * code e.g. Iron Maiden
+ *
+ * For anyone attempting to understand it :
+ *
+ * The core code is implemented here with simple stubs for memmove()
+ *
+ * All local labels are prefixed with Lmemmove_
+ * Following the prefix a label starting f is used in the forward copy code
+ * while a label using b is used in the backwards copy code
+ * The source and destination addresses determine whether a forward or
+ * backward copy is performed.
+ * Separate bits of code are used to deal with the following situations
+ * for both the forward and backwards copy.
+ * unaligned source address
+ * unaligned destination address
+ * Separate copy routines are used to produce an optimised result for each
+ * of these cases.
+ * The copy code will use LDM/STM instructions to copy up to 32 bytes at
+ * a time where possible.
+ *
+ * Note: r12 (aka ip) can be trashed during the function along with
+ * r0-r3 although r0-r2 have defined uses i.e. src, dest, len through out.
+ * Additional registers are preserved prior to use i.e. r4, r5 & lr
+ *
+ * Apologies for the state of the comments ;-)
 */
+
+ENTRY(memmove)
+#ifdef __XSCALE__
+	/*
+	 * The XSCALE version of memcpy() is *way* faster than
+	 * this code, so see if we can use it.
+	 */
+	/* Do the buffers overlap? */
+	cmp	r0, r1
+	moveq	pc, lr		/* Bail now if src/dst are the same */
+	subcc	r3, r0, r1	/* if (dst > src) r3 = dst - src */
+	subcs	r3, r1, r0	/* if (src > dsr) r3 = src - dst */
+	cmp	r3, r2		/* if (r3 < len) we have an overlap */
+	bcc	_C_LABEL(memcpy)
+#endif
+
+	/* Determine copy direction */
+	cmp	r1, r0
+
+	moveq	r0, #0			/* Quick abort for len=0 */
+	moveq	pc, lr
+
+	/* save leaf functions having to store this away */
+	stmdb	sp!, {r0, lr}		/* memmove() returns dest addr */
+
+	bcc	.Lmemmove_backwards
+
+	/* start of forwards copy */	
+	subs	r2, r2, #4
+	blt	.Lmemmove_fl4		/* less than 4 bytes */
+	ands	r12, r0, #3
+	bne	.Lmemmove_fdestul	/* oh unaligned destination addr */
+	ands	r12, r1, #3
+	bne	.Lmemmove_fsrcul	/* oh unaligned source addr */
+
+.Lmemmove_ft8:
+	/* We have aligned source and destination */
+	subs	r2, r2, #8
+	blt	.Lmemmove_fl12		/* less than 12 bytes (4 from above) */
+	subs	r2, r2, #0x14         
+	blt	.Lmemmove_fl32		/* less than 32 bytes (12 from above) */
+	stmdb	sp!, {r4}		/* borrow r4 */
+
+	/* blat 32 bytes at a time */
+	/* XXX for really big copies perhaps we should use more registers */
+.Lmemmove_floop32:	
+	ldmia	r1!, {r3, r4, r12, lr}
+	stmia	r0!, {r3, r4, r12, lr}
+	ldmia	r1!, {r3, r4, r12, lr}
+	stmia	r0!, {r3, r4, r12, lr}
+	subs	r2, r2, #0x20         
+	bge	.Lmemmove_floop32
+
+	cmn	r2, #0x10
+	ldmgeia	r1!, {r3, r4, r12, lr}	/* blat a remaining 16 bytes */
+	stmgeia	r0!, {r3, r4, r12, lr}
+	subge	r2, r2, #0x10         
+	ldmia	sp!, {r4}		/* return r4 */
+
+.Lmemmove_fl32:
+	adds	r2, r2, #0x14         
+
+	/* blat 12 bytes at a time */
+.Lmemmove_floop12:
+	ldmgeia	r1!, {r3, r12, lr}
+	stmgeia	r0!, {r3, r12, lr}
+	subges	r2, r2, #0x0c         
+	bge	.Lmemmove_floop12
+
+.Lmemmove_fl12:
+	adds	r2, r2, #8
+	blt	.Lmemmove_fl4
+
+	subs	r2, r2, #4
+	ldrlt	r3, [r1], #4
+	strlt	r3, [r0], #4
+	ldmgeia	r1!, {r3, r12}
+	stmgeia	r0!, {r3, r12}
+	subge	r2, r2, #4
+
+.Lmemmove_fl4:
+	/* less than 4 bytes to go */
+	adds	r2, r2, #4
+#ifdef __APCS_26_
+	ldmeqia sp!, {r0, pc}^		/* done */
+#else
+	ldmeqia	sp!, {r0, pc}		/* done */
+#endif
+	/* copy the crud byte at a time */
+	cmp	r2, #2
+	ldrb	r3, [r1], #1
+	strb	r3, [r0], #1
+	ldrgeb	r3, [r1], #1
+	strgeb	r3, [r0], #1
+	ldrgtb	r3, [r1], #1
+	strgtb	r3, [r0], #1
+	ldmia	sp!, {r0, pc}
+
+	/* erg - unaligned destination */
+.Lmemmove_fdestul:
+	rsb	r12, r12, #4
+	cmp	r12, #2
+
+	/* align destination with byte copies */
+	ldrb	r3, [r1], #1
+	strb	r3, [r0], #1
+	ldrgeb	r3, [r1], #1
+	strgeb	r3, [r0], #1
+	ldrgtb	r3, [r1], #1
+	strgtb	r3, [r0], #1
+	subs	r2, r2, r12
+	blt	.Lmemmove_fl4		/* less the 4 bytes */
+
+	ands	r12, r1, #3
+	beq	.Lmemmove_ft8		/* we have an aligned source */
+
+	/* erg - unaligned source */
+	/* This is where it gets nasty ... */
+.Lmemmove_fsrcul:
+	bic	r1, r1, #3
+	ldr	lr, [r1], #4
+	cmp	r12, #2
+	bgt	.Lmemmove_fsrcul3
+	beq	.Lmemmove_fsrcul2
+	cmp	r2, #0x0c            
+	blt	.Lmemmove_fsrcul1loop4
+	sub	r2, r2, #0x0c         
+	stmdb	sp!, {r4, r5}
+
+.Lmemmove_fsrcul1loop16:
+#ifdef __ARMEB__
+	mov	r3, lr, lsl #8
+#else
+	mov	r3, lr, lsr #8
+#endif
+	ldmia	r1!, {r4, r5, r12, lr}
+#ifdef __ARMEB__
+	orr	r3, r3, r4, lsl #24
+	mov	r4, r4, lsr #8
+	orr	r4, r4, r5, lsl #24
+	mov	r5, r5, lsr #8
+	orr	r5, r5, r12, lsl #24
+	mov	r12, r12, lsr #8
+	orr	r12, r12, lr, lsl #24
+#else
+	orr	r3, r3, r4, lsr #24
+	mov	r4, r4, lsl #8
+	orr	r4, r4, r5, lsr #24
+	mov	r5, r5, lsl #8
+	orr	r5, r5, r12, lsr #24
+	mov	r12, r12, lsl #8
+	orr	r12, r12, lr, lsr #24
+#endif
+	stmia	r0!, {r3-r5, r12}
+	subs	r2, r2, #0x10         
+	bge	.Lmemmove_fsrcul1loop16
+	ldmia	sp!, {r4, r5}
+	adds	r2, r2, #0x0c         
+	blt	.Lmemmove_fsrcul1l4
+
+.Lmemmove_fsrcul1loop4:
+#ifdef __ARMEB__
+	mov	r12, lr, lsl #8
+#else
+	mov	r12, lr, lsr #8
+#endif
+	ldr	lr, [r1], #4
+#ifdef __ARMEB__
+	orr	r12, r12, lr, lsr #24
+#else
+	orr	r12, r12, lr, lsl #24
+#endif
+	str	r12, [r0], #4
+	subs	r2, r2, #4
+	bge	.Lmemmove_fsrcul1loop4
+
+.Lmemmove_fsrcul1l4:
+	sub	r1, r1, #3
+	b	.Lmemmove_fl4
+
+.Lmemmove_fsrcul2:
+	cmp	r2, #0x0c            
+	blt	.Lmemmove_fsrcul2loop4
+	sub	r2, r2, #0x0c         
+	stmdb	sp!, {r4, r5}
+
+.Lmemmove_fsrcul2loop16:
+#ifdef __ARMEB__
+	mov	r3, lr, lsl #16
+#else
+	mov	r3, lr, lsr #16
+#endif
+	ldmia	r1!, {r4, r5, r12, lr}
+#ifdef __ARMEB__
+	orr	r3, r3, r4, lsr #16
+	mov	r4, r4, lsl #16
+	orr	r4, r4, r5, lsr #16
+	mov	r5, r5, lsl #16
+	orr	r5, r5, r12, lsr #16
+	mov	r12, r12, lsl #16
+	orr	r12, r12, lr, lsr #16
+#else
+	orr	r3, r3, r4, lsl #16
+	mov	r4, r4, lsr #16
+	orr	r4, r4, r5, lsl #16
+	mov	r5, r5, lsr #16
+	orr	r5, r5, r12, lsl #16
+	mov	r12, r12, lsr #16
+	orr	r12, r12, lr, lsl #16
+#endif
+	stmia	r0!, {r3-r5, r12}
+	subs	r2, r2, #0x10         
+	bge	.Lmemmove_fsrcul2loop16
+	ldmia	sp!, {r4, r5}
+	adds	r2, r2, #0x0c         
+	blt	.Lmemmove_fsrcul2l4
+
+.Lmemmove_fsrcul2loop4:
+#ifdef __ARMEB__
+	mov	r12, lr, lsl #16
+#else
+	mov	r12, lr, lsr #16
+#endif
+	ldr	lr, [r1], #4
+#ifdef __ARMEB__
+	orr	r12, r12, lr, lsr #16
+#else
+	orr	r12, r12, lr, lsl #16
+#endif
+	str	r12, [r0], #4
+	subs	r2, r2, #4
+	bge	.Lmemmove_fsrcul2loop4
+
+.Lmemmove_fsrcul2l4:
+	sub	r1, r1, #2
+	b	.Lmemmove_fl4
+
+.Lmemmove_fsrcul3:
+	cmp	r2, #0x0c            
+	blt	.Lmemmove_fsrcul3loop4
+	sub	r2, r2, #0x0c         
+	stmdb	sp!, {r4, r5}
+
+.Lmemmove_fsrcul3loop16:
+#ifdef __ARMEB__
+	mov	r3, lr, lsl #24
+#else
+	mov	r3, lr, lsr #24
+#endif
+	ldmia	r1!, {r4, r5, r12, lr}
+#ifdef __ARMEB__
+	orr	r3, r3, r4, lsr #8
+	mov	r4, r4, lsl #24
+	orr	r4, r4, r5, lsr #8
+	mov	r5, r5, lsl #24
+	orr	r5, r5, r12, lsr #8
+	mov	r12, r12, lsl #24
+	orr	r12, r12, lr, lsr #8
+#else
+	orr	r3, r3, r4, lsl #8
+	mov	r4, r4, lsr #24
+	orr	r4, r4, r5, lsl #8
+	mov	r5, r5, lsr #24
+	orr	r5, r5, r12, lsl #8
+	mov	r12, r12, lsr #24
+	orr	r12, r12, lr, lsl #8
+#endif
+	stmia	r0!, {r3-r5, r12}
+	subs	r2, r2, #0x10         
+	bge	.Lmemmove_fsrcul3loop16
+	ldmia	sp!, {r4, r5}
+	adds	r2, r2, #0x0c         
+	blt	.Lmemmove_fsrcul3l4
+
+.Lmemmove_fsrcul3loop4:
+#ifdef __ARMEB__
+	mov	r12, lr, lsl #24
+#else
+	mov	r12, lr, lsr #24
+#endif
+	ldr	lr, [r1], #4
+#ifdef __ARMEB__
+	orr	r12, r12, lr, lsr #8
+#else
+	orr	r12, r12, lr, lsl #8
+#endif
+	str	r12, [r0], #4
+	subs	r2, r2, #4
+	bge	.Lmemmove_fsrcul3loop4
+
+.Lmemmove_fsrcul3l4:
+	sub	r1, r1, #1
+	b	.Lmemmove_fl4
+
+.Lmemmove_backwards:
+	add	r1, r1, r2
+	add	r0, r0, r2
+	subs	r2, r2, #4
+	blt	.Lmemmove_bl4		/* less than 4 bytes */
+	ands	r12, r0, #3
+	bne	.Lmemmove_bdestul	/* oh unaligned destination addr */
+	ands	r12, r1, #3
+	bne	.Lmemmove_bsrcul	/* oh unaligned source addr */
+
+.Lmemmove_bt8:
+	/* We have aligned source and destination */
+	subs	r2, r2, #8
+	blt	.Lmemmove_bl12		/* less than 12 bytes (4 from above) */
+	stmdb	sp!, {r4}
+	subs	r2, r2, #0x14		/* less than 32 bytes (12 from above) */
+	blt	.Lmemmove_bl32
+
+	/* blat 32 bytes at a time */
+	/* XXX for really big copies perhaps we should use more registers */
+.Lmemmove_bloop32:
+	ldmdb	r1!, {r3, r4, r12, lr}
+	stmdb	r0!, {r3, r4, r12, lr}
+	ldmdb	r1!, {r3, r4, r12, lr}
+	stmdb	r0!, {r3, r4, r12, lr}
+	subs	r2, r2, #0x20         
+	bge	.Lmemmove_bloop32
+
+.Lmemmove_bl32:
+	cmn	r2, #0x10            
+	ldmgedb	r1!, {r3, r4, r12, lr}	/* blat a remaining 16 bytes */
+	stmgedb	r0!, {r3, r4, r12, lr}
+	subge	r2, r2, #0x10         
+	adds	r2, r2, #0x14         
+	ldmgedb	r1!, {r3, r12, lr}	/* blat a remaining 12 bytes */
+	stmgedb	r0!, {r3, r12, lr}
+	subge	r2, r2, #0x0c         
+	ldmia	sp!, {r4}
+
+.Lmemmove_bl12:
+	adds	r2, r2, #8
+	blt	.Lmemmove_bl4
+	subs	r2, r2, #4
+	ldrlt	r3, [r1, #-4]!
+	strlt	r3, [r0, #-4]!
+	ldmgedb	r1!, {r3, r12}
+	stmgedb	r0!, {r3, r12}
+	subge	r2, r2, #4
+
+.Lmemmove_bl4:
+	/* less than 4 bytes to go */
+	adds	r2, r2, #4
+	ldmeqia	sp!, {r0, pc}
+
+	/* copy the crud byte at a time */
+	cmp	r2, #2
+	ldrb	r3, [r1, #-1]!
+	strb	r3, [r0, #-1]!
+	ldrgeb	r3, [r1, #-1]!
+	strgeb	r3, [r0, #-1]!
+	ldrgtb	r3, [r1, #-1]!
+	strgtb	r3, [r0, #-1]!
+	ldmia	sp!, {r0, pc}
+
+	/* erg - unaligned destination */
+.Lmemmove_bdestul:
+	cmp	r12, #2
+
+	/* align destination with byte copies */
+	ldrb	r3, [r1, #-1]!
+	strb	r3, [r0, #-1]!
+	ldrgeb	r3, [r1, #-1]!
+	strgeb	r3, [r0, #-1]!
+	ldrgtb	r3, [r1, #-1]!
+	strgtb	r3, [r0, #-1]!
+	subs	r2, r2, r12
+	blt	.Lmemmove_bl4		/* less than 4 bytes to go */
+	ands	r12, r1, #3
+	beq	.Lmemmove_bt8		/* we have an aligned source */
+
+	/* erg - unaligned source */
+	/* This is where it gets nasty ... */
+.Lmemmove_bsrcul:
+	bic	r1, r1, #3
+	ldr	r3, [r1, #0]
+	cmp	r12, #2
+	blt	.Lmemmove_bsrcul1
+	beq	.Lmemmove_bsrcul2
+	cmp	r2, #0x0c            
+	blt	.Lmemmove_bsrcul3loop4
+	sub	r2, r2, #0x0c         
+	stmdb	sp!, {r4, r5}
+
+.Lmemmove_bsrcul3loop16:
+#ifdef __ARMEB__
+	mov	lr, r3, lsr #8
+#else
+	mov	lr, r3, lsl #8
+#endif
+	ldmdb	r1!, {r3-r5, r12}
+#ifdef __ARMEB__
+	orr	lr, lr, r12, lsl #24
+	mov	r12, r12, lsr #8
+	orr	r12, r12, r5, lsl #24
+	mov	r5, r5, lsr #8
+	orr	r5, r5, r4, lsl #24
+	mov	r4, r4, lsr #8
+	orr	r4, r4, r3, lsl #24
+#else
+	orr	lr, lr, r12, lsr #24
+	mov	r12, r12, lsl #8
+	orr	r12, r12, r5, lsr #24
+	mov	r5, r5, lsl #8
+	orr	r5, r5, r4, lsr #24
+	mov	r4, r4, lsl #8
+	orr	r4, r4, r3, lsr #24
+#endif
+	stmdb	r0!, {r4, r5, r12, lr}
+	subs	r2, r2, #0x10         
+	bge	.Lmemmove_bsrcul3loop16
+	ldmia	sp!, {r4, r5}
+	adds	r2, r2, #0x0c         
+	blt	.Lmemmove_bsrcul3l4
+
+.Lmemmove_bsrcul3loop4:
+#ifdef __ARMEB__
+	mov	r12, r3, lsr #8
+#else
+	mov	r12, r3, lsl #8
+#endif
+	ldr	r3, [r1, #-4]!
+#ifdef __ARMEB__
+	orr	r12, r12, r3, lsl #24
+#else
+	orr	r12, r12, r3, lsr #24
+#endif
+	str	r12, [r0, #-4]!
+	subs	r2, r2, #4
+	bge	.Lmemmove_bsrcul3loop4
+
+.Lmemmove_bsrcul3l4:
+	add	r1, r1, #3
+	b	.Lmemmove_bl4
+
+.Lmemmove_bsrcul2:
+	cmp	r2, #0x0c            
+	blt	.Lmemmove_bsrcul2loop4
+	sub	r2, r2, #0x0c         
+	stmdb	sp!, {r4, r5}
+
+.Lmemmove_bsrcul2loop16:
+#ifdef __ARMEB__
+	mov	lr, r3, lsr #16
+#else
+	mov	lr, r3, lsl #16
+#endif
+	ldmdb	r1!, {r3-r5, r12}
+#ifdef __ARMEB__
+	orr	lr, lr, r12, lsl #16
+	mov	r12, r12, lsr #16
+	orr	r12, r12, r5, lsl #16
+	mov	r5, r5, lsr #16
+	orr	r5, r5, r4, lsl #16
+	mov	r4, r4, lsr #16
+	orr	r4, r4, r3, lsl #16
+#else
+	orr	lr, lr, r12, lsr #16
+	mov	r12, r12, lsl #16
+	orr	r12, r12, r5, lsr #16
+	mov	r5, r5, lsl #16
+	orr	r5, r5, r4, lsr #16
+	mov	r4, r4, lsl #16
+	orr	r4, r4, r3, lsr #16
+#endif
+	stmdb	r0!, {r4, r5, r12, lr}
+	subs	r2, r2, #0x10         
+	bge	.Lmemmove_bsrcul2loop16
+	ldmia	sp!, {r4, r5}
+	adds	r2, r2, #0x0c         
+	blt	.Lmemmove_bsrcul2l4
+
+.Lmemmove_bsrcul2loop4:
+#ifdef __ARMEB__
+	mov	r12, r3, lsr #16
+#else
+	mov	r12, r3, lsl #16
+#endif
+	ldr	r3, [r1, #-4]!
+#ifdef __ARMEB__
+	orr	r12, r12, r3, lsl #16
+#else
+	orr	r12, r12, r3, lsr #16
+#endif
+	str	r12, [r0, #-4]!
+	subs	r2, r2, #4
+	bge	.Lmemmove_bsrcul2loop4
+
+.Lmemmove_bsrcul2l4:
+	add	r1, r1, #2
+	b	.Lmemmove_bl4
+
+.Lmemmove_bsrcul1:
+	cmp	r2, #0x0c            
+	blt	.Lmemmove_bsrcul1loop4
+	sub	r2, r2, #0x0c         
+	stmdb	sp!, {r4, r5}
+
+.Lmemmove_bsrcul1loop32:
+#ifdef __ARMEB__
+	mov	lr, r3, lsr #24
+#else
+	mov	lr, r3, lsl #24
+#endif
+	ldmdb	r1!, {r3-r5, r12}
+#ifdef __ARMEB__
+	orr	lr, lr, r12, lsl #8
+	mov	r12, r12, lsr #24
+	orr	r12, r12, r5, lsl #8
+	mov	r5, r5, lsr #24
+	orr	r5, r5, r4, lsl #8
+	mov	r4, r4, lsr #24
+	orr	r4, r4, r3, lsl #8
+#else
+	orr	lr, lr, r12, lsr #8
+	mov	r12, r12, lsl #24
+	orr	r12, r12, r5, lsr #8
+	mov	r5, r5, lsl #24
+	orr	r5, r5, r4, lsr #8
+	mov	r4, r4, lsl #24
+	orr	r4, r4, r3, lsr #8
+#endif
+	stmdb	r0!, {r4, r5, r12, lr}
+	subs	r2, r2, #0x10         
+	bge	.Lmemmove_bsrcul1loop32
+	ldmia	sp!, {r4, r5}
+	adds	r2, r2, #0x0c         
+	blt	.Lmemmove_bsrcul1l4
+
+.Lmemmove_bsrcul1loop4:
+#ifdef __ARMEB__
+	mov	r12, r3, lsr #24
+#else
+	mov	r12, r3, lsl #24
+#endif
+	ldr	r3, [r1, #-4]!
+#ifdef __ARMEB__
+	orr	r12, r12, r3, lsl #8
+#else
+	orr	r12, r12, r3, lsr #8
+#endif
+	str	r12, [r0, #-4]!
+	subs	r2, r2, #4
+	bge	.Lmemmove_bsrcul1loop4
+
+.Lmemmove_bsrcul1l4:
+	add	r1, r1, #1
+	b	.Lmemmove_bl4
--- a/sys/lib/libkern/arch/arm/memset.S
+++ b/sys/lib/libkern/arch/arm/memset.S
@ -1,5 +1,39 @@
-/*	$NetBSD: memset.S,v 1.3 2003/04/05 23:27:15 bjh21 Exp $	*/
+/*	$NetBSD: memset.S,v 1.4 2003/10/13 19:59:24 scw Exp $	*/

+/*
+ * Copyright 2003 Wasabi Systems, Inc.
+ * All rights reserved.
+ *
+ * Written by Steve C. Woodford for Wasabi Systems, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *      This product includes software developed for the NetBSD Project by
+ *      Wasabi Systems, Inc.
+ * 4. The name of Wasabi Systems, Inc. may not be used to endorse
+ *    or promote products derived from this software without specific prior
+ *    written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
 /*
 * Copyright (c) 1995 Mark Brinicombe.
 * All rights reserved.
@ -35,7 +69,7 @@
 #include <machine/asm.h>

 /*
- * Sets a block of memory to the specified value
+ * memset: Sets a block of memory to the specified value
 *
 * On entry:
 *   r0 - dest address
@ -45,82 +79,157 @@
 * On exit:
 *   r0 - dest address
 */
-
+#ifdef _BZERO
+/* LINTSTUB: Func: void bzero(void *, size_t) */
+ENTRY(bzero)
+	mov	r3, #0x00
+#else
+/* LINTSTUB: Func: void *memset(void *, int, size_t) */
 ENTRY(memset)
-	stmfd	sp!, {r0}		/* Remember address for return value */
-	and	r1, r1, #0x000000ff	/* We write bytes */
-
-	cmp	r2, #0x00000004		/* Do we have less than 4 bytes */
+	and	r3, r1, #0xff		/* We deal with bytes */
+	mov	r1, r2
+#endif
+	cmp	r1, #0x04		/* Do we have less than 4 bytes */
+	mov	ip, r0
 	blt	.Lmemset_lessthanfour

 	/* Ok first we will word align the address */
+	ands	r2, ip, #0x03		/* Get the bottom two bits */
+	bne	.Lmemset_wordunaligned	/* The address is not word aligned */

-	ands	r3, r0, #0x00000003	/* Get the bottom two bits */
-	beq	.Lmemset_addraligned	/* The address is word aligned */
+	/* We are now word aligned */
+.Lmemset_wordaligned:
+#ifndef _BZERO
+	orr	r3, r3, r3, lsl #8	/* Extend value to 16-bits */
+#endif
+#ifdef __XSCALE__
+	tst	ip, #0x04		/* Quad-align for Xscale */
+#else
+	cmp	r1, #0x10
+#endif
+#ifndef _BZERO
+	orr	r3, r3, r3, lsl #16	/* Extend value to 32-bits */
+#endif
+#ifdef __XSCALE__
+	subne	r1, r1, #0x04		/* Quad-align if necessary */
+	strne	r3, [ip], #0x04
+	cmp	r1, #0x10
+#endif
+	blt	.Lmemset_loop4		/* If less than 16 then use words */
+	mov	r2, r3			/* Duplicate data */
+	cmp	r1, #0x80		/* If < 128 then skip the big loop */
+	blt	.Lmemset_loop32

-	rsb	r3, r3, #0x00000004
-	sub	r2, r2, r3
-	cmp	r3, #0x00000002
-	strb	r1, [r0], #0x0001	/* Set 1 byte */
-	strgeb	r1, [r0], #0x0001	/* Set another byte */
-	strgtb	r1, [r0], #0x0001	/* and a third */
+	/* Do 128 bytes at a time */
+.Lmemset_loop128:
+	subs	r1, r1, #0x80
+#ifdef __XSCALE__
+	strged	r2, [ip], #0x08
+	strged	r2, [ip], #0x08
+	strged	r2, [ip], #0x08
+	strged	r2, [ip], #0x08
+	strged	r2, [ip], #0x08
+	strged	r2, [ip], #0x08
+	strged	r2, [ip], #0x08
+	strged	r2, [ip], #0x08
+	strged	r2, [ip], #0x08
+	strged	r2, [ip], #0x08
+	strged	r2, [ip], #0x08
+	strged	r2, [ip], #0x08
+	strged	r2, [ip], #0x08
+	strged	r2, [ip], #0x08
+	strged	r2, [ip], #0x08
+	strged	r2, [ip], #0x08
+#else
+	stmgeia	ip!, {r2-r3}
+	stmgeia	ip!, {r2-r3}
+	stmgeia	ip!, {r2-r3}
+	stmgeia	ip!, {r2-r3}
+	stmgeia	ip!, {r2-r3}
+	stmgeia	ip!, {r2-r3}
+	stmgeia	ip!, {r2-r3}
+	stmgeia	ip!, {r2-r3}
+	stmgeia	ip!, {r2-r3}
+	stmgeia	ip!, {r2-r3}
+	stmgeia	ip!, {r2-r3}
+	stmgeia	ip!, {r2-r3}
+	stmgeia	ip!, {r2-r3}
+	stmgeia	ip!, {r2-r3}
+	stmgeia	ip!, {r2-r3}
+	stmgeia	ip!, {r2-r3}
+#endif
+	bgt	.Lmemset_loop128
+	moveq	pc, lr			/* Zero length so just exit */

-	cmp	r2, #0x00000004
-	blt	.Lmemset_lessthanfour
+	add	r1, r1, #0x80		/* Adjust for extra sub */

-	/* Now we must be word aligned */
+	/* Do 32 bytes at a time */
+.Lmemset_loop32:
+	subs	r1, r1, #0x20
+#ifdef __XSCALE__
+	strged	r2, [ip], #0x08
+	strged	r2, [ip], #0x08
+	strged	r2, [ip], #0x08
+	strged	r2, [ip], #0x08
+#else
+	stmgeia	ip!, {r2-r3}
+	stmgeia	ip!, {r2-r3}
+	stmgeia	ip!, {r2-r3}
+	stmgeia	ip!, {r2-r3}
+#endif
+	bgt	.Lmemset_loop32
+	moveq	pc, lr			/* Zero length so just exit */

-.Lmemset_addraligned:
+	adds	r1, r1, #0x10		/* Partially adjust for extra sub */

-	orr	r3, r1, r1, lsl #8	/* Repeat the byte into a word */
-	orr	r3, r3, r3, lsl #16
+	/* Deal with 16 bytes or more */
+#ifdef __XSCALE__
+	strged	r2, [ip], #0x08
+	strged	r2, [ip], #0x08
+#else
+	stmgeia	ip!, {r2-r3}
+	stmgeia	ip!, {r2-r3}
+#endif
+	moveq	pc, lr			/* Zero length so just exit */

-	/* We know we have at least 4 bytes ... */
-
-	cmp	r2, #0x00000020		/* If less than 32 then use words */
-	blt	.Lmemset_lessthan32
-
-	/* We have at least 32 so lets use quad words */
-
-	stmfd	sp!, {r4-r6}		/* Store registers */
-	mov	r4, r3			/* Duplicate data */
-	mov	r5, r3
-	mov	r6, r3
-
-.Lmemset_loop16:
-	stmia	r0!, {r3-r6}		/* Store 16 bytes */
-	sub	r2, r2, #0x00000010	/* Adjust count */
-	cmp	r2, #0x00000010		/* Still got at least 16 bytes ? */
-	bgt	.Lmemset_loop16
-
-	ldmfd	sp!, {r4-r6}		/* Restore registers */
-
-	/* Do we need to set some words as well ? */
-
-	cmp	r2, #0x00000004
-	blt	.Lmemset_lessthanfour
-
-	/* Have either less than 16 or less than 32 depending on route taken */
-
-.Lmemset_lessthan32:
+	addlt	r1, r1, #0x10		/* Possibly adjust for extra sub */

 	/* We have at least 4 bytes so copy as words */
-
 .Lmemset_loop4:
-	str	r3, [r0], #0x0004
-	sub	r2, r2, #0x0004
-	cmp	r2, #0x00000004
-	bge	.Lmemset_loop4
+	subs	r1, r1, #0x04
+	strge	r3, [ip], #0x04
+	bgt	.Lmemset_loop4
+	moveq	pc, lr			/* Zero length so just exit */
+
+#ifdef __XSCALE__
+	/* Compensate for 64-bit alignment check */
+	adds	r1, r1, #0x04
+	moveq	pc, lr
+	cmp	r1, #2
+#else
+	cmp	r1, #-2
+#endif
+
+	strb	r3, [ip], #0x01		/* Set 1 byte */
+	strgeb	r3, [ip], #0x01		/* Set another byte */
+	strgtb	r3, [ip]		/* and a third */
+	mov	pc, lr			/* Exit */
+
+.Lmemset_wordunaligned:
+	rsb	r2, r2, #0x004
+	strb	r3, [ip], #0x01		/* Set 1 byte */
+	cmp	r2, #0x02
+	strgeb	r3, [ip], #0x01		/* Set another byte */
+	sub	r1, r1, r2
+	strgtb	r3, [ip], #0x01		/* and a third */
+	cmp	r1, #0x04		/* More than 4 bytes left? */
+	bge	.Lmemset_wordaligned	/* Yup */

 .Lmemset_lessthanfour:
-	cmp	r2, #0x00000000
-	ldmeqfd	sp!, {r0}
+	cmp	r1, #0x00
 	moveq	pc, lr			/* Zero length so exit */
-
-	cmp	r2, #0x00000002
-	strb	r1, [r0], #0x0001	/* Set 1 byte */
-	strgeb	r1, [r0], #0x0001	/* Set another byte */
-	strgtb	r1, [r0], #0x0001	/* and a third */
-
-	ldmfd	sp!, {r0}
+	strb	r3, [ip], #0x01		/* Set 1 byte */
+	cmp	r1, #0x02
+	strgeb	r3, [ip], #0x01		/* Set another byte */
+	strgtb	r3, [ip]		/* and a third */
 	mov	pc, lr			/* Exit */