From cbb2b5d0fb38f053785eafbda6f0c4c30ebea5f2 Mon Sep 17 00:00:00 2001
From: jonathan <jonathan@NetBSD.org>
Date: Sat, 9 Aug 1997 02:02:08 +0000
Subject: [PATCH] Substitute Mach 3.0 kernel bcopy() which is unrolled for
 aligned copies.

---
 lib/libc/arch/mips/string/bcopy.S | 293 ++++++++++++++++++------------
 1 file changed, 173 insertions(+), 120 deletions(-)

diff --git a/lib/libc/arch/mips/string/bcopy.S b/lib/libc/arch/mips/string/bcopy.S
index 09f7f34e94ae..24b2c8a999cc 100644
--- a/lib/libc/arch/mips/string/bcopy.S
+++ b/lib/libc/arch/mips/string/bcopy.S
@@ -1,145 +1,198 @@
-/*	$NetBSD: bcopy.S,v 1.5 1996/09/17 01:32:32 jonathan Exp $	*/
+/*	$NetBSD: bcopy.S,v 1.6 1997/08/09 02:02:08 jonathan Exp $	*/
 
-/*-
- * Copyright (c) 1991, 1993
- *	The Regents of the University of California.  All rights reserved.
- *
- * This code is derived from software contributed to Berkeley by
- * Ralph Campbell.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- *    must display the following acknowledgement:
- *	This product includes software developed by the University of
- *	California, Berkeley and its contributors.
- * 4. Neither the name of the University nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
+/* 
+ * Mach Operating System
+ * Copyright (c) 1993 Carnegie Mellon University
+ * All Rights Reserved.
+ * 
+ * Permission to use, copy, modify and distribute this software and its
+ * documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ * 
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
+ * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ * 
+ * Carnegie Mellon requests users of this software to return to
+ * 
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ * 
+ * any improvements or extensions that they make and grant Carnegie Mellon
+ * the rights to redistribute these changes.
  */
 
+/*
+ *	File:	mips_bcopy.s
+ *	Author:	Chris Maeda
+ *	Date:	June 1993
+ *
+ *	Fast copy routine.  Derived from aligned_block_copy.
+ */	
+
+
 #include <mips/asm.h>
 
 #if defined(LIBC_SCCS) && !defined(lint)
-	ASMSTR("from: @(#)bcopy.s	8.1 (Berkeley) 6/4/93")
-	ASMSTR("$NetBSD: bcopy.S,v 1.5 1996/09/17 01:32:32 jonathan Exp $")
+	ASMSTR("from: @(#)mips_bcopy.s	2.2 CMU 18/06/93")
+	ASMSTR("$NetBSD: bcopy.S,v 1.6 1997/08/09 02:02:08 jonathan Exp $")
 #endif /* LIBC_SCCS and not lint */
 
 #ifdef ABICALLS
 	.abicalls
 #endif
 
-/* bcopy(s1, s2, n) */
-
-#ifdef MIPSEL
-#	define	LWHI	lwr
-#	define	LWLO	lwl
-#	define	SWHI	swr
-#	define	SWLO	swl
-#endif
-#ifdef MIPSEB
-#	define	LWHI	lwl
-#	define	LWLO	lwr
-#	define	SWHI	swl
-#	define	SWLO	swr
-#endif
+/*
+ *	bcopy(caddr_t src, caddr_t dst, unsigned int len)
+ *
+ *	a0 	src address
+ *	a1	dst address
+ *	a2	length
+ */
 
 LEAF(bcopy)
+	.set	noat
 	.set	noreorder
-	addu	t0, a0, a2		# t0 = end of s1 region
-	sltu	t1, a1, t0
-	sltu	t2, a0, a1
-	and	t1, t1, t2		# t1 = true if from < to < (from+len)
-	beq	t1, zero, forward	# non overlapping, do forward copy
-	slt	t2, a2, 12		# check for small copy
+	/*
+	 *	Make sure we can copy forwards.
+	 */
+	sltu	t0,a0,a1	# t0 == a0 < a1
+	addu	a3,a0,a2	# a3 == end of source
+	sltu	t1,a1,a3	# t1 == a1 < a0+a2
+	and	t2,t0,t1	# overlap -- copy backwards
+	bne	t2,zero,backcopy
 
-	ble	a2, zero, 2f
-	addu	t1, a1, a2		# t1 = end of to region
+	/*
+	 * 	There are four alignment cases (with frequency)
+	 *	(Based on measurements taken with a DECstation 5000/200
+	 *	 inside a Mach kernel.)
+	 *
+	 * 	aligned   -> aligned		(mostly)
+	 * 	unaligned -> aligned		(sometimes)
+	 * 	aligned,unaligned -> unaligned	(almost never)
+	 *
+	 *	Note that we could add another case that checks if
+	 *	the destination and source are unaligned but the 
+	 *	copy is alignable.  eg if src and dest are both
+	 *	on a halfword boundary.
+	 */
+	andi	t1,a1,3		# get last 3 bits of dest
+	bne	t1,zero,bytecopy
+	andi	t0,a0,3		# get last 3 bits of src
+	bne	t0,zero,destaligned
+
+	/*
+	 *	Forward aligned->aligned copy, 8*4 bytes at a time.
+	 */
+	li	AT,-32
+	and	t0,a2,AT	/* count truncated to multiple of 32 */
+	addu	a3,a0,t0	/* run fast loop up to this address */
+	sltu	AT,a0,a3	/* any work to do? */
+	beq	AT,zero,wordcopy
+	subu	a2,t0
+
+	/*
+	 *	loop body
+	 */
+cp:
+	lw	v0,0(a0)
+	lw	v1,4(a0)
+	lw	t0,8(a0)
+	lw	t1,12(a0)
+	addu	a0,32
+	sw	v0,0(a1)
+	sw	v1,4(a1)
+	sw	t0,8(a1)
+	sw	t1,12(a1)
+	lw	t1,-4(a0)
+	lw	t0,-8(a0)
+	lw	v1,-12(a0)
+	lw	v0,-16(a0)
+	addu	a1,32
+	sw	t1,-4(a1)
+	sw	t0,-8(a1)
+	sw	v1,-12(a1)
+	bne	a0,a3,cp
+	sw	v0,-16(a1)
+
+	/*
+	 *	Copy a word at a time, no loop unrolling.
+	 */
+wordcopy:
+	andi	t2,a2,3		# get byte count / 4
+	subu	t2,a2,t2	# t2 = number of words to copy * 4
+	beq	t2,zero,bytecopy
+	addu	t0,a0,t2	# stop at t0
+	subu	a2,a2,t2
 1:
-	lb	v0, -1(t0)		# copy bytes backwards,
-	subu	t0, t0, 1		#   doesnt happen often so do slow way
-	subu	t1, t1, 1
-	bne	t0, a0, 1b
-	sb	v0, 0(t1)
+	lw	v0,0(a0)
+	addu	a0,4
+	sw	v0,0(a1)
+	bne	a0,t0,1b
+	addu	a1,4
+
+bytecopy:
+	beq	a2,zero,copydone	# nothing left to do?
+	nop
 2:
+	lb	v0,0(a0)
+	addu	a0,1
+	sb	v0,0(a1)
+	subu	a2,1
+	bgtz	a2,2b
+	addu	a1,1
+
+copydone:
 	j	ra
 	nop
-forward:
-	bne	t2, zero, smallcpy	# do a small bcopy
-	xor	v0, a0, a1		# compare low two bits of addresses
-	and	v0, v0, 3
-	subu	a3, zero, a1		# compute # bytes to word align address
-	beq	v0, zero, aligned	# addresses can be word aligned
-	and	a3, a3, 3
 
-	beq	a3, zero, 1f
-	subu	a2, a2, a3		# subtract from remaining count
-	LWHI	v0, 0(a0)		# get next 4 bytes (unaligned)
-	LWLO	v0, 3(a0)
-	addu	a0, a0, a3
-	SWHI	v0, 0(a1)		# store 1, 2, or 3 bytes to align a1
-	addu	a1, a1, a3
-1:
-	and	v0, a2, 3		# compute number of words left
-	subu	a3, a2, v0
-	move	a2, v0
-	addu	a3, a3, a0		# compute ending address
-2:
-	LWHI	v0, 0(a0)		# copy words a0 unaligned, a1 aligned
-	LWLO	v0, 3(a0)
-	addu	a0, a0, 4
-	addu	a1, a1, 4
-	bne	a0, a3, 2b
-	sw	v0, -4(a1)
-	b	smallcpy
+	/*
+	 *	Copy from unaligned source to aligned dest.
+	 */
+destaligned:
+	andi	t0,a2,3		# t0 = bytecount mod 4
+	subu	a3,a2,t0	# number of words to transfer
+	beq	a3,zero,bytecopy
 	nop
-aligned:
-	beq	a3, zero, 1f
-	subu	a2, a2, a3		# subtract from remaining count
-	LWHI	v0, 0(a0)		# copy 1, 2, or 3 bytes to align
-	addu	a0, a0, a3
-	SWHI	v0, 0(a1)
-	addu	a1, a1, a3
-1:
-	and	v0, a2, 3		# compute number of whole words left
-	subu	a3, a2, v0
-	move	a2, v0
-	addu	a3, a3, a0		# compute ending address
-2:
-	lw	v0, 0(a0)		# copy words
-	addu	a0, a0, 4
-	addu	a1, a1, 4
-	bne	a0, a3, 2b
-	sw	v0, -4(a1)
-smallcpy:
-	ble	a2, zero, 2f
-	addu	a3, a2, a0		# compute ending address
-1:
-	lbu	v0, 0(a0)		# copy bytes
-	addu	a0, a0, 1
-	addu	a1, a1, 1
-	bne	a0, a3, 1b
-	sb	v0, -1(a1)
-2:
+	move	a2,t0		# this many to do after we are done
+	addu	a3,a0,a3	# stop point
+
+3:
+#if MIPSEL	/* little-endian */
+	lwr	v0,0(a0)
+	lwl	v0,3(a0)
+#else		/* big-endian */
+	lwl	v0,0(a0)
+	lwr	v0,3(a0)
+#endif
+	addi	a0,4
+	sw	v0,0(a1)
+	bne	a0,a3,3b
+	addi	a1,4
+
+	j	bytecopy
+	nop
+
+	/*
+	 *	Copy by bytes backwards.
+	 */
+backcopy:
+	blez	a2,copydone	# nothing left to do?
+	addu	t0,a0,a2	# end of source
+	addu	t1,a1,a2	# end of destination
+4:
+	lb	v0,-1(t0)	
+	subu	t0,1
+	sb	v0,-1(t1)
+	bne	t0,a0,4b
+	subu	t1,1
 	j	ra
 	nop
+	
 	.set	reorder
-END(bcopy)
+	.set	at
+	END(bcopy)