From 8231ab74c5927d32fca5870b661ac5d14c3b9430 Mon Sep 17 00:00:00 2001
From: mjl <mjl@NetBSD.org>
Date: Fri, 30 Nov 2001 02:27:20 +0000
Subject: [PATCH] 	Add assembler version of strlen.

---
 sys/lib/libkern/arch/powerpc/Makefile.inc |   6 +-
 sys/lib/libkern/arch/powerpc/strlen.S     | 112 ++++++++++++++++++++++
 2 files changed, 115 insertions(+), 3 deletions(-)
 create mode 100644 sys/lib/libkern/arch/powerpc/strlen.S

diff --git a/sys/lib/libkern/arch/powerpc/Makefile.inc b/sys/lib/libkern/arch/powerpc/Makefile.inc
index 9fb9291ef33a..b9030695c9b8 100644
--- a/sys/lib/libkern/arch/powerpc/Makefile.inc
+++ b/sys/lib/libkern/arch/powerpc/Makefile.inc
@@ -1,15 +1,15 @@
-#	$NetBSD: Makefile.inc,v 1.17 2001/11/29 00:27:07 mjl Exp $
+#	$NetBSD: Makefile.inc,v 1.18 2001/11/30 02:27:20 mjl Exp $
 
 SRCS+=	__main.c __assert.c \
 	imax.c imin.c lmax.c lmin.c max.c min.c ulmax.c ulmin.c \
 	bswap16.c bswap32.c bswap64.c \
 	bcmp.c \
 	memchr.c memcmp.c \
-	strcat.c strcmp.c strcpy.c strlen.c strcasecmp.c \
+	strcat.c strcmp.c strcpy.c strcasecmp.c \
 	strncasecmp.c strncmp.c strncpy.c \
 	scanc.c skpc.c \
 	htonl.c htons.c ntohl.c ntohs.c \
 	random.c strtoul.c \
 	syncicache.c
 
-SRCS+=	ffs.S bzero.S
+SRCS+=	ffs.S bzero.S strlen.S
diff --git a/sys/lib/libkern/arch/powerpc/strlen.S b/sys/lib/libkern/arch/powerpc/strlen.S
new file mode 100644
index 000000000000..4276e68b64ec
--- /dev/null
+++ b/sys/lib/libkern/arch/powerpc/strlen.S
@@ -0,0 +1,112 @@
+/*	$NetBSD: strlen.S,v 1.1 2001/11/30 02:27:20 mjl Exp $ */
+
+/*-
+ * Copyright (C) 2001	Martin J. Laubach <mjl@netbsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+/*----------------------------------------------------------------------*/
+
+#include <machine/asm.h>
+
+/*----------------------------------------------------------------------*/
+/* The algorithm here uses the following techniques:
+
+   1) Given a word 'x', we can test to see if it contains any 0 bytes
+      by subtracting 0x01010101, and seeing if any of the high bits of each
+      byte changed from 0 to 1. This works because the least significant
+      0 byte must have had no incoming carry (otherwise it's not the least
+      significant), so it is 0x00 - 0x01 == 0xff. For all other
+      byte values, either they have the high bit set initially, or when
+      1 is subtracted you get a value in the range 0x00-0x7f, none of which
+      have their high bit set. The expression here is
+      (x + 0xfefefeff) & ~(x | 0x7f7f7f7f), which gives 0x00000000 when
+      there were no 0x00 bytes in the word.
+
+   2) Given a word 'x', we can test to see _which_ byte was zero by
+      calculating ~(((x & 0x7f7f7f7f) + 0x7f7f7f7f) | x | 0x7f7f7f7f).
+      This produces 0x80 in each byte that was zero, and 0x00 in all
+      the other bytes. The '| 0x7f7f7f7f' clears the low 7 bits in each
+      byte, and the '| x' part ensures that bytes with the high bit set
+      produce 0x00. The addition will carry into the high bit of each byte
+      iff that byte had one of its low 7 bits set. We can then just see
+      which was the most significant bit set and divide by 8 to find how
+      many to add to the index.
+      This is from the book 'The PowerPC Compiler Writer's Guide',
+      by Steve Hoxey, Faraydon Karim, Bill Hay and Hank Warren.
+*/
+/*----------------------------------------------------------------------*/
+
+		.text
+		.align 4
+
+ENTRY(strlen)
+
+		/* Setup constants */
+		lis	r10, 0x7f7f
+		lis	r9, 0xfefe
+		ori	r10, r10, 0x7f7f
+		ori	r9, r9, 0xfeff
+
+		/* Mask out leading bytes on non aligned strings */
+		rlwinm.	r8, r3, 3, 27, 28	/* leading bits to mask */
+		clrrwi	r5, r3, 2		/*  clear low 2 addr bits */
+		li	r0, -1
+		beq+	3f			/* skip alignment if already */
+						/* aligned */
+
+		srw	r0, r0, r8		/* make 0000...1111 mask */
+
+		lwz	r7, 0(r5)
+		nor	r0, r0, r0		/* invert mask */
+		or	r7, r7, r0		/* make leading bytes != 0 */
+		b	2f
+
+3:		subi	r5, r5, 4
+
+1:		lwzu	r7, 4(r5)		/* fetch data word */
+
+2:		nor	r0, r7, r10		/* do step 1 */
+		add	r6, r7, r9
+		and.	r0, r0, r6
+
+		beq+	1b			/* no NUL bytes here */
+	
+		and	r8, r7, r10		/* ok, a NUL is somewhere */
+		or	r7, r7, r10		/* do step 2 to find out */
+		add	r0, r8, r10		/* where */
+		nor	r8, r7, r0
+
+		cntlzw	r0, r8			/* offset from this word */
+		srwi	r4, r0, 3
+
+		add	r4, r5, r4		/* r4 contains end pointer */
+		/* NOTE: Keep it so this function returns the end pointer
+		   in r4, so we can it use from other str* calls (strcat
+		   comes to mind */
+
+		subf	r3, r3, r4
+		blr
+
+/*----------------------------------------------------------------------*/