diff --git a/src/system/libroot/posix/glibc/arch/ppc/Jamfile b/src/system/libroot/posix/glibc/arch/ppc/Jamfile index 53add9237a..a89142a1ae 100644 --- a/src/system/libroot/posix/glibc/arch/ppc/Jamfile +++ b/src/system/libroot/posix/glibc/arch/ppc/Jamfile @@ -118,6 +118,7 @@ MergeObject posix_gnu_arch_$(TARGET_ARCH)_other.o : ldbl2mpn.c lshift.S rshift.S mul_1.S + strlen.S sub_n.S submul_1.S diff --git a/src/system/libroot/posix/glibc/arch/ppc/strlen.S b/src/system/libroot/posix/glibc/arch/ppc/strlen.S new file mode 100644 index 0000000000..5ff31506e2 --- /dev/null +++ b/src/system/libroot/posix/glibc/arch/ppc/strlen.S @@ -0,0 +1,159 @@ +/* Optimized strlen implementation for PowerPC. + Copyright (C) 1997, 1999, 2000 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include +#include +#include + +/* The algorithm here uses the following techniques: + + 1) Given a word 'x', we can test to see if it contains any 0 bytes + by subtracting 0x01010101, and seeing if any of the high bits of each + byte changed from 0 to 1. This works because the least significant + 0 byte must have had no incoming carry (otherwise it's not the least + significant), so it is 0x00 - 0x01 == 0xff. For all other + byte values, either they have the high bit set initially, or when + 1 is subtracted you get a value in the range 0x00-0x7f, none of which + have their high bit set. The expression here is + (x + 0xfefefeff) & ~(x | 0x7f7f7f7f), which gives 0x00000000 when + there were no 0x00 bytes in the word. + + 2) Given a word 'x', we can test to see _which_ byte was zero by + calculating ~(((x & 0x7f7f7f7f) + 0x7f7f7f7f) | x | 0x7f7f7f7f). + This produces 0x80 in each byte that was zero, and 0x00 in all + the other bytes. The '| 0x7f7f7f7f' clears the low 7 bits in each + byte, and the '| x' part ensures that bytes with the high bit set + produce 0x00. The addition will carry into the high bit of each byte + iff that byte had one of its low 7 bits set. We can then just see + which was the most significant bit set and divide by 8 to find how + many to add to the index. + This is from the book 'The PowerPC Compiler Writer's Guide', + by Steve Hoxey, Faraydon Karim, Bill Hay and Hank Warren. + + We deal with strings not aligned to a word boundary by taking the + first word and ensuring that bytes not part of the string + are treated as nonzero. To allow for memory latency, we unroll the + loop a few times, being careful to ensure that we do not read ahead + across cache line boundaries. + + Questions to answer: + 1) How long are strings passed to strlen? If they're often really long, + we should probably use cache management instructions and/or unroll the + loop more. If they're often quite short, it might be better to use + fact (2) in the inner loop than have to recalculate it. + 2) How popular are bytes with the high bit set? If they are very rare, + on some processors it might be useful to use the simpler expression + ~((x - 0x01010101) | 0x7f7f7f7f) (that is, on processors with only one + ALU), but this fails when any character has its high bit set. */ + +/* Some notes on register usage: Under the SVR4 ABI, we can use registers + 0 and 3 through 12 (so long as we don't call any procedures) without + saving them. We can also use registers 14 through 31 if we save them. + We can't use r1 (it's the stack pointer), r2 nor r13 because the user + program may expect them to hold their usual value if we get sent + a signal. Integer parameters are passed in r3 through r10. + We can use condition registers cr0, cr1, cr5, cr6, and cr7 without saving + them, the others we must save. */ + +/* int [r3] strlen (char *s [r3]) */ + +ENTRY (BP_SYM (strlen)) + +#define rTMP1 r0 +#define rRTN r3 /* incoming STR arg, outgoing result */ +#define rSTR r4 /* current string position */ +#define rPADN r5 /* number of padding bits we prepend to the + string to make it start at a word boundary */ +#define rFEFE r6 /* constant 0xfefefeff (-0x01010101) */ +#define r7F7F r7 /* constant 0x7f7f7f7f */ +#define rWORD1 r8 /* current string word */ +#define rWORD2 r9 /* next string word */ +#define rMASK r9 /* mask for first string word */ +#define rTMP2 r10 +#define rTMP3 r11 +#define rTMP4 r12 + + CHECK_BOUNDS_LOW (rRTN, rTMP1, rTMP2) + + clrrwi rSTR, rRTN, 2 + lis r7F7F, 0x7f7f + rlwinm rPADN, rRTN, 3, 27, 28 + lwz rWORD1, 0(rSTR) + li rMASK, -1 + addi r7F7F, r7F7F, 0x7f7f +/* That's the setup done, now do the first pair of words. + We make an exception and use method (2) on the first two words, to reduce + overhead. */ + srw rMASK, rMASK, rPADN + and rTMP1, r7F7F, rWORD1 + or rTMP2, r7F7F, rWORD1 + add rTMP1, rTMP1, r7F7F + nor rTMP1, rTMP2, rTMP1 + and. rWORD1, rTMP1, rMASK + mtcrf 0x01, rRTN + bne L(done0) + lis rFEFE, -0x101 + addi rFEFE, rFEFE, -0x101 +/* Are we now aligned to a doubleword boundary? */ + bt 29, L(loop) + +/* Handle second word of pair. */ + lwzu rWORD1, 4(rSTR) + and rTMP1, r7F7F, rWORD1 + or rTMP2, r7F7F, rWORD1 + add rTMP1, rTMP1, r7F7F + nor. rWORD1, rTMP2, rTMP1 + bne L(done0) + +/* The loop. */ + +L(loop): + lwz rWORD1, 4(rSTR) + lwzu rWORD2, 8(rSTR) + add rTMP1, rFEFE, rWORD1 + nor rTMP2, r7F7F, rWORD1 + and. rTMP1, rTMP1, rTMP2 + add rTMP3, rFEFE, rWORD2 + nor rTMP4, r7F7F, rWORD2 + bne L(done1) + and. rTMP1, rTMP3, rTMP4 + beq L(loop) + + and rTMP1, r7F7F, rWORD2 + add rTMP1, rTMP1, r7F7F + andc rWORD1, rTMP4, rTMP1 + b L(done0) + +L(done1): + and rTMP1, r7F7F, rWORD1 + subi rSTR, rSTR, 4 + add rTMP1, rTMP1, r7F7F + andc rWORD1, rTMP2, rTMP1 + +/* When we get to here, rSTR points to the first word in the string that + contains a zero byte, and the most significant set bit in rWORD1 is in that + byte. */ +L(done0): + cntlzw rTMP3, rWORD1 + subf rTMP1, rRTN, rSTR + srwi rTMP3, rTMP3, 3 + add rRTN, rTMP1, rTMP3 + /* GKM FIXME: check high bound. */ + blr +END (BP_SYM (strlen)) diff --git a/src/system/libroot/posix/glibc/arch/x86/Jamfile b/src/system/libroot/posix/glibc/arch/x86/Jamfile index 9e313f6e9a..109a47e400 100644 --- a/src/system/libroot/posix/glibc/arch/x86/Jamfile +++ b/src/system/libroot/posix/glibc/arch/x86/Jamfile @@ -101,6 +101,7 @@ MergeObject posix_gnu_arch_$(TARGET_ARCH)_other.o : mplog.c mul_1.S lshift.S rshift.S + strlen.S sub_n.S submul_1.S ; diff --git a/src/system/libroot/posix/glibc/arch/x86/strlen.S b/src/system/libroot/posix/glibc/arch/x86/strlen.S new file mode 100644 index 0000000000..34151305cf --- /dev/null +++ b/src/system/libroot/posix/glibc/arch/x86/strlen.S @@ -0,0 +1,188 @@ +/* strlen -- Compute length of NUL terminated string. + Highly optimized version for ix86, x>=5. + Copyright (C) 1995, 1996, 1997, 2000, 2002 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper, . + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include +#include "asm-syntax.h" +#include "bp-sym.h" +#include "bp-asm.h" + +/* This version is especially optimized for the i586 (and following?) + processors. This is mainly done by using the two pipelines. The + version optimized for i486 is weak in this aspect because to get + as much parallelism we have to execute some *more* instructions. + + The code below is structured to reflect the pairing of the instructions + as *I think* it is. I have no processor data book to verify this. + If you find something you think is incorrect let me know. */ + + +/* The magic value which is used throughout in the whole code. */ +#define magic 0xfefefeff + +#define PARMS LINKAGE /* no space for saved regs */ +#define STR PARMS + + .text +ENTRY (BP_SYM (strlen)) + ENTER + + movl STR(%esp), %eax + CHECK_BOUNDS_LOW (%eax, STR(%esp)) + movl $3, %edx /* load mask (= 3) */ + + andl %eax, %edx /* separate last two bits of address */ + + jz L(1) /* aligned => start loop */ + jp L(0) /* exactly two bits set */ + + cmpb %dh, (%eax) /* is byte NUL? */ + je L(2) /* yes => return */ + + incl %eax /* increment pointer */ + cmpb %dh, (%eax) /* is byte NUL? */ + + je L(2) /* yes => return */ + + incl %eax /* increment pointer */ + xorl $2, %edx + + jz L(1) + +L(0): cmpb %dh, (%eax) /* is byte NUL? */ + je L(2) /* yes => return */ + + incl %eax /* increment pointer */ + xorl %edx, %edx /* We need %edx == 0 for later */ + + /* We exit the loop if adding MAGIC_BITS to LONGWORD fails to + change any of the hole bits of LONGWORD. + + 1) Is this safe? Will it catch all the zero bytes? + Suppose there is a byte with all zeros. Any carry bits + propagating from its left will fall into the hole at its + least significant bit and stop. Since there will be no + carry from its most significant bit, the LSB of the + byte to the left will be unchanged, and the zero will be + detected. + + 2) Is this worthwhile? Will it ignore everything except + zero bytes? Suppose every byte of LONGWORD has a bit set + somewhere. There will be a carry into bit 8. If bit 8 + is set, this will carry into bit 16. If bit 8 is clear, + one of bits 9-15 must be set, so there will be a carry + into bit 16. Similarly, there will be a carry into bit + 24. If one of bits 24-31 is set, there will be a carry + into bit 32 (=carry flag), so all of the hole bits will + be changed. + + Note: %edx == 0 in any case here. */ + +L(1): + movl (%eax), %ecx /* get word (= 4 bytes) in question */ + addl $4, %eax /* adjust pointer for *next* word */ + + subl %ecx, %edx /* first step to negate word */ + addl $magic, %ecx /* add magic word */ + + decl %edx /* complete negation of word */ + jnc L(3) /* previous addl caused overflow? */ + + xorl %ecx, %edx /* (word+magic)^word */ + + andl $~magic, %edx /* any of the carry flags set? */ + + jne L(3) /* yes => determine byte */ + + + movl (%eax), %ecx /* get word (= 4 bytes) in question */ + addl $4, %eax /* adjust pointer for *next* word */ + + subl %ecx, %edx /* first step to negate word */ + addl $magic, %ecx /* add magic word */ + + decl %edx /* complete negation of word */ + jnc L(3) /* previous addl caused overflow? */ + + xorl %ecx, %edx /* (word+magic)^word */ + + andl $~magic, %edx /* any of the carry flags set? */ + + jne L(3) /* yes => determine byte */ + + + movl (%eax), %ecx /* get word (= 4 bytes) in question */ + addl $4, %eax /* adjust pointer for *next* word */ + + subl %ecx, %edx /* first step to negate word */ + addl $magic, %ecx /* add magic word */ + + decl %edx /* complete negation of word */ + jnc L(3) /* previous addl caused overflow? */ + + xorl %ecx, %edx /* (word+magic)^word */ + + andl $~magic, %edx /* any of the carry flags set? */ + + jne L(3) /* yes => determine byte */ + + + movl (%eax), %ecx /* get word (= 4 bytes) in question */ + addl $4, %eax /* adjust pointer for *next* word */ + + subl %ecx, %edx /* first step to negate word */ + addl $magic, %ecx /* add magic word */ + + decl %edx /* complete negation of word */ + jnc L(3) /* previous addl caused overflow? */ + + xorl %ecx, %edx /* (word+magic)^word */ + + andl $~magic, %edx /* any of the carry flags set? */ + + je L(1) /* no => start loop again */ + + +L(3): subl $4, %eax /* correct too early pointer increment */ + subl $magic, %ecx + + cmpb $0, %cl /* lowest byte NUL? */ + jz L(2) /* yes => return */ + + inc %eax /* increment pointer */ + testb %ch, %ch /* second byte NUL? */ + + jz L(2) /* yes => return */ + + shrl $16, %ecx /* make upper bytes accessible */ + incl %eax /* increment pointer */ + + cmpb $0, %cl /* is third byte NUL? */ + jz L(2) /* yes => return */ + + incl %eax /* increment pointer */ + +L(2): CHECK_BOUNDS_HIGH (%eax, STR(%esp), jb) + subl STR(%esp), %eax /* now compute the length as difference + between start and terminating NUL + character */ + LEAVE + ret +END (BP_SYM (strlen)) diff --git a/src/system/libroot/posix/string/Jamfile b/src/system/libroot/posix/string/Jamfile index 21dffe5956..404b7cbac8 100644 --- a/src/system/libroot/posix/string/Jamfile +++ b/src/system/libroot/posix/string/Jamfile @@ -25,7 +25,6 @@ MergeObject posix_string.o : strerror.c strlcat.c strlcpy.c - strlen.c strncat.c strncmp.c strncpy.c diff --git a/src/system/libroot/posix/string/strlen.c b/src/system/libroot/posix/string/strlen.c deleted file mode 100644 index 08e0d9a037..0000000000 --- a/src/system/libroot/posix/string/strlen.c +++ /dev/null @@ -1,20 +0,0 @@ -/* -** Copyright 2001, Travis Geiselbrecht. All rights reserved. -** Distributed under the terms of the NewOS License. -*/ - -#include -#include - - -size_t -strlen(char const *s) -{ - size_t i = 0; - - while (s[i]) { - i += 1; - } - - return i; -}