diff --git a/lib/libc/arch/i386/string/index.S b/lib/libc/arch/i386/string/index.S index 152294699a59..ad8943d65415 100644 --- a/lib/libc/arch/i386/string/index.S +++ b/lib/libc/arch/i386/string/index.S @@ -1,29 +1,4 @@ -/* - * Written by J.T. Conklin . - * Public domain. - */ +/* $NetBSD: index.S,v 1.12 2005/02/04 18:12:52 drochner Exp $ */ -#include - -#if defined(LIBC_SCCS) - RCSID("$NetBSD: index.S,v 1.11 2003/07/26 19:24:34 salo Exp $") -#endif - -#ifdef STRCHR -ENTRY(strchr) -#else -ENTRY(index) -#endif - movl 4(%esp),%eax - movb 8(%esp),%cl - _ALIGN_TEXT,0x90 -L1: - movb (%eax),%dl - cmpb %dl,%cl /* found char??? */ - je L2 - incl %eax - testb %dl,%dl /* null terminator??? */ - jnz L1 - xorl %eax,%eax -L2: - ret +#define INDEX +#include "strchr.S" diff --git a/lib/libc/arch/i386/string/memchr.S b/lib/libc/arch/i386/string/memchr.S index aa58a4e44d32..d7b469e5dcb5 100644 --- a/lib/libc/arch/i386/string/memchr.S +++ b/lib/libc/arch/i386/string/memchr.S @@ -1,29 +1,108 @@ /* - * Written by J.T. Conklin . + * Written by J.T. Conklin * Public domain. */ #include #if defined(LIBC_SCCS) - RCSID("$NetBSD: memchr.S,v 1.10 2003/07/26 19:24:34 salo Exp $") + RCSID("$NetBSD: memchr.S,v 1.11 2005/02/04 18:12:52 drochner Exp $") #endif ENTRY(memchr) - pushl %edi - movl 8(%esp),%edi /* string address */ - movl 12(%esp),%eax /* set character to search for */ - movl 16(%esp),%ecx /* set length of search */ - testl %ecx,%ecx /* test for len == 0 */ - jz L1 - cld /* set search forward */ - repne /* search! */ - scasb - jne L1 /* scan failed, return null */ - leal -1(%edi),%eax /* adjust result of scan */ - popl %edi - ret - _ALIGN_TEXT,0x90 -L1: xorl %eax,%eax - popl %edi + pushl %esi + movl 8(%esp),%eax + movl 12(%esp),%ecx + movl 16(%esp),%esi + + /* + * Align to word boundry + * Consider unrolling loop? + */ + testl %esi,%esi /* nbytes == 0? */ + je .Lzero +.Lalign: + testb $3,%al + je .Lword_aligned + cmpb (%eax),%cl + je .Ldone + incl %eax + decl %esi + jnz .Lalign + +.Lword_aligned: + /* copy char to all bytes in word */ + movb %cl,%ch + movl %ecx,%edx + sall $16,%ecx + orl %edx,%ecx + + _ALIGN_TEXT +.Lloop: + cmpl $3,%esi /* nbytes > 4 */ + jbe .Lbyte + movl (%eax),%edx + addl $4,%eax + xorl %ecx,%edx + subl $4,%esi + subl $0x01010101,%edx + testl $0x80808080,%edx + je .Lloop + + /* + * In rare cases, the above loop may exit prematurely. We must + * return to the loop if none of the bytes in the word are + * equal to ch. + */ + + /* + * High load-use latency on the Athlon leads to significant + * stalls, so we preload the next char as soon as possible + * instead of using cmp mem8, reg8. + * + * Alignment here avoids a stall on the Athlon, even though + * it's not a branch target. + */ + _ALIGN_TEXT + cmpb -4(%eax),%cl /* 1st byte == ch? */ + movb -3(%eax),%dl + jne 1f + subl $4,%eax + jmp .Ldone + + _ALIGN_TEXT +1: cmpb %dl,%cl /* 2nd byte == ch? */ + movb -2(%eax),%dl + jne 1f + subl $3,%eax + jmp .Ldone + + _ALIGN_TEXT +1: cmpb %dl,%cl /* 3rd byte == ch? */ + movb -1(%eax),%dl + jne 1f + subl $2,%eax + jmp .Ldone + + _ALIGN_TEXT +1: cmpb %dl,%cl /* 4th byte == ch? */ + jne .Lloop + decl %eax + jmp .Ldone + +.Lbyte: + testl %esi,%esi + je .Lzero +.Lbyte_loop: + cmpb (%eax),%cl + je .Ldone + incl %eax + decl %esi + jnz .Lbyte_loop + +.Lzero: + xorl %eax,%eax + +.Ldone: + popl %esi ret diff --git a/lib/libc/arch/i386/string/rindex.S b/lib/libc/arch/i386/string/rindex.S index 10768d13a5f5..6feda25ef1d8 100644 --- a/lib/libc/arch/i386/string/rindex.S +++ b/lib/libc/arch/i386/string/rindex.S @@ -1,32 +1,4 @@ -/* - * Written by J.T. Conklin . - * Public domain. - */ +/* $NetBSD: rindex.S,v 1.12 2005/02/04 18:12:52 drochner Exp $ */ -#include - -#if defined(LIBC_SCCS) - RCSID("$NetBSD: rindex.S,v 1.11 2003/07/26 19:24:34 salo Exp $") -#endif - -#ifdef STRRCHR -ENTRY(strrchr) -#else -ENTRY(rindex) -#endif - pushl %ebx - movl 8(%esp),%edx - movb 12(%esp),%cl - xorl %eax,%eax /* init pointer to null */ - _ALIGN_TEXT,0x90 -L1: - movb (%edx),%bl - cmpb %bl,%cl - jne L2 - movl %edx,%eax -L2: - incl %edx - testb %bl,%bl /* null terminator??? */ - jnz L1 - popl %ebx - ret +#define RINDEX +#include "strrchr.S" diff --git a/lib/libc/arch/i386/string/strcat.S b/lib/libc/arch/i386/string/strcat.S index 090d1c23e171..68270f754c94 100644 --- a/lib/libc/arch/i386/string/strcat.S +++ b/lib/libc/arch/i386/string/strcat.S @@ -1,69 +1,127 @@ /* - * Written by J.T. Conklin . + * Written by J.T. Conklin * Public domain. */ #include #if defined(LIBC_SCCS) - RCSID("$NetBSD: strcat.S,v 1.10 2003/07/26 19:24:35 salo Exp $") + RCSID("$NetBSD: strcat.S,v 1.11 2005/02/04 18:12:52 drochner Exp $") #endif -/* - * NOTE: I've unrolled the loop eight times: large enough to make a - * significant difference, and small enough not to totally trash the - * cache. - */ - ENTRY(strcat) - pushl %edi /* save edi */ - movl 8(%esp),%edi /* dst address */ - movl 12(%esp),%edx /* src address */ - pushl %edi /* push destination address */ + pushl %ebx + movl 8(%esp),%ecx + movl 12(%esp),%eax - cld /* set search forward */ - xorl %eax,%eax /* set search for null terminator */ - movl $-1,%ecx /* set search for lots of characters */ - repne /* search! */ - scasb + /* + * Align destination to word boundary. + * Consider unrolling loop? + */ +.Lscan: +.Lscan_align: + testb $3,%cl + je .Lscan_aligned + cmpb $0,(%ecx) + je .Lcopy + incl %ecx + jmp .Lscan_align - leal -1(%edi),%ecx /* correct dst address */ + _ALIGN_TEXT +.Lscan_aligned: +.Lscan_loop: + movl (%ecx),%ebx + addl $4,%ecx + leal -0x01010101(%ebx),%edx + testl $0x80808080,%edx + je .Lscan_loop - _ALIGN_TEXT,0x90 -L1: movb (%edx),%al /* unroll loop, but not too much */ - movb %al,(%ecx) - testb %al,%al - jz L2 - movb 1(%edx),%al - movb %al,1(%ecx) - testb %al,%al - jz L2 - movb 2(%edx),%al - movb %al,2(%ecx) - testb %al,%al - jz L2 - movb 3(%edx),%al - movb %al,3(%ecx) - testb %al,%al - jz L2 - movb 4(%edx),%al - movb %al,4(%ecx) - testb %al,%al - jz L2 - movb 5(%edx),%al - movb %al,5(%ecx) - testb %al,%al - jz L2 - movb 6(%edx),%al - movb %al,6(%ecx) - testb %al,%al - jz L2 - movb 7(%edx),%al - movb %al,7(%ecx) - addl $8,%edx - addl $8,%ecx - testb %al,%al - jnz L1 -L2: popl %eax /* pop destination address */ - popl %edi /* restore edi */ + /* + * In rare cases, the above loop may exit prematurely. We must + * return to the loop if none of the bytes in the word equal 0. + */ + + /* + * The optimal code for determining whether each byte is zero + * differs by processor. This space-optimized code should be + * acceptable on all, especially since we don't expect it to + * be run frequently, + */ + + testb %bl,%bl /* 1st byte == 0? */ + jne 1f + subl $4,%ecx + jmp .Lcopy + +1: testb %bh,%bh /* 2nd byte == 0? */ + jne 1f + subl $3,%ecx + jmp .Lcopy + +1: shrl $16,%ebx + testb %bl,%bl /* 3rd byte == 0? */ + jne 1f + subl $2,%ecx + jmp .Lcopy + +1: testb %bh,%bh /* 4th byte == 0? */ + jne .Lscan_loop + subl $1,%ecx + + /* + * Align source to a word boundary. + * Consider unrolling loop? + */ +.Lcopy: +.Lcopy_align: + testl $3,%eax + je .Lcopy_aligned + movb (%eax),%bl + incl %eax + movb %bl,(%ecx) + incl %ecx + testb %bl,%bl + jne .Lcopy_align + jmp .Ldone + + _ALIGN_TEXT +.Lcopy_loop: + movl %ebx,(%ecx) + addl $4,%ecx +.Lcopy_aligned: + movl (%eax),%ebx + addl $4,%eax + leal -0x01010101(%ebx),%edx + testl $0x80808080,%edx + je .Lcopy_loop + + /* + * In rare cases, the above loop may exit prematurely. We must + * return to the loop if none of the bytes in the word equal 0. + */ + + movb %bl,(%ecx) + incl %ecx + testb %bl,%bl + je .Ldone + + movb %bh,(%ecx) + incl %ecx + testb %bh,%bh + je .Ldone + + shrl $16,%ebx + movb %bl,(%ecx) + incl %ecx + testb %bl,%bl + je .Ldone + + movb %bh,(%ecx) + incl %ecx + testb %bh,%bh + jne .Lcopy_aligned + +.Ldone: + movl 8(%esp),%eax + popl %ebx ret diff --git a/lib/libc/arch/i386/string/strchr.S b/lib/libc/arch/i386/string/strchr.S index 6598976f3d62..50f33c2f18db 100644 --- a/lib/libc/arch/i386/string/strchr.S +++ b/lib/libc/arch/i386/string/strchr.S @@ -1,4 +1,105 @@ -/* $NetBSD: strchr.S,v 1.4 1998/01/09 03:45:09 perry Exp $ */ +/* + * Written by J.T. Conklin + * Public domain. + */ -#define STRCHR -#include "index.S" +#include + +#if defined(LIBC_SCCS) + RCSID("$NetBSD: strchr.S,v 1.5 2005/02/04 18:12:52 drochner Exp $") +#endif + +#ifdef INDEX +ENTRY(index) +#else +ENTRY(strchr) +#endif + pushl %esi + pushl %ebx + movl 12(%esp),%eax + movl 16(%esp),%ecx + + /* + * Align to word boundary. + * Consider unrolling loop? + */ +.Lalign: + testb $3,%al + je .Lword_aligned + movb (%eax),%bl + cmpb %cl,%bl + je .Ldone + testb %bl,%bl + je .Lzero + incl %eax + jmp .Lalign + +.Lword_aligned: + /* copy char to all bytes in word */ + movb %cl,%ch + movl %ecx,%edx + sall $16,%ecx + orl %edx,%ecx + + /* Check whether any byte in the word is equal to ch or 0. */ + _ALIGN_TEXT +.Lloop: + movl (%eax),%ebx + addl $4,%eax + movl %ebx,%esi + leal -0x01010101(%ebx),%edx + xorl %ecx,%esi + subl $0x01010101,%esi + orl %esi,%edx + testl $0x80808080,%edx + je .Lloop + + /* + * In rare cases, the above loop may exit prematurely. We must + * return to the loop if none of the bytes in the word match + * ch or are equal to 0. + */ + + /* + * Alignment here avoids a stall on the Athlon, even though + * it's not a branch target. + */ + + _ALIGN_TEXT + cmpb %cl,%bl /* 1st byte == ch? */ + jne 1f + subl $4,%eax + jmp .Ldone +1: testb %bl,%bl /* 1st byte == 0? */ + je .Lzero + + cmpb %cl,%bh /* 2nd byte == ch? */ + jne 1f + subl $3,%eax + jmp .Ldone +1: testb %bh,%bh /* 2nd byte == 0? */ + je .Lzero + + shrl $16,%ebx + cmpb %cl,%bl /* 3rd byte == ch? */ + jne 1f + subl $2,%eax + jmp .Ldone +1: testb %bl,%bl /* 3rd byte == 0? */ + je .Lzero + + cmpb %cl,%bh /* 4th byte == ch? */ + jne 1f + decl %eax + jmp .Ldone +1: testb %bh,%bh /* 4th byte == 0? */ + jne .Lloop + +.Lzero: + /* If a ch wasn't found, return 0. */ + xorl %eax,%eax + +.Ldone: + popl %ebx + popl %esi + ret diff --git a/lib/libc/arch/i386/string/strcmp.S b/lib/libc/arch/i386/string/strcmp.S index 795c8d459eaa..ad5361f72a78 100644 --- a/lib/libc/arch/i386/string/strcmp.S +++ b/lib/libc/arch/i386/string/strcmp.S @@ -1,84 +1,77 @@ /* - * Written by J.T. Conklin . + * Written by J.T. Conklin * Public domain. */ #include #if defined(LIBC_SCCS) - RCSID("$NetBSD: strcmp.S,v 1.13 2003/07/26 19:24:35 salo Exp $") + RCSID("$NetBSD: strcmp.S,v 1.14 2005/02/04 18:12:52 drochner Exp $") #endif -/* - * NOTE: I've unrolled the loop eight times: large enough to make a - * significant difference, and small enough not to totally trash the - * cache. - */ - ENTRY(strcmp) - movl 0x04(%esp),%eax - movl 0x08(%esp),%edx - jmp L2 /* Jump into the loop! */ + pushl %esi + pushl %ebx + movl 12(%esp),%ebx + movl 16(%esp),%esi - _ALIGN_TEXT,0x90 -L1: incl %eax - incl %edx -L2: movb (%eax),%cl - testb %cl,%cl /* null terminator??? */ - jz L3 - cmpb %cl,(%edx) /* chars match??? */ - jne L3 - incl %eax - incl %edx - movb (%eax),%cl - testb %cl,%cl - jz L3 - cmpb %cl,(%edx) - jne L3 - incl %eax - incl %edx - movb (%eax),%cl - testb %cl,%cl - jz L3 - cmpb %cl,(%edx) - jne L3 - incl %eax - incl %edx - movb (%eax),%cl - testb %cl,%cl - jz L3 - cmpb %cl,(%edx) - jne L3 - incl %eax - incl %edx - movb (%eax),%cl - testb %cl,%cl - jz L3 - cmpb %cl,(%edx) - jne L3 - incl %eax - incl %edx - movb (%eax),%cl - testb %cl,%cl - jz L3 - cmpb %cl,(%edx) - jne L3 - incl %eax - incl %edx - movb (%eax),%cl - testb %cl,%cl - jz L3 - cmpb %cl,(%edx) - jne L3 - incl %eax - incl %edx - movb (%eax),%cl - testb %cl,%cl - jz L3 - cmpb %cl,(%edx) - je L1 - _ALIGN_TEXT,0x90 -L3: movzbl (%eax),%eax /* unsigned comparison */ - movzbl (%edx),%edx + /* + * Align s1 to word boundary. + * Consider unrolling loop? + */ +.Ls1align: + testb $3,%bl + je .Ls1aligned + movb (%ebx),%al + incl %ebx + movb (%esi),%dl + incl %esi + testb %al,%al + je .Ldone + cmpb %al,%dl + je .Ls1align + jmp .Ldone + + /* + * Check whether s2 is aligned to a word boundry. If it is, we + * can compare by words. Otherwise we have to compare by bytes. + */ +.Ls1aligned: + testl $3,%esi + jne .Lbyte_loop + + subl $4,%ebx + subl $4,%esi + + _ALIGN_TEXT +.Lword_loop: + movl 4(%ebx),%eax + addl $4,%ebx + movl 4(%esi),%edx + addl $4,%esi + cmpl %eax,%edx + jne .Lbyte_loop + subl $0x01010101,%edx + notl %eax + andl %eax,%edx + testl $0x80808080,%edx + je .Lword_loop + + _ALIGN_TEXT +.Lbyte_loop: + movb (%ebx),%al + incl %ebx + movb (%esi),%dl + incl %esi + testb %al,%al + je .Ldone + cmpb %al,%dl + je .Lbyte_loop + +.Ldone: + movzbl %al,%eax + movzbl %dl,%edx subl %edx,%eax + popl %ebx + popl %esi ret diff --git a/lib/libc/arch/i386/string/strlen.S b/lib/libc/arch/i386/string/strlen.S index 38ebda3628c9..b8f5721c4537 100644 --- a/lib/libc/arch/i386/string/strlen.S +++ b/lib/libc/arch/i386/string/strlen.S @@ -1,23 +1,141 @@ /* - * Written by J.T. Conklin . + * Written by J.T. Conklin * Public domain. */ #include #if defined(LIBC_SCCS) - RCSID("$NetBSD: strlen.S,v 1.7 2003/07/26 19:24:35 salo Exp $") + RCSID("$NetBSD: strlen.S,v 1.8 2005/02/04 18:12:52 drochner Exp $") #endif ENTRY(strlen) - pushl %edi - movl 8(%esp),%edi /* string address */ - cld /* set search forward */ - xorl %eax,%eax /* set search for null terminator */ - movl $-1,%ecx /* set search for lots of characters */ - repne /* search! */ - scasb - notl %ecx /* get length by taking complement */ - leal -1(%ecx),%eax /* and subtracting one */ - popl %edi + movl 4(%esp),%eax + +.Lalign: + /* Consider unrolling loop? */ + testb $3,%al + je .Lword_aligned + cmpb $0,(%eax) + je .Ldone + incl %eax + jmp .Lalign + + /* + * There are many well known branch-free sequences which are used + * for determining whether a zero-byte is contained within a word. + * These sequences are generally much more efficent than loading + * and comparing each byte individually. + * + * The expression [1,2]: + * + * (1) ~(((x & 0x7f7f7f7f) + 0x7f7f7f7f) | (x | 0x7f7f7f7f)) + * + * evaluates to a non-zero value if any of the bytes in the + * original word is zero. + * + * It also has the useful property that bytes in the result word + * that coorespond to non-zero bytes in the original word have + * the value 0x00, while bytes cooresponding to zero bytes have + * the value 0x80. This allows calculation of the first (and + * last) occurance of a zero byte within the word (useful for C's + * str* primitives) by counting the number of leading (or + * trailing) zeros and dividing the result by 8. On machines + * without (or with slow) clz() / ctz() instructions, testing + * each byte in the result word for zero is necessary. + * + * This typically takes 4 instructions (5 on machines without + * "not-or") not including those needed to load the constant. + * + * + * The expression: + * + * (2) ((x - 0x01010101) & ~x & 0x80808080) + * + * evaluates to a non-zero value if any of the bytes in the + * original word is zero. + * + * On little endian machines, the first byte in the result word + * that cooresponds to a zero byte in the original byte is 0x80, + * so clz() can be used as above. On big endian machines, and + * little endian machines without (or with a slow) clz() insn, + * testing each byte in the original for zero is necessary + * + * This typically takes 3 instructions (4 on machines without + * "and with complement") not including those needed to load + * constants. + * + * + * The expression: + * + * (3) ((x - 0x01010101) & 0x80808080) + * + * always evaluates to a non-zero value if any of the bytes in + * the original word is zero. However, in rare cases, it also + * evaluates to a non-zero value when none of the bytes in the + * original word is zero. + * + * To account for possible false positives, each byte of the + * original word must be checked when the expression evaluates to + * a non-zero value. However, because it is simpler than those + * presented above, code that uses it will be faster as long as + * the rate of false positives is low. + * + * This is likely, because the the false positive can only occur + * if the most siginificant bit of a byte within the word is set. + * The expression will never fail for typical 7-bit ASCII strings. + * + * This typically takes 2 instructions not including those needed + * to load constants. + * + * + * [1] Henry S. Warren Jr., "Hacker's Delight", Addison-Westley 2003 + * + * [2] International Business Machines, "The PowerPC Compiler Writer's + * Guide", Warthman Associates, 1996 + */ + + _ALIGN_TEXT +.Lword_aligned: +.Lloop: + movl (%eax),%ecx + addl $4,%eax + leal -0x01010101(%ecx),%edx + testl $0x80808080,%edx + je .Lloop + + /* + * In rare cases, the above loop may exit prematurely. We must + * return to the loop if none of the bytes in the word equal 0. + */ + + /* + * The optimal code for determining whether each byte is zero + * differs by processor. This space-optimized code should be + * acceptable on all, especially since we don't expect it to + * be run frequently, + */ + + testb %cl,%cl /* 1st byte == 0? */ + jne 1f + subl $4,%eax + jmp .Ldone + +1: testb %ch,%ch /* 2nd byte == 0? */ + jne 1f + subl $3,%eax + jmp .Ldone + +1: shrl $16,%ecx + testb %cl,%cl /* 3rd byte == 0? */ + jne 1f + subl $2,%eax + jmp .Ldone + +1: testb %ch,%ch /* 4th byte == 0? */ + jne .Lloop + decl %eax + +.Ldone: + subl 4(%esp),%eax ret diff --git a/lib/libc/arch/i386/string/strrchr.S b/lib/libc/arch/i386/string/strrchr.S index 8de16a951a15..eff8779363e6 100644 --- a/lib/libc/arch/i386/string/strrchr.S +++ b/lib/libc/arch/i386/string/strrchr.S @@ -1,4 +1,98 @@ -/* $NetBSD: strrchr.S,v 1.5 1998/01/09 03:45:09 perry Exp $ */ +/* + * Written by J.T. Conklin + * Public domain. + */ -#define STRRCHR -#include "rindex.S" +#include + +#if defined(LIBC_SCCS) + RCSID("$NetBSD: strrchr.S,v 1.6 2005/02/04 18:12:52 drochner Exp $") +#endif + +#ifdef RINDEX +ENTRY(rindex) +#else +ENTRY(strrchr) +#endif + pushl %esi + pushl %edi + pushl %ebx + movl 16(%esp),%edx + movl 20(%esp),%ecx + + /* zero return value */ + xorl %eax,%eax + + /* + * Align to word boundary. + * Consider unrolling loop? + */ +.Lalign: + testb $3,%dl + je .Lword_aligned + movb (%edx),%bl + cmpb %cl,%bl + jne 1f + movl %edx,%eax +1: testb %bl,%bl + je .Ldone + incl %edx + jmp .Lalign + +.Lword_aligned: + /* copy char to all bytes in word */ + movb %cl,%ch + movl %ecx,%edi + sall $16,%ecx + orl %edi,%ecx + + /* Check whether any byte in the word is equal to ch or 0. */ + _ALIGN_TEXT +.Lloop: + movl (%edx),%ebx + addl $4,%edx + movl %ebx,%esi + leal -0x01010101(%ebx),%edi + xorl %ecx,%esi + subl $0x01010101,%esi + orl %esi,%edi + testl $0x80808080,%edi + je .Lloop + + /* + * In rare cases, the above loop may exit prematurely. We must + * return to the loop if none of the bytes in the word match + * ch or are equal to 0. + */ + + _ALIGN_TEXT + cmpb %cl,%bl /* 1st byte == ch? */ + jne 1f + leal -4(%edx),%eax +1: testb %bl,%bl /* 1st byte == 0? */ + je .Ldone + + cmpb %cl,%bh /* 2nd byte == ch? */ + jne 1f + leal -3(%edx),%eax +1: testb %bh,%bh /* 2nd byte == 0? */ + je .Ldone + + shrl $16,%ebx + cmpb %cl,%bl /* 3rd byte == ch? */ + jne 1f + leal -2(%edx),%eax +1: testb %bl,%bl /* 3rd byte == 0? */ + je .Ldone + + cmpb %cl,%bh /* 4th byte == ch? */ + jne 1f + leal -1(%edx),%eax +1: testb %bh,%bh /* 4th byte == 0? */ + jne .Lloop + +.Ldone: + popl %ebx + popl %edi + popl %esi + ret