After alg 2 triggers, mask with ~x (alg 3) to ignore bytes with top bit set.
Then use bit scan to work out which byte is zero. If the source is misaligned read the aligned word and make the unwanted (low order) bytes non-zero. Passes regression test - which probably tests just enough cases.
This commit is contained in:
parent
749e537220
commit
66df44074f
@ -6,151 +6,140 @@
|
||||
#include <machine/asm.h>
|
||||
|
||||
#if defined(LIBC_SCCS)
|
||||
RCSID("$NetBSD: strlen.S,v 1.2 2009/07/11 08:48:51 dsl Exp $")
|
||||
RCSID("$NetBSD: strlen.S,v 1.3 2009/07/11 11:57:47 dsl Exp $")
|
||||
#endif
|
||||
|
||||
/*
|
||||
* There are many well known branch-free sequences which are used
|
||||
* for determining whether a zero-byte is contained within a word.
|
||||
* These sequences are generally much more efficent than loading
|
||||
* and comparing each byte individually.
|
||||
*
|
||||
* The expression [1,2]:
|
||||
*
|
||||
* (1) ~(((x & 0x7f....7f) + 0x7f....7f) | (x | 0x7f....7f))
|
||||
*
|
||||
* evaluates to a non-zero value if any of the bytes in the
|
||||
* original word is zero.
|
||||
*
|
||||
* It also has the useful property that bytes in the result word
|
||||
* that correspond to non-zero bytes in the original word have
|
||||
* the value 0x00, while bytes corresponding to zero bytes have
|
||||
* the value 0x80. This allows calculation of the first (and
|
||||
* last) occurrence of a zero byte within the word (useful for C's
|
||||
* str* primitives) by counting the number of leading (or
|
||||
* trailing) zeros and dividing the result by 8. On machines
|
||||
* without (or with slow) clz() / ctz() instructions, testing
|
||||
* each byte in the result word for zero is necessary.
|
||||
*
|
||||
* This typically takes 4 instructions (5 on machines without
|
||||
* "not-or") not including those needed to load the constant.
|
||||
*
|
||||
*
|
||||
* The expression:
|
||||
*
|
||||
* (2) ((x - 0x01....01) & 0x80....80 & ~x)
|
||||
*
|
||||
* evaluates to a non-zero value if any of the bytes in the
|
||||
* original word is zero.
|
||||
*
|
||||
* On little endian machines, the first byte in the result word
|
||||
* that corresponds to a zero byte in the original byte is 0x80,
|
||||
* so clz() can be used as above. On big endian machines, and
|
||||
* little endian machines without (or with a slow) clz() insn,
|
||||
* testing each byte in the original for zero is necessary.
|
||||
*
|
||||
* This typically takes 3 instructions (4 on machines without
|
||||
* "and with complement") not including those needed to load
|
||||
* constants.
|
||||
*
|
||||
*
|
||||
* The expression:
|
||||
*
|
||||
* (3) ((x - 0x01....01) & 0x80....80)
|
||||
*
|
||||
* always evaluates to a non-zero value if any of the bytes in
|
||||
* the original word is zero or has the top bit set.
|
||||
* For strings that are likely to only contain 7-bit ascii these
|
||||
* false positives will be rare.
|
||||
*
|
||||
* To account for possible false positives, each byte of the
|
||||
* original word must be checked when the expression evaluates to
|
||||
* a non-zero value. However, because it is simpler than those
|
||||
* presented above, code that uses it will be faster as long as
|
||||
* the rate of false positives is low.
|
||||
*
|
||||
* This is likely, because the the false positive can only occur
|
||||
* if the most siginificant bit of a byte within the word is set.
|
||||
* The expression will never fail for typical 7-bit ASCII strings.
|
||||
*
|
||||
* This typically takes 2 instructions not including those needed
|
||||
* to load constants.
|
||||
*
|
||||
*
|
||||
* [1] Henry S. Warren Jr., "Hacker's Delight", Addison-Westley 2003
|
||||
*
|
||||
* [2] International Business Machines, "The PowerPC Compiler Writer's
|
||||
* Guide", Warthman Associates, 1996
|
||||
*/
|
||||
|
||||
#ifdef TEST_STRLEN
|
||||
ENTRY(test_strlen)
|
||||
#else
|
||||
ENTRY(strlen)
|
||||
movq %rdi,%rax
|
||||
negq %rdi
|
||||
#endif
|
||||
movabsq $0x0101010101010101,%r8
|
||||
|
||||
.Lalign:
|
||||
/* Consider unrolling loop? */
|
||||
testb $7,%al
|
||||
je .Lword_aligned
|
||||
cmpb $0,(%rax)
|
||||
jne 1f
|
||||
leaq (%rdi,%rax),%rax
|
||||
ret
|
||||
1: incq %rax
|
||||
jmp .Lalign
|
||||
|
||||
/*
|
||||
* There are many well known branch-free sequences which are used
|
||||
* for determining whether a zero-byte is contained within a word.
|
||||
* These sequences are generally much more efficent than loading
|
||||
* and comparing each byte individually.
|
||||
*
|
||||
* The expression [1,2]:
|
||||
*
|
||||
* (1) ~(((x & 0x7f....7f) + 0x7f....7f) | (x | 0x7f....7f))
|
||||
*
|
||||
* evaluates to a non-zero value if any of the bytes in the
|
||||
* original word is zero.
|
||||
*
|
||||
* It also has the useful property that bytes in the result word
|
||||
* that correspond to non-zero bytes in the original word have
|
||||
* the value 0x00, while bytes corresponding to zero bytes have
|
||||
* the value 0x80. This allows calculation of the first (and
|
||||
* last) occurrence of a zero byte within the word (useful for C's
|
||||
* str* primitives) by counting the number of leading (or
|
||||
* trailing) zeros and dividing the result by 8. On machines
|
||||
* without (or with slow) clz() / ctz() instructions, testing
|
||||
* each byte in the result word for zero is necessary.
|
||||
*
|
||||
* This typically takes 4 instructions (5 on machines without
|
||||
* "not-or") not including those needed to load the constant.
|
||||
*
|
||||
*
|
||||
* The expression:
|
||||
*
|
||||
* (2) ((x - 0x01....01) & ~x & 0x80....80)
|
||||
*
|
||||
* evaluates to a non-zero value if any of the bytes in the
|
||||
* original word is zero.
|
||||
*
|
||||
* On little endian machines, the first byte in the result word
|
||||
* that corresponds to a zero byte in the original byte is 0x80,
|
||||
* so clz() can be used as above. On big endian machines, and
|
||||
* little endian machines without (or with a slow) clz() insn,
|
||||
* testing each byte in the original for zero is necessary.
|
||||
*
|
||||
* This typically takes 3 instructions (4 on machines without
|
||||
* "and with complement") not including those needed to load
|
||||
* constants.
|
||||
*
|
||||
*
|
||||
* The expression:
|
||||
*
|
||||
* (3) ((x - 0x01....01) & 0x80....80)
|
||||
*
|
||||
* always evaluates to a non-zero value if any of the bytes in
|
||||
* the original word is zero or has the top bit set.
|
||||
* For strings that are likely to only contain 7-bit ascii these
|
||||
* false positives will be rare.
|
||||
*
|
||||
* To account for possible false positives, each byte of the
|
||||
* original word must be checked when the expression evaluates to
|
||||
* a non-zero value. However, because it is simpler than those
|
||||
* presented above, code that uses it will be faster as long as
|
||||
* the rate of false positives is low.
|
||||
*
|
||||
* This is likely, because the the false positive can only occur
|
||||
* if the most siginificant bit of a byte within the word is set.
|
||||
* The expression will never fail for typical 7-bit ASCII strings.
|
||||
*
|
||||
* This typically takes 2 instructions not including those needed
|
||||
* to load constants.
|
||||
*
|
||||
*
|
||||
* [1] Henry S. Warren Jr., "Hacker's Delight", Addison-Westley 2003
|
||||
*
|
||||
* [2] International Business Machines, "The PowerPC Compiler Writer's
|
||||
* Guide", Warthman Associates, 1996
|
||||
*/
|
||||
test $7,%dil
|
||||
movq %rdi,%rax /* Buffer, %rdi unchanged */
|
||||
movabsq $0x8080808080808080,%r9
|
||||
jnz 10f /* Jump if misaligned */
|
||||
|
||||
_ALIGN_TEXT
|
||||
.Lword_aligned:
|
||||
movabsq $0x0101010101010101,%r8
|
||||
movabsq $0x8080808080808080,%r9
|
||||
.Lloop:
|
||||
movq (%rax),%rdx
|
||||
1:
|
||||
movq (%rax),%rdx /* get bytes to check */
|
||||
2:
|
||||
addq $8,%rax
|
||||
subq %r8,%rdx
|
||||
testq %r9,%rdx
|
||||
je .Lloop
|
||||
mov %rdx,%rcx /* save for later check */
|
||||
subq %r8,%rdx /* alg (3) above first */
|
||||
not %rcx /* Invert of data */
|
||||
andq %r9,%rdx
|
||||
je 1b /* jump if all 0x01-0x7f */
|
||||
|
||||
/*
|
||||
* For bytes 0x80-0xff the above loop will exit prematurely. We must
|
||||
* return to the loop if none of the bytes in the word equal 0.
|
||||
*/
|
||||
cmpb $0,-8(%rax) /* 1st byte == 0? */
|
||||
je .Lsub8
|
||||
cmpb $0,-7(%rax) /* 2nd byte == 0? */
|
||||
je .Lsub7
|
||||
cmpb $0,-6(%rax) /* 3rd byte == 0? */
|
||||
je .Lsub6
|
||||
cmpb $0,-5(%rax) /* 4th byte == 0? */
|
||||
je .Lsub5
|
||||
cmpb $0,-4(%rax) /* 5th byte == 0? */
|
||||
je .Lsub4
|
||||
cmpb $0,-3(%rax) /* 6th byte == 0? */
|
||||
je .Lsub3
|
||||
cmpb $0,-2(%rax) /* 7th byte == 0? */
|
||||
je .Lsub2
|
||||
cmpb $0,-1(%rax) /* 8th byte == 0? */
|
||||
jne .Lloop
|
||||
/* Do check from alg (2) above - loops for 0x80..0xff bytes */
|
||||
andq %rcx,%rdx
|
||||
je 1b
|
||||
|
||||
.Lsub1:
|
||||
leaq -1(%rdi,%rax),%rax
|
||||
/* Since we are LE, use bit scan for first 0x80 byte */
|
||||
sub %rdi,%rax /* length to next word */
|
||||
bsf %rdx,%rdx /* 7, 15, 23 ... 63 */
|
||||
shr $3,%rdx /* 0, 1, 2 ... 7 */
|
||||
lea -8(%rax,%rdx),%rax
|
||||
ret
|
||||
.Lsub2:
|
||||
leaq -2(%rdi,%rax),%rax
|
||||
ret
|
||||
.Lsub3:
|
||||
leaq -3(%rdi,%rax),%rax
|
||||
ret
|
||||
.Lsub4:
|
||||
leaq -4(%rdi,%rax),%rax
|
||||
ret
|
||||
.Lsub5:
|
||||
leaq -5(%rdi,%rax),%rax
|
||||
ret
|
||||
.Lsub6:
|
||||
leaq -6(%rdi,%rax),%rax
|
||||
ret
|
||||
.Lsub7:
|
||||
leaq -7(%rdi,%rax),%rax
|
||||
ret
|
||||
.Lsub8:
|
||||
leaq -8(%rdi,%rax),%rax
|
||||
|
||||
/* Misaligned, read aligned word and make low bytes non-zero */
|
||||
_ALIGN_TEXT
|
||||
10:
|
||||
mov $1,%rsi
|
||||
mov %al,%cl
|
||||
and $~7,%al /* start of word with buffer */
|
||||
and $7,%cl /* offset into word 1..7 */
|
||||
shl $3,%cl /* bit count 8, 16 .. 56 */
|
||||
movq (%rax),%rdx /* first data in high bytes */
|
||||
shl %cl,%rsi
|
||||
dec %rsi
|
||||
or %rsi,%rdx /* low bytes now non-zero */
|
||||
jmp 2b
|
||||
|
||||
#ifdef TEST_STRLEN
|
||||
/* trivial implementation when testing above! */
|
||||
ENTRY(strlen)
|
||||
mov %rdi,%rax
|
||||
1:
|
||||
cmpb $0,(%rax)
|
||||
jz 2f
|
||||
inc %rax
|
||||
jmp 1b
|
||||
2: sub %rdi,%rax
|
||||
ret
|
||||
#endif
|
||||
|
Loading…
Reference in New Issue
Block a user