Add assembler version of in_cksum. Thanks to ad@ for his time as
peep hole optimiser. This is 30% faster than the portable C version for checksumming 64byte and larger mbufs and as fast as or slightly faster for shorter mbufs (with a high variance).
This commit is contained in:
parent
96381cfdb1
commit
9d157a1ecb
283
sys/arch/amd64/amd64/in_cksum.S
Normal file
283
sys/arch/amd64/amd64/in_cksum.S
Normal file
@ -0,0 +1,283 @@
|
||||
/* $NetBSD: in_cksum.S,v 1.1 2008/01/09 16:40:17 joerg Exp $ */
|
||||
|
||||
/*-
|
||||
* Copyright (c) 2008 Joerg Sonnenberger <joerg@NetBSD.org>.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
||||
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
||||
* COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
* INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
|
||||
* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include <machine/asm.h>
|
||||
#include "assym.h"
|
||||
|
||||
ENTRY(in_cksum)
|
||||
pushq %rbp
|
||||
pushq %rbx
|
||||
|
||||
/*
|
||||
* During most of the function the following values can
|
||||
* be found in the registers:
|
||||
*
|
||||
* %rdi: The current element in the mbuf chain.
|
||||
* %esi: Remaining bytes to check after the current mbuf.
|
||||
* %ebp: Minimum of %esi at the start of the loop and the
|
||||
* length of the current mbuf.
|
||||
* %r8: Overall sum. Carry must be handled on increment.
|
||||
* %r9 and %r10: Partial sums. This are normally modified
|
||||
* without carry check, see comment in inner loop.
|
||||
* %rbx: Remaining data of current mbuf.
|
||||
* %dh: Partial sum must be byte swapped before adding up.
|
||||
* %dl: Current mbuf started at odd position. A word was split.
|
||||
*/
|
||||
|
||||
xorl %edx,%edx
|
||||
xorq %r8, %r8
|
||||
|
||||
.Mmbuf_loop:
|
||||
/* All requested bytes checksummed? */
|
||||
testl %esi, %esi
|
||||
jz .Mdone
|
||||
|
||||
/* No more data to process? */
|
||||
testq %rdi, %rdi
|
||||
jz .Mout_of_mbufs
|
||||
|
||||
movl M_LEN(%rdi), %ebp
|
||||
movq M_DATA(%rdi), %rbx
|
||||
|
||||
/* Skip empty mbufs. */
|
||||
testl %ebp, %ebp
|
||||
jz .Mmbuf_loop_next
|
||||
|
||||
/* If this mbuf is longer than necessary, just truncate it. */
|
||||
cmpl %ebp, %esi
|
||||
cmovb %esi, %ebp
|
||||
subl %ebp, %esi
|
||||
|
||||
xorq %r9, %r9
|
||||
xorq %r10, %r10
|
||||
|
||||
.Mmbuf_align_word:
|
||||
/* Already aligned on a word boundary? */
|
||||
testb $1, %bl
|
||||
jz .Mmbuf_align_dword
|
||||
/*
|
||||
* Invert %dl.
|
||||
* If the current position is equivalent to an odd index,
|
||||
* byte swap the overall sum and add the byte as if it was part
|
||||
* of the last mbuf.
|
||||
*
|
||||
* In the even case, adding it without byte swap has the same effect.
|
||||
*/
|
||||
testb $1, %dl
|
||||
setz %dl
|
||||
jz 1f
|
||||
rolq $8, %r8
|
||||
1:
|
||||
movzbl (%rbx), %ecx
|
||||
xchgb %cl, %ch
|
||||
addq %rcx, %r9
|
||||
incq %rbx
|
||||
decl %ebp
|
||||
|
||||
.Mmbuf_align_dword:
|
||||
/*
|
||||
* If the current position is equivalent to an odd index,
|
||||
* byte swap the partial sums at the end to compensate.
|
||||
*/
|
||||
movb %dl, %dh
|
||||
|
||||
/*
|
||||
* If the data is not already aligned at a dword boundary,
|
||||
* just add the first word to one of the partial sums.
|
||||
*/
|
||||
testb $2, %bl
|
||||
jz .Mmbuf_inner_loop
|
||||
cmpl $2, %ebp
|
||||
jb .Mmbuf_trailing_bytes
|
||||
movzwl (%rbx), %ecx
|
||||
addq %rcx, %r9
|
||||
leaq 2(%rbx), %rbx
|
||||
leal -2(%ebp), %ebp
|
||||
|
||||
.Mmbuf_inner_loop:
|
||||
.align 16
|
||||
/*
|
||||
* Inner loop is unrolled to handle 32 byte at a time.
|
||||
* Dwords are summed up in %r9 and %10 without checking
|
||||
* for overflow. This exploits two adders and the order
|
||||
* constraint on flags.
|
||||
*
|
||||
* After the summing up, %r9 and %r10 are merged and
|
||||
* the sum is test for having either of the two highest
|
||||
* bits set. If that is the case, the partial sum is added
|
||||
* to the overall sum and both registers are zeroed.
|
||||
*/
|
||||
cmpl $32, %ebp
|
||||
jb .Mmbuf_trailing_owords
|
||||
movl 0(%rbx), %ecx
|
||||
movl 4(%rbx), %eax
|
||||
addq %rcx, %r9
|
||||
addq %rax, %r10
|
||||
|
||||
movl 8(%rbx), %ecx
|
||||
movl 12(%rbx), %eax
|
||||
addq %rcx, %r9
|
||||
addq %rax, %r10
|
||||
|
||||
movl 16(%rbx), %ecx
|
||||
movl 20(%rbx), %eax
|
||||
addq %rcx, %r9
|
||||
addq %rax, %r10
|
||||
|
||||
movl 24(%rbx), %ecx
|
||||
movl 28(%rbx), %eax
|
||||
addq %rcx, %r9
|
||||
addq %rax, %r10
|
||||
|
||||
leaq 32(%rbx), %rbx
|
||||
leal -32(%ebp), %ebp
|
||||
|
||||
addq %r9, %r10
|
||||
movq %r10, %rax
|
||||
shrq $62, %rax
|
||||
xorq %r9, %r9
|
||||
testb %al, %al
|
||||
jz .Mmbuf_inner_loop
|
||||
|
||||
testb %dh, %dh
|
||||
jz 1f
|
||||
rolq $8, %r10
|
||||
1:
|
||||
addq %r10, %r8
|
||||
adcq $0, %r8
|
||||
xorq %r10, %r10
|
||||
|
||||
jmp .Mmbuf_inner_loop
|
||||
|
||||
/*
|
||||
* One more check for 16, 8, 4, 2 and 1 remaining
|
||||
* byte in the mbuf...
|
||||
*
|
||||
* No more overflow checks needed here.
|
||||
*/
|
||||
.Mmbuf_trailing_owords:
|
||||
cmpl $16, %ebp
|
||||
jb .Mmbuf_trailing_qwords
|
||||
movl 0(%rbx), %ecx
|
||||
movl 4(%rbx), %eax
|
||||
addq %rcx, %r9
|
||||
addq %rax, %r10
|
||||
|
||||
movl 8(%rbx), %ecx
|
||||
movl 12(%rbx), %eax
|
||||
addq %rcx, %r9
|
||||
addq %rax, %r10
|
||||
|
||||
leaq 16(%rbx), %rbx
|
||||
leal -16(%ebp), %ebp
|
||||
|
||||
.Mmbuf_trailing_qwords:
|
||||
cmpl $8, %ebp
|
||||
jb .Mmbuf_trailing_dwords
|
||||
movl 0(%rbx), %ecx
|
||||
movl 4(%rbx), %eax
|
||||
addq %rcx, %r9
|
||||
addq %rax, %r10
|
||||
|
||||
leaq 8(%rbx), %rbx
|
||||
leal -8(%ebp), %ebp
|
||||
|
||||
.Mmbuf_trailing_dwords:
|
||||
cmpl $4, %ebp
|
||||
jb .Mmbuf_trailing_words
|
||||
movl (%rbx), %ecx
|
||||
addq %rcx, %r9
|
||||
leaq 4(%rbx), %rbx
|
||||
leal -4(%ebp), %ebp
|
||||
|
||||
.Mmbuf_trailing_words:
|
||||
cmpl $2, %ebp
|
||||
jb .Mmbuf_trailing_bytes
|
||||
movzwl (%rbx), %ecx
|
||||
addq %rcx, %r9
|
||||
leaq 2(%rbx), %rbx
|
||||
leal -2(%ebp), %ebp
|
||||
|
||||
.Mmbuf_trailing_bytes:
|
||||
cmpl $1, %ebp
|
||||
jne .Mbyte_swap
|
||||
movzbl (%rbx), %ecx
|
||||
addq %rcx, %r9
|
||||
/* Invert %dl as this is a split in a word. */
|
||||
testb %dl, %dl
|
||||
setz %dl
|
||||
|
||||
.Mbyte_swap:
|
||||
/* Byte swap by 8 bit rotate. */
|
||||
testb %dh, %dh
|
||||
jz 1f
|
||||
rolq $8, %r9
|
||||
rolq $8, %r10
|
||||
1:
|
||||
addq %r10, %r8
|
||||
adcq %r9, %r8
|
||||
adcq $0, %r8
|
||||
|
||||
.Mmbuf_loop_next:
|
||||
movq M_NEXT(%rdi), %rdi
|
||||
jmp .Mmbuf_loop
|
||||
|
||||
.Mdone:
|
||||
/*
|
||||
* Reduce 64 bit overall sum into 16 bit sum and
|
||||
* return the complement.
|
||||
*/
|
||||
movq %r8, %rax
|
||||
movq %r8, %rbx
|
||||
shrq $32, %rax
|
||||
addl %eax, %ebx
|
||||
adcl $0, %ebx
|
||||
movzwl %bx, %eax
|
||||
shrl $16, %ebx
|
||||
addw %ax, %bx
|
||||
adcw $0, %bx
|
||||
movw %bx, %ax
|
||||
notw %ax
|
||||
|
||||
.Mreturn:
|
||||
popq %rbx
|
||||
popq %rbp
|
||||
ret
|
||||
|
||||
.Mout_of_mbufs:
|
||||
movq $.Mout_of_mbufs_msg, %rdi
|
||||
movl $0, %eax
|
||||
call _C_LABEL(printf)
|
||||
jmp .Mreturn
|
||||
|
||||
.section .rodata
|
||||
.Mout_of_mbufs_msg:
|
||||
.string "in_cksum: out of data\n"
|
@ -1,4 +1,4 @@
|
||||
# $NetBSD: files.amd64,v 1.53 2008/01/06 18:04:00 ad Exp $
|
||||
# $NetBSD: files.amd64,v 1.54 2008/01/09 16:40:17 joerg Exp $
|
||||
#
|
||||
# new style config file for amd64 architecture
|
||||
#
|
||||
@ -41,10 +41,10 @@ file arch/amd64/amd64/kobj_machdep.c modular
|
||||
file arch/amd64/amd64/kgdb_machdep.c kgdb
|
||||
file kern/subr_disk_mbr.c disk
|
||||
file arch/amd64/amd64/gdt.c
|
||||
file arch/amd64/amd64/in_cksum.S inet
|
||||
#
|
||||
# XXXfvdl write the optimized versions for these.
|
||||
#
|
||||
file netinet/in_cksum.c inet
|
||||
file netinet/in4_cksum.c inet
|
||||
|
||||
file arch/amd64/amd64/machdep.c
|
||||
|
Loading…
Reference in New Issue
Block a user