Translate in_cksum() to assembler, and speed it up a bit.

This commit is contained in:
mycroft 1996-07-03 13:01:40 +00:00
parent 78bf5c48e8
commit 5f51da7a03
3 changed files with 266 additions and 178 deletions

View File

@ -1,4 +1,4 @@
# $NetBSD: files.i386,v 1.73 1996/05/07 00:58:36 thorpej Exp $
# $NetBSD: files.i386,v 1.74 1996/07/03 13:01:40 mycroft Exp $
#
# new style config file for i386 architecture
#
@ -15,7 +15,7 @@ file arch/i386/i386/db_interface.c ddb
file arch/i386/i386/db_trace.c ddb
file arch/i386/i386/disksubr.c disk
file arch/i386/i386/gdt.c
file arch/i386/i386/in_cksum.c inet
file arch/i386/i386/in_cksum.s inet
file arch/i386/i386/machdep.c
file arch/i386/i386/math_emulate.c math_emulate
file arch/i386/i386/mem.c
@ -66,6 +66,13 @@ include "../../../dev/pci/files.pci"
file arch/i386/pci/pci_machdep.c pci
file arch/i386/pci/pci_compat.c pci # XXX compatibility
# XXX
# define pcmciabus here until config issues are resolved
device pcmciabus {[port = -1], [size = 0],
[iomem = -1], [iosiz = 0],
[irq = -1], [drq = -1]}
attach pcmciabus at isabus
#
# ISA and mixed ISA+EISA or ISA+PCI drivers
#
@ -141,6 +148,16 @@ file arch/i386/isa/joy.c joy needs-flag
include "../../../dev/eisa/files.eisa"
file arch/i386/eisa/eisa_machdep.c eisa
#
# PCMCIA-only drivers
#
include "../../../dev/pcmcia/files.pcmcia"
device pcic: pcmciabus
attach pcic at isa
file dev/isa/pcmcia_pcic.c pcic
#
# Compatibility modules
#

View File

@ -1,176 +0,0 @@
/* $NetBSD: in_cksum.c,v 1.9 1996/05/03 19:42:09 christos Exp $ */
/*-
* Copyright (c) 1994, 1995 Charles M. Hannum. All rights reserved.
* Copyright (c) 1990 The Regents of the University of California.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from tahoe: in_cksum.c 1.2 86/01/05
* @(#)in_cksum.c 1.3 (Berkeley) 1/19/91
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/mbuf.h>
#include <netinet/in.h>
/*
* Checksum routine for Internet Protocol family headers.
*
* This routine is very heavily used in the network
* code and should be modified for each CPU to be as fast as possible.
*
* This implementation is 386 version.
*/
#define REDUCE {sum = (sum & 0xffff) + (sum >> 16);}
#define ADDCARRY {if (sum > 0xffff) sum -= 0xffff;}
#define SWAP {sum <<= 8;}
#define ADVANCE(x) {w += x; mlen -= x;}
/*
* Thanks to gcc we don't have to guess
* which registers contain sum & w.
*/
#define Asm __asm __volatile
#define ADD(n) Asm("addl " #n "(%2),%0" : "=r" (sum) : "0" (sum), "r" (w))
#define ADC(n) Asm("adcl " #n "(%2),%0" : "=r" (sum) : "0" (sum), "r" (w))
#define MOP Asm("adcl $0,%0" : "=r" (sum) : "0" (sum))
#define UNSWAP Asm("roll $8,%0" : "=r" (sum) : "0" (sum))
#define ADDBYTE {sum += *w; SWAP; byte_swapped ^= 1;}
#define ADDWORD {sum += *(u_short *)w;}
int
in_cksum(m, len)
register struct mbuf *m;
register int len;
{
register u_char *w;
register unsigned sum = 0;
register int mlen = 0;
int byte_swapped = 0;
for (; m && len; m = m->m_next) {
mlen = m->m_len;
if (mlen == 0)
continue;
w = mtod(m, u_char *);
if (len < mlen)
mlen = len;
len -= mlen;
if (mlen < 16)
goto short_mbuf;
/*
* Force to long boundary so we do longword aligned
* memory operations
*/
if ((3 & (long)w) != 0) {
REDUCE;
if ((1 & (long)w) != 0) {
ADDBYTE;
ADVANCE(1);
}
if ((2 & (long)w) != 0) {
ADDWORD;
ADVANCE(2);
}
}
/*
* Align 4 bytes past a 16-byte cache line boundary.
*/
if ((4 & (long)w) == 0) {
ADD(0);
MOP;
ADVANCE(4);
}
if ((8 & (long)w) != 0) {
ADD(0); ADC(4);
MOP;
ADVANCE(8);
}
/*
* Do as much of the checksum as possible 32 bits at at time.
* In fact, this loop is unrolled to make overhead from
* branches &c small.
*/
while ((mlen -= 32) >= 0) {
/*
* Add with carry 16 words and fold in the last carry
* by adding a 0 with carry.
*
* We aligned the pointer above so that the out-of-
* order operations will cause the next cache line to
* be preloaded while we finish with the current one.
*/
ADD(12); ADC(0); ADC(4); ADC(8);
ADC(28); ADC(16); ADC(20); ADC(24);
MOP;
w += 32;
}
mlen += 32;
if (mlen >= 16) {
ADD(12); ADC(0); ADC(4); ADC(8);
MOP;
ADVANCE(16);
}
short_mbuf:
if (mlen >= 8) {
ADD(0); ADC(4);
MOP;
ADVANCE(8);
}
if (mlen >= 4) {
ADD(0);
MOP;
ADVANCE(4);
}
if (mlen > 0) {
REDUCE;
if (mlen >= 2) {
ADDWORD;
ADVANCE(2);
}
if (mlen >= 1) {
ADDBYTE;
}
}
}
if (len)
printf("cksum: out of data\n");
if (byte_swapped) {
UNSWAP;
}
REDUCE;
ADDCARRY;
return (sum ^ 0xffff);
}

View File

@ -0,0 +1,247 @@
#include <machine/asm.h>
#include "assym.h"
# %eax = sum
# %ebx = m
# %cl = rotate
# %edx = len
# %esi = mlen
# %ebp = w
#define SWAP \
roll $8, %eax ; \
xorb $8, %cl
#define UNSWAP \
roll %cl, %eax
#define MOP \
adcl $0, %eax
#define ADVANCE(n) \
leal n(%ebp), %ebp ; \
leal -n(%esi), %esi ; \
#define ADDBYTE \
SWAP ; \
addb (%ebp), %ah
#define ADDWORD \
addw (%ebp), %ax
#define REDUCE \
movzwl %ax, %esi ; \
shrl $16, %eax ; \
addw %si, %ax ; \
adcw $0, %ax
#define ADD(n) \
addl n(%ebp), %eax
#define ADC(n) \
adcl n(%ebp), %eax
ENTRY(myc2_in_cksum)
pushl %ebp
pushl %ebx
pushl %esi
/*pushl %edi*/
movl 16(%esp), %ebx
movl 20(%esp), %edx
xorl %eax, %eax
xorb %cl, %cl
mbuf_loop_1:
testl %edx, %edx
jz done
mbuf_loop_2:
testl %ebx, %ebx
jz out_of_mbufs
movl M_DATA(%ebx), %ebp
movl M_LEN(%ebx), %esi
movl M_NEXT(%ebx), %ebx
cmpl %edx, %esi
jbe 1f
movl %edx, %esi
1:
subl %esi, %edx
cmpl $16, %esi
jb short_mbuf
testl $3, %ebp
jz dword_aligned
testl $1, %ebp
jz byte_aligned
ADDBYTE
ADVANCE(1)
MOP
testl $2, %ebp
jz word_aligned
byte_aligned:
ADDWORD
ADVANCE(2)
MOP
word_aligned:
dword_aligned:
testl $4, %ebp
jnz qword_aligned
ADD(0)
ADVANCE(4)
MOP
qword_aligned:
testl $8, %ebp
jz oword_aligned
ADD(0)
ADC(4)
ADVANCE(8)
MOP
oword_aligned:
subl $128, %esi
jb finished_128
loop_128:
ADD(12)
ADC(0)
ADC(4)
ADC(8)
ADC(28)
ADC(16)
ADC(20)
ADC(24)
ADC(44)
ADC(32)
ADC(36)
ADC(40)
ADC(60)
ADC(48)
ADC(52)
ADC(56)
ADC(76)
ADC(64)
ADC(68)
ADC(72)
ADC(92)
ADC(80)
ADC(84)
ADC(88)
ADC(108)
ADC(96)
ADC(100)
ADC(104)
ADC(124)
ADC(112)
ADC(116)
ADC(120)
leal 128(%ebp), %ebp
MOP
subl $128, %esi
jnb loop_128
finished_128:
addl $128, %esi
subl $32, %esi
jb finished_32
loop_32:
ADD(12)
ADC(0)
ADC(4)
ADC(8)
ADC(28)
ADC(16)
ADC(20)
ADC(24)
leal 32(%ebp), %ebp
MOP
subl $32, %esi
jnb loop_32
finished_32:
testl $16, %esi
jz finished_16
ADD(12)
ADC(0)
ADC(4)
ADC(8)
leal 16(%ebp), %ebp
MOP
finished_16:
short_mbuf:
testl $8, %esi
jz finished_8
ADD(0)
ADC(4)
leal 8(%ebp), %ebp
MOP
finished_8:
testl $4, %esi
jz finished_4
ADD(0)
leal 4(%ebp), %ebp
MOP
finished_4:
testl $3, %esi
jz mbuf_loop_1
testl $2, %esi
jz finished_2
ADDWORD
leal 2(%ebp), %ebp
MOP
testl $1, %esi
jz finished_1
finished_2:
ADDBYTE
MOP
finished_1:
mbuf_done:
testl %edx, %edx
jnz mbuf_loop_2
done:
UNSWAP
REDUCE
notw %ax
return:
/*popl %edi*/
popl %esi
popl %ebx
popl %ebp
ret
out_of_mbufs:
pushl $1f
call _printf
leal 4(%esp), %esp
jmp return
1:
.asciz "cksum: out of data\n"