Add fast assembler versions of in_cksum() and in4_cksum().

(Well, as fast as can be expected on a cpu with no carry flag)
This commit is contained in:
scw 2002-10-19 09:01:44 +00:00
parent 80b7fbca17
commit 873eee59ff
4 changed files with 370 additions and 247 deletions

View File

@ -1,4 +1,4 @@
# $NetBSD: files.sh5,v 1.9 2002/10/14 14:13:27 scw Exp $
# $NetBSD: files.sh5,v 1.10 2002/10/19 09:01:44 scw Exp $
#
@ -146,8 +146,7 @@ file dev/cninit.c
# the two architectures.
file arch/sh3/sh3/disksubr.c
file arch/sh5/sh5/in_cksum.c inet
file netinet/in4_cksum.c inet
file arch/sh5/sh5/in_cksum.S inet
file netns/ns_cksum.c ns
defflag opt_sh5_debug.h SH5_SIM SH5_DEBUG_ST50

View File

@ -1,4 +1,4 @@
# $NetBSD: genassym.cf,v 1.11 2002/10/12 11:39:54 scw Exp $
# $NetBSD: genassym.cf,v 1.12 2002/10/19 09:01:45 scw Exp $
# Copyright 2002 Wasabi Systems, Inc.
# All rights reserved.
@ -38,8 +38,14 @@ include "opt_kernel_ipt.h"
include <sys/param.h>
include <sys/types.h>
include <sys/proc.h>
include <sys/mbuf.h>
include <sys/signal.h>
include <sys/syscall.h>
include <netinet/in.h>
include <netinet/in_systm.h>
include <netinet/ip.h>
include <netinet/ip6.h>
include <netinet/ip_var.h>
include <uvm/uvm_extern.h>
@ -363,3 +369,12 @@ define SYS_exit SYS_exit
define MR_START offsetof(struct mem_region, mr_start)
define MR_SIZE offsetof(struct mem_region, mr_size)
define SIZEOF_MEM_REGION sizeof(struct mem_region)
# Constants required for in_cksum() and friends.
define M_LEN offsetof(struct mbuf, m_len)
define M_DATA offsetof(struct mbuf, m_data)
define M_NEXT offsetof(struct mbuf, m_next)
define IP_SRC offsetof(struct ip, ip_src)
define IP_DST offsetof(struct ip, ip_dst)
define IP6_SRC offsetof(struct ip6_hdr, ip6_src)
define IP6_DST offsetof(struct ip6_hdr, ip6_dst)

352
sys/arch/sh5/sh5/in_cksum.S Normal file
View File

@ -0,0 +1,352 @@
/* $NetBSD: in_cksum.S,v 1.1 2002/10/19 09:01:45 scw Exp $ */
/*
* Copyright 2002 Wasabi Systems, Inc.
* All rights reserved.
*
* Written by Steve C. Woodford for Wasabi Systems, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed for the NetBSD Project by
* Wasabi Systems, Inc.
* 4. The name of Wasabi Systems, Inc. may not be used to endorse
* or promote products derived from this software without specific prior
* written permission.
*
* THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* in_cksum() and in4_cksum() implementations for SH5.
*
* The SH5 does not have a carry flag, which complicates matters somewhat.
* On the plus side, misaligned buffers are a piece of cake to deal with
* thanks to the ldlo.q and ldhi.q instructions.
*/
#include "opt_inet.h"
#include <machine/asm.h>
#include "assym.h"
/*
* Add With Carry two quads, "q1" and "q2. Put the result in "result".
*
* Trashes r0 and q1.
*/
#define ADDC(q1, q2, result) \
add q2, q1, r0 ;\
or q2, q1, q1 ;\
cmpgtu q1, r0, q1 ;\
add r0, q1, result
/*
* Reduce the quad in "q" to a 32-bit sum, dealing with any
* resulting carry. Put the result in "result".
*
* Trashes r0 and q.
*/
#define REDUCE32(q, result) \
mshflo.l q, r63, r0 ;\
mshfhi.l q, r63, q ;\
add r0, q, q ;\
mshflo.l q, r63, r0 ;\
mshfhi.l q, r63, q ;\
add r0, q, result
/*
* Reduce the 32-bit int in "i" to a 16-bit sum, dealing with any
* resulting carry. Put the result in "result".
*
* Trashes r0 and i.
*/
#define REDUCE16(i, result) \
mshflo.w i, r63, r0 ;\
shlri i, 16, i ;\
add r0, i, i ;\
mshflo.w i, r63, r0 ;\
shlri i, 16, i ;\
add r0, i, result
/*
* Entry parameters:
*
* r3 Buffer length
* r4 Pointer to buffer
* r17 Must be set to 0x1f
* r18 Return address
*
* Returns:
*
* r7 Accumulated sum as two pairs of "carry:sum" words.
*
* Trashes:
* r0, r1, r3, r4, r19, r20, r21, r22
* tr0, tr1, tr2
*/
Lcksumdata:
movi 0, r7
ld.b r4, 0, r63 /* Pre-fetch the start of the buffer */
ptabs/u r18, tr0
pta/u Lend_game, tr1
/*
* We first have to quad-align the buffer.
*
* XXX: We may have to shift the result of the following "ldlo.q"
* depending on the buffer alignment, particularly for odd addresses,
* in the same way as we do for the "ldhi.q" in Lend_game.
*/
xori r4, 0x7, r0
andi r0, 0x7, r0
addi r0, 1, r0 /* r0 == # bytes to next quad */
bgtu/u r0, r3, tr1 /* Not enough bytes left to make it */
ldlo.q r4, 0, r19 /* Fetch 1 to 4 words */
add r4, r0, r4 /* r4 is now quad-aligned */
sub r3, r0, r3 /* Update remaining length */
ADDC (r19, r7, r7) /* Accumulate the words we just read */
beq/u r3, r63, tr0 /* Return to caller if done */
/*
* Buffer is now quad-aligned.
* We now need to align it to a 32-byte boundary.
*/
and r4, r17, r1
xor r1, r17, r0
addi r0, 1, r0 /* r0 == # bytes to 32-byte boundary */
bgtu/u r0, r3, tr1 /* Jump if not enough left to align */
add r4, r0, r4 /* Update buffer pointer */
sub r3, r0, r3 /* Update remaining bytes */
shlri r1, 1, r1 /* Compute loop entry-point in order */
addi r1, 17, r1 /* align buffer to 32-byte boundaey */
movi 0, r19
movi 0, r20
movi 0, r21
pta/u Lbig_loop, tr2
ptrel/l r1, tr1
blink tr1, r63 /* Go for it. */
/*
* At this point:
*
* r0 == 0x00 Enter loop at 1st load.
* r0 == 0x08 Enter loop at 2nd load.
* r0 == 0x10 Enter loop at 3rd load.
* r0 == 0x18 Enter loop at 4th load.
*
* r3 == # of bytes remaining, AFTER loop entry.
* r4 -> *next* 32-byte aligned chunk of buffer.
*
* The "big_loop" checksums 16 words at a time.
*/
Lbig_loop:
addi r4, 32, r4
addi r3, -32, r3
ld.q r4, -32, r19
ld.q r4, -24, r20
ld.q r4, -16, r21
ld.q r4, -8, r22
ld.q r4, 0, r63 /* Pre-fetch next chunk */
ADDC (r19, r7, r7)
ADDC (r20, r7, r7)
ADDC (r21, r7, r7)
ADDC (r22, r7, r7)
bgt/l r3, r17, tr2
/*
* There are less than 32-bytes left.
*/
Lend_game:
beq/u r3, r63, tr0 /* Exit if all done */
add r3, r4, r0
andi r3, 0x18, r3
add r4, r3, r4
xor r3, r17, r3
shlri r3, 1, r3
addi r3, 6, r3
movi 0, r19
movi 0, r20
movi 0, r21
ptrel/l r3, tr1
blink tr1, r63
ld.q r4, -24, r19
ld.q r4, -16, r20
ld.q r4, -8, r21
ldhi.q r0, -1, r22 /* The last quad needs special care */
xori r0, 7, r0 /* to deal with mis-alignment, and */
addi r0, 1, r0 /* to ensure we don't include any */
andi r0, 7, r0 /* bytes past the end of the buffer */
cmveq r0, r0, r22
shlli r0, 3, r0
#ifdef __LITTLE_ENDIAN__
shlrd r22, r0, r22
#else
shlld r22, r0, r22
#endif
ADDC (r19, r7, r7)
ADDC (r20, r7, r7)
ADDC (r21, r7, r7)
ADDC (r22, r7, r7)
blink tr0, r63
/*
* int in_cksum(struct mbuf *m, int len)
*/
ENTRY(in_cksum)
#ifndef _LP64
add.l r2, r63, r5
#else
add r2, r63, r5
#endif
addz.l r3, r63, r6
movi 0, r2
pta/l Lcksum_entry, tr2
/*
* r2 == current sum
* r5 == m
* r6 == len
*/
Lcksum_top:
pta/u Lcksumdata, tr4
pta/u Lcksum_loop, tr3
movi 0, r7
movi 0, r8
movi 0x1f, r17
add r18, r63, r23
blink tr2, r63
Lcksum_loop:
ld.l r5, M_LEN, r3
LDPTR r5, M_DATA, r4
LDPTR r5, M_NEXT, r5
Lcksum_entry4:
cmpgt r3, r6, r0
cmvne r0, r6, r3
sub r6, r3, r6
xor r8, r4, r0
add r8, r3, r8
andi r0, 1, r0
shlli r0, 3, r9
blink tr4, r18
REDUCE32(r7, r7)
shlld r7, r9, r7
add r7, r2, r2
Lcksum_entry:
pta/u 4f, tr0
beq/u r6, r63, tr0
bne/l r5, r63, tr3
/*
* XXX: Do we need to take account of odd final r8?
*/
4: REDUCE32(r2, r2)
REDUCE16(r2, r2)
ptabs/l r23, tr0
xori r2, -1, r2
shlli r2, 48, r2
shlri r2, 48, r2
blink tr0, r63
#ifdef INET
/*
* int in4_cksum(struct mbuf *m, u_int8_t nxt, int off, int len)
*/
ENTRY(in4_cksum)
addz.l r5, r63, r6
#ifndef _LP64
add.l r2, r63, r5
#else
add r2, r63, r5
#endif
shlli r3, 56, r3
shlri r3, 56, r2
addz.l r4, r63, r8
/*
* r2 == nxt (sum)
* r5 == m
* r6 == len
* r8 == off
*/
/*
* First, deal with a pseudo header, if present
*/
pta/l Lno_pseudo, tr0
beq/l r2, r63, tr0 /* Jump if no pseudo header */
LDPTR r5, M_DATA, r4
add r2, r6, r2 /* sum += len */
#ifdef __LITTLE_ENDIAN__
shlli r2, 8, r2 /* sum = htons(sum) */
#endif
ldlo.q r4, IP_SRC, r19 /* Note: Assumes ip_src/ip_dst are */
ldhi.q r4, IP_SRC+7, r20 /* contiguous in memory */
or r19, r20, r19
ADDC (r19, r2, r2) /* sum += ip->ip_{src,dst} */
#endif /* INET */
#if defined(INET) || defined(INET6)
Lno_pseudo:
pta/u Lskip_loop, tr0
pta/l Lskip_entry, tr1
pta/u Lskip_done, tr2
blink tr1, r63
Lskip_loop:
ld.l r5, M_LEN, r3
LDPTR r5, M_DATA, r4
sub r8, r3, r8 /* off -= m->m_len */
LDPTR r5, M_NEXT, r5
Lskip_entry:
bgt/l r63, r8, tr2 /* Break loop if off < 0 */
bne/l r5, r63, tr0 /* Go back until out of mbufs */
/*
* Well wha' d'ya know, wan out of widdle piggies...
*/
pta/l Lout_of_mbufs, tr0
blink tr0, r63
Lskip_done:
add r8, r4, r4
xori r8, -1, r8
add r3, r4, r4
addi r8, 1, r3
pta/l Lcksum_entry4, tr2
pta/l Lcksum_top, tr0
blink tr0, r63
Lout_of_mbufs:
LEA(Lmbuf_msg, r2)
pta/l _C_LABEL(printf), tr0
blink tr0, r63
Lmbuf_msg:
.asciz "cksum: out of mbufs\n"
#endif /* INET || INET6 */

View File

@ -1,243 +0,0 @@
/* $NetBSD: in_cksum.c,v 1.1 2002/07/05 13:32:05 scw Exp $ */
/*
* XXX: This MUST be replaced with an assembly version. SH-5's "Multi-
* Media" instruction set has some features which will make these
* checksum routines very fast indeed.
*/
/*
* Copyright (c) 1988, 1992, 1993
* The Regents of the University of California. All rights reserved.
* Copyright (c) 1996
* Matt Thomas <matt@3am-software.com>
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)in_cksum.c 8.1 (Berkeley) 6/10/93
*/
#include <sys/cdefs.h> /* RCS ID & Copyright macro defns */
__KERNEL_RCSID(0, "$NetBSD: in_cksum.c,v 1.1 2002/07/05 13:32:05 scw Exp $");
#include <sys/param.h>
#include <sys/mbuf.h>
#include <sys/systm.h>
#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/ip.h>
#include <netinet/ip_var.h>
/*
* Checksum routine for Internet Protocol family headers
* (Based on Portable Alpha version).
*
* This routine is very heavily used in the network
* code and should be modified for each CPU to be as fast as possible.
*/
#define ADDCARRY(x) (x > 65535 ? x -= 65535 : x)
#define REDUCE32 \
{ \
q_util.q = sum; \
sum = q_util.s[0] + q_util.s[1] + q_util.s[2] + q_util.s[3]; \
}
#define REDUCE16 \
{ \
q_util.q = sum; \
l_util.l = q_util.s[0] + q_util.s[1] + q_util.s[2] + q_util.s[3]; \
sum = l_util.s[0] + l_util.s[1]; \
ADDCARRY(sum); \
}
static const u_int32_t in_masks[] = {
/*0 bytes*/ /*1 byte*/ /*2 bytes*/ /*3 bytes*/
0x00000000, 0x000000FF, 0x0000FFFF, 0x00FFFFFF, /* offset 0 */
0x00000000, 0x0000FF00, 0x00FFFF00, 0xFFFFFF00, /* offset 1 */
0x00000000, 0x00FF0000, 0xFFFF0000, 0xFFFF0000, /* offset 2 */
0x00000000, 0xFF000000, 0xFF000000, 0xFF000000, /* offset 3 */
};
union l_util {
u_int16_t s[2];
u_int32_t l;
};
union q_util {
u_int16_t s[4];
u_int32_t l[2];
u_int64_t q;
};
static u_int64_t
in_cksumdata(register caddr_t buf, register int len)
{
const u_int32_t *lw = (u_int32_t *) buf;
u_int64_t sum = 0;
u_int64_t prefilled;
int offset;
union q_util q_util;
if ((3 & (uintptr_t) lw) == 0 && len == 20) {
sum = (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3] + lw[4];
REDUCE32;
return sum;
}
if ((offset = 3 & (uintptr_t) lw) != 0) {
const u_int32_t *masks = in_masks + (offset << 2);
lw = (u_int32_t *) (((uintptr_t) lw) - offset);
sum = *lw++ & masks[len >= 3 ? 3 : len];
len -= 4 - offset;
if (len <= 0) {
REDUCE32;
return sum;
}
}
/*
* access prefilling to start load of next cache line.
* then add current cache line
* save result of prefilling for loop iteration.
*/
prefilled = lw[0];
while ((len -= 32) >= 4) {
u_int64_t prefilling = lw[8];
sum += prefilled + lw[1] + lw[2] + lw[3]
+ lw[4] + lw[5] + lw[6] + lw[7];
lw += 8;
prefilled = prefilling;
}
if (len >= 0) {
sum += prefilled + lw[1] + lw[2] + lw[3]
+ lw[4] + lw[5] + lw[6] + lw[7];
lw += 8;
} else {
len += 32;
}
while ((len -= 16) >= 0) {
sum += (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3];
lw += 4;
}
len += 16;
while ((len -= 4) >= 0) {
sum += (u_int64_t) *lw++;
}
len += 4;
if (len > 0)
sum += (u_int64_t) (in_masks[len] & *lw);
REDUCE32;
return sum;
}
int
in_cksum(register struct mbuf *m, register int len)
{
register u_int64_t sum = 0;
register int mlen = 0;
register int clen = 0;
register caddr_t addr;
union q_util q_util;
union l_util l_util;
for (; m && len; m = m->m_next) {
if (m->m_len == 0)
continue;
mlen = m->m_len;
if (len < mlen)
mlen = len;
addr = mtod(m, caddr_t);
if ((clen ^ (uintptr_t) addr) & 1)
sum += in_cksumdata(addr, mlen) << 8;
else
sum += in_cksumdata(addr, mlen);
clen += mlen;
len -= mlen;
}
REDUCE16;
return (~sum & 0xffff);
}
#if 0
int
in4_cksum(struct mbuf *m, u_int8_t nxt, int off, int len)
{
register u_int64_t sum = 0;
register int mlen = 0;
register int clen = 0;
register caddr_t addr;
union q_util q_util;
union l_util l_util;
struct ipovly ipov;
if (nxt != 0) {
/* pseudo header */
if (off < sizeof(struct ipovly))
panic("in4_cksum: offset too short");
if (m->m_len < sizeof(struct ip))
panic("in4_cksum: bad mbuf chain");
memset(&ipov, 0, sizeof(ipov));
ipov.ih_len = htons(len);
ipov.ih_pr = nxt;
ipov.ih_src = mtod(m, struct ip *)->ip_src;
ipov.ih_dst = mtod(m, struct ip *)->ip_dst;
sum += in_cksumdata((caddr_t) &ipov, sizeof(ipov));
}
/* skip over unnecessary part */
while (m != NULL && off > 0) {
if (m->m_len > off)
break;
off -= m->m_len;
m = m->m_next;
}
for (; m && len; m = m->m_next, off = 0) {
if (m->m_len == 0)
continue;
mlen = m->m_len - off;
if (len < mlen)
mlen = len;
addr = mtod(m, caddr_t) + off;
if ((clen ^ (u_int64_t) addr) & 1)
sum += in_cksumdata(addr, mlen) << 8;
else
sum += in_cksumdata(addr, mlen);
clen += mlen;
len -= mlen;
}
REDUCE16;
return (~sum & 0xffff);
}
#endif