NetBSD/sys/netinet/cpu_in_cksum.c
joerg 6e869e402d Refactor in_cksum/in4_cksum/in6_cksum implementations:
- All three functions are included in the kernel by default.
  They call a backend function cpu_in_cksum after possibly
  computing the checksum of the pseudo header.
- cpu_in_cksum is the core to implement the one-complement sum.
  The default implementation is moderate fast on most platforms
  and provides a 32bit accumulator with 16bit addends for L32 platforms
  and a 64bit accumulator with 32bit addends for L64 platforms.
  It handles edge cases like very large mbuf chains (could happen with
  native IPv6 in the future) and provides a good base for new native
  implementations.
- Modify i386 and amd64 assembly to use the new interface.

This disables the MD implementations on !x86 until the conversion is
done. For Alpha, the portable version is faster.
2008-01-25 21:12:10 +00:00

379 lines
9.8 KiB
C

/* $NetBSD: cpu_in_cksum.c,v 1.1 2008/01/25 21:12:14 joerg Exp $ */
/*-
* Copyright (c) 2008 Joerg Sonnenberger <joerg@NetBSD.org>.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: cpu_in_cksum.c,v 1.1 2008/01/25 21:12:14 joerg Exp $");
#include <sys/param.h>
#include <sys/endian.h>
#include <sys/mbuf.h>
#ifdef _KERNEL
#include <sys/systm.h>
#else
#include <assert.h>
#include <stdbool.h>
#include <stdio.h>
#define KASSERT(x) assert(x)
#endif
#include <machine/limits.h>
#include <netinet/in.h>
#ifndef _KERNEL
int cpu_in_cksum(struct mbuf*, int, int, uint32_t);
#endif
/*
* Checksum routine for Internet Protocol family headers (Portable Version).
*
* This routine is very heavily used in the network
* code and should be modified for each CPU to be as fast as possible.
*
* A discussion of different implementation techniques can be found in
* RFC 1071.
*
* The default implementation for 32bit architectures is using
* a 32bit accumulator and operating on 16bit operands.
*
* The default implementation for 64bit architectures is using
* a 64bit accumulator and operating on 32bit operands.
*
* Both versions are unrolled to handle 32 Byte / 64 Byte fragments as core
* of the inner loop. After each iteration of the inner loop, a partial
* reduction is done to avoid carry in long packets.
*/
#if ULONG_MAX == 0xffffffffUL
/* 32bit version */
int
cpu_in_cksum(struct mbuf *m, int len, int off, uint32_t initial_sum)
{
int mlen;
uint32_t sum, partial;
unsigned int final_acc;
uint8_t *data;
bool needs_swap, started_on_odd;
KASSERT(len >= 0);
KASSERT(off >= 0);
needs_swap = false;
started_on_odd = false;
sum = (initial_sum >> 16) + (initial_sum & 0xffff);
for (;;) {
if (__predict_false(m == NULL)) {
printf("in_cksum: out of data\n");
return -1;
}
mlen = m->m_len;
if (mlen > off) {
mlen -= off;
data = mtod(m, uint8_t *) + off;
goto post_initial_offset;
}
off -= mlen;
if (len == 0)
break;
m = m->m_next;
}
for (; len > 0; m = m->m_next) {
if (__predict_false(m == NULL)) {
printf("in_cksum: out of data\n");
return -1;
}
mlen = m->m_len;
data = mtod(m, uint8_t *);
post_initial_offset:
if (mlen == 0)
continue;
if (mlen > len)
mlen = len;
len -= mlen;
partial = 0;
if ((uintptr_t)data & 1) {
/* Align on word boundary */
started_on_odd = !started_on_odd;
#if _BYTE_ORDER == _LITTLE_ENDIAN
partial = *data << 8;
#else
partial = *data;
#endif
++data;
--mlen;
}
needs_swap = started_on_odd;
while (mlen >= 32) {
__builtin_prefetch(data + 32);
partial += *(uint16_t *)data;
partial += *(uint16_t *)(data + 2);
partial += *(uint16_t *)(data + 4);
partial += *(uint16_t *)(data + 6);
partial += *(uint16_t *)(data + 8);
partial += *(uint16_t *)(data + 10);
partial += *(uint16_t *)(data + 12);
partial += *(uint16_t *)(data + 14);
partial += *(uint16_t *)(data + 16);
partial += *(uint16_t *)(data + 18);
partial += *(uint16_t *)(data + 20);
partial += *(uint16_t *)(data + 22);
partial += *(uint16_t *)(data + 24);
partial += *(uint16_t *)(data + 26);
partial += *(uint16_t *)(data + 28);
partial += *(uint16_t *)(data + 30);
data += 32;
mlen -= 32;
if (__predict_false(partial & 0xc0000000)) {
if (needs_swap)
partial = (partial << 8) + (partial >> 24);
sum += (partial >> 16);
sum += (partial & 0xffff);
partial = 0;
}
}
if (mlen & 16) {
partial += *(uint16_t *)data;
partial += *(uint16_t *)(data + 2);
partial += *(uint16_t *)(data + 4);
partial += *(uint16_t *)(data + 6);
partial += *(uint16_t *)(data + 8);
partial += *(uint16_t *)(data + 10);
partial += *(uint16_t *)(data + 12);
partial += *(uint16_t *)(data + 14);
data += 16;
mlen -= 16;
}
/*
* mlen is not updated below as the remaining tests
* are using bit masks, which are not affected.
*/
if (mlen & 8) {
partial += *(uint16_t *)data;
partial += *(uint16_t *)(data + 2);
partial += *(uint16_t *)(data + 4);
partial += *(uint16_t *)(data + 6);
data += 8;
}
if (mlen & 4) {
partial += *(uint16_t *)data;
partial += *(uint16_t *)(data + 2);
data += 4;
}
if (mlen & 2) {
partial += *(uint16_t *)data;
data += 2;
}
if (mlen & 1) {
#if _BYTE_ORDER == _LITTLE_ENDIAN
partial += *data;
#else
partial += *data << 8;
#endif
started_on_odd = !started_on_odd;
}
if (needs_swap)
partial = (partial << 8) + (partial >> 24);
sum += (partial >> 16) + (partial & 0xffff);
/*
* Reduce sum to allow potential byte swap
* in the next iteration without carry.
*/
sum = (sum >> 16) + (sum & 0xffff);
}
final_acc = ((sum >> 16) & 0xffff) + (sum & 0xffff);
final_acc = (final_acc >> 16) + (final_acc & 0xffff);
return ~final_acc & 0xffff;
}
#else
/* 64bit version */
int
cpu_in_cksum(struct mbuf *m, int len, int off, uint32_t initial_sum)
{
int mlen;
uint64_t sum, partial;
unsigned int final_acc;
uint8_t *data;
bool needs_swap, started_on_odd;
KASSERT(len >= 0);
KASSERT(off >= 0);
needs_swap = false;
started_on_odd = false;
sum = initial_sum;
for (;;) {
if (__predict_false(m == NULL)) {
printf("in_cksum: out of data\n");
return -1;
}
mlen = m->m_len;
if (mlen > off) {
mlen -= off;
data = mtod(m, uint8_t *) + off;
goto post_initial_offset;
}
off -= mlen;
if (len == 0)
break;
m = m->m_next;
}
for (; len > 0; m = m->m_next) {
if (__predict_false(m == NULL)) {
printf("in_cksum: out of data\n");
return -1;
}
mlen = m->m_len;
data = mtod(m, uint8_t *);
post_initial_offset:
if (mlen == 0)
continue;
if (mlen > len)
mlen = len;
len -= mlen;
partial = 0;
if ((uintptr_t)data & 1) {
/* Align on word boundary */
started_on_odd = !started_on_odd;
#if _BYTE_ORDER == _LITTLE_ENDIAN
partial = *data << 8;
#else
partial = *data;
#endif
++data;
--mlen;
}
needs_swap = started_on_odd;
if ((uintptr_t)data & 2) {
if (mlen < 2)
goto trailing_bytes;
partial += *(uint16_t *)data;
data += 2;
mlen -= 2;
}
while (mlen >= 64) {
__builtin_prefetch(data + 32);
__builtin_prefetch(data + 64);
partial += *(uint32_t *)data;
partial += *(uint32_t *)(data + 4);
partial += *(uint32_t *)(data + 8);
partial += *(uint32_t *)(data + 12);
partial += *(uint32_t *)(data + 16);
partial += *(uint32_t *)(data + 20);
partial += *(uint32_t *)(data + 24);
partial += *(uint32_t *)(data + 28);
partial += *(uint32_t *)(data + 32);
partial += *(uint32_t *)(data + 36);
partial += *(uint32_t *)(data + 40);
partial += *(uint32_t *)(data + 44);
partial += *(uint32_t *)(data + 48);
partial += *(uint32_t *)(data + 52);
partial += *(uint32_t *)(data + 56);
partial += *(uint32_t *)(data + 60);
data += 64;
mlen -= 64;
if (__predict_false(partial & (3ULL << 62))) {
if (needs_swap)
partial = (partial << 8) + (partial >> 56);
sum += (partial >> 32);
sum += (partial & 0xffffffff);
partial = 0;
}
}
/*
* mlen is not updated below as the remaining tests
* are using bit masks, which are not affected.
*/
if (mlen & 32) {
partial += *(uint32_t *)data;
partial += *(uint32_t *)(data + 4);
partial += *(uint32_t *)(data + 8);
partial += *(uint32_t *)(data + 12);
partial += *(uint32_t *)(data + 16);
partial += *(uint32_t *)(data + 20);
partial += *(uint32_t *)(data + 24);
partial += *(uint32_t *)(data + 28);
data += 32;
}
if (mlen & 16) {
partial += *(uint32_t *)data;
partial += *(uint32_t *)(data + 4);
partial += *(uint32_t *)(data + 8);
partial += *(uint32_t *)(data + 12);
data += 16;
}
if (mlen & 8) {
partial += *(uint32_t *)data;
partial += *(uint32_t *)(data + 4);
data += 8;
}
if (mlen & 4) {
partial += *(uint32_t *)data;
data += 4;
}
if (mlen & 2) {
partial += *(uint16_t *)data;
data += 2;
}
trailing_bytes:
if (mlen & 1) {
#if _BYTE_ORDER == _LITTLE_ENDIAN
partial += *data;
#else
partial += *data << 8;
#endif
started_on_odd = !started_on_odd;
}
if (needs_swap)
partial = (partial << 8) + (partial >> 56);
sum += (partial >> 32) + (partial & 0xffffffff);
/*
* Reduce sum to allow potential byte swap
* in the next iteration without carry.
*/
sum = (sum >> 32) + (sum & 0xffffffff);
}
final_acc = (sum >> 48) + ((sum >> 32) & 0xffff) +
((sum >> 16) & 0xffff) + (sum & 0xffff);
final_acc = (final_acc >> 16) + (final_acc & 0xffff);
final_acc = (final_acc >> 16) + (final_acc & 0xffff);
return ~final_acc & 0xffff;
}
#endif