rewrite and hand-tune copy_gap16 loops:
* hoist code to deal with fragments of a 16-byte chunk outside main copy loop. * over 98% of dynamic calls are 2-byte-algnied but not 4-byte-aligned (due to 14-byte Ether headers). kernel bcopy is poorly tune for this. Replace bcopy() with inline code tuned to minimize accesses to DMA buffers, which aren't uncached on mips. Tested on 5000/240 (3MAXPLUS) and 3000/700 (sandpiper) (mjacob@feral.com)
This commit is contained in:
parent
9ffe59d049
commit
6cd95f2a55
@ -1,4 +1,4 @@
|
||||
/* $NetBSD: if_le_ioasic.c,v 1.7 1997/07/22 03:44:30 jonathan Exp $ */
|
||||
/* $NetBSD: if_le_ioasic.c,v 1.8 1997/08/26 01:27:12 jonathan Exp $ */
|
||||
|
||||
/*
|
||||
* Copyright (c) 1996 Carnegie-Mellon University.
|
||||
@ -204,19 +204,79 @@ le_ioasic_copytobuf_gap16(sc, fromv, boff, len)
|
||||
volatile caddr_t buf = sc->sc_mem;
|
||||
register caddr_t from = fromv;
|
||||
register caddr_t bptr;
|
||||
register int xfer;
|
||||
|
||||
bptr = buf + ((boff << 1) & ~0x1f);
|
||||
boff &= 0xf;
|
||||
|
||||
/*
|
||||
* Dispose of boff so destination of subsequent copies is
|
||||
* 16-byte aligned.
|
||||
*/
|
||||
if (boff) {
|
||||
register int xfer;
|
||||
xfer = min(len, 16 - boff);
|
||||
while (len > 0) {
|
||||
bcopy(from, bptr + boff, xfer);
|
||||
from += xfer;
|
||||
bptr += 32;
|
||||
boff = 0;
|
||||
len -= xfer;
|
||||
xfer = min(len, 16);
|
||||
}
|
||||
|
||||
/* Destination of copies is now 16-byte aligned. */
|
||||
if (len >= 16)
|
||||
switch ((u_long)from & (sizeof(u_int32_t) -1)) {
|
||||
case 2:
|
||||
/* Ethernet headers make this the dominant case. */
|
||||
do {
|
||||
register u_int32_t *dst = (u_int32_t*)bptr;
|
||||
register u_int16_t t0;
|
||||
register u_int32_t t1, t2, t3, t4;
|
||||
|
||||
/* read from odd-16-bit-aligned, cached src */
|
||||
t0 = *(u_int16_t*)from;
|
||||
t1 = *(u_int32_t*)(from+2);
|
||||
t2 = *(u_int32_t*)(from+6);
|
||||
t3 = *(u_int32_t*)(from+10);
|
||||
t4 = *(u_int16_t*)(from+14);
|
||||
|
||||
/* DMA buffer is uncached on mips */
|
||||
dst[0] = t0 | (t1 << 16);
|
||||
dst[1] = (t1 >> 16) | (t2 << 16);
|
||||
dst[2] = (t2 >> 16) | (t3 << 16);
|
||||
dst[3] = (t3 >> 16) | (t4 << 16);
|
||||
|
||||
from += 16;
|
||||
bptr += 32;
|
||||
len -= 16;
|
||||
} while (len >= 16);
|
||||
break;
|
||||
|
||||
case 0:
|
||||
do {
|
||||
register u_int32_t *src = (u_int32_t*)from;
|
||||
register u_int32_t *dst = (u_int32_t*)bptr;
|
||||
register u_int32_t t0, t1, t2, t3;
|
||||
|
||||
t0 = src[0]; t1 = src[1]; t2 = src[2]; t3 = src[3];
|
||||
dst[0] = t0; dst[1] = t1; dst[2] = t2; dst[3] = t3;
|
||||
|
||||
from += 16;
|
||||
bptr += 32;
|
||||
len -= 16;
|
||||
} while (len >= 16);
|
||||
break;
|
||||
|
||||
default:
|
||||
/* Does odd-aligned case ever happen? */
|
||||
do {
|
||||
bcopy(from, bptr, 16);
|
||||
from += 16;
|
||||
bptr += 32;
|
||||
len -= 16;
|
||||
} while (len >= 16);
|
||||
break;
|
||||
}
|
||||
if (len)
|
||||
bcopy(from, bptr, len);
|
||||
}
|
||||
|
||||
void
|
||||
@ -228,19 +288,71 @@ le_ioasic_copyfrombuf_gap16(sc, tov, boff, len)
|
||||
volatile caddr_t buf = sc->sc_mem;
|
||||
register caddr_t to = tov;
|
||||
register caddr_t bptr;
|
||||
register int xfer;
|
||||
|
||||
bptr = buf + ((boff << 1) & ~0x1f);
|
||||
boff &= 0xf;
|
||||
|
||||
/* Dispose of boff. source of copy is subsequently 16-byte aligned. */
|
||||
if (boff) {
|
||||
register int xfer;
|
||||
xfer = min(len, 16 - boff);
|
||||
while (len > 0) {
|
||||
bcopy(bptr+boff, to, xfer);
|
||||
to += xfer;
|
||||
bptr += 32;
|
||||
boff = 0;
|
||||
len -= xfer;
|
||||
xfer = min(len, 16);
|
||||
}
|
||||
if (len >= 16)
|
||||
switch ((u_long)to & (sizeof(u_int32_t) -1)) {
|
||||
case 2:
|
||||
/*
|
||||
* to is aligned to an odd 16-bit boundary. Ethernet headers
|
||||
* make this the dominant case (98% or more).
|
||||
*/
|
||||
do {
|
||||
register u_int32_t *src = (u_int32_t*)bptr;
|
||||
register u_int32_t t0, t1, t2, t3;
|
||||
|
||||
/* read from uncached aligned DMA buf */
|
||||
t0 = src[0]; t1 = src[1]; t2 = src[2]; t3 = src[3];
|
||||
|
||||
/* write to odd-16-bit-word aligned dst */
|
||||
*(u_int16_t *) (to+0) = (u_short) t0;
|
||||
*(u_int32_t *) (to+2) = (t0 >> 16) | (t1 << 16);
|
||||
*(u_int32_t *) (to+6) = (t1 >> 16) | (t2 << 16);
|
||||
*(u_int32_t *) (to+10) = (t2 >> 16) | (t3 << 16);
|
||||
*(u_int16_t *) (to+14) = (t3 >> 16);
|
||||
bptr += 32;
|
||||
to += 16;
|
||||
len -= 16;
|
||||
} while (len > 16);
|
||||
break;
|
||||
case 0:
|
||||
/* 32-bit aligned aligned copy. Rare. */
|
||||
do {
|
||||
register u_int32_t *src = (u_int32_t*)bptr;
|
||||
register u_int32_t *dst = (u_int32_t*)to;
|
||||
register u_int32_t t0, t1, t2, t3;
|
||||
|
||||
t0 = src[0]; t1 = src[1]; t2 = src[2]; t3 = src[3];
|
||||
dst[0] = t0; dst[1] = t1; dst[2] = t2; dst[3] = t3;
|
||||
to += 16;
|
||||
bptr += 32;
|
||||
len -= 16;
|
||||
} while (len > 16);
|
||||
break;
|
||||
|
||||
/* XXX Does odd-byte-aligned case ever happen? */
|
||||
default:
|
||||
do {
|
||||
bcopy(bptr, to, 16);
|
||||
to += 16;
|
||||
bptr += 32;
|
||||
len -= 16;
|
||||
} while (len > 16);
|
||||
break;
|
||||
}
|
||||
if (len)
|
||||
bcopy(bptr, to, len);
|
||||
}
|
||||
|
||||
void
|
||||
|
Loading…
Reference in New Issue
Block a user