New bzero() using block store insns.
This commit is contained in:
parent
d1b815fe32
commit
0ba088f0a3
@ -1,7 +1,7 @@
|
||||
/* $NetBSD: bzero.S,v 1.1 1998/09/11 04:56:32 eeh Exp $ */
|
||||
/* $NetBSD: bzero.S,v 1.2 1999/12/30 15:31:39 eeh Exp $ */
|
||||
|
||||
/*
|
||||
* Copyright (c) 1992, 1993
|
||||
* Copyright (c) 1992, 1993, 1999
|
||||
* The Regents of the University of California. All rights reserved.
|
||||
*
|
||||
* This software was developed by the Computer Systems Engineering group
|
||||
@ -40,111 +40,241 @@
|
||||
*/
|
||||
|
||||
#include <machine/asm.h>
|
||||
#ifndef _LOCORE
|
||||
#define _LOCORE
|
||||
#endif
|
||||
#include <machine/ctlreg.h>
|
||||
#include <machine/frame.h>
|
||||
#include <machine/psl.h>
|
||||
|
||||
#if defined(LIBC_SCCS) && !defined(lint)
|
||||
#if 0
|
||||
.asciz "@(#)bzero.s 8.1 (Berkeley) 6/4/93"
|
||||
#else
|
||||
RCSID("$NetBSD: bzero.S,v 1.1 1998/09/11 04:56:32 eeh Exp $")
|
||||
RCSID("$NetBSD: bzero.S,v 1.2 1999/12/30 15:31:39 eeh Exp $")
|
||||
#endif
|
||||
#endif /* LIBC_SCCS and not lint */
|
||||
|
||||
#ifdef MEMSET
|
||||
/*
|
||||
* memset(addr, c, len)
|
||||
*
|
||||
* Duplicate the pattern so it fills 64-bits, then swap around the
|
||||
* arguments and call bzero.
|
||||
*/
|
||||
ENTRY(memset)
|
||||
and %o1, 0x0ff, %o3
|
||||
mov %o2, %o1
|
||||
sllx %o3, 8, %o2
|
||||
or %o2, %o3, %o2
|
||||
mov %o0, %o4 ! Save original pointer
|
||||
sllx %o2, 16, %o3
|
||||
or %o2, %o3, %o2
|
||||
sllx %o2, 32, %o3
|
||||
or %o2, %o3, %o2
|
||||
#else
|
||||
/*
|
||||
* bzero(addr, len)
|
||||
*
|
||||
* We should unroll the loop, but at the moment this would
|
||||
* gain nothing since the `std' instructions are what limits us.
|
||||
* We want to use VIS instructions if we're clearing out more than
|
||||
* 256 bytes, but to do that we need to properly save and restore the
|
||||
* FP registers. Unfortunately the code to do that in the kernel needs
|
||||
* to keep track of the current owner of the FPU, hence the different
|
||||
* code.
|
||||
*
|
||||
*/
|
||||
ENTRY(bzero)
|
||||
! %o0 = addr, %o1 = len
|
||||
|
||||
! Optimize a common case: addr and len are both multiples of 8.
|
||||
or %o0, %o1, %o2
|
||||
btst 7, %o2 ! ((addr | len) & 7) != 0?
|
||||
bnz,pt %icc, 1f ! if so, cannot optimize
|
||||
cmp %o1, 15 ! len >= 15? -- 1st instr of 1: below
|
||||
|
||||
/* `Good' operands, can just store doubles. */
|
||||
clr %o2 ! Initialize our pattern
|
||||
#endif
|
||||
Lbzero_internal:
|
||||
brz,pn %o1, Lbzero_done ! No bytes to copy??
|
||||
! cmp %o1, 8 ! Less than 8 bytes to go?
|
||||
! ble,a,pn %icc, Lbzero_small ! Do it byte at a time.
|
||||
! deccc 8, %o1 ! pre-decrement
|
||||
|
||||
btst 7, %o0 ! 64-bit aligned? Optimization
|
||||
bz,pt %xcc, 2f
|
||||
btst 3, %o0 ! 32-bit aligned?
|
||||
bz,pt %xcc, 1f
|
||||
btst 1, %o0 ! 16-bit aligned?
|
||||
bz,pt %xcc, 0f
|
||||
btst 3, %o0
|
||||
|
||||
!! unaligned -- store 1 byte
|
||||
stb %o2, [%o0]
|
||||
dec 1, %o1 ! Record storing 1 byte
|
||||
inc %o0
|
||||
cmp %o1, 2
|
||||
bl,a,pn %icc, 7f ! 1 or 0 left
|
||||
dec 8, %o1 ! Fixup count -8
|
||||
0:
|
||||
deccc 8, %o1 ! while ((len -= 8) >= 0)
|
||||
bge,a 0b
|
||||
stx %g0, [%o0 + %o1] ! *(quad *)(addr + len) = 0;
|
||||
retl
|
||||
nop
|
||||
btst 3, %o0
|
||||
bz,pt %xcc, 1f
|
||||
btst 7, %o0 ! 64-bit aligned?
|
||||
|
||||
/*
|
||||
* Either the address is unaligned, or the count is not a
|
||||
* multiple of 8, or both. We will have to align the address
|
||||
* in order to use anything `better' than stb.
|
||||
*/
|
||||
!! 16-bit aligned -- store half word
|
||||
sth %o2, [%o0]
|
||||
dec 2, %o1 ! Prepare to store 2 bytes
|
||||
inc 2, %o0
|
||||
cmp %o1, 4
|
||||
bl,a,pn %icc, 5f ! Less than 4 left
|
||||
dec 8, %o1 ! Fixup count -8
|
||||
1:
|
||||
! cmp %o1, 15 ! len >= 15?
|
||||
bge,a,pn %xcc, Lstx ! yes, use stx
|
||||
btst 1, %o0 ! (but first check alignment)
|
||||
|
||||
! not enough to bother: do byte-at-a-time loop.
|
||||
2:
|
||||
deccc %o1 ! while (--len >= 0)
|
||||
brnz,a,pt %o1, 2b
|
||||
stb %g0, [%o0 + %o1] ! addr[len] = 0;
|
||||
retl
|
||||
btst 7, %o0 ! 64-bit aligned?
|
||||
bz,pt %xcc, 2f
|
||||
nop
|
||||
|
||||
Lstx:
|
||||
/*
|
||||
* There are at least 15 bytes to zero.
|
||||
* We may have to zero some initial stuff to align
|
||||
* the address.
|
||||
*/
|
||||
bz,a %icc, 1f ! if (addr & 1) {
|
||||
btst 2, %o0
|
||||
stb %g0, [%o0] ! *addr = 0;
|
||||
inc %o0 ! addr++;
|
||||
dec %o1 ! len--;
|
||||
btst 2, %o0 ! }
|
||||
1:
|
||||
bz,a 1f ! if (addr & 2) {
|
||||
btst 4, %o0
|
||||
sth %g0, [%o0] ! *(short *)addr = 0;
|
||||
inc 2, %o0 ! addr += 2;
|
||||
dec 2, %o1 ! len -= 2;
|
||||
btst 4, %o0 ! }
|
||||
1:
|
||||
bz 1f ! if (addr & 4) {
|
||||
dec 8, %o1
|
||||
st %g0, [%o0] ! *(int *)addr = 0;
|
||||
inc 4, %o0 ! addr += 4;
|
||||
dec 4, %o1 ! len -= 4;
|
||||
! }
|
||||
/*
|
||||
* Address is double word aligned; len is 8 less than
|
||||
* the number of bytes remaining (i.e., len is 0 if
|
||||
* the remaining count is 8, 1 if it is 9, etc.).
|
||||
*/
|
||||
1:
|
||||
stx %g0, [%o0] ! do {
|
||||
2: ! *(quad *)addr = 0;
|
||||
inc 8, %o0 ! addr += 8;
|
||||
deccc 8, %o1 ! } while ((len -= 8) >= 0);
|
||||
bge,a 2b
|
||||
stx %g0, [%o0]
|
||||
!! 32-bit aligned -- store word
|
||||
stw %o2, [%o0]
|
||||
dec 4, %o1
|
||||
inc 4, %o0
|
||||
cmp %o1, 8
|
||||
bl,a,pn %icc, Lbzero_cleanup ! Less than 8 left
|
||||
dec 8, %o1 ! Fixup count -8
|
||||
2:
|
||||
!! Now we're 64-bit aligned
|
||||
cmp %o1, 256 ! Use block clear if len > 256
|
||||
bge,pt %xcc, Lbzero_block ! use block store insns
|
||||
deccc 8, %o1
|
||||
Lbzero_longs:
|
||||
bl,pn %xcc, Lbzero_cleanup ! Less than 8 bytes left
|
||||
nop
|
||||
3:
|
||||
stx %o2, [%o0] ! Do 1 longword at a time
|
||||
deccc 8, %o1
|
||||
bge,pt %xcc, 3b
|
||||
inc 8, %o0
|
||||
|
||||
/*
|
||||
* Len is in [-8..-1] where -8 => done, -7 => 1 byte to zero,
|
||||
* -6 => two bytes, etc. Mop up this remainder, if any.
|
||||
*/
|
||||
Lbzero_cleanup:
|
||||
btst 4, %o1
|
||||
bz 1f ! if (len & 4) {
|
||||
bz,pt %xcc, 6f ! if (len & 4) {
|
||||
btst 2, %o1
|
||||
stw %g0, [%o0] ! *(int *)addr = 0;
|
||||
stw %o2, [%o0] ! *(int *)addr = 0;
|
||||
inc 4, %o0 ! addr += 4;
|
||||
1:
|
||||
bz 1f ! if (len & 2) {
|
||||
5:
|
||||
btst 2, %o1
|
||||
6:
|
||||
bz,pt %xcc, 8f ! if (len & 2) {
|
||||
btst 1, %o1
|
||||
sth %g0, [%o0] ! *(short *)addr = 0;
|
||||
sth %o2, [%o0] ! *(short *)addr = 0;
|
||||
inc 2, %o0 ! addr += 2;
|
||||
1:
|
||||
bnz,a 1f ! if (len & 1)
|
||||
stb %g0, [%o0] ! *addr = 0;
|
||||
1:
|
||||
7:
|
||||
btst 1, %o1
|
||||
8:
|
||||
bnz,a %icc, Lbzero_done ! if (len & 1)
|
||||
stb %o2, [%o0] ! *addr = 0;
|
||||
Lbzero_done:
|
||||
retl
|
||||
mov %o4, %o0 ! Restore ponter for memset (ugh)
|
||||
|
||||
/*
|
||||
* Len is in [-8..-1] where -8 => done, -7 => 1 byte to zero,
|
||||
* -6 => two bytes, etc. but we're potentially unaligned.
|
||||
* Do byte stores since it's easiest.
|
||||
*/
|
||||
Lbzero_small:
|
||||
inccc 8, %o1
|
||||
bz,pn %icc, Lbzero_done
|
||||
1:
|
||||
deccc %o1
|
||||
stb %o2, [%o0]
|
||||
bge,pt %icc, 1b
|
||||
inc %o0
|
||||
ba,a,pt %icc, Lbzero_done
|
||||
nop ! XXX spitfire bug?
|
||||
|
||||
Lbzero_block:
|
||||
/*
|
||||
* Userland:
|
||||
*
|
||||
* We allocate enough space on the stack to save our registers and save
|
||||
* our floating point state. We really don't need to do this if the
|
||||
* registers were not in use before, but we can't really tell if they
|
||||
* were in use or not.
|
||||
*
|
||||
* See locore.s for the kernel version.
|
||||
*
|
||||
*/
|
||||
save %sp, -(CC64FSZ+32*8+BLOCK_SIZE), %sp ! Allocate an fpstate
|
||||
add %sp, (CC64FSZ+BLOCK_SIZE-1), %l0 ! Calculate pointer to fpstate
|
||||
andn %l0, BLOCK_ALIGN, %l0 ! And make it block aligned
|
||||
btst 1, %sp
|
||||
add %l0, BIAS, %l1 ! Fixup 64-bit stack pointers
|
||||
movnz %xcc, %l1, %l0
|
||||
|
||||
! wr %g0, FPRS_FEF, %fprs ! Enable FPU
|
||||
stda %f0, [%l0] ASI_BLK_P
|
||||
add %l0, BLOCK_SIZE, %l1
|
||||
stda %f16, [%l1] ASI_BLK_COMMIT_P ! We only need two banks
|
||||
|
||||
!! We are now 8-byte aligned. We need to become 64-byte aligned.
|
||||
btst 63, %i0
|
||||
bz,pt %xcc, 2f
|
||||
nop
|
||||
1:
|
||||
stx %i2, [%i0]
|
||||
inc 8, %i0
|
||||
btst 63, %i0
|
||||
bnz,pt %xcc, 1b
|
||||
dec 8, %i1
|
||||
|
||||
2:
|
||||
brz,pt %i2, 4f ! Do we have a pattern to load?
|
||||
fzero %f0 ! Set up FPU
|
||||
|
||||
btst 1, %fp
|
||||
bnz,pt %icc, 3f ! 64-bit stack?
|
||||
nop
|
||||
stw %i2, [%fp + 0x28] ! Flush this puppy to RAM
|
||||
membar #StoreLoad
|
||||
ld [%fp + 0x28], %f0
|
||||
ba,pt %icc, 4f
|
||||
fmovsa %icc, %f0, %f1
|
||||
3:
|
||||
stx %i2, [%fp + BIAS + 0x50] ! Flush this puppy to RAM
|
||||
membar #StoreLoad
|
||||
ldd [%fp + BIAS + 0x50], %f0
|
||||
4:
|
||||
fmovda %icc, %f0, %f2 ! Duplicate the pattern
|
||||
fmovda %icc, %f0, %f4
|
||||
fmovda %icc, %f0, %f6
|
||||
fmovda %icc, %f0, %f8
|
||||
fmovda %icc, %f0, %f10
|
||||
fmovda %icc, %f0, %f12
|
||||
fmovda %icc, %f0, %f14
|
||||
fmovda %icc, %f0, %f16 ! And second bank
|
||||
fmovda %icc, %f0, %f18
|
||||
fmovda %icc, %f0, %f20
|
||||
fmovda %icc, %f0, %f22
|
||||
fmovda %icc, %f0, %f24
|
||||
fmovda %icc, %f0, %f26
|
||||
fmovda %icc, %f0, %f28
|
||||
fmovda %icc, %f0, %f30
|
||||
|
||||
!! Remember: we were 8 bytes too far
|
||||
dec 56, %i1 ! Go one iteration too far
|
||||
5:
|
||||
stda %f0, [%i0] ASI_BLK_COMMIT_P ! Store 64 bytes
|
||||
deccc 64, %i1
|
||||
ble,pn %xcc, 6f
|
||||
inc 64, %i0
|
||||
|
||||
stda %f0, [%i0] ASI_BLK_COMMIT_P ! Store 64 bytes
|
||||
deccc 64, %i1
|
||||
bg,pn %xcc, 5b
|
||||
inc 64, %i0
|
||||
6:
|
||||
/*
|
||||
* Now we're done we need to load the FPU state from where
|
||||
* we put it.
|
||||
*/
|
||||
ldda [%l0] ASI_BLK_P, %f0
|
||||
add %l0, BLOCK_SIZE, %l1
|
||||
ldda [%l1] ASI_BLK_P, %f16
|
||||
addcc %i1, 56, %i1 ! Restore the count
|
||||
ba,pt %xcc, Lbzero_longs ! Finish up the remainder
|
||||
restore
|
||||
|
Loading…
Reference in New Issue
Block a user