From c60a6af952ba1cd1865c044d95187dfd4702a0cc Mon Sep 17 00:00:00 2001 From: christos Date: Sun, 17 Mar 2013 00:43:45 +0000 Subject: [PATCH] now live in common/lib/libc/arch/sparc64/string --- lib/libc/arch/sparc64/string/memcpy.S | 1963 ------------------------- lib/libc/arch/sparc64/string/memset.S | 196 --- 2 files changed, 2159 deletions(-) delete mode 100644 lib/libc/arch/sparc64/string/memcpy.S delete mode 100644 lib/libc/arch/sparc64/string/memset.S diff --git a/lib/libc/arch/sparc64/string/memcpy.S b/lib/libc/arch/sparc64/string/memcpy.S deleted file mode 100644 index 6bc391b28094..000000000000 --- a/lib/libc/arch/sparc64/string/memcpy.S +++ /dev/null @@ -1,1963 +0,0 @@ -/* $NetBSD: memcpy.S,v 1.5 2011/07/12 07:51:33 mrg Exp $ */ - -/* - * Copyright (c) 2001 Eduardo E. Horvath - * - * This software was developed by the Computer Systems Engineering group - * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and - * contributed to Berkeley. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - */ - -#include -#ifndef _LOCORE -#define _LOCORE -#endif -#include -#include -#include - -#if defined(LIBC_SCCS) && !defined(lint) - RCSID("$NetBSD: memcpy.S,v 1.5 2011/07/12 07:51:33 mrg Exp $") -#endif /* LIBC_SCCS and not lint */ - -#define EMPTY nop -#define NOTREACHED ta 1 - -#define BCOPY_SMALL 16 -#define BLOCK_SIZE SPARC64_BLOCK_SIZE -#define BLOCK_ALIGN SPARC64_BLOCK_ALIGN - -#if 0 -#define ASI_STORE ASI_BLK_COMMIT_P -#else -#define ASI_STORE ASI_BLK_P -#endif - -#ifndef _ALIGN -#define _ALIGN .align 8 -#endif - -#if 1 -/* - * kernel bcopy/memcpy - * Assumes regions do not overlap; has no useful return value. - * - * Must not use %g7 (see copyin/copyout above). - */ -ENTRY(memcpy) /* dest, src, size */ - /* - * Swap args for bcopy. Gcc generates calls to memcpy for - * structure assignments. - */ - mov %o0, %o3 - mov %o1, %o0 - mov %o3, %o1 -#endif -ENTRY(bcopy) /* src, dest, size */ -#ifdef DEBUG - set pmapdebug, %o4 - ld [%o4], %o4 - btst 0x80, %o4 ! PDB_COPY - bz,pt %icc, 3f - nop - save %sp, -CC64FSZ, %sp - mov %i0, %o1 - set 2f, %o0 - mov %i1, %o2 - call printf - mov %i2, %o3 -! ta 1; nop - restore - .data -2: .asciz "bcopy(%p->%p,%x)\n" - _ALIGN - .text -3: -#endif - /* - * Check for overlaps and punt. - * - * If src <= dest <= src+len we have a problem. - */ - - sub %o1, %o0, %o3 - - cmp %o3, %o2 - blu,pn %xcc, Lovbcopy - cmp %o2, BCOPY_SMALL -Lbcopy_start: - bge,pt %xcc, 2f ! if >= this many, go be fancy. - cmp %o2, 256 - - mov %o1, %o5 ! Save memcpy return value - /* - * Not much to copy, just do it a byte at a time. - */ - deccc %o2 ! while (--len >= 0) - bl 1f - EMPTY -0: - inc %o0 - ldsb [%o0 - 1], %o4 ! (++dst)[-1] = *src++; - stb %o4, [%o1] - deccc %o2 - bge 0b - inc %o1 -1: - retl - mov %o5, %o0 - NOTREACHED - - /* - * Overlapping bcopies -- punt. - */ -Lovbcopy: - - /* - * Since src comes before dst, and the regions might overlap, - * we have to do the copy starting at the end and working backwards. - * - * We could optimize this, but it almost never happens. - */ - mov %o1, %o5 ! Retval - add %o2, %o0, %o0 ! src += len - add %o2, %o1, %o1 ! dst += len - - deccc %o2 - bl,pn %xcc, 1f - dec %o0 -0: - dec %o1 - ldsb [%o0], %o4 - dec %o0 - - deccc %o2 - bge,pt %xcc, 0b - stb %o4, [%o1] -1: - retl - mov %o5, %o0 - - /* - * Plenty of data to copy, so try to do it optimally. - */ -2: -#if 1 - ! If it is big enough, use VIS instructions - bge Lbcopy_block - nop -#endif -Lbcopy_fancy: - - !! - !! First align the output to a 8-byte entity - !! - - save %sp, -CC64FSZ, %sp - - mov %i0, %o0 - mov %i1, %o1 - - mov %i2, %o2 - btst 1, %o1 - - bz,pt %icc, 4f - btst 2, %o1 - ldub [%o0], %o4 ! Load 1st byte - - deccc 1, %o2 - ble,pn %xcc, Lbcopy_finish ! XXXX - inc 1, %o0 - - stb %o4, [%o1] ! Store 1st byte - inc 1, %o1 ! Update address - btst 2, %o1 -4: - bz,pt %icc, 4f - - btst 1, %o0 - bz,a 1f - lduh [%o0], %o4 ! Load short - - ldub [%o0], %o4 ! Load bytes - - ldub [%o0+1], %o3 - sllx %o4, 8, %o4 - or %o3, %o4, %o4 - -1: - deccc 2, %o2 - ble,pn %xcc, Lbcopy_finish ! XXXX - inc 2, %o0 - sth %o4, [%o1] ! Store 1st short - - inc 2, %o1 -4: - btst 4, %o1 - bz,pt %xcc, 4f - - btst 3, %o0 - bz,a,pt %xcc, 1f - lduw [%o0], %o4 ! Load word -1 - - btst 1, %o0 - bz,a,pt %icc, 2f - lduh [%o0], %o4 - - ldub [%o0], %o4 - - lduh [%o0+1], %o3 - sllx %o4, 16, %o4 - or %o4, %o3, %o4 - - ldub [%o0+3], %o3 - sllx %o4, 8, %o4 - ba,pt %icc, 1f - or %o4, %o3, %o4 - -2: - lduh [%o0+2], %o3 - sllx %o4, 16, %o4 - or %o4, %o3, %o4 - -1: - deccc 4, %o2 - ble,pn %xcc, Lbcopy_finish ! XXXX - inc 4, %o0 - - st %o4, [%o1] ! Store word - inc 4, %o1 -4: - !! - !! We are now 32-bit aligned in the dest. - !! -Lbcopy__common: - - and %o0, 7, %o4 ! Shift amount - andn %o0, 7, %o0 ! Source addr - - brz,pt %o4, Lbcopy_noshift8 ! No shift version... - - sllx %o4, 3, %o4 ! In bits - mov 8<<3, %o3 - - ldx [%o0], %l0 ! Load word -1 - sub %o3, %o4, %o3 ! Reverse shift - deccc 16*8, %o2 ! Have enough room? - - sllx %l0, %o4, %l0 - bl,pn %xcc, 2f - and %o3, 0x38, %o3 -Lbcopy_unrolled8: - - /* - * This is about as close to optimal as you can get, since - * the shifts require EU0 and cannot be paired, and you have - * 3 dependent operations on the data. - */ - -! ldx [%o0+0*8], %l0 ! Already done -! sllx %l0, %o4, %l0 ! Already done - ldx [%o0+1*8], %l1 - ldx [%o0+2*8], %l2 - ldx [%o0+3*8], %l3 - ldx [%o0+4*8], %l4 - ldx [%o0+5*8], %l5 - ldx [%o0+6*8], %l6 -#if 1 - ba,pt %icc, 1f - ldx [%o0+7*8], %l7 - .align 8 -1: - srlx %l1, %o3, %g1 - inc 8*8, %o0 - - sllx %l1, %o4, %l1 - or %g1, %l0, %o5 - ldx [%o0+0*8], %l0 - - stx %o5, [%o1+0*8] - srlx %l2, %o3, %g1 - - sllx %l2, %o4, %l2 - or %g1, %l1, %o5 - ldx [%o0+1*8], %l1 - - stx %o5, [%o1+1*8] - srlx %l3, %o3, %g1 - - sllx %l3, %o4, %l3 - or %g1, %l2, %o5 - ldx [%o0+2*8], %l2 - - stx %o5, [%o1+2*8] - srlx %l4, %o3, %g1 - - sllx %l4, %o4, %l4 - or %g1, %l3, %o5 - ldx [%o0+3*8], %l3 - - stx %o5, [%o1+3*8] - srlx %l5, %o3, %g1 - - sllx %l5, %o4, %l5 - or %g1, %l4, %o5 - ldx [%o0+4*8], %l4 - - stx %o5, [%o1+4*8] - srlx %l6, %o3, %g1 - - sllx %l6, %o4, %l6 - or %g1, %l5, %o5 - ldx [%o0+5*8], %l5 - - stx %o5, [%o1+5*8] - srlx %l7, %o3, %g1 - - sllx %l7, %o4, %l7 - or %g1, %l6, %o5 - ldx [%o0+6*8], %l6 - - stx %o5, [%o1+6*8] - srlx %l0, %o3, %g1 - deccc 8*8, %o2 ! Have enough room? - - sllx %l0, %o4, %l0 ! Next loop - or %g1, %l7, %o5 - ldx [%o0+7*8], %l7 - - stx %o5, [%o1+7*8] - bge,pt %xcc, 1b - inc 8*8, %o1 - -Lbcopy_unrolled8_cleanup: - !! - !! Finished 8 byte block, unload the regs. - !! - srlx %l1, %o3, %g1 - inc 7*8, %o0 - - sllx %l1, %o4, %l1 - or %g1, %l0, %o5 - - stx %o5, [%o1+0*8] - srlx %l2, %o3, %g1 - - sllx %l2, %o4, %l2 - or %g1, %l1, %o5 - - stx %o5, [%o1+1*8] - srlx %l3, %o3, %g1 - - sllx %l3, %o4, %l3 - or %g1, %l2, %o5 - - stx %o5, [%o1+2*8] - srlx %l4, %o3, %g1 - - sllx %l4, %o4, %l4 - or %g1, %l3, %o5 - - stx %o5, [%o1+3*8] - srlx %l5, %o3, %g1 - - sllx %l5, %o4, %l5 - or %g1, %l4, %o5 - - stx %o5, [%o1+4*8] - srlx %l6, %o3, %g1 - - sllx %l6, %o4, %l6 - or %g1, %l5, %o5 - - stx %o5, [%o1+5*8] - srlx %l7, %o3, %g1 - - sllx %l7, %o4, %l7 - or %g1, %l6, %o5 - - stx %o5, [%o1+6*8] - inc 7*8, %o1 - - mov %l7, %l0 ! Save our unused data - dec 7*8, %o2 -#else - /* - * This version also handles aligned copies at almost the - * same speed. It should take the same number of cycles - * as the previous version, but is slightly slower, probably - * due to i$ issues. - */ - ldx [%o0+7*8], %l7 - ba,pt %icc, 1f - clr %g1 - .align 32 -1: - srlx %l1, %o3, %g1 - bz,pn %xcc, 3f - inc 8*8, %o0 - - sllx %l1, %o4, %l1 - or %g1, %l0, %o5 - ba,pt %icc, 4f - ldx [%o0+0*8], %l0 - - nop -3: - mov %l0, %o5 - ldx [%o0+0*8], %l0 - -4: - bz,pn %icc, 3f - stx %o5, [%o1+0*8] - srlx %l2, %o3, %g1 - - sllx %l2, %o4, %l2 -3: - or %g1, %l1, %o5 - ldx [%o0+1*8], %l1 - - bz,pn %icc, 3f - stx %o5, [%o1+1*8] - srlx %l3, %o3, %g1 - - sllx %l3, %o4, %l3 -3: - or %g1, %l2, %o5 - ldx [%o0+2*8], %l2 - - bz,pn %icc, 3f - stx %o5, [%o1+2*8] - srlx %l4, %o3, %g1 - - sllx %l4, %o4, %l4 -3: - or %g1, %l3, %o5 - ldx [%o0+3*8], %l3 - - bz,pn %icc, 3f - stx %o5, [%o1+3*8] - srlx %l5, %o3, %g1 - - sllx %l5, %o4, %l5 -3: - or %g1, %l4, %o5 - ldx [%o0+4*8], %l4 - - bz,pn %icc, 3f - stx %o5, [%o1+4*8] - srlx %l6, %o3, %g1 - - sllx %l6, %o4, %l6 -3: - or %g1, %l5, %o5 - ldx [%o0+5*8], %l5 - - bz,pn %icc, 3f - stx %o5, [%o1+5*8] - srlx %l7, %o3, %g1 - - sllx %l7, %o4, %l7 -3: - or %g1, %l6, %o5 - ldx [%o0+6*8], %l6 - - bz,pn %icc, 3f - stx %o5, [%o1+6*8] - srlx %l0, %o3, %g1 - - sllx %l0, %o4, %l0 ! Next loop -3: - or %g1, %l7, %o5 - ldx [%o0+7*8], %l7 - deccc 8*8, %o2 ! Have enough room? - - stx %o5, [%o1+7*8] - inc 8*8, %o1 - bge,pt %xcc, 1b - tst %o4 - - - !! - !! Now unload all those regs - !! -Lbcopy_unrolled8_cleanup: - srlx %l1, %o3, %g1 - bz,pn %xcc, 3f - inc 7*8, %o0 ! Point at the last load - - sllx %l1, %o4, %l1 - ba,pt %icc, 4f - or %g1, %l0, %o5 - -3: - mov %l0, %o5 - -4: - bz,pn %icc, 3f - stx %o5, [%o1+0*8] - srlx %l2, %o3, %g1 - - sllx %l2, %o4, %l2 -3: - or %g1, %l1, %o5 - - bz,pn %icc, 3f - stx %o5, [%o1+1*8] - srlx %l3, %o3, %g1 - - sllx %l3, %o4, %l3 -3: - or %g1, %l2, %o5 - - bz,pn %icc, 3f - stx %o5, [%o1+2*8] - srlx %l4, %o3, %g1 - - sllx %l4, %o4, %l4 -3: - or %g1, %l3, %o5 - - bz,pn %icc, 3f - stx %o5, [%o1+3*8] - srlx %l5, %o3, %g1 - - sllx %l5, %o4, %l5 -3: - or %g1, %l4, %o5 - - bz,pn %icc, 3f - stx %o5, [%o1+4*8] - srlx %l6, %o3, %g1 - - sllx %l6, %o4, %l6 -3: - or %g1, %l5, %o5 - - bz,pn %icc, 3f - stx %o5, [%o1+5*8] - srlx %l7, %o3, %g1 - - sllx %l7, %o4, %l7 -3: - or %g1, %l6, %o5 - mov %l7, %l0 ! Shuffle to %l0 - - stx %o5, [%o1+6*8] - or %g1, %l7, %o5 - dec 7*8, %o2 - - inc 7*8, %o1 ! Point at last store -#endif -2: - inccc 16*8, %o2 - bz,pn %icc, Lbcopy_complete - - !! Unrolled 8 times -Lbcopy_aligned8: -! ldx [%o0], %l0 ! Already done -! sllx %l0, %o4, %l0 ! Shift high word - - deccc 8, %o2 ! Pre-decrement - bl,pn %xcc, Lbcopy_finish -1: - ldx [%o0+8], %l1 ! Load word 0 - inc 8, %o0 - - srlx %l1, %o3, %o5 - or %o5, %l0, %o5 ! Combine - - stx %o5, [%o1] ! Store result - inc 8, %o1 - - deccc 8, %o2 - bge,pn %xcc, 1b - sllx %l1, %o4, %l0 - - btst 7, %o2 ! Done? - bz,pt %xcc, Lbcopy_complete - - !! - !! Loadup the last dregs into %l0 and shift it into place - !! - srlx %o3, 3, %o5 ! # bytes in %l0 - dec 8, %o5 ! - 8 - !! n-8 - (by - 8) -> n - by - subcc %o2, %o5, %g0 ! # bytes we need - ble,pt %icc, Lbcopy_finish - nop - ldx [%o0+8], %l1 ! Need another word - srlx %l1, %o3, %l1 - ba,pt %icc, Lbcopy_finish - or %l0, %l1, %l0 ! All loaded up. - -Lbcopy_noshift8: - deccc 8*8, %o2 ! Have enough room? - bl,pn %xcc, 2f - nop - ba,pt %icc, 1f - nop - .align 32 -1: - ldx [%o0+0*8], %l0 - ldx [%o0+1*8], %l1 - ldx [%o0+2*8], %l2 - ldx [%o0+3*8], %l3 - stx %l0, [%o1+0*8] - stx %l1, [%o1+1*8] - stx %l2, [%o1+2*8] - stx %l3, [%o1+3*8] - - - ldx [%o0+4*8], %l4 - ldx [%o0+5*8], %l5 - ldx [%o0+6*8], %l6 - ldx [%o0+7*8], %l7 - inc 8*8, %o0 - stx %l4, [%o1+4*8] - stx %l5, [%o1+5*8] - deccc 8*8, %o2 - stx %l6, [%o1+6*8] - stx %l7, [%o1+7*8] - stx %l2, [%o1+2*8] - bge,pt %xcc, 1b - inc 8*8, %o1 -2: - inc 8*8, %o2 -1: - deccc 8, %o2 - bl,pn %icc, 1f ! < 0 --> sub word - nop - ldx [%o0], %o5 - inc 8, %o0 - stx %o5, [%o1] - bg,pt %icc, 1b ! Exactly 0 --> done - inc 8, %o1 -1: - btst 7, %o2 ! Done? - bz,pt %xcc, Lbcopy_complete - clr %o4 - ldx [%o0], %l0 -Lbcopy_finish: - - brz,pn %o2, 2f ! 100% complete? - cmp %o2, 8 ! Exactly 8 bytes? - bz,a,pn %xcc, 2f - stx %l0, [%o1] - - btst 4, %o2 ! Word store? - bz %xcc, 1f - srlx %l0, 32, %o5 ! Shift high word down - stw %o5, [%o1] - inc 4, %o1 - mov %l0, %o5 ! Operate on the low bits -1: - btst 2, %o2 - mov %o5, %l0 - bz 1f - srlx %l0, 16, %o5 - - sth %o5, [%o1] ! Store short - inc 2, %o1 - mov %l0, %o5 ! Operate on low bytes -1: - mov %o5, %l0 - btst 1, %o2 ! Byte aligned? - bz 2f - srlx %l0, 8, %o5 - - stb %o5, [%o1] ! Store last byte - inc 1, %o1 ! Update address -2: -Lbcopy_complete: -#if 0 - !! - !! verify copy success. - !! - - mov %i0, %o2 - mov %i1, %o4 - mov %i2, %l4 -0: - ldub [%o2], %o1 - inc %o2 - ldub [%o4], %o3 - inc %o4 - cmp %o3, %o1 - bnz 1f - dec %l4 - brnz %l4, 0b - nop - ba 2f - nop - -1: - set 0f, %o0 - call printf - sub %i2, %l4, %o5 - set 1f, %o0 - mov %i0, %o1 - mov %i1, %o2 - call printf - mov %i2, %o3 - ta 1 - .data -0: .asciz "bcopy failed: %x@%p != %x@%p byte %d\n" -1: .asciz "bcopy(%p, %p, %lx)\n" - _ALIGN - .text -2: -#endif - ret - restore %i1, %g0, %o0 - -#if 1 - -/* - * Block copy. Useful for >256 byte copies. - * - * Benchmarking has shown this always seems to be slower than - * the integer version, so this is disabled. Maybe someone will - * figure out why sometime. - */ - -Lbcopy_block: -#ifdef _KERNEL -/* - * Kernel: - * - * Here we use VIS instructions to do a block clear of a page. - * But before we can do that we need to save and enable the FPU. - * The last owner of the FPU registers is fpproc, and - * fpproc->p_md.md_fpstate is the current fpstate. If that's not - * null, call savefpstate() with it to store our current fp state. - * - * Next, allocate an aligned fpstate on the stack. We will properly - * nest calls on a particular stack so this should not be a problem. - * - * Now we grab either curproc (or if we're on the interrupt stack - * proc0). We stash its existing fpstate in a local register and - * put our new fpstate in curproc->p_md.md_fpstate. We point - * fpproc at curproc (or proc0) and enable the FPU. - * - * If we are ever preempted, our FPU state will be saved in our - * fpstate. Then, when we're resumed and we take an FPDISABLED - * trap, the trap handler will be able to fish our FPU state out - * of curproc (or proc0). - * - * On exiting this routine we undo the damage: restore the original - * pointer to curproc->p_md.md_fpstate, clear our fpproc, and disable - * the MMU. - * - * - * Register usage, Kernel only (after save): - * - * %i0 src - * %i1 dest - * %i2 size - * - * %l0 XXXX DEBUG old fpstate - * %l1 fpproc (hi bits only) - * %l2 orig fpproc - * %l3 orig fpstate - * %l5 curproc - * %l6 old fpstate - * - * Register ussage, Kernel and user: - * - * %g1 src (retval for memcpy) - * - * %o0 src - * %o1 dest - * %o2 end dest - * %o5 last safe fetchable address - */ - -#if 1 - ENABLE_FPU(0) -#else - save %sp, -(CC64FSZ+FS_SIZE+BLOCK_SIZE), %sp ! Allocate an fpstate - sethi %hi(FPPROC), %l1 - LDPTR [%l1 + %lo(FPPROC)], %l2 ! Load fpproc - add %sp, (CC64FSZ+STKB+BLOCK_SIZE-1), %l0 ! Calculate pointer to fpstate - brz,pt %l2, 1f ! fpproc == NULL? - andn %l0, BLOCK_ALIGN, %l0 ! And make it block aligned - LDPTR [%l2 + P_FPSTATE], %l3 - brz,pn %l3, 1f ! Make sure we have an fpstate - mov %l3, %o0 - call _C_LABEL(savefpstate) ! Save the old fpstate - set EINTSTACK-STKB, %l4 ! Are we on intr stack? - cmp %sp, %l4 - bgu,pt %xcc, 1f - set INTSTACK-STKB, %l4 - cmp %sp, %l4 - blu %xcc, 1f -0: - sethi %hi(_C_LABEL(proc0)), %l4 ! Yes, use proc0 - ba,pt %xcc, 2f ! XXXX needs to change to CPUs idle proc - or %l4, %lo(_C_LABEL(proc0)), %l5 -1: - sethi %hi(CURPROC), %l4 ! Use curproc - LDPTR [%l4 + %lo(CURPROC)], %l5 - brz,pn %l5, 0b ! If curproc is NULL need to use proc0 - nop -2: - LDPTR [%l5 + P_FPSTATE], %l6 ! Save old fpstate - STPTR %l0, [%l5 + P_FPSTATE] ! Insert new fpstate - STPTR %l5, [%l1 + %lo(FPPROC)] ! Set new fpproc - wr %g0, FPRS_FEF, %fprs ! Enable FPU -#endif - mov %i0, %o0 ! Src addr. - mov %i1, %o1 ! Store our dest ptr here. - mov %i2, %o2 ! Len counter -#endif - - !! - !! First align the output to a 64-bit entity - !! - - mov %o1, %g1 ! memcpy retval - add %o0, %o2, %o5 ! End of source block - - andn %o0, 7, %o3 ! Start of block - dec %o5 - fzero %f0 - - andn %o5, BLOCK_ALIGN, %o5 ! Last safe addr. - ldd [%o3], %f2 ! Load 1st word - - dec 8, %o3 ! Move %o3 1 word back - btst 1, %o1 - bz 4f - - mov -7, %o4 ! Lowest src addr possible - alignaddr %o0, %o4, %o4 ! Base addr for load. - - cmp %o3, %o4 - be,pt %xcc, 1f ! Already loaded? - mov %o4, %o3 - fmovd %f2, %f0 ! No. Shift - ldd [%o3+8], %f2 ! And load -1: - - faligndata %f0, %f2, %f4 ! Isolate 1st byte - - stda %f4, [%o1] ASI_FL8_P ! Store 1st byte - inc 1, %o1 ! Update address - inc 1, %o0 - dec 1, %o2 -4: - btst 2, %o1 - bz 4f - - mov -6, %o4 ! Calculate src - 6 - alignaddr %o0, %o4, %o4 ! calculate shift mask and dest. - - cmp %o3, %o4 ! Addresses same? - be,pt %xcc, 1f - mov %o4, %o3 - fmovd %f2, %f0 ! Shuffle data - ldd [%o3+8], %f2 ! Load word 0 -1: - faligndata %f0, %f2, %f4 ! Move 1st short low part of f8 - - stda %f4, [%o1] ASI_FL16_P ! Store 1st short - dec 2, %o2 - inc 2, %o1 - inc 2, %o0 -4: - brz,pn %o2, Lbcopy_blockfinish ! XXXX - - btst 4, %o1 - bz 4f - - mov -4, %o4 - alignaddr %o0, %o4, %o4 ! calculate shift mask and dest. - - cmp %o3, %o4 ! Addresses same? - beq,pt %xcc, 1f - mov %o4, %o3 - fmovd %f2, %f0 ! Shuffle data - ldd [%o3+8], %f2 ! Load word 0 -1: - faligndata %f0, %f2, %f4 ! Move 1st short low part of f8 - - st %f5, [%o1] ! Store word - dec 4, %o2 - inc 4, %o1 - inc 4, %o0 -4: - brz,pn %o2, Lbcopy_blockfinish ! XXXX - !! - !! We are now 32-bit aligned in the dest. - !! -Lbcopy_block_common: - - mov -0, %o4 - alignaddr %o0, %o4, %o4 ! base - shift - - cmp %o3, %o4 ! Addresses same? - beq,pt %xcc, 1f - mov %o4, %o3 - fmovd %f2, %f0 ! Shuffle data - ldd [%o3+8], %f2 ! Load word 0 -1: - add %o3, 8, %o0 ! now use %o0 for src - - !! - !! Continue until our dest is block aligned - !! -Lbcopy_block_aligned8: -1: - brz %o2, Lbcopy_blockfinish - btst BLOCK_ALIGN, %o1 ! Block aligned? - bz 1f - - faligndata %f0, %f2, %f4 ! Generate result - deccc 8, %o2 - ble,pn %icc, Lbcopy_blockfinish ! Should never happen - fmovd %f4, %f48 - - std %f4, [%o1] ! Store result - inc 8, %o1 - - fmovd %f2, %f0 - inc 8, %o0 - ba,pt %xcc, 1b ! Not yet. - ldd [%o0], %f2 ! Load next part -Lbcopy_block_aligned64: -1: - -/* - * 64-byte aligned -- ready for block operations. - * - * Here we have the destination block aligned, but the - * source pointer may not be. Sub-word alignment will - * be handled by faligndata instructions. But the source - * can still be potentially aligned to 8 different words - * in our 64-bit block, so we have 8 different copy routines. - * - * Once we figure out our source alignment, we branch - * to the appropriate copy routine, which sets up the - * alignment for faligndata and loads (sets) the values - * into the source registers and does the copy loop. - * - * When were down to less than 1 block to store, we - * exit the copy loop and execute cleanup code. - * - * Block loads and stores are not properly interlocked. - * Stores save one reg/cycle, so you can start overwriting - * registers the cycle after the store is issued. - * - * Block loads require a block load to a different register - * block or a membar #Sync before accessing the loaded - * data. - * - * Since the faligndata instructions may be offset as far - * as 7 registers into a block (if you are shifting source - * 7 -> dest 0), you need 3 source register blocks for full - * performance: one you are copying, one you are loading, - * and one for interlocking. Otherwise, we would need to - * sprinkle the code with membar #Sync and lose the advantage - * of running faligndata in parallel with block stores. This - * means we are fetching a full 128 bytes ahead of the stores. - * We need to make sure the prefetch does not inadvertently - * cross a page boundary and fault on data that we will never - * store. - * - */ -#if 1 - and %o0, BLOCK_ALIGN, %o3 - srax %o3, 3, %o3 ! Isolate the offset - - brz %o3, L100 ! 0->0 - btst 4, %o3 - bnz %xcc, 4f - btst 2, %o3 - bnz %xcc, 2f - btst 1, %o3 - ba,pt %xcc, L101 ! 0->1 - nop /* XXX spitfire bug */ -2: - bz %xcc, L102 ! 0->2 - nop - ba,pt %xcc, L103 ! 0->3 - nop /* XXX spitfire bug */ -4: - bnz %xcc, 2f - btst 1, %o3 - bz %xcc, L104 ! 0->4 - nop - ba,pt %xcc, L105 ! 0->5 - nop /* XXX spitfire bug */ -2: - bz %xcc, L106 ! 0->6 - nop - ba,pt %xcc, L107 ! 0->7 - nop /* XXX spitfire bug */ -#else - - !! - !! Isolate the word offset, which just happens to be - !! the slot in our jump table. - !! - !! This is 6 insns, most of which cannot be paired, - !! which is about the same as the above version. - !! - rd %pc, %o4 -1: - and %o0, 0x31, %o3 - add %o3, (Lbcopy_block_jmp - 1b), %o3 - jmpl %o4 + %o3, %g0 - nop - - !! - !! Jump table - !! - -Lbcopy_block_jmp: - ba,a,pt %xcc, L100 - nop - ba,a,pt %xcc, L101 - nop - ba,a,pt %xcc, L102 - nop - ba,a,pt %xcc, L103 - nop - ba,a,pt %xcc, L104 - nop - ba,a,pt %xcc, L105 - nop - ba,a,pt %xcc, L106 - nop - ba,a,pt %xcc, L107 - nop -#endif - - !! - !! Source is block aligned. - !! - !! Just load a block and go. - !! -L100: -#ifdef RETURN_NAME - sethi %hi(1f), %g1 - ba,pt %icc, 2f - or %g1, %lo(1f), %g1 -1: - .asciz "L100" - .align 8 -2: -#endif - fmovd %f0 , %f62 - ldda [%o0] ASI_BLK_P, %f0 - inc BLOCK_SIZE, %o0 - cmp %o0, %o5 - bleu,a,pn %icc, 3f - ldda [%o0] ASI_BLK_P, %f16 - ba,pt %icc, 3f - membar #Sync - - .align 32 ! ICache align. -3: - faligndata %f62, %f0, %f32 - inc BLOCK_SIZE, %o0 - faligndata %f0, %f2, %f34 - dec BLOCK_SIZE, %o2 - faligndata %f2, %f4, %f36 - cmp %o0, %o5 - faligndata %f4, %f6, %f38 - faligndata %f6, %f8, %f40 - faligndata %f8, %f10, %f42 - faligndata %f10, %f12, %f44 - brlez,pn %o2, Lbcopy_blockdone - faligndata %f12, %f14, %f46 - - bleu,a,pn %icc, 2f - ldda [%o0] ASI_BLK_P, %f48 - membar #Sync -2: - stda %f32, [%o1] ASI_STORE - faligndata %f14, %f16, %f32 - inc BLOCK_SIZE, %o0 - faligndata %f16, %f18, %f34 - inc BLOCK_SIZE, %o1 - faligndata %f18, %f20, %f36 - dec BLOCK_SIZE, %o2 - faligndata %f20, %f22, %f38 - cmp %o0, %o5 - faligndata %f22, %f24, %f40 - faligndata %f24, %f26, %f42 - faligndata %f26, %f28, %f44 - brlez,pn %o2, Lbcopy_blockdone - faligndata %f28, %f30, %f46 - - bleu,a,pn %icc, 2f - ldda [%o0] ASI_BLK_P, %f0 - membar #Sync -2: - stda %f32, [%o1] ASI_STORE - faligndata %f30, %f48, %f32 - inc BLOCK_SIZE, %o0 - faligndata %f48, %f50, %f34 - inc BLOCK_SIZE, %o1 - faligndata %f50, %f52, %f36 - dec BLOCK_SIZE, %o2 - faligndata %f52, %f54, %f38 - cmp %o0, %o5 - faligndata %f54, %f56, %f40 - faligndata %f56, %f58, %f42 - faligndata %f58, %f60, %f44 - brlez,pn %o2, Lbcopy_blockdone - faligndata %f60, %f62, %f46 - bleu,a,pn %icc, 2f - ldda [%o0] ASI_BLK_P, %f16 ! Increment is at top - membar #Sync -2: - stda %f32, [%o1] ASI_STORE - ba 3b - inc BLOCK_SIZE, %o1 - - !! - !! Source at BLOCK_ALIGN+8 - !! - !! We need to load almost 1 complete block by hand. - !! -L101: -#ifdef RETURN_NAME - sethi %hi(1f), %g1 - ba,pt %icc, 2f - or %g1, %lo(1f), %g1 -1: - .asciz "L101" - .align 8 -2: -#endif -! fmovd %f0, %f0 ! Hoist fmovd - ldd [%o0], %f2 - inc 8, %o0 - ldd [%o0], %f4 - inc 8, %o0 - ldd [%o0], %f6 - inc 8, %o0 - ldd [%o0], %f8 - inc 8, %o0 - ldd [%o0], %f10 - inc 8, %o0 - ldd [%o0], %f12 - inc 8, %o0 - ldd [%o0], %f14 - inc 8, %o0 - - cmp %o0, %o5 - bleu,a,pn %icc, 3f - ldda [%o0] ASI_BLK_P, %f16 - membar #Sync -3: - faligndata %f0, %f2, %f32 - inc BLOCK_SIZE, %o0 - faligndata %f2, %f4, %f34 - cmp %o0, %o5 - faligndata %f4, %f6, %f36 - dec BLOCK_SIZE, %o2 - faligndata %f6, %f8, %f38 - faligndata %f8, %f10, %f40 - faligndata %f10, %f12, %f42 - faligndata %f12, %f14, %f44 - bleu,a,pn %icc, 2f - ldda [%o0] ASI_BLK_P, %f48 - membar #Sync -2: - brlez,pn %o2, Lbcopy_blockdone - faligndata %f14, %f16, %f46 - - stda %f32, [%o1] ASI_STORE - - faligndata %f16, %f18, %f32 - inc BLOCK_SIZE, %o0 - faligndata %f18, %f20, %f34 - inc BLOCK_SIZE, %o1 - faligndata %f20, %f22, %f36 - cmp %o0, %o5 - faligndata %f22, %f24, %f38 - dec BLOCK_SIZE, %o2 - faligndata %f24, %f26, %f40 - faligndata %f26, %f28, %f42 - faligndata %f28, %f30, %f44 - bleu,a,pn %icc, 2f - ldda [%o0] ASI_BLK_P, %f0 - membar #Sync -2: - brlez,pn %o2, Lbcopy_blockdone - faligndata %f30, %f48, %f46 - - stda %f32, [%o1] ASI_STORE - - faligndata %f48, %f50, %f32 - inc BLOCK_SIZE, %o0 - faligndata %f50, %f52, %f34 - inc BLOCK_SIZE, %o1 - faligndata %f52, %f54, %f36 - cmp %o0, %o5 - faligndata %f54, %f56, %f38 - dec BLOCK_SIZE, %o2 - faligndata %f56, %f58, %f40 - faligndata %f58, %f60, %f42 - faligndata %f60, %f62, %f44 - bleu,a,pn %icc, 2f - ldda [%o0] ASI_BLK_P, %f16 - membar #Sync -2: - brlez,pn %o2, Lbcopy_blockdone - faligndata %f62, %f0, %f46 - - stda %f32, [%o1] ASI_STORE - ba 3b - inc BLOCK_SIZE, %o1 - - !! - !! Source at BLOCK_ALIGN+16 - !! - !! We need to load 6 doubles by hand. - !! -L102: -#ifdef RETURN_NAME - sethi %hi(1f), %g1 - ba,pt %icc, 2f - or %g1, %lo(1f), %g1 -1: - .asciz "L102" - .align 8 -2: -#endif - ldd [%o0], %f4 - inc 8, %o0 - fmovd %f0, %f2 ! Hoist fmovd - ldd [%o0], %f6 - inc 8, %o0 - - ldd [%o0], %f8 - inc 8, %o0 - ldd [%o0], %f10 - inc 8, %o0 - ldd [%o0], %f12 - inc 8, %o0 - ldd [%o0], %f14 - inc 8, %o0 - - cmp %o0, %o5 - bleu,a,pn %icc, 3f - ldda [%o0] ASI_BLK_P, %f16 - membar #Sync -3: - faligndata %f2, %f4, %f32 - inc BLOCK_SIZE, %o0 - faligndata %f4, %f6, %f34 - cmp %o0, %o5 - faligndata %f6, %f8, %f36 - dec BLOCK_SIZE, %o2 - faligndata %f8, %f10, %f38 - faligndata %f10, %f12, %f40 - faligndata %f12, %f14, %f42 - bleu,a,pn %icc, 2f - ldda [%o0] ASI_BLK_P, %f48 - membar #Sync -2: - faligndata %f14, %f16, %f44 - - brlez,pn %o2, Lbcopy_blockdone - faligndata %f16, %f18, %f46 - - stda %f32, [%o1] ASI_STORE - - faligndata %f18, %f20, %f32 - inc BLOCK_SIZE, %o0 - faligndata %f20, %f22, %f34 - inc BLOCK_SIZE, %o1 - faligndata %f22, %f24, %f36 - cmp %o0, %o5 - faligndata %f24, %f26, %f38 - dec BLOCK_SIZE, %o2 - faligndata %f26, %f28, %f40 - faligndata %f28, %f30, %f42 - bleu,a,pn %icc, 2f - ldda [%o0] ASI_BLK_P, %f0 - membar #Sync -2: - faligndata %f30, %f48, %f44 - brlez,pn %o2, Lbcopy_blockdone - faligndata %f48, %f50, %f46 - - stda %f32, [%o1] ASI_STORE - - faligndata %f50, %f52, %f32 - inc BLOCK_SIZE, %o0 - faligndata %f52, %f54, %f34 - inc BLOCK_SIZE, %o1 - faligndata %f54, %f56, %f36 - cmp %o0, %o5 - faligndata %f56, %f58, %f38 - dec BLOCK_SIZE, %o2 - faligndata %f58, %f60, %f40 - faligndata %f60, %f62, %f42 - bleu,a,pn %icc, 2f - ldda [%o0] ASI_BLK_P, %f16 - membar #Sync -2: - faligndata %f62, %f0, %f44 - brlez,pn %o2, Lbcopy_blockdone - faligndata %f0, %f2, %f46 - - stda %f32, [%o1] ASI_STORE - ba 3b - inc BLOCK_SIZE, %o1 - - !! - !! Source at BLOCK_ALIGN+24 - !! - !! We need to load 5 doubles by hand. - !! -L103: -#ifdef RETURN_NAME - sethi %hi(1f), %g1 - ba,pt %icc, 2f - or %g1, %lo(1f), %g1 -1: - .asciz "L103" - .align 8 -2: -#endif - fmovd %f0, %f4 - ldd [%o0], %f6 - inc 8, %o0 - ldd [%o0], %f8 - inc 8, %o0 - ldd [%o0], %f10 - inc 8, %o0 - ldd [%o0], %f12 - inc 8, %o0 - ldd [%o0], %f14 - inc 8, %o0 - - cmp %o0, %o5 - bleu,a,pn %icc, 2f - ldda [%o0] ASI_BLK_P, %f16 - membar #Sync -2: - inc BLOCK_SIZE, %o0 -3: - faligndata %f4, %f6, %f32 - cmp %o0, %o5 - faligndata %f6, %f8, %f34 - dec BLOCK_SIZE, %o2 - faligndata %f8, %f10, %f36 - faligndata %f10, %f12, %f38 - faligndata %f12, %f14, %f40 - bleu,a,pn %icc, 2f - ldda [%o0] ASI_BLK_P, %f48 - membar #Sync -2: - faligndata %f14, %f16, %f42 - inc BLOCK_SIZE, %o0 - faligndata %f16, %f18, %f44 - brlez,pn %o2, Lbcopy_blockdone - faligndata %f18, %f20, %f46 - - stda %f32, [%o1] ASI_STORE - - faligndata %f20, %f22, %f32 - cmp %o0, %o5 - faligndata %f22, %f24, %f34 - dec BLOCK_SIZE, %o2 - faligndata %f24, %f26, %f36 - inc BLOCK_SIZE, %o1 - faligndata %f26, %f28, %f38 - faligndata %f28, %f30, %f40 - ble,a,pn %icc, 2f - ldda [%o0] ASI_BLK_P, %f0 - membar #Sync -2: - faligndata %f30, %f48, %f42 - inc BLOCK_SIZE, %o0 - faligndata %f48, %f50, %f44 - brlez,pn %o2, Lbcopy_blockdone - faligndata %f50, %f52, %f46 - - stda %f32, [%o1] ASI_STORE - - faligndata %f52, %f54, %f32 - cmp %o0, %o5 - faligndata %f54, %f56, %f34 - dec BLOCK_SIZE, %o2 - faligndata %f56, %f58, %f36 - faligndata %f58, %f60, %f38 - inc BLOCK_SIZE, %o1 - faligndata %f60, %f62, %f40 - bleu,a,pn %icc, 2f - ldda [%o0] ASI_BLK_P, %f16 - membar #Sync -2: - faligndata %f62, %f0, %f42 - inc BLOCK_SIZE, %o0 - faligndata %f0, %f2, %f44 - brlez,pn %o2, Lbcopy_blockdone - faligndata %f2, %f4, %f46 - - stda %f32, [%o1] ASI_STORE - ba 3b - inc BLOCK_SIZE, %o1 - - !! - !! Source at BLOCK_ALIGN+32 - !! - !! We need to load 4 doubles by hand. - !! -L104: -#ifdef RETURN_NAME - sethi %hi(1f), %g1 - ba,pt %icc, 2f - or %g1, %lo(1f), %g1 -1: - .asciz "L104" - .align 8 -2: -#endif - fmovd %f0, %f6 - ldd [%o0], %f8 - inc 8, %o0 - ldd [%o0], %f10 - inc 8, %o0 - ldd [%o0], %f12 - inc 8, %o0 - ldd [%o0], %f14 - inc 8, %o0 - - cmp %o0, %o5 - bleu,a,pn %icc, 2f - ldda [%o0] ASI_BLK_P, %f16 - membar #Sync -2: - inc BLOCK_SIZE, %o0 -3: - faligndata %f6, %f8, %f32 - cmp %o0, %o5 - faligndata %f8, %f10, %f34 - dec BLOCK_SIZE, %o2 - faligndata %f10, %f12, %f36 - faligndata %f12, %f14, %f38 - bleu,a,pn %icc, 2f - ldda [%o0] ASI_BLK_P, %f48 - membar #Sync -2: - faligndata %f14, %f16, %f40 - faligndata %f16, %f18, %f42 - inc BLOCK_SIZE, %o0 - faligndata %f18, %f20, %f44 - brlez,pn %o2, Lbcopy_blockdone - faligndata %f20, %f22, %f46 - - stda %f32, [%o1] ASI_STORE - - faligndata %f22, %f24, %f32 - cmp %o0, %o5 - faligndata %f24, %f26, %f34 - faligndata %f26, %f28, %f36 - inc BLOCK_SIZE, %o1 - faligndata %f28, %f30, %f38 - bleu,a,pn %icc, 2f - ldda [%o0] ASI_BLK_P, %f0 - membar #Sync -2: - faligndata %f30, %f48, %f40 - dec BLOCK_SIZE, %o2 - faligndata %f48, %f50, %f42 - inc BLOCK_SIZE, %o0 - faligndata %f50, %f52, %f44 - brlez,pn %o2, Lbcopy_blockdone - faligndata %f52, %f54, %f46 - - stda %f32, [%o1] ASI_STORE - - faligndata %f54, %f56, %f32 - cmp %o0, %o5 - faligndata %f56, %f58, %f34 - faligndata %f58, %f60, %f36 - inc BLOCK_SIZE, %o1 - faligndata %f60, %f62, %f38 - bleu,a,pn %icc, 2f - ldda [%o0] ASI_BLK_P, %f16 - membar #Sync -2: - faligndata %f62, %f0, %f40 - dec BLOCK_SIZE, %o2 - faligndata %f0, %f2, %f42 - inc BLOCK_SIZE, %o0 - faligndata %f2, %f4, %f44 - brlez,pn %o2, Lbcopy_blockdone - faligndata %f4, %f6, %f46 - - stda %f32, [%o1] ASI_STORE - ba 3b - inc BLOCK_SIZE, %o1 - - !! - !! Source at BLOCK_ALIGN+40 - !! - !! We need to load 3 doubles by hand. - !! -L105: -#ifdef RETURN_NAME - sethi %hi(1f), %g1 - ba,pt %icc, 2f - or %g1, %lo(1f), %g1 -1: - .asciz "L105" - .align 8 -2: -#endif - fmovd %f0, %f8 - ldd [%o0], %f10 - inc 8, %o0 - ldd [%o0], %f12 - inc 8, %o0 - ldd [%o0], %f14 - inc 8, %o0 - - cmp %o0, %o5 - bleu,a,pn %icc, 2f - ldda [%o0] ASI_BLK_P, %f16 - membar #Sync -2: - inc BLOCK_SIZE, %o0 -3: - faligndata %f8, %f10, %f32 - cmp %o0, %o5 - faligndata %f10, %f12, %f34 - faligndata %f12, %f14, %f36 - bleu,a,pn %icc, 2f - ldda [%o0] ASI_BLK_P, %f48 - membar #Sync -2: - faligndata %f14, %f16, %f38 - dec BLOCK_SIZE, %o2 - faligndata %f16, %f18, %f40 - inc BLOCK_SIZE, %o0 - faligndata %f18, %f20, %f42 - faligndata %f20, %f22, %f44 - brlez,pn %o2, Lbcopy_blockdone - faligndata %f22, %f24, %f46 - - stda %f32, [%o1] ASI_STORE - - faligndata %f24, %f26, %f32 - cmp %o0, %o5 - faligndata %f26, %f28, %f34 - dec BLOCK_SIZE, %o2 - faligndata %f28, %f30, %f36 - bleu,a,pn %icc, 2f - ldda [%o0] ASI_BLK_P, %f0 - membar #Sync -2: - faligndata %f30, %f48, %f38 - inc BLOCK_SIZE, %o1 - faligndata %f48, %f50, %f40 - inc BLOCK_SIZE, %o0 - faligndata %f50, %f52, %f42 - faligndata %f52, %f54, %f44 - brlez,pn %o2, Lbcopy_blockdone - faligndata %f54, %f56, %f46 - - stda %f32, [%o1] ASI_STORE - - faligndata %f56, %f58, %f32 - cmp %o0, %o5 - faligndata %f58, %f60, %f34 - dec BLOCK_SIZE, %o2 - faligndata %f60, %f62, %f36 - bleu,a,pn %icc, 2f - ldda [%o0] ASI_BLK_P, %f16 - membar #Sync -2: - faligndata %f62, %f0, %f38 - inc BLOCK_SIZE, %o1 - faligndata %f0, %f2, %f40 - inc BLOCK_SIZE, %o0 - faligndata %f2, %f4, %f42 - faligndata %f4, %f6, %f44 - brlez,pn %o2, Lbcopy_blockdone - faligndata %f6, %f8, %f46 - - stda %f32, [%o1] ASI_STORE - ba 3b - inc BLOCK_SIZE, %o1 - - - !! - !! Source at BLOCK_ALIGN+48 - !! - !! We need to load 2 doubles by hand. - !! -L106: -#ifdef RETURN_NAME - sethi %hi(1f), %g1 - ba,pt %icc, 2f - or %g1, %lo(1f), %g1 -1: - .asciz "L106" - .align 8 -2: -#endif - fmovd %f0, %f10 - ldd [%o0], %f12 - inc 8, %o0 - ldd [%o0], %f14 - inc 8, %o0 - - cmp %o0, %o5 - bleu,a,pn %icc, 2f - ldda [%o0] ASI_BLK_P, %f16 - membar #Sync -2: - inc BLOCK_SIZE, %o0 -3: - faligndata %f10, %f12, %f32 - cmp %o0, %o5 - faligndata %f12, %f14, %f34 - bleu,a,pn %icc, 2f - ldda [%o0] ASI_BLK_P, %f48 - membar #Sync -2: - faligndata %f14, %f16, %f36 - dec BLOCK_SIZE, %o2 - faligndata %f16, %f18, %f38 - inc BLOCK_SIZE, %o0 - faligndata %f18, %f20, %f40 - faligndata %f20, %f22, %f42 - faligndata %f22, %f24, %f44 - brlez,pn %o2, Lbcopy_blockdone - faligndata %f24, %f26, %f46 - - stda %f32, [%o1] ASI_STORE - - faligndata %f26, %f28, %f32 - cmp %o0, %o5 - faligndata %f28, %f30, %f34 - bleu,a,pn %icc, 2f - ldda [%o0] ASI_BLK_P, %f0 - membar #Sync -2: - faligndata %f30, %f48, %f36 - dec BLOCK_SIZE, %o2 - faligndata %f48, %f50, %f38 - inc BLOCK_SIZE, %o1 - faligndata %f50, %f52, %f40 - faligndata %f52, %f54, %f42 - inc BLOCK_SIZE, %o0 - faligndata %f54, %f56, %f44 - brlez,pn %o2, Lbcopy_blockdone - faligndata %f56, %f58, %f46 - - stda %f32, [%o1] ASI_STORE - - faligndata %f58, %f60, %f32 - cmp %o0, %o5 - faligndata %f60, %f62, %f34 - bleu,a,pn %icc, 2f - ldda [%o0] ASI_BLK_P, %f16 - membar #Sync -2: - faligndata %f62, %f0, %f36 - dec BLOCK_SIZE, %o2 - faligndata %f0, %f2, %f38 - inc BLOCK_SIZE, %o1 - faligndata %f2, %f4, %f40 - faligndata %f4, %f6, %f42 - inc BLOCK_SIZE, %o0 - faligndata %f6, %f8, %f44 - brlez,pn %o2, Lbcopy_blockdone - faligndata %f8, %f10, %f46 - - stda %f32, [%o1] ASI_STORE - ba 3b - inc BLOCK_SIZE, %o1 - - - !! - !! Source at BLOCK_ALIGN+56 - !! - !! We need to load 1 double by hand. - !! -L107: -#ifdef RETURN_NAME - sethi %hi(1f), %g1 - ba,pt %icc, 2f - or %g1, %lo(1f), %g1 -1: - .asciz "L107" - .align 8 -2: -#endif - fmovd %f0, %f12 - ldd [%o0], %f14 - inc 8, %o0 - - cmp %o0, %o5 - bleu,a,pn %icc, 2f - ldda [%o0] ASI_BLK_P, %f16 - membar #Sync -2: - inc BLOCK_SIZE, %o0 -3: - faligndata %f12, %f14, %f32 - cmp %o0, %o5 - bleu,a,pn %icc, 2f - ldda [%o0] ASI_BLK_P, %f48 - membar #Sync -2: - faligndata %f14, %f16, %f34 - dec BLOCK_SIZE, %o2 - faligndata %f16, %f18, %f36 - inc BLOCK_SIZE, %o0 - faligndata %f18, %f20, %f38 - faligndata %f20, %f22, %f40 - faligndata %f22, %f24, %f42 - faligndata %f24, %f26, %f44 - brlez,pn %o2, Lbcopy_blockdone - faligndata %f26, %f28, %f46 - - stda %f32, [%o1] ASI_STORE - - faligndata %f28, %f30, %f32 - cmp %o0, %o5 - bleu,a,pn %icc, 2f - ldda [%o0] ASI_BLK_P, %f0 - membar #Sync -2: - faligndata %f30, %f48, %f34 - dec BLOCK_SIZE, %o2 - faligndata %f48, %f50, %f36 - inc BLOCK_SIZE, %o1 - faligndata %f50, %f52, %f38 - faligndata %f52, %f54, %f40 - inc BLOCK_SIZE, %o0 - faligndata %f54, %f56, %f42 - faligndata %f56, %f58, %f44 - brlez,pn %o2, Lbcopy_blockdone - faligndata %f58, %f60, %f46 - - stda %f32, [%o1] ASI_STORE - - faligndata %f60, %f62, %f32 - cmp %o0, %o5 - bleu,a,pn %icc, 2f - ldda [%o0] ASI_BLK_P, %f16 - membar #Sync -2: - faligndata %f62, %f0, %f34 - dec BLOCK_SIZE, %o2 - faligndata %f0, %f2, %f36 - inc BLOCK_SIZE, %o1 - faligndata %f2, %f4, %f38 - faligndata %f4, %f6, %f40 - inc BLOCK_SIZE, %o0 - faligndata %f6, %f8, %f42 - faligndata %f8, %f10, %f44 - - brlez,pn %o2, Lbcopy_blockdone - faligndata %f10, %f12, %f46 - - stda %f32, [%o1] ASI_STORE - ba 3b - inc BLOCK_SIZE, %o1 - -Lbcopy_blockdone: - inc BLOCK_SIZE, %o2 ! Fixup our overcommit - membar #Sync ! Finish any pending loads -#define FINISH_REG(f) \ - deccc 8, %o2; \ - bl,a Lbcopy_blockfinish; \ - fmovd f, %f48; \ - std f, [%o1]; \ - inc 8, %o1 - - FINISH_REG(%f32) - FINISH_REG(%f34) - FINISH_REG(%f36) - FINISH_REG(%f38) - FINISH_REG(%f40) - FINISH_REG(%f42) - FINISH_REG(%f44) - FINISH_REG(%f46) - FINISH_REG(%f48) -#undef FINISH_REG - !! - !! The low 3 bits have the sub-word bits needed to be - !! stored [because (x-8)&0x7 == x]. - !! -Lbcopy_blockfinish: - brz,pn %o2, 2f ! 100% complete? - fmovd %f48, %f4 - cmp %o2, 8 ! Exactly 8 bytes? - bz,a,pn %xcc, 2f - std %f4, [%o1] - - btst 4, %o2 ! Word store? - bz %xcc, 1f - nop - st %f4, [%o1] - inc 4, %o1 -1: - btst 2, %o2 - fzero %f0 - bz 1f - - mov -6, %o4 - alignaddr %o1, %o4, %g0 - - faligndata %f0, %f4, %f8 - - stda %f8, [%o1] ASI_FL16_P ! Store short - inc 2, %o1 -1: - btst 1, %o2 ! Byte aligned? - bz 2f - - mov -7, %o0 ! Calculate dest - 7 - alignaddr %o1, %o0, %g0 ! Calculate shift mask and dest. - - faligndata %f0, %f4, %f8 ! Move 1st byte to low part of f8 - - stda %f8, [%o1] ASI_FL8_P ! Store 1st byte - inc 1, %o1 ! Update address -2: - membar #Sync -#if 0 - !! - !! verify copy success. - !! - - mov %i0, %o2 - mov %i1, %o4 - mov %i2, %l4 -0: - ldub [%o2], %o1 - inc %o2 - ldub [%o4], %o3 - inc %o4 - cmp %o3, %o1 - bnz 1f - dec %l4 - brnz %l4, 0b - nop - ba 2f - nop - -1: - set block_disable, %o0 - stx %o0, [%o0] - - set 0f, %o0 - call prom_printf - sub %i2, %l4, %o5 - set 1f, %o0 - mov %i0, %o1 - mov %i1, %o2 - call prom_printf - mov %i2, %o3 - ta 1 - .data - _ALIGN -block_disable: .xword 0 -0: .asciz "bcopy failed: %x@%p != %x@%p byte %d\r\n" -1: .asciz "bcopy(%p, %p, %lx)\r\n" - _ALIGN - .text -2: -#endif -#ifdef _KERNEL - - set 1f, %o0 - mov %i0, %o1 - mov %i1, %o2 - call printf - mov %i2, %o3 - - .data - _ALIGN -1: .asciz "block exit (%p, %p, %d)\n" - _ALIGN - .text -/* - * Weve saved our possible fpstate, now disable the fpu - * and continue with life. - */ -#if 1 - RESTORE_FPU -#else -#ifdef DEBUG - LDPTR [%l1 + %lo(FPPROC)], %l7 - cmp %l7, %l5 -! tnz 1 ! fpproc has changed! - LDPTR [%l5 + P_FPSTATE], %l7 - cmp %l7, %l0 - tnz 1 ! fpstate has changed! -#endif - andcc %l2, %l3, %g0 ! If (fpproc && fpstate) - STPTR %l2, [%l1 + %lo(FPPROC)] ! Restore old fproc - bz,pt %xcc, 1f ! Skip if no fpstate - STPTR %l6, [%l5 + P_FPSTATE] ! Restore old fpstate - - call _C_LABEL(loadfpstate) ! Re-load orig fpstate - mov %l3, %o0 -1: -#endif - set 1f, %o0 - mov %i0, %o1 - mov %i1, %o2 - call printf - mov %i2, %o3 - - .data - _ALIGN -1: .asciz "block done (%p, %p, %d)\n" - _ALIGN - .text - - - ret - restore %g1, 0, %o0 ! Return DEST for memcpy -#endif - retl - mov %g1, %o0 -#endif - - diff --git a/lib/libc/arch/sparc64/string/memset.S b/lib/libc/arch/sparc64/string/memset.S deleted file mode 100644 index ca10dc702843..000000000000 --- a/lib/libc/arch/sparc64/string/memset.S +++ /dev/null @@ -1,196 +0,0 @@ -/* $NetBSD: memset.S,v 1.4 2001/08/02 01:17:28 eeh Exp $ */ - -/* - * Copyright (c) 2001, Eduardo E. Horvath - * - * This software was developed by the Computer Systems Engineering group - * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and - * contributed to Berkeley. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * from: Header: bzero.s,v 1.1 92/06/25 12:52:46 torek Exp - */ - -#include -#ifndef _LOCORE -#define _LOCORE -#endif -#include -#include -#include - -#if defined(LIBC_SCCS) && !defined(lint) - RCSID("$NetBSD: memset.S,v 1.4 2001/08/02 01:17:28 eeh Exp $") -#endif /* LIBC_SCCS and not lint */ - -/* - * bzero(addr, len) - * - * We want to use VIS instructions if we're clearing out more than - * 256 bytes, but to do that we need to properly save and restore the - * FP registers. Unfortunately the code to do that in the kernel needs - * to keep track of the current owner of the FPU, hence the different - * code. - * - * XXXXX To produce more efficient code, we do not allow lengths - * greater than 0x80000000000000000, which are negative numbers. - * This should not really be an issue since the VA hole should - * cause any such ranges to fail anyway. - */ -ENTRY(bzero) - ! %o0 = addr, %o1 = len - mov %o1, %o2 - clr %o1 ! Initialize our pattern -/* - * memset(addr, c, len) - * - */ -ENTRY(memset) - ! %o0 = addr, %o1 = pattern, %o2 = len - mov %o0, %o4 ! Save original pointer - -Lbzero_internal: - btst 7, %o0 ! Word aligned? - bz,pn %xcc, 0f - nop - inc %o0 - deccc %o2 ! Store up to 7 bytes - bge,a,pt %xcc, Lbzero_internal - stb %o1, [%o0 - 1] - - retl ! Duplicate Lbzero_done - mov %o4, %o0 -0: - /* - * Duplicate the pattern so it fills 64-bits. - */ - andcc %o1, 0x0ff, %o1 ! No need to extend zero - bz,pt %icc, 1f - sllx %o1, 8, %o3 ! sigh. all dependent insns. - or %o1, %o3, %o1 - sllx %o1, 16, %o3 - or %o1, %o3, %o1 - sllx %o1, 32, %o3 - or %o1, %o3, %o1 -1: -#if 1 - !! Now we are 64-bit aligned - cmp %o2, 256 ! Use block clear if len > 256 - bge,pt %xcc, Lbzero_block ! use block store insns -#endif - deccc 8, %o2 -Lbzero_longs: - bl,pn %xcc, Lbzero_cleanup ! Less than 8 bytes left - nop -3: - inc 8, %o0 - deccc 8, %o2 - bge,pt %xcc, 3b - stx %o1, [%o0 - 8] ! Do 1 longword at a time - - /* - * Len is in [-8..-1] where -8 => done, -7 => 1 byte to zero, - * -6 => two bytes, etc. Mop up this remainder, if any. - */ -Lbzero_cleanup: - btst 4, %o2 - bz,pt %xcc, 5f ! if (len & 4) { - nop - stw %o1, [%o0] ! *(int *)addr = 0; - inc 4, %o0 ! addr += 4; -5: - btst 2, %o2 - bz,pt %xcc, 7f ! if (len & 2) { - nop - sth %o1, [%o0] ! *(short *)addr = 0; - inc 2, %o0 ! addr += 2; -7: - btst 1, %o2 - bnz,a %icc, Lbzero_done ! if (len & 1) - stb %o1, [%o0] ! *addr = 0; -Lbzero_done: - retl - mov %o4, %o0 ! Restore ponter for memset (ugh) - -#if 1 -Lbzero_block: -/* - * Userland: - * - * Floating point registers are volatile. What luck. - * - * See locore.s for the kernel version. - * - */ -! wr %g0, FPRS_FEF, %fprs ! Enable FPU - - !! We are now 8-byte aligned. We need to become 64-byte aligned. - btst 63, %o0 - bz,pt %xcc, 2f - nop -1: - stx %o1, [%o0] - inc 8, %o0 - btst 63, %o0 - bnz,pt %xcc, 1b - dec 8, %o2 - -2: - brz %o1, 3f ! Skip the memory op - fzero %f0 ! for bzero - - stx %o1, [%o0] ! Flush this puppy to RAM - membar #StoreLoad - ldd [%o0], %f0 -3: - fmovd %f0, %f2 ! Duplicate the pattern - fmovd %f0, %f4 - fmovd %f0, %f6 - fmovd %f0, %f8 - fmovd %f0, %f10 - fmovd %f0, %f12 - fmovd %f0, %f14 - - !! Remember: we were 8 bytes too far - dec 56, %o2 ! Go one iteration too far -5: - stda %f0, [%o0] ASI_BLK_P ! Store 64 bytes - deccc 64, %o2 - bg,pn %xcc, 5b - inc 64, %o0 - - membar #Sync -/* - * Now we're done we need to load the FPU state from where - * we put it. - */ - ba,pt %xcc, Lbzero_longs ! Finish up the remainder - inccc 56, %o2 ! Restore the count -#endif