libroot: New memcpy for ARM.

It seems the old one was just broken (see mailing list discussion.) This new one was built from the generic one, but it was done using a trunk build of Clang 8 with tuned optimization flags, and is smaller (101 instructions) than GCC 8's (134 instructions) and the old hand-optimized one (~125 instructions?) as well as being targeted for ARMv7 instead of ARMv6 like the old one was. In the future, we may want to look at newlib's ARM memcpy, which is probably much more hand-optmized than this one is, but this is at least better than what we had before.
2018-08-20 11:53:47 -04:00 · 2018-08-20 11:53:47 -04:00 · b2cb85f4d5
commit b2cb85f4d5
parent fbc02f9437
1 changed files with 129 additions and 160 deletions
--- a/src/system/libroot/posix/string/arch/arm/arch_string.S
+++ b/src/system/libroot/posix/string/arch/arm/arch_string.S
@ -1,170 +1,139 @@
 /*
-** Copyright 2001, Travis Geiselbrecht. All rights reserved.
-** Distributed under the terms of the NewOS License.
-*/
+ * Copyright 2018, Haiku, Inc. All rights reserved.
+ * Distributed under the terms of the MIT License.
+ */

 #include <asm_defs.h>

-#if 1
-
-/* that should be enough for now */

 .align 4
 FUNCTION(memcpy):
 FUNCTION(__aeabi_memcpy):
-	// check for zero length copy or the same pointer
-	cmp		r2, #0
-	cmpne	r1, r0
-	bxeq	lr
-
-	// save a few registers for use and the return code (input dst)
-	stmfd	sp!, {r0, r4, r5, lr}
-
-	// check for forwards overlap (src > dst, distance < len)
-	subs	r3, r0, r1
-	cmpgt	r2, r3
-	bgt		.L_forwardoverlap
-
-	// check for a short copy len.
-	// 20 bytes is enough so that if a 16 byte alignment needs to happen there is at least a 
-	//   wordwise copy worth of work to be done.
-	cmp		r2, #(16+4)
-	blt		.L_bytewise
-
-	// see if they are similarly aligned on 4 byte boundaries
-	eor		r3, r0, r1
-	tst		r3, #3
-	bne		.L_bytewise		// dissimilarly aligned, nothing we can do (for now)
-
-	// check for 16 byte alignment on dst.
-	// this will also catch src being not 4 byte aligned, since it is similarly 4 byte 
-	//   aligned with dst at this point.
-	tst		r0, #15
-	bne		.L_not16bytealigned
-
-	// check to see if we have at least 32 bytes of data to copy.
-	// if not, just revert to wordwise copy
-	cmp		r2, #32
-	blt		.L_wordwise
-
-.L_bigcopy:
-	// copy 32 bytes at a time. src & dst need to be at least 4 byte aligned, 
-	// and we need at least 32 bytes remaining to copy
-
-	// save r6-r7 for use in the big copy
-	stmfd	sp!, {r6-r7}
-
-	sub		r2, r2, #32		// subtract an extra 32 to the len so we can avoid an extra compare
-
-.L_bigcopy_loop:
-	ldmia	r1!, {r4, r5, r6, r7}
-	stmia	r0!, {r4, r5, r6, r7}
-	ldmia	r1!, {r4, r5, r6, r7}
-	subs	r2, r2, #32
-	stmia	r0!, {r4, r5, r6, r7}
-	bge		.L_bigcopy_loop
-
-	// restore r6-r7
-	ldmfd	sp!, {r6-r7}
-
-	// see if we are done
-	adds	r2, r2, #32
-	beq		.L_done
-
-	// less then 4 bytes left?
-	cmp		r2, #4
-	blt		.L_bytewise
-
-.L_wordwise:
-	// copy 4 bytes at a time.
-	// src & dst are guaranteed to be word aligned, and at least 4 bytes are left to copy.
-	subs	r2, r2, #4
-
-.L_wordwise_loop:
-	ldr		r3, [r1], #4
-	subs	r2, r2, #4
-	str		r3, [r0], #4
-	bge		.L_wordwise_loop
-
-	// correct the remaining len and test for completion
-	adds	r2, r2, #4	
-	beq		.L_done
-
-.L_bytewise:
-	// simple bytewise copy
-	ldrb	r3, [r1], #1
-	subs	r2, r2, #1
-	strb	r3, [r0], #1
-	bgt		.L_bytewise
-
-.L_done:
-	// load dst for return and restore r4,r5
-//#if ARM_ARCH_LEVEL >= 5
-//	ldmfd	sp!, {r0, r4, r5, pc}
-//#else
-	ldmfd	sp!, {r0, r4, r5, lr}
-	bx		lr
-//#endif
-
-.L_not16bytealigned:
-	// dst is not 16 byte aligned, so we will copy up to 15 bytes to get it aligned.
-	// src is guaranteed to be similarly word aligned with dst.
-
-	// set the condition flags based on the alignment.
-	lsl		r12, r0, #28
-	rsb		r12, r12, #0
-	msr		CPSR_f, r12				// move into NZCV fields in CPSR
-
-	// move as many bytes as necessary to get the dst aligned
-#ifdef __clang__
-	ldrbvs	r3, [r1], #1			// V set
-	ldrhcs	r4, [r1], #2			// C set
-	ldreq	r5, [r1], #4			// Z set
-
-	strbvs	r3, [r0], #1
-	strhcs	r4, [r0], #2
-	streq	r5, [r0], #4
-
-	ldmiami	r1!, {r3-r4}			// N set
-	stmiami	r0!, {r3-r4}
-#else
-	ldrvsb	r3, [r1], #1			// V set
-	ldrcsh	r4, [r1], #2			// C set
-	ldreq	r5, [r1], #4			// Z set
-
-	strvsb	r3, [r0], #1
-	strcsh	r4, [r0], #2
-	streq	r5, [r0], #4
-
-	ldmmiia r1!, {r3-r4}			// N set
-	stmmiia r0!, {r3-r4}
-#endif
-
-	// fix the remaining len
-	sub		r2, r2, r12, lsr #28
-
-	// test to see what we should do now
-	cmp		r2, #32
-	bge		.L_bigcopy
-	b		.L_wordwise
-	
-	// src and dest overlap 'forwards' or dst > src
-.L_forwardoverlap:
-
-	// do a bytewise reverse copy for now
-	add		r1, r1, r2
-	add		r0, r0, r2
-
-.L_bytewisereverse:
-	// simple bytewise reverse copy
-	ldrb	r3, [r1], #-1
-	subs	r2, r2, #1
-	strb	r3, [r0], #-1
-	bgt		.L_bytewisereverse
-
-	b		.L_done
-	// check for zero length copy or the same pointer
-
+        push    {r4, r5, r6, r7, r8, r10, r11, lr}
+        add     r11, sp, #24
+        cmp     r0, r1
+        cmpne   r2, #0
+        bne     .LBB0_2
+.LBB0_1:
+        pop     {r4, r5, r6, r7, r8, r10, r11, pc}
+.LBB0_2:
+        orr     r3, r1, r0
+        tst     r3, #3
+        beq     .LBB0_9
+        eor     r3, r1, r0
+        and     r7, r0, #3
+        tst     r3, #3
+        mov     r12, r2
+        rsbeq   r12, r7, #4
+        cmp     r2, #4
+        movlo   r12, r2
+        sub     r2, r2, r12
+        cmp     r12, #1
+        blt     .LBB0_9
+        mvn     r7, r12
+        cmn     r7, #2
+        mvn     r3, #1
+        mvn     r6, #1
+        movgt   r3, r7
+        add     r3, r12, r3
+        add     r3, r3, #2
+        cmp     r3, #16
+        blo     .LBB0_22
+        cmn     r7, #2
+        movgt   r6, r7
+        add     r7, r12, r6
+        add     r7, r7, #2
+        add     r6, r1, r7
+        cmp     r6, r0
+        addhi   r7, r0, r7
+        cmphi   r7, r1
+        bhi     .LBB0_22
+        bic     r4, r3, #15
+        mov     r7, r0
+        add     r5, r0, r4
+        add     lr, r1, r4
+        sub     r12, r12, r4
+        mov     r6, r4
+.LBB0_7:
+        vld1.8  {d16, d17}, [r1]!
+        subs    r6, r6, #16
+        vst1.8  {d16, d17}, [r7]!
+        bne     .LBB0_7
+        cmp     r3, r4
+        bne     .LBB0_23
+        b       .LBB0_25
+.LBB0_9:
+        mov     r5, r0
+        mov     r3, #0
+        cmp     r3, r2, lsr #2
+        beq     .LBB0_19
+.LBB0_10:
+        bic     r12, r2, #3
+        lsr     r3, r2, #2
+        cmp     r2, #16
+        blo     .LBB0_15
+        add     r7, r1, r12
+        cmp     r5, r7
+        addlo   r7, r5, r12
+        cmplo   r1, r7
+        blo     .LBB0_15
+        and     lr, r3, #3
+        mov     r7, r1
+        sub     r8, r3, lr
+        mov     r6, r5
+        sub     r3, r12, lr, lsl #2
+        add     r4, r5, r3
+        add     r3, r1, r3
+.LBB0_13:
+        vld1.32 {d16, d17}, [r7]!
+        subs    r8, r8, #4
+        vst1.32 {d16, d17}, [r6]!
+        bne     .LBB0_13
+        cmp     lr, #0
+        bne     .LBB0_16
+        b       .LBB0_18
+.LBB0_15:
+        mov     lr, r3
+        mov     r3, r1
+        mov     r4, r5
+.LBB0_16:
+        add     r6, lr, #1
+.LBB0_17:
+        ldr     r7, [r3], #4
+        sub     r6, r6, #1
+        str     r7, [r4], #4
+        cmp     r6, #1
+        bgt     .LBB0_17
+.LBB0_18:
+        add     r5, r5, r12
+        add     r1, r1, r12
+.LBB0_19:
+        ands    r2, r2, #3
+        beq     .LBB0_1
+        add     r2, r2, #1
+.LBB0_21:
+        ldrb    r3, [r1], #1
+        sub     r2, r2, #1
+        strb    r3, [r5], #1
+        cmp     r2, #1
+        bgt     .LBB0_21
+        b       .LBB0_1
+.LBB0_22:
+        mov     lr, r1
+        mov     r5, r0
+.LBB0_23:
+        add     r1, r12, #1
+.LBB0_24:
+        ldrb    r3, [lr], #1
+        sub     r1, r1, #1
+        strb    r3, [r5], #1
+        cmp     r1, #1
+        bgt     .LBB0_24
+.LBB0_25:
+        mov     r1, lr
+        mov     r3, #0
+        cmp     r3, r2, lsr #2
+        bne     .LBB0_10
+        b       .LBB0_19
 FUNCTION_END(memcpy)
 FUNCTION_END(__aeabi_memcpy)
-#endif