mirror of https://github.com/libsdl-org/SDL
ARM: NEON assembly optimization for SDL_FillRect
This commit is contained in:
parent
1187b013a5
commit
72f8044a42
|
@ -281,6 +281,27 @@ SDL_FillRects(SDL_Surface * dst, const SDL_Rect * rects, int count,
|
|||
return SDL_SetError("SDL_FillRects() passed NULL rects");
|
||||
}
|
||||
|
||||
#if SDL_ARM_NEON_BLITTERS
|
||||
if (SDL_HasNEON() && dst->format->BytesPerPixel != 3) {
|
||||
void FillRect8ARMNEONAsm(int32_t w, int32_t h, uint8_t *dst, int32_t dst_stride, uint8_t src);
|
||||
void FillRect16ARMNEONAsm(int32_t w, int32_t h, uint16_t *dst, int32_t dst_stride, uint16_t src);
|
||||
void FillRect32ARMNEONAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t src);
|
||||
switch (dst->format->BytesPerPixel) {
|
||||
case 1:
|
||||
FillRect8ARMNEONAsm(rect->w, rect->h, (uint8_t *) pixels, dst->pitch >> 0, color);
|
||||
break;
|
||||
case 2:
|
||||
FillRect16ARMNEONAsm(rect->w, rect->h, (uint16_t *) pixels, dst->pitch >> 1, color);
|
||||
break;
|
||||
case 4:
|
||||
FillRect32ARMNEONAsm(rect->w, rect->h, (uint32_t *) pixels, dst->pitch >> 2, color);
|
||||
break;
|
||||
}
|
||||
|
||||
SDL_UnlockSurface(dst);
|
||||
return(0);
|
||||
}
|
||||
#endif
|
||||
#if SDL_ARM_SIMD_BLITTERS
|
||||
if (SDL_HasARMSIMD() && dst->format->BytesPerPixel != 3) {
|
||||
void FillRect8ARMSIMDAsm(int32_t w, int32_t h, uint8_t *dst, int32_t dst_stride, uint8_t src);
|
||||
|
|
|
@ -95,6 +95,134 @@
|
|||
|
||||
/******************************************************************************/
|
||||
|
||||
/* We can actually do significantly better than the Pixman macros, at least for
|
||||
* the case of fills, by using a carefully scheduled inner loop. Cortex-A53
|
||||
* shows an improvement of up to 78% in ideal cases (large fills to L1 cache).
|
||||
*/
|
||||
|
||||
.macro generate_fillrect_function name, bpp, log2Bpp
|
||||
/*
|
||||
* void name(int32_t w, int32_t h, uint8_t *dst, int32_t dst_stride, uint8_t src);
|
||||
* On entry:
|
||||
* a1 = width, pixels
|
||||
* a2 = height, rows
|
||||
* a3 = pointer to top-left destination pixel
|
||||
* a4 = stride, pixels
|
||||
* [sp] = pixel value to fill with
|
||||
* Within the function:
|
||||
* v1 = width remaining
|
||||
* v2 = vst offset
|
||||
* v3 = alternate pointer
|
||||
* ip = data ARM register
|
||||
*/
|
||||
pixman_asm_function name
|
||||
vld1.\bpp {d0[],d1[]}, [sp]
|
||||
sub a4, a1
|
||||
vld1.\bpp {d2[],d3[]}, [sp]
|
||||
cmp a1, #(15+64) >> \log2Bpp
|
||||
push {v1-v3,lr}
|
||||
vmov ip, s0
|
||||
blo 51f
|
||||
|
||||
/* Long-row case */
|
||||
mov v2, #64
|
||||
1: mov v1, a1
|
||||
ands v3, a3, #15
|
||||
beq 2f
|
||||
/* Leading pixels */
|
||||
rsb v3, v3, #16 /* number of leading bytes until 16-byte aligned */
|
||||
sub v1, v1, v3, lsr #\log2Bpp
|
||||
rbit v3, v3
|
||||
.if bpp <= 16
|
||||
.if bpp == 8
|
||||
tst a3, #1 /* bit 0 unaffected by rsb so can avoid register interlock */
|
||||
strneb ip, [a3], #1
|
||||
tst v3, #1<<30
|
||||
.else
|
||||
tst a3, #2 /* bit 1 unaffected by rsb (assuming halfword alignment) so can avoid register interlock */
|
||||
.endif
|
||||
strneh ip, [a3], #2
|
||||
.endif
|
||||
movs v3, v3, lsl #3
|
||||
vstmcs a3!, {s0}
|
||||
vstmmi a3!, {d0}
|
||||
2: sub v1, v1, #64 >> \log2Bpp /* simplifies inner loop termination */
|
||||
add v3, a3, #32
|
||||
/* Inner loop */
|
||||
3: vst1.\bpp {q0-q1}, [a3 :128], v2
|
||||
subs v1, v1, #64 >> \log2Bpp
|
||||
vst1.\bpp {q0-q1}, [v3 :128], v2
|
||||
bhs 3b
|
||||
/* Trailing pixels */
|
||||
4: movs v1, v1, lsl #27 + \log2Bpp
|
||||
bcc 5f
|
||||
vst1.\bpp {q0-q1}, [a3 :128]!
|
||||
5: bpl 6f
|
||||
vst1.\bpp {q0}, [a3 :128]!
|
||||
6: movs v1, v1, lsl #2
|
||||
vstmcs a3!, {d0}
|
||||
vstmmi a3!, {s0}
|
||||
.if bpp <= 16
|
||||
movs v1, v1, lsl #2
|
||||
strcsh ip, [a3], #2
|
||||
.if bpp == 8
|
||||
strmib ip, [a3], #1
|
||||
.endif
|
||||
.endif
|
||||
subs a2, a2, #1
|
||||
add a3, a3, a4, lsl #\log2Bpp
|
||||
bhi 1b
|
||||
pop {v1-v3,pc}
|
||||
|
||||
/* Short-row case */
|
||||
51: movs v1, a1
|
||||
.if bpp == 8
|
||||
tst a3, #3
|
||||
beq 53f
|
||||
52: subs v1, v1, #1
|
||||
blo 57f
|
||||
strb ip, [a3], #1
|
||||
tst a3, #3
|
||||
bne 52b
|
||||
.elseif bpp == 16
|
||||
tstne a3, #2
|
||||
subne v1, v1, #1
|
||||
strneh ip, [a3], #2
|
||||
.endif
|
||||
53: cmp v1, #32 >> \log2Bpp
|
||||
bcc 54f
|
||||
vst1.\bpp {q0-q1}, [a3]!
|
||||
sub v1, v1, #32 >> \log2Bpp
|
||||
/* Trailing pixels */
|
||||
54: movs v1, v1, lsl #27 + \log2Bpp
|
||||
bcc 55f
|
||||
vst1.\bpp {q0-q1}, [a3]!
|
||||
55: bpl 56f
|
||||
vst1.\bpp {q0}, [a3]!
|
||||
56: movs v1, v1, lsl #2
|
||||
vstmcs a3!, {d0}
|
||||
vstmmi a3!, {s0}
|
||||
.if bpp <= 16
|
||||
movs v1, v1, lsl #2
|
||||
strcsh ip, [a3], #2
|
||||
.if bpp == 8
|
||||
strmib ip, [a3], #1
|
||||
.endif
|
||||
.endif
|
||||
subs a2, a2, #1
|
||||
add a3, a3, a4, lsl #\log2Bpp
|
||||
bhi 51b
|
||||
57: pop {v1-v3,pc}
|
||||
|
||||
.endfunc
|
||||
.endm
|
||||
|
||||
generate_fillrect_function FillRect32ARMNEONAsm, 32, 2
|
||||
generate_fillrect_function FillRect16ARMNEONAsm, 16, 1
|
||||
generate_fillrect_function FillRect8ARMNEONAsm, 8, 0
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
.macro RGBtoRGBPixelAlpha_process_pixblock_head
|
||||
vmvn d30, d3 /* get inverted source alpha */
|
||||
vmov d31, d7 /* dest alpha is always unchanged */
|
||||
|
|
Loading…
Reference in New Issue