diff --git a/CMakeLists.txt b/CMakeLists.txt index a40bc609a..14be99089 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -292,7 +292,6 @@ dep_option(SDL_MMX "Use MMX assembly routines" ON "SDL_ASSEMBLY; dep_option(SDL_ALTIVEC "Use Altivec assembly routines" ON "SDL_ASSEMBLY;SDL_CPU_POWERPC32 OR SDL_CPU_POWERPC64" OFF) dep_option(SDL_ARMSIMD "Use SIMD assembly blitters on ARM" OFF "SDL_ASSEMBLY;SDL_CPU_ARM32" OFF) dep_option(SDL_ARMNEON "Use NEON assembly routines" ON "SDL_ASSEMBLY;SDL_CPU_ARM32 OR SDL_CPU_ARM64" OFF) -dep_option(SDL_ARMNEON_BLITTERS "Use NEON assembly blitters on ARM32" OFF "SDL_VIDEO;SDL_ASSEMBLY;SDL_ARMNEON;SDL_CPU_ARM32" OFF) dep_option(SDL_LSX "Use LSX assembly routines" ON "SDL_ASSEMBLY;SDL_CPU_LOONGARCH64" OFF) dep_option(SDL_LASX "Use LASX assembly routines" ON "SDL_ASSEMBLY;SDL_CPU_LOONGARCH64" OFF) @@ -883,67 +882,6 @@ if(SDL_ASSEMBLY) endif() endif() - if(SDL_ARMSIMD) - cmake_push_check_state() - string(APPEND CMAKE_REQUIRED_FLAGS " -x assembler-with-cpp") - list(APPEND CMAKE_REQUIRED_LINK_OPTIONS -x none) - check_c_source_compiles(" - .text - .arch armv6 - .object_arch armv4 - .arm - .altmacro - #ifndef __ARM_EABI__ - #error EABI is required (to be sure that calling conventions are compatible) - #endif - main: - .global main - pld [r0] - uqadd8 r0, r0, r0 - " ARMSIMD_FOUND) - cmake_pop_check_state() - - if(ARMSIMD_FOUND) - set(HAVE_ARMSIMD TRUE) - set(SDL_ARM_SIMD_BLITTERS 1) - enable_language(ASM) - sdl_glob_sources("${SDL3_SOURCE_DIR}/src/video/arm/pixman-arm-simd*.S") - set_property(SOURCE ${ARMSIMD_SOURCES} APPEND PROPERTY COMPILE_OPTIONS -x assembler-with-cpp) - set(WARN_ABOUT_ARM_SIMD_ASM_MIT TRUE) - endif() - endif() - - if(SDL_ARMNEON_BLITTERS) - cmake_push_check_state() - string(APPEND CMAKE_REQUIRED_FLAGS " -x assembler-with-cpp") - list(APPEND CMAKE_REQUIRED_LINK_OPTIONS -x none) - check_c_source_compiles(" - .text - .fpu neon - .arch armv7a - .object_arch armv4 - .eabi_attribute 10, 0 - .arm - .altmacro - #ifndef __ARM_EABI__ - #error EABI is required (to be sure that calling conventions are compatible) - #endif - main: - .global main - pld [r0] - vmovn.u16 d0, q0 - " COMPILER_SUPPORTS_ARMNEON_ASSEMBLY) - cmake_pop_check_state() - if(COMPILER_SUPPORTS_ARMNEON_ASSEMBLY) - set(HAVE_ARMNEON_BLITTERS TRUE) - set(SDL_ARM_NEON_BLITTERS 1) - enable_language(ASM) - sdl_glob_sources("${SDL3_SOURCE_DIR}/src/video/arm/pixman-arm-neon*.S") - set_property(SOURCE ${ARMNEON_SOURCES} APPEND PROPERTY COMPILE_OPTIONS -x assembler-with-cpp) - set(WARN_ABOUT_ARM_NEON_ASM_MIT TRUE) - endif() - endif() - if(SDL_ARMNEON) check_c_source_compiles(" #include diff --git a/cmake/3rdparty.cmake b/cmake/3rdparty.cmake index becb6d3ab..8b38a5ded 100644 --- a/cmake/3rdparty.cmake +++ b/cmake/3rdparty.cmake @@ -25,10 +25,6 @@ function(get_clang_tidy_ignored_files OUTVAR) # HIDAPI Steam controller "controller_constants.h" "controller_structs.h" - # Nokia Pixman - "pixman-arm-asm.h" - "pixman-arm-neon-asm.h" - "pixman-arm-simd-asm.h" # YUV2RGB "yuv_rgb.c" "yuv_rgb_lsx_func.h" diff --git a/src/video/SDL_blit.h b/src/video/SDL_blit.h index ba5b95904..90588153a 100644 --- a/src/video/SDL_blit.h +++ b/src/video/SDL_blit.h @@ -23,12 +23,6 @@ #ifndef SDL_blit_h_ #define SDL_blit_h_ -/* pixman ARM blitters are 32 bit only : */ -#if defined(__aarch64__) || defined(_M_ARM64) -#undef SDL_ARM_SIMD_BLITTERS -#undef SDL_ARM_NEON_BLITTERS -#endif - /* Table to do pixel byte expansion */ extern const Uint8 *SDL_expand_byte[9]; extern const Uint16 SDL_expand_byte_10[]; diff --git a/src/video/SDL_blit_A.c b/src/video/SDL_blit_A.c index ed3c941e0..7f272fd09 100644 --- a/src/video/SDL_blit_A.c +++ b/src/video/SDL_blit_A.c @@ -421,66 +421,6 @@ static void SDL_TARGETING("mmx") BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info) #endif /* SDL_MMX_INTRINSICS */ -#ifdef SDL_ARM_SIMD_BLITTERS -void BlitARGBto565PixelAlphaARMSIMDAsm(int32_t w, int32_t h, uint16_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride); - -static void BlitARGBto565PixelAlphaARMSIMD(SDL_BlitInfo *info) -{ - int32_t width = info->dst_w; - int32_t height = info->dst_h; - uint16_t *dstp = (uint16_t *)info->dst; - int32_t dststride = width + (info->dst_skip >> 1); - uint32_t *srcp = (uint32_t *)info->src; - int32_t srcstride = width + (info->src_skip >> 2); - - BlitARGBto565PixelAlphaARMSIMDAsm(width, height, dstp, dststride, srcp, srcstride); -} - -void BlitRGBtoRGBPixelAlphaARMSIMDAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride); - -static void BlitRGBtoRGBPixelAlphaARMSIMD(SDL_BlitInfo *info) -{ - int32_t width = info->dst_w; - int32_t height = info->dst_h; - uint32_t *dstp = (uint32_t *)info->dst; - int32_t dststride = width + (info->dst_skip >> 2); - uint32_t *srcp = (uint32_t *)info->src; - int32_t srcstride = width + (info->src_skip >> 2); - - BlitRGBtoRGBPixelAlphaARMSIMDAsm(width, height, dstp, dststride, srcp, srcstride); -} -#endif - -#ifdef SDL_ARM_NEON_BLITTERS -void BlitARGBto565PixelAlphaARMNEONAsm(int32_t w, int32_t h, uint16_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride); - -static void BlitARGBto565PixelAlphaARMNEON(SDL_BlitInfo *info) -{ - int32_t width = info->dst_w; - int32_t height = info->dst_h; - uint16_t *dstp = (uint16_t *)info->dst; - int32_t dststride = width + (info->dst_skip >> 1); - uint32_t *srcp = (uint32_t *)info->src; - int32_t srcstride = width + (info->src_skip >> 2); - - BlitARGBto565PixelAlphaARMNEONAsm(width, height, dstp, dststride, srcp, srcstride); -} - -void BlitRGBtoRGBPixelAlphaARMNEONAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride); - -static void BlitRGBtoRGBPixelAlphaARMNEON(SDL_BlitInfo *info) -{ - int32_t width = info->dst_w; - int32_t height = info->dst_h; - uint32_t *dstp = (uint32_t *)info->dst; - int32_t dststride = width + (info->dst_skip >> 2); - uint32_t *srcp = (uint32_t *)info->src; - int32_t srcstride = width + (info->src_skip >> 2); - - BlitRGBtoRGBPixelAlphaARMNEONAsm(width, height, dstp, dststride, srcp, srcstride); -} -#endif - /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */ static void BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo *info) { @@ -1274,21 +1214,7 @@ SDL_BlitFunc SDL_CalculateBlitA(SDL_Surface *surface) } case 2: -#if defined(SDL_ARM_NEON_BLITTERS) || defined(SDL_ARM_SIMD_BLITTERS) - if (sf->bytes_per_pixel == 4 && sf->Amask == 0xff000000 && sf->Gmask == 0xff00 && df->Gmask == 0x7e0 && ((sf->Rmask == 0xff && df->Rmask == 0x1f) || (sf->Bmask == 0xff && df->Bmask == 0x1f))) { -#ifdef SDL_ARM_NEON_BLITTERS - if (SDL_HasNEON()) { - return BlitARGBto565PixelAlphaARMNEON; - } -#endif -#ifdef SDL_ARM_SIMD_BLITTERS - if (SDL_HasARMSIMD()) { - return BlitARGBto565PixelAlphaARMSIMD; - } -#endif - } -#endif - if (sf->bytes_per_pixel == 4 && sf->Amask == 0xff000000 && sf->Gmask == 0xff00 && ((sf->Rmask == 0xff && df->Rmask == 0x1f) || (sf->Bmask == 0xff && df->Bmask == 0x1f))) { + if (sf->BytesPerPixel == 4 && sf->Amask == 0xff000000 && sf->Gmask == 0xff00 && ((sf->Rmask == 0xff && df->Rmask == 0x1f) || (sf->Bmask == 0xff && df->Bmask == 0x1f))) { if (df->Gmask == 0x7e0) { return BlitARGBto565PixelAlpha; } else if (df->Gmask == 0x3e0) { @@ -1311,18 +1237,6 @@ SDL_BlitFunc SDL_CalculateBlitA(SDL_Surface *surface) } } #endif /* SDL_MMX_INTRINSICS */ - if (sf->Amask == 0xff000000) { -#ifdef SDL_ARM_NEON_BLITTERS - if (SDL_HasNEON()) { - return BlitRGBtoRGBPixelAlphaARMNEON; - } -#endif -#ifdef SDL_ARM_SIMD_BLITTERS - if (SDL_HasARMSIMD()) { - return BlitRGBtoRGBPixelAlphaARMSIMD; - } -#endif - } } return BlitNtoNPixelAlpha; diff --git a/src/video/SDL_fillrect.c b/src/video/SDL_fillrect.c index e38d7f73d..04aa925fb 100644 --- a/src/video/SDL_fillrect.c +++ b/src/video/SDL_fillrect.c @@ -247,54 +247,6 @@ int SDL_FillSurfaceRect(SDL_Surface *dst, const SDL_Rect *rect, Uint32 color) return SDL_FillSurfaceRects(dst, rect, 1, color); } -#ifdef SDL_ARM_NEON_BLITTERS -void FillSurfaceRect8ARMNEONAsm(int32_t w, int32_t h, uint8_t *dst, int32_t dst_stride, uint8_t src); -void FillSurfaceRect16ARMNEONAsm(int32_t w, int32_t h, uint16_t *dst, int32_t dst_stride, uint16_t src); -void FillSurfaceRect32ARMNEONAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t src); - -static void fill_8_neon(Uint8 *pixels, int pitch, Uint32 color, int w, int h) -{ - FillSurfaceRect8ARMNEONAsm(w, h, (uint8_t *)pixels, pitch >> 0, color); - return; -} - -static void fill_16_neon(Uint8 *pixels, int pitch, Uint32 color, int w, int h) -{ - FillSurfaceRect16ARMNEONAsm(w, h, (uint16_t *)pixels, pitch >> 1, color); - return; -} - -static void fill_32_neon(Uint8 *pixels, int pitch, Uint32 color, int w, int h) -{ - FillSurfaceRect32ARMNEONAsm(w, h, (uint32_t *)pixels, pitch >> 2, color); - return; -} -#endif - -#ifdef SDL_ARM_SIMD_BLITTERS -void FillSurfaceRect8ARMSIMDAsm(int32_t w, int32_t h, uint8_t *dst, int32_t dst_stride, uint8_t src); -void FillSurfaceRect16ARMSIMDAsm(int32_t w, int32_t h, uint16_t *dst, int32_t dst_stride, uint16_t src); -void FillSurfaceRect32ARMSIMDAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t src); - -static void fill_8_simd(Uint8 *pixels, int pitch, Uint32 color, int w, int h) -{ - FillSurfaceRect8ARMSIMDAsm(w, h, (uint8_t *)pixels, pitch >> 0, color); - return; -} - -static void fill_16_simd(Uint8 *pixels, int pitch, Uint32 color, int w, int h) -{ - FillSurfaceRect16ARMSIMDAsm(w, h, (uint16_t *)pixels, pitch >> 1, color); - return; -} - -static void fill_32_simd(Uint8 *pixels, int pitch, Uint32 color, int w, int h) -{ - FillSurfaceRect32ARMSIMDAsm(w, h, (uint32_t *)pixels, pitch >> 2, color); - return; -} -#endif - int SDL_FillSurfaceRects(SDL_Surface *dst, const SDL_Rect *rects, int count, Uint32 color) { @@ -339,39 +291,8 @@ int SDL_FillSurfaceRects(SDL_Surface *dst, const SDL_Rect *rects, int count, return SDL_SetError("SDL_FillSurfaceRects(): Unsupported surface format"); } -#ifdef SDL_ARM_NEON_BLITTERS - if (SDL_HasNEON() && dst->format->bytes_per_pixel != 3 && !fill_function) { - switch (dst->format->bytes_per_pixel) { - case 1: - fill_function = fill_8_neon; - break; - case 2: - fill_function = fill_16_neon; - break; - case 4: - fill_function = fill_32_neon; - break; - } - } -#endif -#ifdef SDL_ARM_SIMD_BLITTERS - if (SDL_HasARMSIMD() && dst->format->bytes_per_pixel != 3 && !fill_function) { - switch (dst->format->bytes_per_pixel) { - case 1: - fill_function = fill_8_simd; - break; - case 2: - fill_function = fill_16_simd; - break; - case 4: - fill_function = fill_32_simd; - break; - } - } -#endif - - if (!fill_function) { - switch (dst->format->bytes_per_pixel) { + if (fill_function == NULL) { + switch (dst->format->BytesPerPixel) { case 1: { color |= (color << 8); diff --git a/src/video/arm/pixman-arm-asm.h b/src/video/arm/pixman-arm-asm.h deleted file mode 100644 index 3f13ba049..000000000 --- a/src/video/arm/pixman-arm-asm.h +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright © 2010 Nokia Corporation - * - * Permission to use, copy, modify, distribute, and sell this software and its - * documentation for any purpose is hereby granted without fee, provided that - * the above copyright notice appear in all copies and that both that - * copyright notice and this permission notice appear in supporting - * documentation, and that the name of Mozilla Corporation not be used in - * advertising or publicity pertaining to distribution of the software without - * specific, written prior permission. Mozilla Corporation makes no - * representations about the suitability of this software for any purpose. It - * is provided "as is" without express or implied warranty. - * - * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS - * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND - * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY - * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN - * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING - * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS - * SOFTWARE. - * - * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com) - * - */ - -/* Supplementary macro for setting function attributes */ -.macro pixman_asm_function fname - .func fname - .global fname -#ifdef __ELF__ - .hidden fname - .type fname, %function -#endif -fname: -.endm diff --git a/src/video/arm/pixman-arm-neon-asm.S b/src/video/arm/pixman-arm-neon-asm.S deleted file mode 100644 index f9549621a..000000000 --- a/src/video/arm/pixman-arm-neon-asm.S +++ /dev/null @@ -1,375 +0,0 @@ -/* - * Copyright © 2009 Nokia Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - * DEALINGS IN THE SOFTWARE. - * - * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com) - */ - -/* - * Copyright (c) 2018 RISC OS Open Ltd - * - * This software is provided 'as-is', without any express or implied - * warranty. In no event will the authors be held liable for any damages - * arising from the use of this software. - * - * Permission is granted to anyone to use this software for any purpose, - * including commercial applications, and to alter it and redistribute it - * freely, subject to the following restrictions: - * - * 1. The origin of this software must not be misrepresented; you must not - * claim that you wrote the original software. If you use this software - * in a product, an acknowledgment in the product documentation would be - * appreciated but is not required. - * 2. Altered source versions must be plainly marked as such, and must not be - * misrepresented as being the original software. - * 3. This notice may not be removed or altered from any source distribution. - */ - -/* Prevent the stack from becoming executable for no reason... */ -#if defined(__linux__) && defined(__ELF__) -.section .note.GNU-stack,"",%progbits -#endif - - .text - .fpu neon - .arch armv7a - .object_arch armv4 - .eabi_attribute 10, 0 /* suppress Tag_FP_arch */ - .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */ - .arm - .altmacro - .p2align 2 - -#include "pixman-arm-asm.h" -#include "pixman-arm-neon-asm.h" - -/* Global configuration options and preferences */ - -/* - * The code can optionally make use of unaligned memory accesses to improve - * performance of handling leading/trailing pixels for each scanline. - * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for - * example in linux if unaligned memory accesses are not configured to - * generate.exceptions. - */ -.set RESPECT_STRICT_ALIGNMENT, 1 - -/* - * Set default prefetch type. There is a choice between the following options: - * - * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work - * as NOP to workaround some HW bugs or for whatever other reason) - * - * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where - * advanced prefetch introduces heavy overhead) - * - * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8 - * which can run ARM and NEON instructions simultaneously so that extra ARM - * instructions do not add (many) extra cycles, but improve prefetch efficiency) - * - * Note: some types of function can't support advanced prefetch and fallback - * to simple one (those which handle 24bpp pixels) - */ -.set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED - -/* Prefetch distance in pixels for simple prefetch */ -.set PREFETCH_DISTANCE_SIMPLE, 64 - -/******************************************************************************/ - -/* We can actually do significantly better than the Pixman macros, at least for - * the case of fills, by using a carefully scheduled inner loop. Cortex-A53 - * shows an improvement of up to 78% in ideal cases (large fills to L1 cache). - */ - -.macro generate_fillrect_function name, bpp, log2Bpp -/* - * void name(int32_t w, int32_t h, uint8_t *dst, int32_t dst_stride, uint8_t src); - * On entry: - * a1 = width, pixels - * a2 = height, rows - * a3 = pointer to top-left destination pixel - * a4 = stride, pixels - * [sp] = pixel value to fill with - * Within the function: - * v1 = width remaining - * v2 = vst offset - * v3 = alternate pointer - * ip = data ARM register - */ -pixman_asm_function name - vld1.\bpp {d0[],d1[]}, [sp] - sub a4, a1 - vld1.\bpp {d2[],d3[]}, [sp] - cmp a1, #(15+64) >> \log2Bpp - push {v1-v3,lr} - vmov ip, s0 - blo 51f - - /* Long-row case */ - mov v2, #64 -1: mov v1, a1 - ands v3, a3, #15 - beq 2f - /* Leading pixels */ - rsb v3, v3, #16 /* number of leading bytes until 16-byte aligned */ - sub v1, v1, v3, lsr #\log2Bpp - rbit v3, v3 -.if bpp <= 16 -.if bpp == 8 - tst a3, #1 /* bit 0 unaffected by rsb so can avoid register interlock */ - strneb ip, [a3], #1 - tst v3, #1<<30 -.else - tst a3, #2 /* bit 1 unaffected by rsb (assuming halfword alignment) so can avoid register interlock */ -.endif - strneh ip, [a3], #2 -.endif - movs v3, v3, lsl #3 - vstmcs a3!, {s0} - vstmmi a3!, {d0} -2: sub v1, v1, #64 >> \log2Bpp /* simplifies inner loop termination */ - add v3, a3, #32 - /* Inner loop */ -3: vst1.\bpp {q0-q1}, [a3 :128], v2 - subs v1, v1, #64 >> \log2Bpp - vst1.\bpp {q0-q1}, [v3 :128], v2 - bhs 3b - /* Trailing pixels */ -4: movs v1, v1, lsl #27 + \log2Bpp - bcc 5f - vst1.\bpp {q0-q1}, [a3 :128]! -5: bpl 6f - vst1.\bpp {q0}, [a3 :128]! -6: movs v1, v1, lsl #2 - vstmcs a3!, {d0} - vstmmi a3!, {s0} -.if bpp <= 16 - movs v1, v1, lsl #2 - strcsh ip, [a3], #2 -.if bpp == 8 - strmib ip, [a3], #1 -.endif -.endif - subs a2, a2, #1 - add a3, a3, a4, lsl #\log2Bpp - bhi 1b - pop {v1-v3,pc} - - /* Short-row case */ -51: movs v1, a1 -.if bpp == 8 - tst a3, #3 - beq 53f -52: subs v1, v1, #1 - blo 57f - strb ip, [a3], #1 - tst a3, #3 - bne 52b -.elseif bpp == 16 - tstne a3, #2 - subne v1, v1, #1 - strneh ip, [a3], #2 -.endif -53: cmp v1, #32 >> \log2Bpp - bcc 54f - vst1.\bpp {q0-q1}, [a3]! - sub v1, v1, #32 >> \log2Bpp - /* Trailing pixels */ -54: movs v1, v1, lsl #27 + \log2Bpp - bcc 55f - vst1.\bpp {q0-q1}, [a3]! -55: bpl 56f - vst1.\bpp {q0}, [a3]! -56: movs v1, v1, lsl #2 - vstmcs a3!, {d0} - vstmmi a3!, {s0} -.if bpp <= 16 - movs v1, v1, lsl #2 - strcsh ip, [a3], #2 -.if bpp == 8 - strmib ip, [a3], #1 -.endif -.endif - subs a2, a2, #1 - add a3, a3, a4, lsl #\log2Bpp - bhi 51b -57: pop {v1-v3,pc} - -.endfunc -.endm - -generate_fillrect_function FillSurfaceRect32ARMNEONAsm, 32, 2 -generate_fillrect_function FillSurfaceRect16ARMNEONAsm, 16, 1 -generate_fillrect_function FillSurfaceRect8ARMNEONAsm, 8, 0 - -/******************************************************************************/ - -.macro RGBtoRGBPixelAlpha_process_pixblock_head - vmvn d30, d3 /* get inverted source alpha */ - vmov d31, d7 /* dest alpha is always unchanged */ - vmull.u8 q14, d0, d3 - vmlal.u8 q14, d4, d30 - vmull.u8 q0, d1, d3 - vmlal.u8 q0, d5, d30 - vmull.u8 q1, d2, d3 - vmlal.u8 q1, d6, d30 - vrshr.u16 q2, q14, #8 - vrshr.u16 q3, q0, #8 - vraddhn.u16 d28, q14, q2 - vrshr.u16 q2, q1, #8 - vraddhn.u16 d29, q0, q3 - vraddhn.u16 d30, q1, q2 -.endm - -.macro RGBtoRGBPixelAlpha_process_pixblock_tail - /* nothing */ -.endm - -.macro RGBtoRGBPixelAlpha_process_pixblock_tail_head - vld4.8 {d0-d3}, [SRC]! - PF add PF_X, PF_X, #8 - vst4.8 {d28-d31}, [DST_W :128]! - PF tst PF_CTL, #0xF - vld4.8 {d4-d7}, [DST_R :128]! - PF addne PF_X, PF_X, #8 - vmvn d30, d3 /* get inverted source alpha */ - vmov d31, d7 /* dest alpha is always unchanged */ - vmull.u8 q14, d0, d3 - PF subne PF_CTL, PF_CTL, #1 - vmlal.u8 q14, d4, d30 - PF cmp PF_X, ORIG_W - vmull.u8 q0, d1, d3 - PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] - vmlal.u8 q0, d5, d30 - PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] - vmull.u8 q1, d2, d3 - PF subge PF_X, PF_X, ORIG_W - vmlal.u8 q1, d6, d30 - PF subges PF_CTL, PF_CTL, #0x10 - vrshr.u16 q2, q14, #8 - PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! - vrshr.u16 q3, q0, #8 - PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! - vraddhn.u16 d28, q14, q2 - vrshr.u16 q2, q1, #8 - vraddhn.u16 d29, q0, q3 - vraddhn.u16 d30, q1, q2 -.endm - -generate_composite_function \ - BlitRGBtoRGBPixelAlphaARMNEONAsm, 32, 0, 32, \ - FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - 5, /* prefetch distance */ \ - default_init, \ - default_cleanup, \ - RGBtoRGBPixelAlpha_process_pixblock_head, \ - RGBtoRGBPixelAlpha_process_pixblock_tail, \ - RGBtoRGBPixelAlpha_process_pixblock_tail_head - - /******************************************************************************/ - -.macro ARGBto565PixelAlpha_process_pixblock_head - vmvn d6, d3 - vshr.u8 d1, #2 - vshr.u8 d3, #3 - vshr.u8 d0, #3 - vshrn.u16 d7, q2, #3 - vshrn.u16 d25, q2, #8 - vbic.i16 q2, #0xe0 - vshr.u8 d6, #3 - vshr.u8 d7, #2 - vshr.u8 d2, #3 - vmovn.u16 d24, q2 - vshr.u8 d25, #3 - vmull.u8 q13, d1, d3 - vmlal.u8 q13, d7, d6 - vmull.u8 q14, d0, d3 - vmlal.u8 q14, d24, d6 - vmull.u8 q15, d2, d3 - vmlal.u8 q15, d25, d6 -.endm - -.macro ARGBto565PixelAlpha_process_pixblock_tail - vsra.u16 q13, #5 - vsra.u16 q14, #5 - vsra.u16 q15, #5 - vrshr.u16 q13, #5 - vrshr.u16 q14, #5 - vrshr.u16 q15, #5 - vsli.u16 q14, q13, #5 - vsli.u16 q14, q15, #11 -.endm - -.macro ARGBto565PixelAlpha_process_pixblock_tail_head - vld4.8 {d0-d3}, [SRC]! - PF add PF_X, PF_X, #8 - vsra.u16 q13, #5 - PF tst PF_CTL, #0xF - vsra.u16 q14, #5 - PF addne PF_X, PF_X, #8 - vsra.u16 q15, #5 - PF subne PF_CTL, PF_CTL, #1 - vrshr.u16 q13, #5 - PF cmp PF_X, ORIG_W - vrshr.u16 q14, #5 - PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] - vrshr.u16 q15, #5 - PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] - vld1.8 {d4-d5}, [DST_R]! - PF subge PF_X, PF_X, ORIG_W - vsli.u16 q14, q13, #5 - PF subges PF_CTL, PF_CTL, #0x10 - vsli.u16 q14, q15, #11 - PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! - vst1.8 {q14}, [DST_W :128]! - vmvn d6, d3 - vshr.u8 d1, #2 - vshr.u8 d3, #3 - vshr.u8 d0, #3 - vshrn.u16 d7, q2, #3 - vshrn.u16 d25, q2, #8 - vbic.i16 q2, #0xe0 - PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! - vshr.u8 d6, #3 - vshr.u8 d7, #2 - vshr.u8 d2, #3 - vmovn.u16 d24, q2 - vshr.u8 d25, #3 - vmull.u8 q13, d1, d3 - vmlal.u8 q13, d7, d6 - vmull.u8 q14, d0, d3 - vmlal.u8 q14, d24, d6 - vmull.u8 q15, d2, d3 - vmlal.u8 q15, d25, d6 -.endm - -generate_composite_function \ - BlitARGBto565PixelAlphaARMNEONAsm, 32, 0, 16, \ - FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ - 8, /* number of pixels, processed in a single block */ \ - 6, /* prefetch distance */ \ - default_init, \ - default_cleanup, \ - ARGBto565PixelAlpha_process_pixblock_head, \ - ARGBto565PixelAlpha_process_pixblock_tail, \ - ARGBto565PixelAlpha_process_pixblock_tail_head diff --git a/src/video/arm/pixman-arm-neon-asm.h b/src/video/arm/pixman-arm-neon-asm.h deleted file mode 100644 index bdcf6a9d4..000000000 --- a/src/video/arm/pixman-arm-neon-asm.h +++ /dev/null @@ -1,1184 +0,0 @@ -/* - * Copyright © 2009 Nokia Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - * DEALINGS IN THE SOFTWARE. - * - * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com) - */ - -/* - * This file contains a macro ('generate_composite_function') which can - * construct 2D image processing functions, based on a common template. - * Any combinations of source, destination and mask images with 8bpp, - * 16bpp, 24bpp, 32bpp color formats are supported. - * - * This macro takes care of: - * - handling of leading and trailing unaligned pixels - * - doing most of the work related to L2 cache preload - * - encourages the use of software pipelining for better instructions - * scheduling - * - * The user of this macro has to provide some configuration parameters - * (bit depths for the images, prefetch distance, etc.) and a set of - * macros, which should implement basic code chunks responsible for - * pixels processing. See 'pixman-arm-neon-asm.S' file for the usage - * examples. - * - * TODO: - * - try overlapped pixel method (from Ian Rickards) when processing - * exactly two blocks of pixels - * - maybe add an option to do reverse scanline processing - */ - -/* - * Bit flags for 'generate_composite_function' macro which are used - * to tune generated functions behavior. - */ -.set FLAG_DST_WRITEONLY, 0 -.set FLAG_DST_READWRITE, 1 -.set FLAG_DEINTERLEAVE_32BPP, 2 - -/* - * Offset in stack where mask and source pointer/stride can be accessed - * from 'init' macro. This is useful for doing special handling for solid mask. - */ -.set ARGS_STACK_OFFSET, 40 - -/* - * Constants for selecting preferable prefetch type. - */ -.set PREFETCH_TYPE_NONE, 0 /* No prefetch at all */ -.set PREFETCH_TYPE_SIMPLE, 1 /* A simple, fixed-distance-ahead prefetch */ -.set PREFETCH_TYPE_ADVANCED, 2 /* Advanced fine-grained prefetch */ - -/* - * Definitions of supplementary pixld/pixst macros (for partial load/store of - * pixel data). - */ - -.macro pixldst1 op, elem_size, reg1, mem_operand, abits -.if abits > 0 - op&.&elem_size {d®1}, [&mem_operand&, :&abits&]! -.else - op&.&elem_size {d®1}, [&mem_operand&]! -.endif -.endm - -.macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits -.if abits > 0 - op&.&elem_size {d®1, d®2}, [&mem_operand&, :&abits&]! -.else - op&.&elem_size {d®1, d®2}, [&mem_operand&]! -.endif -.endm - -.macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits -.if abits > 0 - op&.&elem_size {d®1, d®2, d®3, d®4}, [&mem_operand&, :&abits&]! -.else - op&.&elem_size {d®1, d®2, d®3, d®4}, [&mem_operand&]! -.endif -.endm - -.macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits - op&.&elem_size {d®1[idx]}, [&mem_operand&]! -.endm - -.macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand - op&.&elem_size {d®1, d®2, d®3}, [&mem_operand&]! -.endm - -.macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand - op&.&elem_size {d®1[idx], d®2[idx], d®3[idx]}, [&mem_operand&]! -.endm - -.macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits -.if numbytes == 32 - pixldst4 op, elem_size, %(basereg+4), %(basereg+5), \ - %(basereg+6), %(basereg+7), mem_operand, abits -.elseif numbytes == 16 - pixldst2 op, elem_size, %(basereg+2), %(basereg+3), mem_operand, abits -.elseif numbytes == 8 - pixldst1 op, elem_size, %(basereg+1), mem_operand, abits -.elseif numbytes == 4 - .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 32) - pixldst0 op, 32, %(basereg+0), 1, mem_operand, abits - .elseif elem_size == 16 - pixldst0 op, 16, %(basereg+0), 2, mem_operand, abits - pixldst0 op, 16, %(basereg+0), 3, mem_operand, abits - .else - pixldst0 op, 8, %(basereg+0), 4, mem_operand, abits - pixldst0 op, 8, %(basereg+0), 5, mem_operand, abits - pixldst0 op, 8, %(basereg+0), 6, mem_operand, abits - pixldst0 op, 8, %(basereg+0), 7, mem_operand, abits - .endif -.elseif numbytes == 2 - .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 16) - pixldst0 op, 16, %(basereg+0), 1, mem_operand, abits - .else - pixldst0 op, 8, %(basereg+0), 2, mem_operand, abits - pixldst0 op, 8, %(basereg+0), 3, mem_operand, abits - .endif -.elseif numbytes == 1 - pixldst0 op, 8, %(basereg+0), 1, mem_operand, abits -.else - .error "unsupported size: numbytes" -.endif -.endm - -.macro pixld numpix, bpp, basereg, mem_operand, abits=0 -.if bpp > 0 -.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0) - pixldst4 vld4, 8, %(basereg+4), %(basereg+5), \ - %(basereg+6), %(basereg+7), mem_operand, abits -.elseif (bpp == 24) && (numpix == 8) - pixldst3 vld3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand -.elseif (bpp == 24) && (numpix == 4) - pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand - pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand - pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand - pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand -.elseif (bpp == 24) && (numpix == 2) - pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand - pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand -.elseif (bpp == 24) && (numpix == 1) - pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand -.else - pixldst %(numpix * bpp / 8), vld1, %(bpp), basereg, mem_operand, abits -.endif -.endif -.endm - -.macro pixst numpix, bpp, basereg, mem_operand, abits=0 -.if bpp > 0 -.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0) - pixldst4 vst4, 8, %(basereg+4), %(basereg+5), \ - %(basereg+6), %(basereg+7), mem_operand, abits -.elseif (bpp == 24) && (numpix == 8) - pixldst3 vst3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand -.elseif (bpp == 24) && (numpix == 4) - pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand - pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand - pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand - pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand -.elseif (bpp == 24) && (numpix == 2) - pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand - pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand -.elseif (bpp == 24) && (numpix == 1) - pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand -.else - pixldst %(numpix * bpp / 8), vst1, %(bpp), basereg, mem_operand, abits -.endif -.endif -.endm - -.macro pixld_a numpix, bpp, basereg, mem_operand -.if (bpp * numpix) <= 128 - pixld numpix, bpp, basereg, mem_operand, %(bpp * numpix) -.else - pixld numpix, bpp, basereg, mem_operand, 128 -.endif -.endm - -.macro pixst_a numpix, bpp, basereg, mem_operand -.if (bpp * numpix) <= 128 - pixst numpix, bpp, basereg, mem_operand, %(bpp * numpix) -.else - pixst numpix, bpp, basereg, mem_operand, 128 -.endif -.endm - -/* - * Pixel fetcher for nearest scaling (needs TMP1, TMP2, VX, UNIT_X register - * aliases to be defined) - */ -.macro pixld1_s elem_size, reg1, mem_operand -.if elem_size == 16 - mov TMP1, VX, asr #16 - adds VX, VX, UNIT_X -5: subpls VX, VX, SRC_WIDTH_FIXED - bpl 5b - add TMP1, mem_operand, TMP1, asl #1 - mov TMP2, VX, asr #16 - adds VX, VX, UNIT_X -5: subpls VX, VX, SRC_WIDTH_FIXED - bpl 5b - add TMP2, mem_operand, TMP2, asl #1 - vld1.16 {d®1&[0]}, [TMP1, :16] - mov TMP1, VX, asr #16 - adds VX, VX, UNIT_X -5: subpls VX, VX, SRC_WIDTH_FIXED - bpl 5b - add TMP1, mem_operand, TMP1, asl #1 - vld1.16 {d®1&[1]}, [TMP2, :16] - mov TMP2, VX, asr #16 - adds VX, VX, UNIT_X -5: subpls VX, VX, SRC_WIDTH_FIXED - bpl 5b - add TMP2, mem_operand, TMP2, asl #1 - vld1.16 {d®1&[2]}, [TMP1, :16] - vld1.16 {d®1&[3]}, [TMP2, :16] -.elseif elem_size == 32 - mov TMP1, VX, asr #16 - adds VX, VX, UNIT_X -5: subpls VX, VX, SRC_WIDTH_FIXED - bpl 5b - add TMP1, mem_operand, TMP1, asl #2 - mov TMP2, VX, asr #16 - adds VX, VX, UNIT_X -5: subpls VX, VX, SRC_WIDTH_FIXED - bpl 5b - add TMP2, mem_operand, TMP2, asl #2 - vld1.32 {d®1&[0]}, [TMP1, :32] - vld1.32 {d®1&[1]}, [TMP2, :32] -.else - .error "unsupported" -.endif -.endm - -.macro pixld2_s elem_size, reg1, reg2, mem_operand -.if 0 /* elem_size == 32 */ - mov TMP1, VX, asr #16 - add VX, VX, UNIT_X, asl #1 - add TMP1, mem_operand, TMP1, asl #2 - mov TMP2, VX, asr #16 - sub VX, VX, UNIT_X - add TMP2, mem_operand, TMP2, asl #2 - vld1.32 {d®1&[0]}, [TMP1, :32] - mov TMP1, VX, asr #16 - add VX, VX, UNIT_X, asl #1 - add TMP1, mem_operand, TMP1, asl #2 - vld1.32 {d®2&[0]}, [TMP2, :32] - mov TMP2, VX, asr #16 - add VX, VX, UNIT_X - add TMP2, mem_operand, TMP2, asl #2 - vld1.32 {d®1&[1]}, [TMP1, :32] - vld1.32 {d®2&[1]}, [TMP2, :32] -.else - pixld1_s elem_size, reg1, mem_operand - pixld1_s elem_size, reg2, mem_operand -.endif -.endm - -.macro pixld0_s elem_size, reg1, idx, mem_operand -.if elem_size == 16 - mov TMP1, VX, asr #16 - adds VX, VX, UNIT_X -5: subpls VX, VX, SRC_WIDTH_FIXED - bpl 5b - add TMP1, mem_operand, TMP1, asl #1 - vld1.16 {d®1&[idx]}, [TMP1, :16] -.elseif elem_size == 32 - mov TMP1, VX, asr #16 - adds VX, VX, UNIT_X -5: subpls VX, VX, SRC_WIDTH_FIXED - bpl 5b - add TMP1, mem_operand, TMP1, asl #2 - vld1.32 {d®1&[idx]}, [TMP1, :32] -.endif -.endm - -.macro pixld_s_internal numbytes, elem_size, basereg, mem_operand -.if numbytes == 32 - pixld2_s elem_size, %(basereg+4), %(basereg+5), mem_operand - pixld2_s elem_size, %(basereg+6), %(basereg+7), mem_operand - pixdeinterleave elem_size, %(basereg+4) -.elseif numbytes == 16 - pixld2_s elem_size, %(basereg+2), %(basereg+3), mem_operand -.elseif numbytes == 8 - pixld1_s elem_size, %(basereg+1), mem_operand -.elseif numbytes == 4 - .if elem_size == 32 - pixld0_s elem_size, %(basereg+0), 1, mem_operand - .elseif elem_size == 16 - pixld0_s elem_size, %(basereg+0), 2, mem_operand - pixld0_s elem_size, %(basereg+0), 3, mem_operand - .else - pixld0_s elem_size, %(basereg+0), 4, mem_operand - pixld0_s elem_size, %(basereg+0), 5, mem_operand - pixld0_s elem_size, %(basereg+0), 6, mem_operand - pixld0_s elem_size, %(basereg+0), 7, mem_operand - .endif -.elseif numbytes == 2 - .if elem_size == 16 - pixld0_s elem_size, %(basereg+0), 1, mem_operand - .else - pixld0_s elem_size, %(basereg+0), 2, mem_operand - pixld0_s elem_size, %(basereg+0), 3, mem_operand - .endif -.elseif numbytes == 1 - pixld0_s elem_size, %(basereg+0), 1, mem_operand -.else - .error "unsupported size: numbytes" -.endif -.endm - -.macro pixld_s numpix, bpp, basereg, mem_operand -.if bpp > 0 - pixld_s_internal %(numpix * bpp / 8), %(bpp), basereg, mem_operand -.endif -.endm - -.macro vuzp8 reg1, reg2 - vuzp.8 d®1, d®2 -.endm - -.macro vzip8 reg1, reg2 - vzip.8 d®1, d®2 -.endm - -/* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */ -.macro pixdeinterleave bpp, basereg -.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0) - vuzp8 %(basereg+0), %(basereg+1) - vuzp8 %(basereg+2), %(basereg+3) - vuzp8 %(basereg+1), %(basereg+3) - vuzp8 %(basereg+0), %(basereg+2) -.endif -.endm - -/* interleave B, G, R, A channels for eight 32bpp pixels in 4 registers */ -.macro pixinterleave bpp, basereg -.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0) - vzip8 %(basereg+0), %(basereg+2) - vzip8 %(basereg+1), %(basereg+3) - vzip8 %(basereg+2), %(basereg+3) - vzip8 %(basereg+0), %(basereg+1) -.endif -.endm - -/* - * This is a macro for implementing cache preload. The main idea is that - * cache preload logic is mostly independent from the rest of pixels - * processing code. It starts at the top left pixel and moves forward - * across pixels and can jump across scanlines. Prefetch distance is - * handled in an 'incremental' way: it starts from 0 and advances to the - * optimal distance over time. After reaching optimal prefetch distance, - * it is kept constant. There are some checks which prevent prefetching - * unneeded pixel lines below the image (but it still can prefetch a bit - * more data on the right side of the image - not a big issue and may - * be actually helpful when rendering text glyphs). Additional trick is - * the use of LDR instruction for prefetch instead of PLD when moving to - * the next line, the point is that we have a high chance of getting TLB - * miss in this case, and PLD would be useless. - * - * This sounds like it may introduce a noticeable overhead (when working with - * fully cached data). But in reality, due to having a separate pipeline and - * instruction queue for NEON unit in ARM Cortex-A8, normal ARM code can - * execute simultaneously with NEON and be completely shadowed by it. Thus - * we get no performance overhead at all (*). This looks like a very nice - * feature of Cortex-A8, if used wisely. We don't have a hardware prefetcher, - * but still can implement some rather advanced prefetch logic in software - * for almost zero cost! - * - * (*) The overhead of the prefetcher is visible when running some trivial - * pixels processing like simple copy. Anyway, having prefetch is a must - * when working with the graphics data. - */ -.macro PF a, x:vararg -.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_ADVANCED) - a x -.endif -.endm - -.macro cache_preload std_increment, boost_increment -.if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0) -.if regs_shortage - PF ldr ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */ -.endif -.if std_increment != 0 - PF add PF_X, PF_X, #std_increment -.endif - PF tst PF_CTL, #0xF - PF addne PF_X, PF_X, #boost_increment - PF subne PF_CTL, PF_CTL, #1 - PF cmp PF_X, ORIG_W -.if src_bpp_shift >= 0 - PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] -.endif -.if dst_r_bpp != 0 - PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] -.endif -.if mask_bpp_shift >= 0 - PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift] -.endif - PF subge PF_X, PF_X, ORIG_W - PF subges PF_CTL, PF_CTL, #0x10 -.if src_bpp_shift >= 0 - PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! -.endif -.if dst_r_bpp != 0 - PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! -.endif -.if mask_bpp_shift >= 0 - PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! -.endif -.endif -.endm - -.macro cache_preload_simple -.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_SIMPLE) -.if src_bpp > 0 - pld [SRC, #(PREFETCH_DISTANCE_SIMPLE * src_bpp / 8)] -.endif -.if dst_r_bpp > 0 - pld [DST_R, #(PREFETCH_DISTANCE_SIMPLE * dst_r_bpp / 8)] -.endif -.if mask_bpp > 0 - pld [MASK, #(PREFETCH_DISTANCE_SIMPLE * mask_bpp / 8)] -.endif -.endif -.endm - -.macro fetch_mask_pixblock - pixld pixblock_size, mask_bpp, \ - (mask_basereg - pixblock_size * mask_bpp / 64), MASK -.endm - -/* - * Macro which is used to process leading pixels until destination - * pointer is properly aligned (at 16 bytes boundary). When destination - * buffer uses 16bpp format, this is unnecessary, or even pointless. - */ -.macro ensure_destination_ptr_alignment process_pixblock_head, \ - process_pixblock_tail, \ - process_pixblock_tail_head -.if dst_w_bpp != 24 - tst DST_R, #0xF - beq 2f - -.irp lowbit, 1, 2, 4, 8, 16 -local skip1 -.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp)) -.if lowbit < 16 /* we don't need more than 16-byte alignment */ - tst DST_R, #lowbit - beq 1f -.endif - pixld_src (lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC - pixld (lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK -.if dst_r_bpp > 0 - pixld_a (lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R -.else - add DST_R, DST_R, #lowbit -.endif - PF add PF_X, PF_X, #(lowbit * 8 / dst_w_bpp) - sub W, W, #(lowbit * 8 / dst_w_bpp) -1: -.endif -.endr - pixdeinterleave src_bpp, src_basereg - pixdeinterleave mask_bpp, mask_basereg - pixdeinterleave dst_r_bpp, dst_r_basereg - - process_pixblock_head - cache_preload 0, pixblock_size - cache_preload_simple - process_pixblock_tail - - pixinterleave dst_w_bpp, dst_w_basereg -.irp lowbit, 1, 2, 4, 8, 16 -.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp)) -.if lowbit < 16 /* we don't need more than 16-byte alignment */ - tst DST_W, #lowbit - beq 1f -.endif - pixst_a (lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W -1: -.endif -.endr -.endif -2: -.endm - -/* - * Special code for processing up to (pixblock_size - 1) remaining - * trailing pixels. As SIMD processing performs operation on - * pixblock_size pixels, anything smaller than this has to be loaded - * and stored in a special way. Loading and storing of pixel data is - * performed in such a way that we fill some 'slots' in the NEON - * registers (some slots naturally are unused), then perform compositing - * operation as usual. In the end, the data is taken from these 'slots' - * and saved to memory. - * - * cache_preload_flag - allows to suppress prefetch if - * set to 0 - * dst_aligned_flag - selects whether destination buffer - * is aligned - */ -.macro process_trailing_pixels cache_preload_flag, \ - dst_aligned_flag, \ - process_pixblock_head, \ - process_pixblock_tail, \ - process_pixblock_tail_head - tst W, #(pixblock_size - 1) - beq 2f -.irp chunk_size, 16, 8, 4, 2, 1 -.if pixblock_size > chunk_size - tst W, #chunk_size - beq 1f - pixld_src chunk_size, src_bpp, src_basereg, SRC - pixld chunk_size, mask_bpp, mask_basereg, MASK -.if dst_aligned_flag != 0 - pixld_a chunk_size, dst_r_bpp, dst_r_basereg, DST_R -.else - pixld chunk_size, dst_r_bpp, dst_r_basereg, DST_R -.endif -.if cache_preload_flag != 0 - PF add PF_X, PF_X, #chunk_size -.endif -1: -.endif -.endr - pixdeinterleave src_bpp, src_basereg - pixdeinterleave mask_bpp, mask_basereg - pixdeinterleave dst_r_bpp, dst_r_basereg - - process_pixblock_head -.if cache_preload_flag != 0 - cache_preload 0, pixblock_size - cache_preload_simple -.endif - process_pixblock_tail - pixinterleave dst_w_bpp, dst_w_basereg -.irp chunk_size, 16, 8, 4, 2, 1 -.if pixblock_size > chunk_size - tst W, #chunk_size - beq 1f -.if dst_aligned_flag != 0 - pixst_a chunk_size, dst_w_bpp, dst_w_basereg, DST_W -.else - pixst chunk_size, dst_w_bpp, dst_w_basereg, DST_W -.endif -1: -.endif -.endr -2: -.endm - -/* - * Macro, which performs all the needed operations to switch to the next - * scanline and start the next loop iteration unless all the scanlines - * are already processed. - */ -.macro advance_to_next_scanline start_of_loop_label -.if regs_shortage - ldrd W, [sp] /* load W and H (width and height) from stack */ -.else - mov W, ORIG_W -.endif - add DST_W, DST_W, DST_STRIDE, lsl #dst_bpp_shift -.if src_bpp != 0 - add SRC, SRC, SRC_STRIDE, lsl #src_bpp_shift -.endif -.if mask_bpp != 0 - add MASK, MASK, MASK_STRIDE, lsl #mask_bpp_shift -.endif -.if (dst_w_bpp != 24) - sub DST_W, DST_W, W, lsl #dst_bpp_shift -.endif -.if (src_bpp != 24) && (src_bpp != 0) - sub SRC, SRC, W, lsl #src_bpp_shift -.endif -.if (mask_bpp != 24) && (mask_bpp != 0) - sub MASK, MASK, W, lsl #mask_bpp_shift -.endif - subs H, H, #1 - mov DST_R, DST_W -.if regs_shortage - str H, [sp, #4] /* save updated height to stack */ -.endif - bge start_of_loop_label -.endm - -/* - * Registers are allocated in the following way by default: - * d0, d1, d2, d3 - reserved for loading source pixel data - * d4, d5, d6, d7 - reserved for loading destination pixel data - * d24, d25, d26, d27 - reserved for loading mask pixel data - * d28, d29, d30, d31 - final destination pixel data for writeback to memory - */ -.macro generate_composite_function fname, \ - src_bpp_, \ - mask_bpp_, \ - dst_w_bpp_, \ - flags, \ - pixblock_size_, \ - prefetch_distance, \ - init, \ - cleanup, \ - process_pixblock_head, \ - process_pixblock_tail, \ - process_pixblock_tail_head, \ - dst_w_basereg_ = 28, \ - dst_r_basereg_ = 4, \ - src_basereg_ = 0, \ - mask_basereg_ = 24 - - pixman_asm_function fname - - push {r4-r12, lr} /* save all registers */ - -/* - * Select prefetch type for this function. If prefetch distance is - * set to 0 or one of the color formats is 24bpp, SIMPLE prefetch - * has to be used instead of ADVANCED. - */ - .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_DEFAULT -.if prefetch_distance == 0 - .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE -.elseif (PREFETCH_TYPE_CURRENT > PREFETCH_TYPE_SIMPLE) && \ - ((src_bpp_ == 24) || (mask_bpp_ == 24) || (dst_w_bpp_ == 24)) - .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_SIMPLE -.endif - -/* - * Make some macro arguments globally visible and accessible - * from other macros - */ - .set src_bpp, src_bpp_ - .set mask_bpp, mask_bpp_ - .set dst_w_bpp, dst_w_bpp_ - .set pixblock_size, pixblock_size_ - .set dst_w_basereg, dst_w_basereg_ - .set dst_r_basereg, dst_r_basereg_ - .set src_basereg, src_basereg_ - .set mask_basereg, mask_basereg_ - - .macro pixld_src x:vararg - pixld x - .endm - .macro fetch_src_pixblock - pixld_src pixblock_size, src_bpp, \ - (src_basereg - pixblock_size * src_bpp / 64), SRC - .endm -/* - * Assign symbolic names to registers - */ - W .req r0 /* width (is updated during processing) */ - H .req r1 /* height (is updated during processing) */ - DST_W .req r2 /* destination buffer pointer for writes */ - DST_STRIDE .req r3 /* destination image stride */ - SRC .req r4 /* source buffer pointer */ - SRC_STRIDE .req r5 /* source image stride */ - DST_R .req r6 /* destination buffer pointer for reads */ - - MASK .req r7 /* mask pointer */ - MASK_STRIDE .req r8 /* mask stride */ - - PF_CTL .req r9 /* combined lines counter and prefetch */ - /* distance increment counter */ - PF_X .req r10 /* pixel index in a scanline for current */ - /* pretetch position */ - PF_SRC .req r11 /* pointer to source scanline start */ - /* for prefetch purposes */ - PF_DST .req r12 /* pointer to destination scanline start */ - /* for prefetch purposes */ - PF_MASK .req r14 /* pointer to mask scanline start */ - /* for prefetch purposes */ -/* - * Check whether we have enough registers for all the local variables. - * If we don't have enough registers, original width and height are - * kept on top of stack (and 'regs_shortage' variable is set to indicate - * this for the rest of code). Even if there are enough registers, the - * allocation scheme may be a bit different depending on whether source - * or mask is not used. - */ -.if (PREFETCH_TYPE_CURRENT < PREFETCH_TYPE_ADVANCED) - ORIG_W .req r10 /* saved original width */ - DUMMY .req r12 /* temporary register */ - .set regs_shortage, 0 -.elseif mask_bpp == 0 - ORIG_W .req r7 /* saved original width */ - DUMMY .req r8 /* temporary register */ - .set regs_shortage, 0 -.elseif src_bpp == 0 - ORIG_W .req r4 /* saved original width */ - DUMMY .req r5 /* temporary register */ - .set regs_shortage, 0 -.else - ORIG_W .req r1 /* saved original width */ - DUMMY .req r1 /* temporary register */ - .set regs_shortage, 1 -.endif - - .set mask_bpp_shift, -1 -.if src_bpp == 32 - .set src_bpp_shift, 2 -.elseif src_bpp == 24 - .set src_bpp_shift, 0 -.elseif src_bpp == 16 - .set src_bpp_shift, 1 -.elseif src_bpp == 8 - .set src_bpp_shift, 0 -.elseif src_bpp == 0 - .set src_bpp_shift, -1 -.else - .error "requested src bpp (src_bpp) is not supported" -.endif -.if mask_bpp == 32 - .set mask_bpp_shift, 2 -.elseif mask_bpp == 24 - .set mask_bpp_shift, 0 -.elseif mask_bpp == 8 - .set mask_bpp_shift, 0 -.elseif mask_bpp == 0 - .set mask_bpp_shift, -1 -.else - .error "requested mask bpp (mask_bpp) is not supported" -.endif -.if dst_w_bpp == 32 - .set dst_bpp_shift, 2 -.elseif dst_w_bpp == 24 - .set dst_bpp_shift, 0 -.elseif dst_w_bpp == 16 - .set dst_bpp_shift, 1 -.elseif dst_w_bpp == 8 - .set dst_bpp_shift, 0 -.else - .error "requested dst bpp (dst_w_bpp) is not supported" -.endif - -.if (((flags) & FLAG_DST_READWRITE) != 0) - .set dst_r_bpp, dst_w_bpp -.else - .set dst_r_bpp, 0 -.endif -.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0) - .set DEINTERLEAVE_32BPP_ENABLED, 1 -.else - .set DEINTERLEAVE_32BPP_ENABLED, 0 -.endif - -.if prefetch_distance < 0 || prefetch_distance > 15 - .error "invalid prefetch distance (prefetch_distance)" -.endif - -.if src_bpp > 0 - ldr SRC, [sp, #40] -.endif -.if mask_bpp > 0 - ldr MASK, [sp, #48] -.endif - PF mov PF_X, #0 -.if src_bpp > 0 - ldr SRC_STRIDE, [sp, #44] -.endif -.if mask_bpp > 0 - ldr MASK_STRIDE, [sp, #52] -.endif - mov DST_R, DST_W - -.if src_bpp == 24 - sub SRC_STRIDE, SRC_STRIDE, W - sub SRC_STRIDE, SRC_STRIDE, W, lsl #1 -.endif -.if mask_bpp == 24 - sub MASK_STRIDE, MASK_STRIDE, W - sub MASK_STRIDE, MASK_STRIDE, W, lsl #1 -.endif -.if dst_w_bpp == 24 - sub DST_STRIDE, DST_STRIDE, W - sub DST_STRIDE, DST_STRIDE, W, lsl #1 -.endif - -/* - * Setup advanced prefetcher initial state - */ - PF mov PF_SRC, SRC - PF mov PF_DST, DST_R - PF mov PF_MASK, MASK - /* PF_CTL = prefetch_distance | ((h - 1) << 4) */ - PF mov PF_CTL, H, lsl #4 - PF add PF_CTL, #(prefetch_distance - 0x10) - - init -.if regs_shortage - push {r0, r1} -.endif - subs H, H, #1 -.if regs_shortage - str H, [sp, #4] /* save updated height to stack */ -.else - mov ORIG_W, W -.endif - blt 9f - cmp W, #(pixblock_size * 2) - blt 8f -/* - * This is the start of the pipelined loop, which if optimized for - * long scanlines - */ -0: - ensure_destination_ptr_alignment process_pixblock_head, \ - process_pixblock_tail, \ - process_pixblock_tail_head - - /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */ - pixld_a pixblock_size, dst_r_bpp, \ - (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R - fetch_src_pixblock - pixld pixblock_size, mask_bpp, \ - (mask_basereg - pixblock_size * mask_bpp / 64), MASK - PF add PF_X, PF_X, #pixblock_size - process_pixblock_head - cache_preload 0, pixblock_size - cache_preload_simple - subs W, W, #(pixblock_size * 2) - blt 2f -1: - process_pixblock_tail_head - cache_preload_simple - subs W, W, #pixblock_size - bge 1b -2: - process_pixblock_tail - pixst_a pixblock_size, dst_w_bpp, \ - (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W - - /* Process the remaining trailing pixels in the scanline */ - process_trailing_pixels 1, 1, \ - process_pixblock_head, \ - process_pixblock_tail, \ - process_pixblock_tail_head - advance_to_next_scanline 0b - -.if regs_shortage - pop {r0, r1} -.endif - cleanup - pop {r4-r12, pc} /* exit */ -/* - * This is the start of the loop, designed to process images with small width - * (less than pixblock_size * 2 pixels). In this case neither pipelining - * nor prefetch are used. - */ -8: - /* Process exactly pixblock_size pixels if needed */ - tst W, #pixblock_size - beq 1f - pixld pixblock_size, dst_r_bpp, \ - (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R - fetch_src_pixblock - pixld pixblock_size, mask_bpp, \ - (mask_basereg - pixblock_size * mask_bpp / 64), MASK - process_pixblock_head - process_pixblock_tail - pixst pixblock_size, dst_w_bpp, \ - (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W -1: - /* Process the remaining trailing pixels in the scanline */ - process_trailing_pixels 0, 0, \ - process_pixblock_head, \ - process_pixblock_tail, \ - process_pixblock_tail_head - advance_to_next_scanline 8b -9: -.if regs_shortage - pop {r0, r1} -.endif - cleanup - pop {r4-r12, pc} /* exit */ - - .purgem fetch_src_pixblock - .purgem pixld_src - - .unreq SRC - .unreq MASK - .unreq DST_R - .unreq DST_W - .unreq ORIG_W - .unreq W - .unreq H - .unreq SRC_STRIDE - .unreq DST_STRIDE - .unreq MASK_STRIDE - .unreq PF_CTL - .unreq PF_X - .unreq PF_SRC - .unreq PF_DST - .unreq PF_MASK - .unreq DUMMY - .endfunc -.endm - -/* - * A simplified variant of function generation template for a single - * scanline processing (for implementing pixman combine functions) - */ -.macro generate_composite_function_scanline use_nearest_scaling, \ - fname, \ - src_bpp_, \ - mask_bpp_, \ - dst_w_bpp_, \ - flags, \ - pixblock_size_, \ - init, \ - cleanup, \ - process_pixblock_head, \ - process_pixblock_tail, \ - process_pixblock_tail_head, \ - dst_w_basereg_ = 28, \ - dst_r_basereg_ = 4, \ - src_basereg_ = 0, \ - mask_basereg_ = 24 - - pixman_asm_function fname - - .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE -/* - * Make some macro arguments globally visible and accessible - * from other macros - */ - .set src_bpp, src_bpp_ - .set mask_bpp, mask_bpp_ - .set dst_w_bpp, dst_w_bpp_ - .set pixblock_size, pixblock_size_ - .set dst_w_basereg, dst_w_basereg_ - .set dst_r_basereg, dst_r_basereg_ - .set src_basereg, src_basereg_ - .set mask_basereg, mask_basereg_ - -.if use_nearest_scaling != 0 - /* - * Assign symbolic names to registers for nearest scaling - */ - W .req r0 - DST_W .req r1 - SRC .req r2 - VX .req r3 - UNIT_X .req ip - MASK .req lr - TMP1 .req r4 - TMP2 .req r5 - DST_R .req r6 - SRC_WIDTH_FIXED .req r7 - - .macro pixld_src x:vararg - pixld_s x - .endm - - ldr UNIT_X, [sp] - push {r4-r8, lr} - ldr SRC_WIDTH_FIXED, [sp, #(24 + 4)] - .if mask_bpp != 0 - ldr MASK, [sp, #(24 + 8)] - .endif -.else - /* - * Assign symbolic names to registers - */ - W .req r0 /* width (is updated during processing) */ - DST_W .req r1 /* destination buffer pointer for writes */ - SRC .req r2 /* source buffer pointer */ - DST_R .req ip /* destination buffer pointer for reads */ - MASK .req r3 /* mask pointer */ - - .macro pixld_src x:vararg - pixld x - .endm -.endif - -.if (((flags) & FLAG_DST_READWRITE) != 0) - .set dst_r_bpp, dst_w_bpp -.else - .set dst_r_bpp, 0 -.endif -.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0) - .set DEINTERLEAVE_32BPP_ENABLED, 1 -.else - .set DEINTERLEAVE_32BPP_ENABLED, 0 -.endif - - .macro fetch_src_pixblock - pixld_src pixblock_size, src_bpp, \ - (src_basereg - pixblock_size * src_bpp / 64), SRC - .endm - - init - mov DST_R, DST_W - - cmp W, #pixblock_size - blt 8f - - ensure_destination_ptr_alignment process_pixblock_head, \ - process_pixblock_tail, \ - process_pixblock_tail_head - - subs W, W, #pixblock_size - blt 7f - - /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */ - pixld_a pixblock_size, dst_r_bpp, \ - (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R - fetch_src_pixblock - pixld pixblock_size, mask_bpp, \ - (mask_basereg - pixblock_size * mask_bpp / 64), MASK - process_pixblock_head - subs W, W, #pixblock_size - blt 2f -1: - process_pixblock_tail_head - subs W, W, #pixblock_size - bge 1b -2: - process_pixblock_tail - pixst_a pixblock_size, dst_w_bpp, \ - (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W -7: - /* Process the remaining trailing pixels in the scanline (dst aligned) */ - process_trailing_pixels 0, 1, \ - process_pixblock_head, \ - process_pixblock_tail, \ - process_pixblock_tail_head - - cleanup -.if use_nearest_scaling != 0 - pop {r4-r8, pc} /* exit */ -.else - bx lr /* exit */ -.endif -8: - /* Process the remaining trailing pixels in the scanline (dst unaligned) */ - process_trailing_pixels 0, 0, \ - process_pixblock_head, \ - process_pixblock_tail, \ - process_pixblock_tail_head - - cleanup - -.if use_nearest_scaling != 0 - pop {r4-r8, pc} /* exit */ - - .unreq DST_R - .unreq SRC - .unreq W - .unreq VX - .unreq UNIT_X - .unreq TMP1 - .unreq TMP2 - .unreq DST_W - .unreq MASK - .unreq SRC_WIDTH_FIXED - -.else - bx lr /* exit */ - - .unreq SRC - .unreq MASK - .unreq DST_R - .unreq DST_W - .unreq W -.endif - - .purgem fetch_src_pixblock - .purgem pixld_src - - .endfunc -.endm - -.macro generate_composite_function_single_scanline x:vararg - generate_composite_function_scanline 0, x -.endm - -.macro generate_composite_function_nearest_scanline x:vararg - generate_composite_function_scanline 1, x -.endm - -/* Default prologue/epilogue, nothing special needs to be done */ - -.macro default_init -.endm - -.macro default_cleanup -.endm - -/* - * Prologue/epilogue variant which additionally saves/restores d8-d15 - * registers (they need to be saved/restored by callee according to ABI). - * This is required if the code needs to use all the NEON registers. - */ - -.macro default_init_need_all_regs - vpush {d8-d15} -.endm - -.macro default_cleanup_need_all_regs - vpop {d8-d15} -.endm - -/******************************************************************************/ - -/* - * Conversion of 8 r5g6b6 pixels packed in 128-bit register (in) - * into a planar a8r8g8b8 format (with a, r, g, b color components - * stored into 64-bit registers out_a, out_r, out_g, out_b respectively). - * - * Warning: the conversion is destructive and the original - * value (in) is lost. - */ -.macro convert_0565_to_8888 in, out_a, out_r, out_g, out_b - vshrn.u16 out_r, in, #8 - vshrn.u16 out_g, in, #3 - vsli.u16 in, in, #5 - vmov.u8 out_a, #255 - vsri.u8 out_r, out_r, #5 - vsri.u8 out_g, out_g, #6 - vshrn.u16 out_b, in, #2 -.endm - -.macro convert_0565_to_x888 in, out_r, out_g, out_b - vshrn.u16 out_r, in, #8 - vshrn.u16 out_g, in, #3 - vsli.u16 in, in, #5 - vsri.u8 out_r, out_r, #5 - vsri.u8 out_g, out_g, #6 - vshrn.u16 out_b, in, #2 -.endm - -/* - * Conversion from planar a8r8g8b8 format (with a, r, g, b color components - * in 64-bit registers in_a, in_r, in_g, in_b respectively) into 8 r5g6b6 - * pixels packed in 128-bit register (out). Requires two temporary 128-bit - * registers (tmp1, tmp2) - */ -.macro convert_8888_to_0565 in_r, in_g, in_b, out, tmp1, tmp2 - vshll.u8 tmp1, in_g, #8 - vshll.u8 out, in_r, #8 - vshll.u8 tmp2, in_b, #8 - vsri.u16 out, tmp1, #5 - vsri.u16 out, tmp2, #11 -.endm - -/* - * Conversion of four r5g6b5 pixels (in) to four x8r8g8b8 pixels - * returned in (out0, out1) registers pair. Requires one temporary - * 64-bit register (tmp). 'out1' and 'in' may overlap, the original - * value from 'in' is lost - */ -.macro convert_four_0565_to_x888_packed in, out0, out1, tmp - vshl.u16 out0, in, #5 /* G top 6 bits */ - vshl.u16 tmp, in, #11 /* B top 5 bits */ - vsri.u16 in, in, #5 /* R is ready in top bits */ - vsri.u16 out0, out0, #6 /* G is ready in top bits */ - vsri.u16 tmp, tmp, #5 /* B is ready in top bits */ - vshr.u16 out1, in, #8 /* R is in place */ - vsri.u16 out0, tmp, #8 /* G & B is in place */ - vzip.u16 out0, out1 /* everything is in place */ -.endm diff --git a/src/video/arm/pixman-arm-simd-asm.S b/src/video/arm/pixman-arm-simd-asm.S deleted file mode 100644 index 57449c5f3..000000000 --- a/src/video/arm/pixman-arm-simd-asm.S +++ /dev/null @@ -1,532 +0,0 @@ -/* - * Copyright (c) 2016 RISC OS Open Ltd - * - * This software is provided 'as-is', without any express or implied - * warranty. In no event will the authors be held liable for any damages - * arising from the use of this software. - * - * Permission is granted to anyone to use this software for any purpose, - * including commercial applications, and to alter it and redistribute it - * freely, subject to the following restrictions: - * - * 1. The origin of this software must not be misrepresented; you must not - * claim that you wrote the original software. If you use this software - * in a product, an acknowledgment in the product documentation would be - * appreciated but is not required. - * 2. Altered source versions must be plainly marked as such, and must not be - * misrepresented as being the original software. - * 3. This notice may not be removed or altered from any source distribution. - */ - -/* Prevent the stack from becoming executable */ -#if defined(__linux__) && defined(__ELF__) -.section .note.GNU-stack,"",%progbits -#endif - - .text - .arch armv6 - .object_arch armv4 - .arm - .altmacro - .p2align 2 - -#include "pixman-arm-asm.h" -#include "pixman-arm-simd-asm.h" - -/* A head macro should do all processing which results in an output of up to - * 16 bytes, as far as the final load instruction. The corresponding tail macro - * should complete the processing of the up-to-16 bytes. The calling macro will - * sometimes choose to insert a preload or a decrement of X between them. - * cond ARM condition code for code block - * numbytes Number of output bytes that should be generated this time - * firstreg First WK register in which to place output - * unaligned_src Whether to use non-wordaligned loads of source image - * unaligned_mask Whether to use non-wordaligned loads of mask image - * preload If outputting 16 bytes causes 64 bytes to be read, whether an extra preload should be output - */ - -/******************************************************************************/ - -.macro FillRect32_init - ldr SRC, [sp, #ARGS_STACK_OFFSET] - mov STRIDE_S, SRC - mov MASK, SRC - mov STRIDE_M, SRC -.endm - -.macro FillRect16_init - ldrh SRC, [sp, #ARGS_STACK_OFFSET] - orr SRC, SRC, lsl #16 - mov STRIDE_S, SRC - mov MASK, SRC - mov STRIDE_M, SRC -.endm - -.macro FillRect8_init - ldrb SRC, [sp, #ARGS_STACK_OFFSET] - orr SRC, SRC, lsl #8 - orr SRC, SRC, lsl #16 - mov STRIDE_S, SRC - mov MASK, SRC - mov STRIDE_M, SRC -.endm - -.macro FillRect_process_tail cond, numbytes, firstreg - WK4 .req SRC - WK5 .req STRIDE_S - WK6 .req MASK - WK7 .req STRIDE_M - pixst cond, numbytes, 4, DST - .unreq WK4 - .unreq WK5 - .unreq WK6 - .unreq WK7 -.endm - -generate_composite_function \ - FillSurfaceRect32ARMSIMDAsm, 0, 0, 32, \ - FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \ - 0, /* prefetch distance doesn't apply */ \ - FillRect32_init \ - nop_macro, /* newline */ \ - nop_macro /* cleanup */ \ - nop_macro /* process head */ \ - FillRect_process_tail - -generate_composite_function \ - FillSurfaceRect16ARMSIMDAsm, 0, 0, 16, \ - FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \ - 0, /* prefetch distance doesn't apply */ \ - FillRect16_init \ - nop_macro, /* newline */ \ - nop_macro /* cleanup */ \ - nop_macro /* process head */ \ - FillRect_process_tail - -generate_composite_function \ - FillSurfaceRect8ARMSIMDAsm, 0, 0, 8, \ - FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \ - 0, /* prefetch distance doesn't apply */ \ - FillRect8_init \ - nop_macro, /* newline */ \ - nop_macro /* cleanup */ \ - nop_macro /* process head */ \ - FillRect_process_tail - -/******************************************************************************/ - -/* This differs from the over_8888_8888 routine in Pixman in that the destination - * alpha component is always left unchanged, and RGB components are not - * premultiplied by alpha. It differs from BlitRGBtoRGBPixelAlpha in that - * renormalisation is done by multiplying by 257/256 (with rounding) rather than - * simply shifting right by 8 bits - removing the need to special-case alpha=0xff. - */ - -.macro RGBtoRGBPixelAlpha_init - line_saved_regs STRIDE_S, ORIG_W - mov MASK, #0x80 -.endm - -.macro RGBtoRGBPixelAlpha_1pixel_translucent s, d, tmp0, tmp1, tmp2, tmp3, half - uxtb tmp3, s - uxtb tmp0, d - sub tmp0, tmp3, tmp0 - uxtb tmp3, s, ror #16 - uxtb tmp1, d, ror #16 - sub tmp1, tmp3, tmp1 - uxtb tmp3, s, ror #8 - mov s, s, lsr #24 - uxtb tmp2, d, ror #8 - sub tmp2, tmp3, tmp2 - smlabb tmp0, tmp0, s, half - smlabb tmp1, tmp1, s, half - smlabb tmp2, tmp2, s, half - add tmp0, tmp0, asr #8 - add tmp1, tmp1, asr #8 - add tmp2, tmp2, asr #8 - pkhbt tmp0, tmp0, tmp1, lsl #16 - and tmp2, tmp2, #0xff00 - uxtb16 tmp0, tmp0, ror #8 - orr tmp0, tmp0, tmp2 - uadd8 d, d, tmp0 -.endm - -.macro RGBtoRGBPixelAlpha_1pixel_opaque s, d - and d, d, #0xff000000 - bic s, s, #0xff000000 - orr d, d, s -.endm - -.macro RGBtoRGBPixelAlpha_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload - .if numbytes == 16 - ldm SRC!, {WK0, WK1} - ldm SRC!, {STRIDE_S, STRIDE_M} - ldrd WK2, WK3, [DST], #16 - orr SCRATCH, WK0, WK1 - and ORIG_W, WK0, WK1 - orr SCRATCH, SCRATCH, STRIDE_S - and ORIG_W, ORIG_W, STRIDE_S - orr SCRATCH, SCRATCH, STRIDE_M - and ORIG_W, ORIG_W, STRIDE_M - tst SCRATCH, #0xff000000 - .elseif numbytes == 8 - ldm SRC!, {WK0, WK1} - ldm DST!, {WK2, WK3} - orr SCRATCH, WK0, WK1 - and ORIG_W, WK0, WK1 - tst SCRATCH, #0xff000000 - .else // numbytes == 4 - ldr WK0, [SRC], #4 - ldr WK2, [DST], #4 - tst WK0, #0xff000000 - .endif -.endm - -.macro RGBtoRGBPixelAlpha_process_tail cond, numbytes, firstreg - beq 20f @ all transparent - .if numbytes == 16 - cmp ORIG_W, #0xff000000 - bhs 10f @ all opaque - RGBtoRGBPixelAlpha_1pixel_translucent WK0, WK2, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK - RGBtoRGBPixelAlpha_1pixel_translucent WK1, WK3, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK - strd WK2, WK3, [DST, #-16] - ldrd WK0, WK1, [SRC, #-8] - ldrd WK2, WK3, [DST, #-8] - RGBtoRGBPixelAlpha_1pixel_translucent WK0, WK2, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK - RGBtoRGBPixelAlpha_1pixel_translucent WK1, WK3, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK - b 19f -10: RGBtoRGBPixelAlpha_1pixel_opaque WK0, WK2 - RGBtoRGBPixelAlpha_1pixel_opaque WK1, WK3 - strd WK2, WK3, [DST, #-16] - ldrd WK0, WK1, [SRC, #-8] - ldrd WK2, WK3, [DST, #-8] - RGBtoRGBPixelAlpha_1pixel_opaque WK0, WK2 - RGBtoRGBPixelAlpha_1pixel_opaque WK1, WK3 -19: strd WK2, WK3, [DST, #-8] - .elseif numbytes == 8 - cmp ORIG_W, #0xff000000 - bhs 10f @ all opaque - RGBtoRGBPixelAlpha_1pixel_translucent WK0, WK2, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK - RGBtoRGBPixelAlpha_1pixel_translucent WK1, WK3, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK - b 19f -10: RGBtoRGBPixelAlpha_1pixel_opaque WK0, WK2 - RGBtoRGBPixelAlpha_1pixel_opaque WK1, WK3 -19: strd WK2, WK3, [DST, #-8] - .else // numbytes == 4 - cmp WK0, #0xff000000 - bhs 10f @ opaque - RGBtoRGBPixelAlpha_1pixel_translucent WK0, WK2, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK - b 19f -10: RGBtoRGBPixelAlpha_1pixel_opaque WK0, WK2 -19: str WK2, [DST, #-4] - .endif -20: -.endm - -generate_composite_function \ - BlitRGBtoRGBPixelAlphaARMSIMDAsm, 32, 0, 32, \ - FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_WK0, \ - 2, /* prefetch distance */ \ - RGBtoRGBPixelAlpha_init, \ - nop_macro, /* newline */ \ - nop_macro, /* cleanup */ \ - RGBtoRGBPixelAlpha_process_head, \ - RGBtoRGBPixelAlpha_process_tail - -/******************************************************************************/ - -.macro ARGBto565PixelAlpha_init - line_saved_regs STRIDE_D, STRIDE_S, ORIG_W - mov MASK, #0x001f - mov STRIDE_M, #0x0010 - orr MASK, MASK, MASK, lsl #16 - orr STRIDE_M, STRIDE_M, STRIDE_M, lsl #16 -.endm - -.macro ARGBto565PixelAlpha_newline - mov STRIDE_S, #0x0200 -.endm - -/* On entry: - * s1 holds 1 32bpp source pixel - * d holds 1 16bpp destination pixel - * rbmask, rbhalf, ghalf hold 0x001f001f, 0x00100010, 0x00000200 respectively - * other registers are temporaries - * On exit: - * Constant registers preserved - */ - -.macro ARGBto565PixelAlpha_1pixel_translucent s, d, rbmask, rbhalf, ghalf, alpha, rb, g, misc - mov alpha, s, lsr #27 - and misc, s, #0xfc00 - and g, d, #0x07e0 - pkhbt rb, d, d, lsl #5 - rsb misc, g, misc, lsr #5 - and s, rbmask, s, lsr #3 - and rb, rbmask, rb - sub s, s, rb - smlabb misc, misc, alpha, ghalf - mla s, s, alpha, rbhalf - add misc, misc, misc, lsl #5 - add g, g, misc, asr #10 - add s, s, s, lsl #5 - and g, g, #0x07e0 - add rb, rb, s, asr #10 - and rb, rb, rbmask - pkhbt rb, rb, rb, lsl #11 - orr d, rb, g - orr d, d, rb, lsr #16 -.endm - -/* On entry: - * s1 holds 1 32bpp source pixel - * d holds 1 16bpp destination pixel - * rbmask holds 0x001f001f - * On exit: - * Constant registers preserved - */ - -.macro ARGBto565PixelAlpha_1pixel_opaque s, d, rbmask - and d, rbmask, s, lsr #3 - and s, s, #0xfc00 - orr d, d, d, lsr #5 - orr d, d, s, lsr #5 -.endm - -/* On entry: - * s1, s2 hold 2 32bpp source pixels - * d holds 2 16bpp destination pixels - * rbmask, rbhalf, ghalf hold 0x001f001f, 0x00100010, 0x00000200 respectively - * other registers are temporaries - * On exit: - * Constant registers preserved - * Blended results have been written through destination pointer - */ - -.macro ARGBto565PixelAlpha_2pixels_translucent s1, s2, d, rbmask, rbhalf, ghalf, alpha, rb, g, misc - mov alpha, s1, lsr #27 - and misc, s1, #0xfc00 - and g, d, #0x07e0 - pkhbt rb, d, d, lsl #5 - rsb misc, g, misc, lsr #5 - and s1, rbmask, s1, lsr #3 - and rb, rbmask, rb - sub s1, s1, rb - smlabb misc, misc, alpha, ghalf - mla s1, s1, alpha, rbhalf - uxth d, d, ror #16 - add misc, misc, misc, lsl #5 - mov alpha, s2, lsr #27 - add g, g, misc, asr #10 - add s1, s1, s1, lsl #5 - and g, g, #0x07e0 - add rb, rb, s1, asr #10 - and rb, rb, rbmask - and misc, s2, #0xfc00 - pkhbt rb, rb, rb, lsl #11 - and s1, d, #0x07e0 - pkhbt d, d, d, lsl #5 - rsb misc, s1, misc, lsr #5 - and s2, rbmask, s2, lsr #3 - and d, rbmask, d - sub s2, s2, d - smlabb misc, misc, alpha, ghalf - mla s2, s2, alpha, rbhalf - orr alpha, rb, g - add misc, misc, misc, lsl #5 - orr alpha, alpha, rb, lsr #16 - add s1, s1, misc, asr #10 - add s2, s2, s2, lsl #5 - and s1, s1, #0x07e0 - add d, d, s2, asr #10 - and d, d, rbmask - strh alpha, [DST, #-4] - pkhbt d, d, d, lsl #11 - orr alpha, d, s1 - orr alpha, alpha, d, lsr #16 - strh alpha, [DST, #-2] -.endm - -/* On entry: - * s1, s2 hold 2 32bpp source pixels - * rbmask holds 0x001f001f - * other registers are temporaries - * On exit: - * Constant registers preserved - * Blended results have been written through destination pointer - */ - -.macro ARGBto565PixelAlpha_2pixels_opaque s1, s2, d, rbmask, g - and g, s1, #0xfc00 - and d, rbmask, s1, lsr #3 - and s1, rbmask, s2, lsr #3 - orr d, d, d, lsr #5 - orr d, d, g, lsr #5 - and g, s2, #0xfc00 - strh d, [DST, #-4] - orr s1, s1, s1, lsr #5 - orr s1, s1, g, lsr #5 - strh s1, [DST, #-2] -.endm - -.macro ARGBto565PixelAlpha_2pixels_head - ldrd WK0, WK1, [SRC], #8 - ldr WK2, [DST], #4 - orr SCRATCH, WK0, WK1 - and ORIG_W, WK0, WK1 - tst SCRATCH, #0xff000000 -.endm - -.macro ARGBto565PixelAlpha_2pixels_tail - beq 20f @ all transparent - cmp ORIG_W, #0xff000000 - bhs 10f @ all opaque - ARGBto565PixelAlpha_2pixels_translucent WK0, WK1, WK2, MASK, STRIDE_M, STRIDE_S, STRIDE_D, WK3, SCRATCH, ORIG_W - b 20f -10: ARGBto565PixelAlpha_2pixels_opaque WK0, WK1, WK2, MASK, SCRATCH -20: -.endm - -.macro ARGBto565PixelAlpha_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload - .if numbytes == 16 - ARGBto565PixelAlpha_2pixels_head - ARGBto565PixelAlpha_2pixels_tail - ARGBto565PixelAlpha_2pixels_head - ARGBto565PixelAlpha_2pixels_tail - .endif - .if numbytes >= 8 - ARGBto565PixelAlpha_2pixels_head - ARGBto565PixelAlpha_2pixels_tail - .endif - .if numbytes >= 4 - ARGBto565PixelAlpha_2pixels_head - .else // numbytes == 2 - ldr WK0, [SRC], #4 - ldrh WK2, [DST], #2 - tst WK0, #0xff000000 - .endif -.endm - -.macro ARGBto565PixelAlpha_process_tail cond, numbytes, firstreg - .if numbytes >= 4 - ARGBto565PixelAlpha_2pixels_tail - .else // numbytes == 2 - beq 20f @ all transparent - cmp WK0, #0xff000000 - bhs 10f @ opaque - ARGBto565PixelAlpha_1pixel_translucent WK0, WK2, MASK, STRIDE_M, STRIDE_S, STRIDE_D, WK3, SCRATCH, ORIG_W - b 19f -10: ARGBto565PixelAlpha_1pixel_opaque WK0, WK2, MASK -19: strh WK2, [DST, #-2] -20: - .endif -.endm - -generate_composite_function \ - BlitARGBto565PixelAlphaARMSIMDAsm, 32, 0, 16, \ - FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_WK0, \ - 2, /* prefetch distance */ \ - ARGBto565PixelAlpha_init, \ - ARGBto565PixelAlpha_newline, \ - nop_macro, /* cleanup */ \ - ARGBto565PixelAlpha_process_head, \ - ARGBto565PixelAlpha_process_tail - - /******************************************************************************/ - -.macro BGR888toRGB888_1pixel cond, reg, tmp - uxtb16&cond tmp, WK®, ror #8 - uxtb16&cond WK®, WK®, ror #16 - orr&cond WK®, WK®, tmp, lsl #8 -.endm - -.macro BGR888toRGB888_2pixels cond, reg1, reg2, tmp1, tmp2 - uxtb16&cond tmp1, WK®1, ror #8 - uxtb16&cond WK®1, WK®1, ror #16 - uxtb16&cond tmp2, WK®2, ror #8 - uxtb16&cond WK®2, WK®2, ror #16 - orr&cond WK®1, WK®1, tmp1, lsl #8 - orr&cond WK®2, WK®2, tmp2, lsl #8 -.endm - -.macro BGR888toRGB888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload - pixld cond, numbytes, firstreg, SRC, unaligned_src -.endm - -.macro BGR888toRGB888_process_tail cond, numbytes, firstreg - .if numbytes >= 8 - BGR888toRGB888_2pixels cond, %(firstreg+0), %(firstreg+1), MASK, STRIDE_M - .if numbytes == 16 - BGR888toRGB888_2pixels cond, %(firstreg+2), %(firstreg+3), MASK, STRIDE_M - .endif - .else @ numbytes == 4 - BGR888toRGB888_1pixel cond, %(firstreg+0), MASK - .endif -.endm - -generate_composite_function \ - Blit_XBGR8888_XRGB8888ARMSIMDAsm, 32, 0, 32, \ - FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_SCRATCH, \ - 2, /* prefetch distance */ \ - nop_macro, /* init */ \ - nop_macro, /* newline */ \ - nop_macro, /* cleanup */ \ - BGR888toRGB888_process_head, \ - BGR888toRGB888_process_tail - -/******************************************************************************/ - -.macro RGB444toRGB888_init - ldr MASK, =0x0f0f0f0f - /* Set GE[3:0] to 0101 so SEL instructions do what we want */ - msr CPSR_s, #0x50000 -.endm - -.macro RGB444toRGB888_1pixel reg, mask, tmp - pkhbt WK®, WK®, WK®, lsl #12 @ 0000aaaarrrrggggaaaarrrrggggbbbb - and WK®, mask, WK® @ 0000aaaa0000gggg0000rrrr0000bbbb - orr WK®, WK®, WK®, lsl #4 @ aaaaaaaaggggggggrrrrrrrrbbbbbbbb - pkhtb tmp, WK®, WK®, asr #8 @ aaaaaaaaggggggggggggggggrrrrrrrr - pkhbt WK®, WK®, WK®, lsl #8 @ ggggggggrrrrrrrrrrrrrrrrbbbbbbbb - sel WK®, WK®, tmp @ aaaaaaaarrrrrrrrggggggggbbbbbbbb -.endm - -.macro RGB444toRGB888_2pixels in, out1, out2, mask, tmp1, tmp2 - and tmp1, mask, WK&in @ 0000RRRR0000BBBB0000rrrr0000bbbb - and tmp2, mask, WK&in, lsr #4 @ 0000AAAA0000GGGG0000aaaa0000gggg - orr tmp1, tmp1, tmp1, lsl #4 @ RRRRRRRRBBBBBBBBrrrrrrrrbbbbbbbb - orr tmp2, tmp2, tmp2, lsl #4 @ AAAAAAAAGGGGGGGGaaaaaaaagggggggg - pkhtb WK&out2, tmp2, tmp1, asr #16 @ AAAAAAAAGGGGGGGGRRRRRRRRBBBBBBBB - pkhbt WK&out1, tmp1, tmp2, lsl #16 @ aaaaaaaaggggggggrrrrrrrrbbbbbbbb - pkhtb tmp2, WK&out2, WK&out2, asr #8 @ AAAAAAAAGGGGGGGGGGGGGGGGRRRRRRRR - pkhtb tmp1, WK&out1, WK&out1, asr #8 @ aaaaaaaaggggggggggggggggrrrrrrrr - pkhbt WK&out1, WK&out1, WK&out1, lsl #8 @ ggggggggrrrrrrrrrrrrrrrrbbbbbbbb - pkhbt WK&out2, WK&out2, WK&out2, lsl #8 @ GGGGGGGGRRRRRRRRRRRRRRRRBBBBBBBB - sel WK&out1, WK&out1, tmp1 @ aaaaaaaarrrrrrrrggggggggbbbbbbbb - sel WK&out2, WK&out2, tmp2 @ AAAAAAAARRRRRRRRGGGGGGGGBBBBBBBB -.endm - -.macro RGB444toRGB888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload - pixld cond, numbytes/2, firstreg, SRC, unaligned_src -.endm - -.macro RGB444toRGB888_process_tail cond, numbytes, firstreg - .if numbytes >= 8 - .if numbytes == 16 - RGB444toRGB888_2pixels %(firstreg+1), %(firstreg+2), %(firstreg+3), MASK, STRIDE_M, SCRATCH - .endif - RGB444toRGB888_2pixels %(firstreg+0), %(firstreg+0), %(firstreg+1), MASK, STRIDE_M, SCRATCH - .else @ numbytes == 4 - RGB444toRGB888_1pixel %(firstreg+0), MASK, SCRATCH - .endif -.endm - -generate_composite_function \ - Blit_RGB444_XRGB8888ARMSIMDAsm, 16, 0, 32, \ - FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER, \ - 2, /* prefetch distance */ \ - RGB444toRGB888_init, \ - nop_macro, /* newline */ \ - nop_macro, /* cleanup */ \ - RGB444toRGB888_process_head, \ - RGB444toRGB888_process_tail diff --git a/src/video/arm/pixman-arm-simd-asm.h b/src/video/arm/pixman-arm-simd-asm.h deleted file mode 100644 index 067d52c1a..000000000 --- a/src/video/arm/pixman-arm-simd-asm.h +++ /dev/null @@ -1,1034 +0,0 @@ -/* - * Copyright (c) 2012 Raspberry Pi Foundation - * Copyright (c) 2012 RISC OS Open Ltd - * - * This software is provided 'as-is', without any express or implied - * warranty. In no event will the authors be held liable for any damages - * arising from the use of this software. - * - * Permission is granted to anyone to use this software for any purpose, - * including commercial applications, and to alter it and redistribute it - * freely, subject to the following restrictions: - * - * 1. The origin of this software must not be misrepresented; you must not - * claim that you wrote the original software. If you use this software - * in a product, an acknowledgment in the product documentation would be - * appreciated but is not required. - * 2. Altered source versions must be plainly marked as such, and must not be - * misrepresented as being the original software. - * 3. This notice may not be removed or altered from any source distribution. - */ - -/* - * Because the alignment of pixel data to cachelines, and even the number of - * cachelines per row can vary from row to row, and because of the need to - * preload each scanline once and only once, this prefetch strategy treats - * each row of pixels independently. When a pixel row is long enough, there - * are three distinct phases of prefetch: - * * an inner loop section, where each time a cacheline of data is - * processed, another cacheline is preloaded (the exact distance ahead is - * determined empirically using profiling results from lowlevel-blt-bench) - * * a leading section, where enough cachelines are preloaded to ensure no - * cachelines escape being preloaded when the inner loop starts - * * a trailing section, where a limited number (0 or more) of cachelines - * are preloaded to deal with data (if any) that hangs off the end of the - * last iteration of the inner loop, plus any trailing bytes that were not - * enough to make up one whole iteration of the inner loop - * - * There are (in general) three distinct code paths, selected between - * depending upon how long the pixel row is. If it is long enough that there - * is at least one iteration of the inner loop (as described above) then - * this is described as the "wide" case. If it is shorter than that, but - * there are still enough bytes output that there is at least one 16-byte- - * long, 16-byte-aligned write to the destination (the optimum type of - * write), then this is the "medium" case. If it is not even this long, then - * this is the "narrow" case, and there is no attempt to align writes to - * 16-byte boundaries. In the "medium" and "narrow" cases, all the - * cachelines containing data from the pixel row are prefetched up-front. - */ - -/* - * Determine whether we put the arguments on the stack for debugging. - */ -#undef DEBUG_PARAMS - -/* - * Bit flags for 'generate_composite_function' macro which are used - * to tune generated functions behavior. - */ -.set FLAG_DST_WRITEONLY, 0 -.set FLAG_DST_READWRITE, 1 -.set FLAG_COND_EXEC, 0 -.set FLAG_BRANCH_OVER, 2 -.set FLAG_PROCESS_PRESERVES_PSR, 0 -.set FLAG_PROCESS_CORRUPTS_PSR, 4 -.set FLAG_PROCESS_DOESNT_STORE, 0 -.set FLAG_PROCESS_DOES_STORE, 8 /* usually because it needs to conditionally skip it */ -.set FLAG_NO_SPILL_LINE_VARS, 0 -.set FLAG_SPILL_LINE_VARS_WIDE, 16 -.set FLAG_SPILL_LINE_VARS_NON_WIDE, 32 -.set FLAG_SPILL_LINE_VARS, 48 -.set FLAG_PROCESS_CORRUPTS_SCRATCH, 0 -.set FLAG_PROCESS_PRESERVES_SCRATCH, 64 -.set FLAG_PROCESS_PRESERVES_WK0, 0 -.set FLAG_PROCESS_CORRUPTS_WK0, 128 /* if possible, use the specified register(s) instead so WK0 can hold number of leading pixels */ -.set FLAG_PRELOAD_DST, 0 -.set FLAG_NO_PRELOAD_DST, 256 - -/* - * Number of bytes by which to adjust preload offset of destination - * buffer (allows preload instruction to be moved before the load(s)) - */ -.set DST_PRELOAD_BIAS, 0 - -/* - * Offset into stack where mask and source pointer/stride can be accessed. - */ -#ifdef DEBUG_PARAMS -.set ARGS_STACK_OFFSET, (9*4+9*4) -#else -.set ARGS_STACK_OFFSET, (9*4) -#endif - -/* - * Offset into stack where space allocated during init macro can be accessed. - */ -.set LOCALS_STACK_OFFSET, 0 - -/* - * Constants for selecting preferable prefetch type. - */ -.set PREFETCH_TYPE_NONE, 0 -.set PREFETCH_TYPE_STANDARD, 1 - -/* - * Definitions of macros for load/store of pixel data. - */ - -.macro pixldst op, cond=al, numbytes, reg0, reg1, reg2, reg3, base, unaligned=0 - .if numbytes == 16 - .if unaligned == 1 - op&r&cond WK®0, [base], #4 - op&r&cond WK®1, [base], #4 - op&r&cond WK®2, [base], #4 - op&r&cond WK®3, [base], #4 - .else - op&m&cond&ia base!, {WK®0,WK®1,WK®2,WK®3} - .endif - .elseif numbytes == 8 - .if unaligned == 1 - op&r&cond WK®0, [base], #4 - op&r&cond WK®1, [base], #4 - .else - op&m&cond&ia base!, {WK®0,WK®1} - .endif - .elseif numbytes == 4 - op&r&cond WK®0, [base], #4 - .elseif numbytes == 2 - op&r&cond&h WK®0, [base], #2 - .elseif numbytes == 1 - op&r&cond&b WK®0, [base], #1 - .else - .error "unsupported size: numbytes" - .endif -.endm - -.macro pixst_baseupdated cond, numbytes, reg0, reg1, reg2, reg3, base - .if numbytes == 16 - stm&cond&db base, {WK®0,WK®1,WK®2,WK®3} - .elseif numbytes == 8 - stm&cond&db base, {WK®0,WK®1} - .elseif numbytes == 4 - str&cond WK®0, [base, #-4] - .elseif numbytes == 2 - str&cond&h WK®0, [base, #-2] - .elseif numbytes == 1 - str&cond&b WK®0, [base, #-1] - .else - .error "unsupported size: numbytes" - .endif -.endm - -.macro pixld cond, numbytes, firstreg, base, unaligned - pixldst ld, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base, unaligned -.endm - -.macro pixst cond, numbytes, firstreg, base - .if (flags) & FLAG_DST_READWRITE - pixst_baseupdated cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base - .else - pixldst st, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base - .endif -.endm - -.macro PF a, x:vararg - .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_STANDARD) - a x - .endif -.endm - - -.macro preload_leading_step1 bpp, ptr, base -/* If the destination is already 16-byte aligned, then we need to preload - * between 0 and prefetch_distance (inclusive) cache lines ahead so there - * are no gaps when the inner loop starts. - */ - .if bpp > 0 - PF bic, ptr, base, #31 - .set OFFSET, 0 - .rept prefetch_distance+1 - PF pld, [ptr, #OFFSET] - .set OFFSET, OFFSET+32 - .endr - .endif -.endm - -.macro preload_leading_step2 bpp, bpp_shift, ptr, base -/* However, if the destination is not 16-byte aligned, we may need to - * preload more cache lines than that. The question we need to ask is: - * are the bytes corresponding to the leading pixels more than the amount - * by which the source pointer will be rounded down for preloading, and if - * so, by how many cache lines? Effectively, we want to calculate - * leading_bytes = ((-dst)&15)*src_bpp/dst_bpp - * inner_loop_offset = (src+leading_bytes)&31 - * extra_needed = leading_bytes - inner_loop_offset - * and test if extra_needed is <= 0, <= 32, or > 32 (where > 32 is only - * possible when there are 4 src bytes for every 1 dst byte). - */ - .if bpp > 0 - .ifc base,DST - /* The test can be simplified further when preloading the destination */ - PF tst, base, #16 - PF beq, 61f - .else - .if bpp/dst_w_bpp == 4 - PF add, SCRATCH, base, WK0, lsl #bpp_shift-dst_bpp_shift - PF and, SCRATCH, SCRATCH, #31 - PF rsb, SCRATCH, SCRATCH, WK0, lsl #bpp_shift-dst_bpp_shift - PF sub, SCRATCH, SCRATCH, #1 /* so now ranges are -16..-1 / 0..31 / 32..63 */ - PF movs, SCRATCH, SCRATCH, lsl #32-6 /* so this sets NC / nc / Nc */ - PF bcs, 61f - PF bpl, 60f - PF pld, [ptr, #32*(prefetch_distance+2)] - .else - PF mov, SCRATCH, base, lsl #32-5 - PF add, SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift - PF rsbs, SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift - PF bls, 61f - .endif - .endif -60: PF pld, [ptr, #32*(prefetch_distance+1)] -61: - .endif -.endm - -#define IS_END_OF_GROUP(INDEX,SIZE) ((SIZE) < 2 || ((INDEX) & ~((INDEX)+1)) & ((SIZE)/2)) -.macro preload_middle bpp, base, scratch_holds_offset - .if bpp > 0 - /* prefetch distance = 256/bpp, stm distance = 128/dst_w_bpp */ - .if IS_END_OF_GROUP(SUBBLOCK,256/128*dst_w_bpp/bpp) - .if scratch_holds_offset - PF pld, [base, SCRATCH] - .else - PF bic, SCRATCH, base, #31 - PF pld, [SCRATCH, #32*prefetch_distance] - .endif - .endif - .endif -.endm - -.macro preload_trailing bpp, bpp_shift, base - .if bpp > 0 - .if bpp*pix_per_block > 256 - /* Calculations are more complex if more than one fetch per block */ - PF and, WK1, base, #31 - PF add, WK1, WK1, WK0, lsl #bpp_shift - PF add, WK1, WK1, #32*(bpp*pix_per_block/256-1)*(prefetch_distance+1) - PF bic, SCRATCH, base, #31 -80: PF pld, [SCRATCH, #32*(prefetch_distance+1)] - PF add, SCRATCH, SCRATCH, #32 - PF subs, WK1, WK1, #32 - PF bhi, 80b - .else - /* If exactly one fetch per block, then we need either 0, 1 or 2 extra preloads */ - PF mov, SCRATCH, base, lsl #32-5 - PF adds, SCRATCH, SCRATCH, X, lsl #32-5+bpp_shift - PF adceqs, SCRATCH, SCRATCH, #0 - /* The instruction above has two effects: ensures Z is only - * set if C was clear (so Z indicates that both shifted quantities - * were 0), and clears C if Z was set (so C indicates that the sum - * of the shifted quantities was greater and not equal to 32) */ - PF beq, 82f - PF bic, SCRATCH, base, #31 - PF bcc, 81f - PF pld, [SCRATCH, #32*(prefetch_distance+2)] -81: PF pld, [SCRATCH, #32*(prefetch_distance+1)] -82: - .endif - .endif -.endm - - -.macro preload_line narrow_case, bpp, bpp_shift, base -/* "narrow_case" - just means that the macro was invoked from the "narrow" - * code path rather than the "medium" one - because in the narrow case, - * the row of pixels is known to output no more than 30 bytes, then - * (assuming the source pixels are no wider than the the destination - * pixels) they cannot possibly straddle more than 2 32-byte cachelines, - * meaning there's no need for a loop. - * "bpp" - number of bits per pixel in the channel (source, mask or - * destination) that's being preloaded, or 0 if this channel is not used - * for reading - * "bpp_shift" - log2 of ("bpp"/8) (except if "bpp"=0 of course) - * "base" - base address register of channel to preload (SRC, MASK or DST) - */ - .if bpp > 0 - .if narrow_case && (bpp <= dst_w_bpp) - /* In these cases, each line for each channel is in either 1 or 2 cache lines */ - PF bic, WK0, base, #31 - PF pld, [WK0] - PF add, WK1, base, X, LSL #bpp_shift - PF sub, WK1, WK1, #1 - PF bic, WK1, WK1, #31 - PF cmp, WK1, WK0 - PF beq, 90f - PF pld, [WK1] -90: - .else - PF bic, WK0, base, #31 - PF pld, [WK0] - PF add, WK1, base, X, lsl #bpp_shift - PF sub, WK1, WK1, #1 - PF bic, WK1, WK1, #31 - PF cmp, WK1, WK0 - PF beq, 92f -91: PF add, WK0, WK0, #32 - PF cmp, WK0, WK1 - PF pld, [WK0] - PF bne, 91b -92: - .endif - .endif -.endm - - -.macro conditional_process1_helper cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx - process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, 0 - .if decrementx - sub&cond X, X, #8*numbytes/dst_w_bpp - .endif - process_tail cond, numbytes, firstreg - .if !((flags) & FLAG_PROCESS_DOES_STORE) - pixst cond, numbytes, firstreg, DST - .endif -.endm - -.macro conditional_process1 cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx - .if (flags) & FLAG_BRANCH_OVER - .ifc cond,mi - bpl 100f - .endif - .ifc cond,cs - bcc 100f - .endif - .ifc cond,ne - beq 100f - .endif - conditional_process1_helper , process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx -100: - .else - conditional_process1_helper cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx - .endif -.endm - -.macro conditional_process2 test, cond1, cond2, process_head, process_tail, numbytes1, numbytes2, firstreg1, firstreg2, unaligned_src, unaligned_mask, decrementx - .if (flags) & (FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE) - /* Can't interleave reads and writes */ - test - conditional_process1 cond1, process_head, process_tail, numbytes1, firstreg1, unaligned_src, unaligned_mask, decrementx - .if (flags) & FLAG_PROCESS_CORRUPTS_PSR - test - .endif - conditional_process1 cond2, process_head, process_tail, numbytes2, firstreg2, unaligned_src, unaligned_mask, decrementx - .else - /* Can interleave reads and writes for better scheduling */ - test - process_head cond1, numbytes1, firstreg1, unaligned_src, unaligned_mask, 0 - process_head cond2, numbytes2, firstreg2, unaligned_src, unaligned_mask, 0 - .if decrementx - sub&cond1 X, X, #8*numbytes1/dst_w_bpp - sub&cond2 X, X, #8*numbytes2/dst_w_bpp - .endif - process_tail cond1, numbytes1, firstreg1 - process_tail cond2, numbytes2, firstreg2 - pixst cond1, numbytes1, firstreg1, DST - pixst cond2, numbytes2, firstreg2, DST - .endif -.endm - - -.macro test_bits_1_0_ptr - .if (flags) & FLAG_PROCESS_CORRUPTS_WK0 - movs SCRATCH, X, lsl #32-1 /* C,N = bits 1,0 of DST */ - .else - movs SCRATCH, WK0, lsl #32-1 /* C,N = bits 1,0 of DST */ - .endif -.endm - -.macro test_bits_3_2_ptr - .if (flags) & FLAG_PROCESS_CORRUPTS_WK0 - movs SCRATCH, X, lsl #32-3 /* C,N = bits 3, 2 of DST */ - .else - movs SCRATCH, WK0, lsl #32-3 /* C,N = bits 3, 2 of DST */ - .endif -.endm - -.macro leading_15bytes process_head, process_tail - /* On entry, WK0 bits 0-3 = number of bytes until destination is 16-byte aligned */ - .set DECREMENT_X, 1 - .if (flags) & FLAG_PROCESS_CORRUPTS_WK0 - .set DECREMENT_X, 0 - sub X, X, WK0, lsr #dst_bpp_shift - str X, [sp, #LINE_SAVED_REG_COUNT*4] - mov X, WK0 - .endif - /* Use unaligned loads in all cases for simplicity */ - .if dst_w_bpp == 8 - conditional_process2 test_bits_1_0_ptr, mi, cs, process_head, process_tail, 1, 2, 1, 2, 1, 1, DECREMENT_X - .elseif dst_w_bpp == 16 - test_bits_1_0_ptr - conditional_process1 cs, process_head, process_tail, 2, 2, 1, 1, DECREMENT_X - .endif - conditional_process2 test_bits_3_2_ptr, mi, cs, process_head, process_tail, 4, 8, 1, 2, 1, 1, DECREMENT_X - .if (flags) & FLAG_PROCESS_CORRUPTS_WK0 - ldr X, [sp, #LINE_SAVED_REG_COUNT*4] - .endif -.endm - -.macro test_bits_3_2_pix - movs SCRATCH, X, lsl #dst_bpp_shift+32-3 -.endm - -.macro test_bits_1_0_pix - .if dst_w_bpp == 8 - movs SCRATCH, X, lsl #dst_bpp_shift+32-1 - .else - movs SCRATCH, X, lsr #1 - .endif -.endm - -.macro trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask - conditional_process2 test_bits_3_2_pix, cs, mi, process_head, process_tail, 8, 4, 0, 2, unaligned_src, unaligned_mask, 0 - .if dst_w_bpp == 16 - test_bits_1_0_pix - conditional_process1 cs, process_head, process_tail, 2, 0, unaligned_src, unaligned_mask, 0 - .elseif dst_w_bpp == 8 - conditional_process2 test_bits_1_0_pix, cs, mi, process_head, process_tail, 2, 1, 0, 1, unaligned_src, unaligned_mask, 0 - .endif -.endm - - -.macro wide_case_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment -110: - .set SUBBLOCK, 0 /* this is a count of STMs; there can be up to 8 STMs per block */ - .rept pix_per_block*dst_w_bpp/128 - process_head , 16, 0, unaligned_src, unaligned_mask, 1 - .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) - preload_middle src_bpp, SRC, 1 - .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) - preload_middle mask_bpp, MASK, 1 - .else - preload_middle src_bpp, SRC, 0 - preload_middle mask_bpp, MASK, 0 - .endif - .if (dst_r_bpp > 0) && ((SUBBLOCK % 2) == 0) && (((flags) & FLAG_NO_PRELOAD_DST) == 0) - /* Because we know that writes are 16-byte aligned, it's relatively easy to ensure that - * destination prefetches are 32-byte aligned. It's also the easiest channel to offset - * preloads for, to achieve staggered prefetches for multiple channels, because there are - * always two STMs per prefetch, so there is always an opposite STM on which to put the - * preload. Note, no need to BIC the base register here */ - PF pld, [DST, #32*prefetch_distance - dst_alignment] - .endif - process_tail , 16, 0 - .if !((flags) & FLAG_PROCESS_DOES_STORE) - pixst , 16, 0, DST - .endif - .set SUBBLOCK, SUBBLOCK+1 - .endr - subs X, X, #pix_per_block - bhs 110b -.endm - -.macro wide_case_inner_loop_and_trailing_pixels process_head, process_tail, process_inner_loop, exit_label, unaligned_src, unaligned_mask - /* Destination now 16-byte aligned; we have at least one block before we have to stop preloading */ - .if dst_r_bpp > 0 - tst DST, #16 - bne 111f - process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 16 + DST_PRELOAD_BIAS - b 112f -111: - .endif - process_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, 0 + DST_PRELOAD_BIAS -112: - /* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */ - .if (src_bpp*pix_per_block > 256) || (mask_bpp*pix_per_block > 256) || (dst_r_bpp*pix_per_block > 256) - PF and, WK0, X, #pix_per_block-1 - .endif - preload_trailing src_bpp, src_bpp_shift, SRC - preload_trailing mask_bpp, mask_bpp_shift, MASK - .if ((flags) & FLAG_NO_PRELOAD_DST) == 0 - preload_trailing dst_r_bpp, dst_bpp_shift, DST - .endif - add X, X, #(prefetch_distance+2)*pix_per_block - 128/dst_w_bpp - /* The remainder of the line is handled identically to the medium case */ - medium_case_inner_loop_and_trailing_pixels process_head, process_tail,, exit_label, unaligned_src, unaligned_mask -.endm - -.macro medium_case_inner_loop_and_trailing_pixels process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask -120: - process_head , 16, 0, unaligned_src, unaligned_mask, 0 - process_tail , 16, 0 - .if !((flags) & FLAG_PROCESS_DOES_STORE) - pixst , 16, 0, DST - .endif - subs X, X, #128/dst_w_bpp - bhs 120b - /* Trailing pixels */ - tst X, #128/dst_w_bpp - 1 - beq exit_label - trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask -.endm - -.macro narrow_case_inner_loop_and_trailing_pixels process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask - tst X, #16*8/dst_w_bpp - conditional_process1 ne, process_head, process_tail, 16, 0, unaligned_src, unaligned_mask, 0 - /* Trailing pixels */ - /* In narrow case, it's relatively unlikely to be aligned, so let's do without a branch here */ - trailing_15bytes process_head, process_tail, unaligned_src, unaligned_mask -.endm - -.macro switch_on_alignment action, process_head, process_tail, process_inner_loop, exit_label - /* Note that if we're reading the destination, it's already guaranteed to be aligned at this point */ - .if mask_bpp == 8 || mask_bpp == 16 - tst MASK, #3 - bne 141f - .endif - .if src_bpp == 8 || src_bpp == 16 - tst SRC, #3 - bne 140f - .endif - action process_head, process_tail, process_inner_loop, exit_label, 0, 0 - .if src_bpp == 8 || src_bpp == 16 - b exit_label -140: - action process_head, process_tail, process_inner_loop, exit_label, 1, 0 - .endif - .if mask_bpp == 8 || mask_bpp == 16 - b exit_label -141: - .if src_bpp == 8 || src_bpp == 16 - tst SRC, #3 - bne 142f - .endif - action process_head, process_tail, process_inner_loop, exit_label, 0, 1 - .if src_bpp == 8 || src_bpp == 16 - b exit_label -142: - action process_head, process_tail, process_inner_loop, exit_label, 1, 1 - .endif - .endif -.endm - - -.macro end_of_line restore_x, vars_spilled, loop_label, last_one - .if SINGLE_SCANLINE - .ifc "last_one","" - b 198f - .endif - .else - .if vars_spilled - /* Sadly, GAS doesn't seem have an equivalent of the DCI directive? */ - /* This is ldmia sp,{} */ - .word 0xE89D0000 | LINE_SAVED_REGS - .endif - subs Y, Y, #1 - .if vars_spilled - .if (LINE_SAVED_REGS) & (1<<1) - str Y, [sp] - .endif - .endif - add DST, DST, STRIDE_D - .if src_bpp > 0 - add SRC, SRC, STRIDE_S - .endif - .if mask_bpp > 0 - add MASK, MASK, STRIDE_M - .endif - .if restore_x - mov X, ORIG_W - .endif - bhs loop_label - .ifc "last_one","" - .if vars_spilled - b 197f - .else - b 198f - .endif - .else - .if (!vars_spilled) && ((flags) & FLAG_SPILL_LINE_VARS) - b 198f - .endif - .endif - .endif -.endm - - -.macro generate_composite_function_common fname, \ - src_bpp_, \ - mask_bpp_, \ - dst_w_bpp_, \ - flags_, \ - prefetch_distance_, \ - init, \ - newline, \ - cleanup, \ - process_head, \ - process_tail, \ - process_inner_loop - - pixman_asm_function fname - -/* - * Make some macro arguments globally visible and accessible - * from other macros - */ - .set src_bpp, src_bpp_ - .set mask_bpp, mask_bpp_ - .set dst_w_bpp, dst_w_bpp_ - .set flags, flags_ - .set prefetch_distance, prefetch_distance_ - -/* - * Select prefetch type for this function. - */ - .if prefetch_distance == 0 - .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE - .else - .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_STANDARD - .endif - - .if src_bpp == 32 - .set src_bpp_shift, 2 - .elseif src_bpp == 24 - .set src_bpp_shift, 0 - .elseif src_bpp == 16 - .set src_bpp_shift, 1 - .elseif src_bpp == 8 - .set src_bpp_shift, 0 - .elseif src_bpp == 0 - .set src_bpp_shift, -1 - .else - .error "requested src bpp (src_bpp) is not supported" - .endif - - .if mask_bpp == 32 - .set mask_bpp_shift, 2 - .elseif mask_bpp == 24 - .set mask_bpp_shift, 0 - .elseif mask_bpp == 8 - .set mask_bpp_shift, 0 - .elseif mask_bpp == 0 - .set mask_bpp_shift, -1 - .else - .error "requested mask bpp (mask_bpp) is not supported" - .endif - - .if dst_w_bpp == 32 - .set dst_bpp_shift, 2 - .elseif dst_w_bpp == 24 - .set dst_bpp_shift, 0 - .elseif dst_w_bpp == 16 - .set dst_bpp_shift, 1 - .elseif dst_w_bpp == 8 - .set dst_bpp_shift, 0 - .else - .error "requested dst bpp (dst_w_bpp) is not supported" - .endif - - .if (((flags) & FLAG_DST_READWRITE) != 0) - .set dst_r_bpp, dst_w_bpp - .else - .set dst_r_bpp, 0 - .endif - - .set pix_per_block, 16*8/dst_w_bpp - .if src_bpp != 0 - .if 32*8/src_bpp > pix_per_block - .set pix_per_block, 32*8/src_bpp - .endif - .endif - .if mask_bpp != 0 - .if 32*8/mask_bpp > pix_per_block - .set pix_per_block, 32*8/mask_bpp - .endif - .endif - .if dst_r_bpp != 0 - .if 32*8/dst_r_bpp > pix_per_block - .set pix_per_block, 32*8/dst_r_bpp - .endif - .endif - -/* The standard entry conditions set up by pixman-arm-common.h are: - * r0 = width (pixels) - * r1 = height (rows) - * r2 = pointer to top-left pixel of destination - * r3 = destination stride (pixels) - * [sp] = source pixel value, or pointer to top-left pixel of source - * [sp,#4] = 0 or source stride (pixels) - * The following arguments are unused for non-mask operations - * [sp,#8] = mask pixel value, or pointer to top-left pixel of mask - * [sp,#12] = 0 or mask stride (pixels) - * - * or in the single-scanline case: - * r0 = width (pixels) - * r1 = pointer to top-left pixel of destination - * r2 = pointer to top-left pixel of source - * The following argument is unused for non-mask operations - * r3 = pointer to top-left pixel of mask - */ - -/* - * Assign symbolic names to registers - */ - X .req r0 /* pixels to go on this line */ - .if SINGLE_SCANLINE - DST .req r1 /* destination pixel pointer */ - SRC .req r2 /* source pixel pointer */ - MASK .req r3 /* mask pixel pointer (if applicable) */ - Y .req r4 /* temporary */ - STRIDE_D .req r5 /* temporary */ - STRIDE_S .req r6 /* temporary */ - STRIDE_M .req r7 /* temporary */ - .else - Y .req r1 /* lines to go */ - DST .req r2 /* destination pixel pointer */ - STRIDE_D .req r3 /* destination stride (bytes, minus width) */ - SRC .req r4 /* source pixel pointer */ - STRIDE_S .req r5 /* source stride (bytes, minus width) */ - MASK .req r6 /* mask pixel pointer (if applicable) */ - STRIDE_M .req r7 /* mask stride (bytes, minus width) */ - .endif - WK0 .req r8 /* pixel data registers */ - WK1 .req r9 - WK2 .req r10 - WK3 .req r11 - SCRATCH .req r12 - ORIG_W .req r14 /* width (pixels) */ - - push {r4-r11, lr} /* save all registers */ - - .if !SINGLE_SCANLINE - subs Y, Y, #1 - blo 199f - .endif - -#ifdef DEBUG_PARAMS - sub sp, sp, #9*4 -#endif - - .if !SINGLE_SCANLINE - .if src_bpp > 0 - ldr SRC, [sp, #ARGS_STACK_OFFSET] - ldr STRIDE_S, [sp, #ARGS_STACK_OFFSET+4] - .endif - .if mask_bpp > 0 - ldr MASK, [sp, #ARGS_STACK_OFFSET+8] - ldr STRIDE_M, [sp, #ARGS_STACK_OFFSET+12] - .endif - .endif - -#ifdef DEBUG_PARAMS - add Y, Y, #1 - stmia sp, {r0-r7,pc} - sub Y, Y, #1 -#endif - - init - - .if (flags) & FLAG_PROCESS_CORRUPTS_WK0 - /* Reserve a word in which to store X during leading pixels */ - sub sp, sp, #4 - .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET+4 - .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET+4 - .endif - - .if !SINGLE_SCANLINE - lsl STRIDE_D, #dst_bpp_shift /* stride in bytes */ - sub STRIDE_D, STRIDE_D, X, lsl #dst_bpp_shift - .if src_bpp > 0 - lsl STRIDE_S, #src_bpp_shift - sub STRIDE_S, STRIDE_S, X, lsl #src_bpp_shift - .endif - .if mask_bpp > 0 - lsl STRIDE_M, #mask_bpp_shift - sub STRIDE_M, STRIDE_M, X, lsl #mask_bpp_shift - .endif - .endif - - /* Are we not even wide enough to have one 16-byte aligned 16-byte block write? */ - cmp X, #2*16*8/dst_w_bpp - 1 - blo 170f - .if src_bpp || mask_bpp || dst_r_bpp /* Wide and medium cases are the same for fill */ - /* To preload ahead on the current line, we need at least (prefetch_distance+2) 32-byte blocks on all prefetch channels */ - cmp X, #(prefetch_distance+3)*pix_per_block - 1 - blo 160f - - /* Wide case */ - /* Adjust X so that the decrement instruction can also test for - * inner loop termination. We want it to stop when there are - * (prefetch_distance+1) complete blocks to go. */ - sub X, X, #(prefetch_distance+2)*pix_per_block - .if !SINGLE_SCANLINE - mov ORIG_W, X - .if (flags) & FLAG_SPILL_LINE_VARS_WIDE - /* This is stmdb sp!,{} */ - .word 0xE92D0000 | LINE_SAVED_REGS - .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4 - .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4 - .endif - .endif -151: /* New line */ - newline - preload_leading_step1 src_bpp, WK1, SRC - preload_leading_step1 mask_bpp, WK2, MASK - .if ((flags) & FLAG_NO_PRELOAD_DST) == 0 - preload_leading_step1 dst_r_bpp, WK3, DST - .endif - - ands WK0, DST, #15 - beq 154f - rsb WK0, WK0, #16 /* number of leading bytes until destination aligned */ - - preload_leading_step2 src_bpp, src_bpp_shift, WK1, SRC - preload_leading_step2 mask_bpp, mask_bpp_shift, WK2, MASK - .if ((flags) & FLAG_NO_PRELOAD_DST) == 0 - preload_leading_step2 dst_r_bpp, dst_bpp_shift, WK3, DST - .endif - - leading_15bytes process_head, process_tail - -154: /* Destination now 16-byte aligned; we have at least one prefetch on each channel as well as at least one 16-byte output block */ - .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) - and SCRATCH, SRC, #31 - rsb SCRATCH, SCRATCH, #32*prefetch_distance - .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) - and SCRATCH, MASK, #31 - rsb SCRATCH, SCRATCH, #32*prefetch_distance - .endif - .ifc "process_inner_loop","" - switch_on_alignment wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, wide_case_inner_loop, 157f - .else - switch_on_alignment wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, process_inner_loop, 157f - .endif - -157: /* Check for another line */ - end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_WIDE), 151b - .if (!SINGLE_SCANLINE) && ((flags) & FLAG_SPILL_LINE_VARS_WIDE) - .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4 - .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4 - .endif - .endif - - .ltorg - -160: /* Medium case */ - .if !SINGLE_SCANLINE - mov ORIG_W, X - .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE - /* This is stmdb sp!,{} */ - .word 0xE92D0000 | LINE_SAVED_REGS - .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4 - .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4 - .endif - .endif -161: /* New line */ - newline - preload_line 0, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */ - preload_line 0, mask_bpp, mask_bpp_shift, MASK - .if ((flags) & FLAG_NO_PRELOAD_DST) == 0 - preload_line 0, dst_r_bpp, dst_bpp_shift, DST - .endif - - sub X, X, #128/dst_w_bpp /* simplifies inner loop termination */ - ands WK0, DST, #15 - beq 164f - rsb WK0, WK0, #16 /* number of leading bytes until destination aligned */ - - leading_15bytes process_head, process_tail - -164: /* Destination now 16-byte aligned; we have at least one 16-byte output block */ - switch_on_alignment medium_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 167f - -167: /* Check for another line */ - end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 161b - - .ltorg - -170: /* Narrow case, less than 31 bytes, so no guarantee of at least one 16-byte block */ - .if !SINGLE_SCANLINE - .if dst_w_bpp < 32 - mov ORIG_W, X - .endif - .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE - /* This is stmdb sp!,{} */ - .word 0xE92D0000 | LINE_SAVED_REGS - .endif - .endif -171: /* New line */ - newline - preload_line 1, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */ - preload_line 1, mask_bpp, mask_bpp_shift, MASK - .if ((flags) & FLAG_NO_PRELOAD_DST) == 0 - preload_line 1, dst_r_bpp, dst_bpp_shift, DST - .endif - - .if dst_w_bpp == 8 - tst DST, #3 - beq 174f -172: subs X, X, #1 - blo 177f - process_head , 1, 0, 1, 1, 0 - process_tail , 1, 0 - .if !((flags) & FLAG_PROCESS_DOES_STORE) - pixst , 1, 0, DST - .endif - tst DST, #3 - bne 172b - .elseif dst_w_bpp == 16 - tst DST, #2 - beq 174f - subs X, X, #1 - blo 177f - process_head , 2, 0, 1, 1, 0 - process_tail , 2, 0 - .if !((flags) & FLAG_PROCESS_DOES_STORE) - pixst , 2, 0, DST - .endif - .endif - -174: /* Destination now 4-byte aligned; we have 0 or more output bytes to go */ - switch_on_alignment narrow_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 177f - -177: /* Check for another line */ - end_of_line %(dst_w_bpp < 32), %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 171b, last_one - .if (!SINGLE_SCANLINE) && ((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE) - .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4 - .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4 - .endif - -197: - .if (!SINGLE_SCANLINE) && ((flags) & FLAG_SPILL_LINE_VARS) - add sp, sp, #LINE_SAVED_REG_COUNT*4 - .endif -198: - .if (flags) & FLAG_PROCESS_CORRUPTS_WK0 - .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET-4 - .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET-4 - add sp, sp, #4 - .endif - - cleanup - -#ifdef DEBUG_PARAMS - add sp, sp, #9*4 /* junk the debug copy of arguments */ -#endif -199: - pop {r4-r11, pc} /* exit */ - - .ltorg - - .unreq X - .unreq Y - .unreq DST - .unreq STRIDE_D - .unreq SRC - .unreq STRIDE_S - .unreq MASK - .unreq STRIDE_M - .unreq WK0 - .unreq WK1 - .unreq WK2 - .unreq WK3 - .unreq SCRATCH - .unreq ORIG_W - .endfunc -.endm - -.macro generate_composite_function fname, \ - src_bpp_, \ - mask_bpp_, \ - dst_w_bpp_, \ - flags_, \ - prefetch_distance_, \ - init, \ - newline, \ - cleanup, \ - process_head, \ - process_tail, \ - process_inner_loop - .set SINGLE_SCANLINE, 0 -generate_composite_function_common \ - fname, src_bpp_, mask_bpp_, dst_w_bpp_, flags_, prefetch_distance_, \ - init, newline, cleanup, process_head, process_tail, process_inner_loop -.endm - -.macro generate_composite_function_single_scanline fname, \ - src_bpp_, \ - mask_bpp_, \ - dst_w_bpp_, \ - flags_, \ - prefetch_distance_, \ - init, \ - newline, \ - cleanup, \ - process_head, \ - process_tail, \ - process_inner_loop - .set SINGLE_SCANLINE, 1 -generate_composite_function_common \ - fname, src_bpp_, mask_bpp_, dst_w_bpp_, flags_, prefetch_distance_, \ - init, newline, cleanup, process_head, process_tail, process_inner_loop -.endm - -.macro line_saved_regs x:vararg - .set LINE_SAVED_REGS, 0 - .set LINE_SAVED_REG_COUNT, 0 - .irp SAVED_REG,x - .ifc "SAVED_REG","Y" - .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<1) - .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 - .endif - .ifc "SAVED_REG","STRIDE_D" - .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<3) - .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 - .endif - .ifc "SAVED_REG","STRIDE_S" - .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<5) - .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 - .endif - .ifc "SAVED_REG","STRIDE_M" - .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<7) - .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 - .endif - .ifc "SAVED_REG","ORIG_W" - .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<14) - .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1 - .endif - .endr - .if SINGLE_SCANLINE - .set LINE_SAVED_REG_COUNT, 0 - .endif -.endm - -.macro nop_macro x:vararg -.endm