Remove ARM32 assembly/pixman blitters

2023-09-13 10:10:14 -05:00 · 2023-09-13 10:10:14 -05:00 · 0f351cd6af
parent 3ecb927587
commit 0f351cd6af
10 changed files with 3 additions and 3401 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -292,7 +292,6 @@ dep_option(SDL_MMX                 "Use MMX assembly routines" ON "SDL_ASSEMBLY;
 dep_option(SDL_ALTIVEC             "Use Altivec assembly routines" ON "SDL_ASSEMBLY;SDL_CPU_POWERPC32 OR SDL_CPU_POWERPC64" OFF)
 dep_option(SDL_ARMSIMD             "Use SIMD assembly blitters on ARM" OFF "SDL_ASSEMBLY;SDL_CPU_ARM32" OFF)
 dep_option(SDL_ARMNEON             "Use NEON assembly routines" ON "SDL_ASSEMBLY;SDL_CPU_ARM32 OR SDL_CPU_ARM64" OFF)
 dep_option(SDL_ARMNEON_BLITTERS    "Use NEON assembly blitters on ARM32" OFF "SDL_VIDEO;SDL_ASSEMBLY;SDL_ARMNEON;SDL_CPU_ARM32" OFF)
 dep_option(SDL_LSX                 "Use LSX assembly routines" ON "SDL_ASSEMBLY;SDL_CPU_LOONGARCH64" OFF)
 dep_option(SDL_LASX                "Use LASX assembly routines" ON "SDL_ASSEMBLY;SDL_CPU_LOONGARCH64" OFF)
@ -883,67 +882,6 @@ if(SDL_ASSEMBLY)
      endif()
    endif()
    if(SDL_ARMSIMD)
      cmake_push_check_state()
      string(APPEND CMAKE_REQUIRED_FLAGS " -x assembler-with-cpp")
      list(APPEND CMAKE_REQUIRED_LINK_OPTIONS -x none)
      check_c_source_compiles("
        .text
        .arch armv6
        .object_arch armv4
        .arm
        .altmacro
        #ifndef __ARM_EABI__
        #error EABI is required (to be sure that calling conventions are compatible)
        #endif
        main:
        .global main
        pld [r0]
        uqadd8 r0, r0, r0
      " ARMSIMD_FOUND)
      cmake_pop_check_state()
      if(ARMSIMD_FOUND)
        set(HAVE_ARMSIMD TRUE)
        set(SDL_ARM_SIMD_BLITTERS 1)
        enable_language(ASM)
        sdl_glob_sources("${SDL3_SOURCE_DIR}/src/video/arm/pixman-arm-simd*.S")
        set_property(SOURCE ${ARMSIMD_SOURCES} APPEND PROPERTY COMPILE_OPTIONS -x assembler-with-cpp)
        set(WARN_ABOUT_ARM_SIMD_ASM_MIT TRUE)
      endif()
    endif()
    if(SDL_ARMNEON_BLITTERS)
      cmake_push_check_state()
      string(APPEND CMAKE_REQUIRED_FLAGS " -x assembler-with-cpp")
      list(APPEND CMAKE_REQUIRED_LINK_OPTIONS -x none)
      check_c_source_compiles("
        .text
        .fpu neon
        .arch armv7a
        .object_arch armv4
        .eabi_attribute 10, 0
        .arm
        .altmacro
        #ifndef __ARM_EABI__
        #error EABI is required (to be sure that calling conventions are compatible)
        #endif
        main:
        .global main
        pld [r0]
        vmovn.u16 d0, q0
      " COMPILER_SUPPORTS_ARMNEON_ASSEMBLY)
      cmake_pop_check_state()
      if(COMPILER_SUPPORTS_ARMNEON_ASSEMBLY)
        set(HAVE_ARMNEON_BLITTERS TRUE)
        set(SDL_ARM_NEON_BLITTERS 1)
        enable_language(ASM)
        sdl_glob_sources("${SDL3_SOURCE_DIR}/src/video/arm/pixman-arm-neon*.S")
        set_property(SOURCE ${ARMNEON_SOURCES} APPEND PROPERTY COMPILE_OPTIONS -x assembler-with-cpp)
        set(WARN_ABOUT_ARM_NEON_ASM_MIT TRUE)
      endif()
    endif()
    if(SDL_ARMNEON)
      check_c_source_compiles("
        #include <arm_neon.h>
--- a/cmake/3rdparty.cmake
+++ b/cmake/3rdparty.cmake
@ -25,10 +25,6 @@ function(get_clang_tidy_ignored_files OUTVAR)
      # HIDAPI Steam controller
      "controller_constants.h"
      "controller_structs.h"
      # Nokia Pixman
      "pixman-arm-asm.h"
      "pixman-arm-neon-asm.h"
      "pixman-arm-simd-asm.h"
      # YUV2RGB
      "yuv_rgb.c"
      "yuv_rgb_lsx_func.h"
--- a/src/video/SDL_blit.h
+++ b/src/video/SDL_blit.h
@ -23,12 +23,6 @@
 #ifndef SDL_blit_h_
 #define SDL_blit_h_
 /* pixman ARM blitters are 32 bit only : */
 #if defined(__aarch64__) || defined(_M_ARM64)
 #undef SDL_ARM_SIMD_BLITTERS
 #undef SDL_ARM_NEON_BLITTERS
 #endif
 /* Table to do pixel byte expansion */
 extern const Uint8 *SDL_expand_byte[9];
 extern const Uint16 SDL_expand_byte_10[];
--- a/src/video/SDL_blit_A.c
+++ b/src/video/SDL_blit_A.c
@ -421,66 +421,6 @@ static void SDL_TARGETING("mmx") BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
 #endif /* SDL_MMX_INTRINSICS */
 #ifdef SDL_ARM_SIMD_BLITTERS
 void BlitARGBto565PixelAlphaARMSIMDAsm(int32_t w, int32_t h, uint16_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride);
 static void BlitARGBto565PixelAlphaARMSIMD(SDL_BlitInfo *info)
 {
    int32_t width = info->dst_w;
    int32_t height = info->dst_h;
    uint16_t *dstp = (uint16_t *)info->dst;
    int32_t dststride = width + (info->dst_skip >> 1);
    uint32_t *srcp = (uint32_t *)info->src;
    int32_t srcstride = width + (info->src_skip >> 2);
    BlitARGBto565PixelAlphaARMSIMDAsm(width, height, dstp, dststride, srcp, srcstride);
 }
 void BlitRGBtoRGBPixelAlphaARMSIMDAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride);
 static void BlitRGBtoRGBPixelAlphaARMSIMD(SDL_BlitInfo *info)
 {
    int32_t width = info->dst_w;
    int32_t height = info->dst_h;
    uint32_t *dstp = (uint32_t *)info->dst;
    int32_t dststride = width + (info->dst_skip >> 2);
    uint32_t *srcp = (uint32_t *)info->src;
    int32_t srcstride = width + (info->src_skip >> 2);
    BlitRGBtoRGBPixelAlphaARMSIMDAsm(width, height, dstp, dststride, srcp, srcstride);
 }
 #endif
 #ifdef SDL_ARM_NEON_BLITTERS
 void BlitARGBto565PixelAlphaARMNEONAsm(int32_t w, int32_t h, uint16_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride);
 static void BlitARGBto565PixelAlphaARMNEON(SDL_BlitInfo *info)
 {
    int32_t width = info->dst_w;
    int32_t height = info->dst_h;
    uint16_t *dstp = (uint16_t *)info->dst;
    int32_t dststride = width + (info->dst_skip >> 1);
    uint32_t *srcp = (uint32_t *)info->src;
    int32_t srcstride = width + (info->src_skip >> 2);
    BlitARGBto565PixelAlphaARMNEONAsm(width, height, dstp, dststride, srcp, srcstride);
 }
 void BlitRGBtoRGBPixelAlphaARMNEONAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride);
 static void BlitRGBtoRGBPixelAlphaARMNEON(SDL_BlitInfo *info)
 {
    int32_t width = info->dst_w;
    int32_t height = info->dst_h;
    uint32_t *dstp = (uint32_t *)info->dst;
    int32_t dststride = width + (info->dst_skip >> 2);
    uint32_t *srcp = (uint32_t *)info->src;
    int32_t srcstride = width + (info->src_skip >> 2);
    BlitRGBtoRGBPixelAlphaARMNEONAsm(width, height, dstp, dststride, srcp, srcstride);
 }
 #endif
 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
 static void BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo *info)
 {
@ -1274,21 +1214,7 @@ SDL_BlitFunc SDL_CalculateBlitA(SDL_Surface *surface)
            }
        case 2:
-#if defined(SDL_ARM_NEON_BLITTERS) || defined(SDL_ARM_SIMD_BLITTERS)
+            if (sf->BytesPerPixel == 4 && sf->Amask == 0xff000000 && sf->Gmask == 0xff00 && ((sf->Rmask == 0xff && df->Rmask == 0x1f) || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
            if (sf->bytes_per_pixel == 4 && sf->Amask == 0xff000000 && sf->Gmask == 0xff00 && df->Gmask == 0x7e0 && ((sf->Rmask == 0xff && df->Rmask == 0x1f) || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
 #ifdef SDL_ARM_NEON_BLITTERS
                if (SDL_HasNEON()) {
                    return BlitARGBto565PixelAlphaARMNEON;
                }
 #endif
 #ifdef SDL_ARM_SIMD_BLITTERS
                if (SDL_HasARMSIMD()) {
                    return BlitARGBto565PixelAlphaARMSIMD;
                }
 #endif
            }
 #endif
            if (sf->bytes_per_pixel == 4 && sf->Amask == 0xff000000 && sf->Gmask == 0xff00 && ((sf->Rmask == 0xff && df->Rmask == 0x1f) || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
                if (df->Gmask == 0x7e0) {
                    return BlitARGBto565PixelAlpha;
                } else if (df->Gmask == 0x3e0) {
@ -1311,18 +1237,6 @@ SDL_BlitFunc SDL_CalculateBlitA(SDL_Surface *surface)
                    }
                }
 #endif /* SDL_MMX_INTRINSICS */
                if (sf->Amask == 0xff000000) {
 #ifdef SDL_ARM_NEON_BLITTERS
                    if (SDL_HasNEON()) {
                        return BlitRGBtoRGBPixelAlphaARMNEON;
                    }
 #endif
 #ifdef SDL_ARM_SIMD_BLITTERS
                    if (SDL_HasARMSIMD()) {
                        return BlitRGBtoRGBPixelAlphaARMSIMD;
                    }
 #endif
                }
            }
            return BlitNtoNPixelAlpha;
--- a/src/video/SDL_fillrect.c
+++ b/src/video/SDL_fillrect.c
@ -247,54 +247,6 @@ int SDL_FillSurfaceRect(SDL_Surface *dst, const SDL_Rect *rect, Uint32 color)
    return SDL_FillSurfaceRects(dst, rect, 1, color);
 }
 #ifdef SDL_ARM_NEON_BLITTERS
 void FillSurfaceRect8ARMNEONAsm(int32_t w, int32_t h, uint8_t *dst, int32_t dst_stride, uint8_t src);
 void FillSurfaceRect16ARMNEONAsm(int32_t w, int32_t h, uint16_t *dst, int32_t dst_stride, uint16_t src);
 void FillSurfaceRect32ARMNEONAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t src);
 static void fill_8_neon(Uint8 *pixels, int pitch, Uint32 color, int w, int h)
 {
    FillSurfaceRect8ARMNEONAsm(w, h, (uint8_t *)pixels, pitch >> 0, color);
    return;
 }
 static void fill_16_neon(Uint8 *pixels, int pitch, Uint32 color, int w, int h)
 {
    FillSurfaceRect16ARMNEONAsm(w, h, (uint16_t *)pixels, pitch >> 1, color);
    return;
 }
 static void fill_32_neon(Uint8 *pixels, int pitch, Uint32 color, int w, int h)
 {
    FillSurfaceRect32ARMNEONAsm(w, h, (uint32_t *)pixels, pitch >> 2, color);
    return;
 }
 #endif
 #ifdef SDL_ARM_SIMD_BLITTERS
 void FillSurfaceRect8ARMSIMDAsm(int32_t w, int32_t h, uint8_t *dst, int32_t dst_stride, uint8_t src);
 void FillSurfaceRect16ARMSIMDAsm(int32_t w, int32_t h, uint16_t *dst, int32_t dst_stride, uint16_t src);
 void FillSurfaceRect32ARMSIMDAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t src);
 static void fill_8_simd(Uint8 *pixels, int pitch, Uint32 color, int w, int h)
 {
    FillSurfaceRect8ARMSIMDAsm(w, h, (uint8_t *)pixels, pitch >> 0, color);
    return;
 }
 static void fill_16_simd(Uint8 *pixels, int pitch, Uint32 color, int w, int h)
 {
    FillSurfaceRect16ARMSIMDAsm(w, h, (uint16_t *)pixels, pitch >> 1, color);
    return;
 }
 static void fill_32_simd(Uint8 *pixels, int pitch, Uint32 color, int w, int h)
 {
    FillSurfaceRect32ARMSIMDAsm(w, h, (uint32_t *)pixels, pitch >> 2, color);
    return;
 }
 #endif
 int SDL_FillSurfaceRects(SDL_Surface *dst, const SDL_Rect *rects, int count,
                  Uint32 color)
 {
@ -339,39 +291,8 @@ int SDL_FillSurfaceRects(SDL_Surface *dst, const SDL_Rect *rects, int count,
        return SDL_SetError("SDL_FillSurfaceRects(): Unsupported surface format");
    }
-#ifdef SDL_ARM_NEON_BLITTERS
+    if (fill_function == NULL) {
-    if (SDL_HasNEON() && dst->format->bytes_per_pixel != 3 && !fill_function) {
+        switch (dst->format->BytesPerPixel) {
        switch (dst->format->bytes_per_pixel) {
        case 1:
            fill_function = fill_8_neon;
            break;
        case 2:
            fill_function = fill_16_neon;
            break;
        case 4:
            fill_function = fill_32_neon;
            break;
        }
    }
 #endif
 #ifdef SDL_ARM_SIMD_BLITTERS
    if (SDL_HasARMSIMD() && dst->format->bytes_per_pixel != 3 && !fill_function) {
        switch (dst->format->bytes_per_pixel) {
        case 1:
            fill_function = fill_8_simd;
            break;
        case 2:
            fill_function = fill_16_simd;
            break;
        case 4:
            fill_function = fill_32_simd;
            break;
        }
    }
 #endif
    if (!fill_function) {
        switch (dst->format->bytes_per_pixel) {
        case 1:
        {
            color |= (color << 8);
--- a/src/video/arm/pixman-arm-asm.h
+++ b/src/video/arm/pixman-arm-asm.h
@ -1,36 +0,0 @@
 /*
 * Copyright © 2010 Nokia Corporation
 *
 * Permission to use, copy, modify, distribute, and sell this software and its
 * documentation for any purpose is hereby granted without fee, provided that
 * the above copyright notice appear in all copies and that both that
 * copyright notice and this permission notice appear in supporting
 * documentation, and that the name of Mozilla Corporation not be used in
 * advertising or publicity pertaining to distribution of the software without
 * specific, written prior permission.  Mozilla Corporation makes no
 * representations about the suitability of this software for any purpose.  It
 * is provided "as is" without express or implied warranty.
 *
 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
 * SOFTWARE.
 *
 * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
 *
 */
 /* Supplementary macro for setting function attributes */
 .macro pixman_asm_function fname
 	.func fname
 	.global fname
 #ifdef __ELF__
 	.hidden fname
 	.type fname, %function
 #endif
 fname:
 .endm
--- a/src/video/arm/pixman-arm-neon-asm.S
+++ b/src/video/arm/pixman-arm-neon-asm.S
@ -1,375 +0,0 @@
 /*
 * Copyright © 2009 Nokia Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 *
 * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
 */
 /*
 * Copyright (c) 2018 RISC OS Open Ltd
 *
 * This software is provided 'as-is', without any express or implied
 * warranty.  In no event will the authors be held liable for any damages
 * arising from the use of this software.
 *
 * Permission is granted to anyone to use this software for any purpose,
 * including commercial applications, and to alter it and redistribute it
 * freely, subject to the following restrictions:
 *
 * 1. The origin of this software must not be misrepresented; you must not
 *    claim that you wrote the original software. If you use this software
 *    in a product, an acknowledgment in the product documentation would be
 *    appreciated but is not required.
 * 2. Altered source versions must be plainly marked as such, and must not be
 *    misrepresented as being the original software.
 * 3. This notice may not be removed or altered from any source distribution.
 */
 /* Prevent the stack from becoming executable for no reason... */
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
    .text
    .fpu neon
    .arch armv7a
    .object_arch armv4
    .eabi_attribute 10, 0 /* suppress Tag_FP_arch */
    .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */
    .arm
    .altmacro
    .p2align 2
 #include "pixman-arm-asm.h"
 #include "pixman-arm-neon-asm.h"
 /* Global configuration options and preferences */
 /*
 * The code can optionally make use of unaligned memory accesses to improve
 * performance of handling leading/trailing pixels for each scanline.
 * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for
 * example in linux if unaligned memory accesses are not configured to
 * generate.exceptions.
 */
 .set RESPECT_STRICT_ALIGNMENT, 1
 /*
 * Set default prefetch type. There is a choice between the following options:
 *
 * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work
 * as NOP to workaround some HW bugs or for whatever other reason)
 *
 * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where
 * advanced prefetch introduces heavy overhead)
 *
 * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8
 * which can run ARM and NEON instructions simultaneously so that extra ARM
 * instructions do not add (many) extra cycles, but improve prefetch efficiency)
 *
 * Note: some types of function can't support advanced prefetch and fallback
 *       to simple one (those which handle 24bpp pixels)
 */
 .set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED
 /* Prefetch distance in pixels for simple prefetch */
 .set PREFETCH_DISTANCE_SIMPLE, 64
 /******************************************************************************/
 /* We can actually do significantly better than the Pixman macros, at least for
 * the case of fills, by using a carefully scheduled inner loop. Cortex-A53
 * shows an improvement of up to 78% in ideal cases (large fills to L1 cache).
 */
 .macro generate_fillrect_function name, bpp, log2Bpp
 /*
 * void name(int32_t w, int32_t h, uint8_t *dst, int32_t dst_stride, uint8_t src);
 * On entry:
 * a1 = width, pixels
 * a2 = height, rows
 * a3 = pointer to top-left destination pixel
 * a4 = stride, pixels
 * [sp] = pixel value to fill with
 * Within the function:
 * v1 = width remaining
 * v2 = vst offset
 * v3 = alternate pointer
 * ip = data ARM register
 */
 pixman_asm_function name
    vld1.\bpp   {d0[],d1[]}, [sp]
    sub         a4, a1
    vld1.\bpp   {d2[],d3[]}, [sp]
    cmp         a1, #(15+64) >> \log2Bpp
    push        {v1-v3,lr}
    vmov        ip, s0
    blo         51f
    /* Long-row case */
    mov         v2, #64
 1:  mov         v1, a1
    ands        v3, a3, #15
    beq         2f
    /* Leading pixels */
    rsb         v3, v3, #16  /* number of leading bytes until 16-byte aligned */
    sub         v1, v1, v3, lsr #\log2Bpp
    rbit        v3, v3
 .if bpp <= 16
 .if bpp == 8
    tst         a3, #1       /* bit 0 unaffected by rsb so can avoid register interlock */
    strneb      ip, [a3], #1
    tst         v3, #1<<30
 .else
    tst         a3, #2       /* bit 1 unaffected by rsb (assuming halfword alignment) so can avoid register interlock */
 .endif
    strneh      ip, [a3], #2
 .endif
    movs        v3, v3, lsl #3
    vstmcs      a3!, {s0}
    vstmmi      a3!, {d0}
 2:  sub         v1, v1, #64 >> \log2Bpp /* simplifies inner loop termination */
    add         v3, a3, #32
    /* Inner loop */
 3:  vst1.\bpp   {q0-q1}, [a3 :128], v2
    subs        v1, v1, #64 >> \log2Bpp
    vst1.\bpp   {q0-q1}, [v3 :128], v2
    bhs         3b
    /* Trailing pixels */
 4:  movs        v1, v1, lsl #27 + \log2Bpp
    bcc         5f
    vst1.\bpp   {q0-q1}, [a3 :128]!
 5:  bpl         6f
    vst1.\bpp   {q0}, [a3 :128]!
 6:  movs        v1, v1, lsl #2
    vstmcs      a3!, {d0}
    vstmmi      a3!, {s0}
 .if bpp <= 16
    movs        v1, v1, lsl #2
    strcsh      ip, [a3], #2
 .if bpp == 8
    strmib      ip, [a3], #1
 .endif
 .endif
    subs        a2, a2, #1
    add         a3, a3, a4, lsl #\log2Bpp
    bhi         1b
    pop         {v1-v3,pc}
    /* Short-row case */
 51: movs        v1, a1
 .if bpp == 8
    tst         a3, #3
    beq         53f
 52: subs        v1, v1, #1
    blo         57f
    strb        ip, [a3], #1
    tst         a3, #3
    bne         52b
 .elseif bpp == 16
    tstne       a3, #2
    subne       v1, v1, #1
    strneh      ip, [a3], #2
 .endif
 53: cmp         v1, #32 >> \log2Bpp
    bcc         54f
    vst1.\bpp   {q0-q1}, [a3]!
    sub         v1, v1, #32 >> \log2Bpp
    /* Trailing pixels */
 54: movs        v1, v1, lsl #27 + \log2Bpp
    bcc         55f
    vst1.\bpp   {q0-q1}, [a3]!
 55: bpl         56f
    vst1.\bpp   {q0}, [a3]!
 56: movs        v1, v1, lsl #2
    vstmcs      a3!, {d0}
    vstmmi      a3!, {s0}
 .if bpp <= 16
    movs        v1, v1, lsl #2
    strcsh      ip, [a3], #2
 .if bpp == 8
    strmib      ip, [a3], #1
 .endif
 .endif
    subs        a2, a2, #1
    add         a3, a3, a4, lsl #\log2Bpp
    bhi         51b
 57: pop         {v1-v3,pc}
 .endfunc
 .endm
 generate_fillrect_function FillSurfaceRect32ARMNEONAsm, 32, 2
 generate_fillrect_function FillSurfaceRect16ARMNEONAsm, 16, 1
 generate_fillrect_function FillSurfaceRect8ARMNEONAsm,  8,  0
 /******************************************************************************/
 .macro RGBtoRGBPixelAlpha_process_pixblock_head
    vmvn        d30, d3  /* get inverted source alpha */
    vmov        d31, d7  /* dest alpha is always unchanged */
    vmull.u8    q14, d0, d3
    vmlal.u8    q14, d4, d30
    vmull.u8    q0, d1, d3
    vmlal.u8    q0, d5, d30
    vmull.u8    q1, d2, d3
    vmlal.u8    q1, d6, d30
    vrshr.u16   q2, q14, #8
    vrshr.u16   q3, q0, #8
    vraddhn.u16 d28, q14, q2
    vrshr.u16   q2, q1, #8
    vraddhn.u16 d29, q0, q3
    vraddhn.u16 d30, q1, q2
 .endm
 .macro RGBtoRGBPixelAlpha_process_pixblock_tail
    /* nothing */
 .endm
 .macro RGBtoRGBPixelAlpha_process_pixblock_tail_head
    vld4.8      {d0-d3}, [SRC]!
                                    PF add PF_X, PF_X, #8
        vst4.8      {d28-d31}, [DST_W :128]!
                                    PF tst PF_CTL, #0xF
    vld4.8      {d4-d7}, [DST_R :128]!
                                    PF addne PF_X, PF_X, #8
    vmvn        d30, d3  /* get inverted source alpha */
    vmov        d31, d7  /* dest alpha is always unchanged */
    vmull.u8    q14, d0, d3
                                    PF subne PF_CTL, PF_CTL, #1
    vmlal.u8    q14, d4, d30
                                    PF cmp PF_X, ORIG_W
    vmull.u8    q0, d1, d3
                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
    vmlal.u8    q0, d5, d30
                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
    vmull.u8    q1, d2, d3
                                    PF subge PF_X, PF_X, ORIG_W
    vmlal.u8    q1, d6, d30
                                    PF subges PF_CTL, PF_CTL, #0x10
    vrshr.u16   q2, q14, #8
                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
    vrshr.u16   q3, q0, #8
                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
    vraddhn.u16 d28, q14, q2
    vrshr.u16   q2, q1, #8
    vraddhn.u16 d29, q0, q3
    vraddhn.u16 d30, q1, q2
 .endm
 generate_composite_function \
    BlitRGBtoRGBPixelAlphaARMNEONAsm, 32, 0, 32, \
    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
    8, /* number of pixels, processed in a single block */ \
    5, /* prefetch distance */ \
    default_init, \
    default_cleanup, \
    RGBtoRGBPixelAlpha_process_pixblock_head, \
    RGBtoRGBPixelAlpha_process_pixblock_tail, \
    RGBtoRGBPixelAlpha_process_pixblock_tail_head
 /******************************************************************************/
 .macro ARGBto565PixelAlpha_process_pixblock_head
    vmvn        d6, d3
    vshr.u8     d1, #2
    vshr.u8     d3, #3
    vshr.u8     d0, #3
    vshrn.u16   d7, q2, #3
    vshrn.u16   d25, q2, #8
    vbic.i16    q2, #0xe0
    vshr.u8     d6, #3
    vshr.u8     d7, #2
    vshr.u8     d2, #3
    vmovn.u16   d24, q2
    vshr.u8     d25, #3
    vmull.u8    q13, d1, d3
    vmlal.u8    q13, d7, d6
    vmull.u8    q14, d0, d3
    vmlal.u8    q14, d24, d6
    vmull.u8    q15, d2, d3
    vmlal.u8    q15, d25, d6
 .endm
 .macro ARGBto565PixelAlpha_process_pixblock_tail
    vsra.u16    q13, #5
    vsra.u16    q14, #5
    vsra.u16    q15, #5
    vrshr.u16   q13, #5
    vrshr.u16   q14, #5
    vrshr.u16   q15, #5
    vsli.u16    q14, q13, #5
    vsli.u16    q14, q15, #11
 .endm
 .macro ARGBto565PixelAlpha_process_pixblock_tail_head
    vld4.8      {d0-d3}, [SRC]!
                                    PF add PF_X, PF_X, #8
        vsra.u16    q13, #5
                                    PF tst PF_CTL, #0xF
        vsra.u16    q14, #5
                                    PF addne PF_X, PF_X, #8
        vsra.u16    q15, #5
                                    PF subne PF_CTL, PF_CTL, #1
        vrshr.u16   q13, #5
                                    PF cmp PF_X, ORIG_W
        vrshr.u16   q14, #5
                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
        vrshr.u16   q15, #5
                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
    vld1.8      {d4-d5}, [DST_R]!
                                    PF subge PF_X, PF_X, ORIG_W
        vsli.u16    q14, q13, #5
                                    PF subges PF_CTL, PF_CTL, #0x10
        vsli.u16    q14, q15, #11
                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
        vst1.8      {q14}, [DST_W :128]!
    vmvn        d6, d3
    vshr.u8     d1, #2
    vshr.u8     d3, #3
    vshr.u8     d0, #3
    vshrn.u16   d7, q2, #3
    vshrn.u16   d25, q2, #8
    vbic.i16    q2, #0xe0
                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
    vshr.u8     d6, #3
    vshr.u8     d7, #2
    vshr.u8     d2, #3
    vmovn.u16   d24, q2
    vshr.u8     d25, #3
    vmull.u8    q13, d1, d3
    vmlal.u8    q13, d7, d6
    vmull.u8    q14, d0, d3
    vmlal.u8    q14, d24, d6
    vmull.u8    q15, d2, d3
    vmlal.u8    q15, d25, d6
 .endm
 generate_composite_function \
    BlitARGBto565PixelAlphaARMNEONAsm, 32, 0, 16, \
    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
    8, /* number of pixels, processed in a single block */ \
    6, /* prefetch distance */ \
    default_init, \
    default_cleanup, \
    ARGBto565PixelAlpha_process_pixblock_head, \
    ARGBto565PixelAlpha_process_pixblock_tail, \
    ARGBto565PixelAlpha_process_pixblock_tail_head
--- a/src/video/arm/pixman-arm-neon-asm.h
+++ b/src/video/arm/pixman-arm-neon-asm.h
--- a/src/video/arm/pixman-arm-simd-asm.S
+++ b/src/video/arm/pixman-arm-simd-asm.S
@ -1,532 +0,0 @@
 /*
 * Copyright (c) 2016 RISC OS Open Ltd
 *
 * This software is provided 'as-is', without any express or implied
 * warranty.  In no event will the authors be held liable for any damages
 * arising from the use of this software.
 *
 * Permission is granted to anyone to use this software for any purpose,
 * including commercial applications, and to alter it and redistribute it
 * freely, subject to the following restrictions:
 *
 * 1. The origin of this software must not be misrepresented; you must not
 *    claim that you wrote the original software. If you use this software
 *    in a product, an acknowledgment in the product documentation would be
 *    appreciated but is not required.
 * 2. Altered source versions must be plainly marked as such, and must not be
 *    misrepresented as being the original software.
 * 3. This notice may not be removed or altered from any source distribution.
 */
 /* Prevent the stack from becoming executable */
 #if defined(__linux__) && defined(__ELF__)
 .section .note.GNU-stack,"",%progbits
 #endif
 	.text
 	.arch armv6
 	.object_arch armv4
 	.arm
 	.altmacro
 	.p2align 2
 #include "pixman-arm-asm.h"
 #include "pixman-arm-simd-asm.h"
 /* A head macro should do all processing which results in an output of up to
 * 16 bytes, as far as the final load instruction. The corresponding tail macro
 * should complete the processing of the up-to-16 bytes. The calling macro will
 * sometimes choose to insert a preload or a decrement of X between them.
 *   cond           ARM condition code for code block
 *   numbytes       Number of output bytes that should be generated this time
 *   firstreg       First WK register in which to place output
 *   unaligned_src  Whether to use non-wordaligned loads of source image
 *   unaligned_mask Whether to use non-wordaligned loads of mask image
 *   preload        If outputting 16 bytes causes 64 bytes to be read, whether an extra preload should be output
 */
 /******************************************************************************/
 .macro FillRect32_init
        ldr     SRC, [sp, #ARGS_STACK_OFFSET]
        mov     STRIDE_S, SRC
        mov     MASK, SRC
        mov     STRIDE_M, SRC
 .endm
 .macro FillRect16_init
        ldrh    SRC, [sp, #ARGS_STACK_OFFSET]
        orr     SRC, SRC, lsl #16
        mov     STRIDE_S, SRC
        mov     MASK, SRC
        mov     STRIDE_M, SRC
 .endm
 .macro FillRect8_init
        ldrb    SRC, [sp, #ARGS_STACK_OFFSET]
        orr     SRC, SRC, lsl #8
        orr     SRC, SRC, lsl #16
        mov     STRIDE_S, SRC
        mov     MASK, SRC
        mov     STRIDE_M, SRC
 .endm
 .macro FillRect_process_tail  cond, numbytes, firstreg
    WK4     .req    SRC
    WK5     .req    STRIDE_S
    WK6     .req    MASK
    WK7     .req    STRIDE_M
        pixst   cond, numbytes, 4, DST
    .unreq  WK4
    .unreq  WK5
    .unreq  WK6
    .unreq  WK7
 .endm
 generate_composite_function \
    FillSurfaceRect32ARMSIMDAsm, 0, 0, 32, \
    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
    0, /* prefetch distance doesn't apply */ \
    FillRect32_init \
    nop_macro, /* newline */ \
    nop_macro /* cleanup */ \
    nop_macro /* process head */ \
    FillRect_process_tail
 generate_composite_function \
    FillSurfaceRect16ARMSIMDAsm, 0, 0, 16, \
    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
    0, /* prefetch distance doesn't apply */ \
    FillRect16_init \
    nop_macro, /* newline */ \
    nop_macro /* cleanup */ \
    nop_macro /* process head */ \
    FillRect_process_tail
 generate_composite_function \
    FillSurfaceRect8ARMSIMDAsm, 0, 0, 8, \
    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
    0, /* prefetch distance doesn't apply */ \
    FillRect8_init \
    nop_macro, /* newline */ \
    nop_macro /* cleanup */ \
    nop_macro /* process head */ \
    FillRect_process_tail
 /******************************************************************************/
 /* This differs from the over_8888_8888 routine in Pixman in that the destination
 * alpha component is always left unchanged, and RGB components are not
 * premultiplied by alpha. It differs from BlitRGBtoRGBPixelAlpha in that
 * renormalisation is done by multiplying by 257/256 (with rounding) rather than
 * simply shifting right by 8 bits - removing the need to special-case alpha=0xff.
 */
 .macro RGBtoRGBPixelAlpha_init
        line_saved_regs STRIDE_S, ORIG_W
        mov     MASK, #0x80
 .endm
 .macro RGBtoRGBPixelAlpha_1pixel_translucent  s, d, tmp0, tmp1, tmp2, tmp3, half
        uxtb    tmp3, s
        uxtb    tmp0, d
        sub     tmp0, tmp3, tmp0
        uxtb    tmp3, s, ror #16
        uxtb    tmp1, d, ror #16
        sub     tmp1, tmp3, tmp1
        uxtb    tmp3, s, ror #8
        mov     s, s, lsr #24
        uxtb    tmp2, d, ror #8
        sub     tmp2, tmp3, tmp2
        smlabb  tmp0, tmp0, s, half
        smlabb  tmp1, tmp1, s, half
        smlabb  tmp2, tmp2, s, half
        add     tmp0, tmp0, asr #8
        add     tmp1, tmp1, asr #8
        add     tmp2, tmp2, asr #8
        pkhbt   tmp0, tmp0, tmp1, lsl #16
        and     tmp2, tmp2, #0xff00
        uxtb16  tmp0, tmp0, ror #8
        orr     tmp0, tmp0, tmp2
        uadd8   d, d, tmp0
 .endm
 .macro RGBtoRGBPixelAlpha_1pixel_opaque  s, d
        and     d, d, #0xff000000
        bic     s, s, #0xff000000
        orr     d, d, s
 .endm
 .macro RGBtoRGBPixelAlpha_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
 .if numbytes == 16
        ldm     SRC!, {WK0, WK1}
        ldm     SRC!, {STRIDE_S, STRIDE_M}
        ldrd    WK2, WK3, [DST], #16
        orr     SCRATCH, WK0, WK1
        and     ORIG_W, WK0, WK1
        orr     SCRATCH, SCRATCH, STRIDE_S
        and     ORIG_W, ORIG_W, STRIDE_S
        orr     SCRATCH, SCRATCH, STRIDE_M
        and     ORIG_W, ORIG_W, STRIDE_M
        tst     SCRATCH, #0xff000000
 .elseif numbytes == 8
        ldm     SRC!, {WK0, WK1}
        ldm     DST!, {WK2, WK3}
        orr     SCRATCH, WK0, WK1
        and     ORIG_W, WK0, WK1
        tst     SCRATCH, #0xff000000
 .else // numbytes == 4
        ldr     WK0, [SRC], #4
        ldr     WK2, [DST], #4
        tst     WK0, #0xff000000
 .endif
 .endm
 .macro RGBtoRGBPixelAlpha_process_tail  cond, numbytes, firstreg
        beq     20f @ all transparent
 .if numbytes == 16
        cmp     ORIG_W, #0xff000000
        bhs     10f @ all opaque
        RGBtoRGBPixelAlpha_1pixel_translucent WK0, WK2, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK
        RGBtoRGBPixelAlpha_1pixel_translucent WK1, WK3, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK
        strd    WK2, WK3, [DST, #-16]
        ldrd    WK0, WK1, [SRC, #-8]
        ldrd    WK2, WK3, [DST, #-8]
        RGBtoRGBPixelAlpha_1pixel_translucent WK0, WK2, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK
        RGBtoRGBPixelAlpha_1pixel_translucent WK1, WK3, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK
        b       19f
 10:     RGBtoRGBPixelAlpha_1pixel_opaque WK0, WK2
        RGBtoRGBPixelAlpha_1pixel_opaque WK1, WK3
        strd    WK2, WK3, [DST, #-16]
        ldrd    WK0, WK1, [SRC, #-8]
        ldrd    WK2, WK3, [DST, #-8]
        RGBtoRGBPixelAlpha_1pixel_opaque WK0, WK2
        RGBtoRGBPixelAlpha_1pixel_opaque WK1, WK3
 19:     strd    WK2, WK3, [DST, #-8]
 .elseif numbytes == 8
        cmp     ORIG_W, #0xff000000
        bhs     10f @ all opaque
        RGBtoRGBPixelAlpha_1pixel_translucent WK0, WK2, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK
        RGBtoRGBPixelAlpha_1pixel_translucent WK1, WK3, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK
        b       19f
 10:     RGBtoRGBPixelAlpha_1pixel_opaque WK0, WK2
        RGBtoRGBPixelAlpha_1pixel_opaque WK1, WK3
 19:     strd    WK2, WK3, [DST, #-8]
 .else // numbytes == 4
        cmp     WK0, #0xff000000
        bhs     10f @ opaque
        RGBtoRGBPixelAlpha_1pixel_translucent WK0, WK2, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK
        b       19f
 10:     RGBtoRGBPixelAlpha_1pixel_opaque WK0, WK2
 19:     str     WK2, [DST, #-4]
 .endif
 20:
 .endm
 generate_composite_function \
    BlitRGBtoRGBPixelAlphaARMSIMDAsm, 32, 0, 32, \
    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_WK0, \
    2, /* prefetch distance */ \
    RGBtoRGBPixelAlpha_init, \
    nop_macro, /* newline */ \
    nop_macro, /* cleanup */ \
    RGBtoRGBPixelAlpha_process_head, \
    RGBtoRGBPixelAlpha_process_tail
 /******************************************************************************/
 .macro ARGBto565PixelAlpha_init
        line_saved_regs STRIDE_D, STRIDE_S, ORIG_W
        mov     MASK, #0x001f
        mov     STRIDE_M, #0x0010
        orr     MASK, MASK, MASK, lsl #16
        orr     STRIDE_M, STRIDE_M, STRIDE_M, lsl #16
 .endm
 .macro ARGBto565PixelAlpha_newline
        mov     STRIDE_S, #0x0200
 .endm
 /* On entry:
 * s1 holds 1 32bpp source pixel
 * d holds 1 16bpp destination pixel
 * rbmask, rbhalf, ghalf hold 0x001f001f, 0x00100010, 0x00000200 respectively
 * other registers are temporaries
 * On exit:
 * Constant registers preserved
 */
 .macro ARGBto565PixelAlpha_1pixel_translucent  s, d, rbmask, rbhalf, ghalf, alpha, rb, g, misc
        mov     alpha, s, lsr #27
        and     misc, s, #0xfc00
        and     g, d, #0x07e0
        pkhbt   rb, d, d, lsl #5
        rsb     misc, g, misc, lsr #5
        and     s, rbmask, s, lsr #3
        and     rb, rbmask, rb
        sub     s, s, rb
        smlabb  misc, misc, alpha, ghalf
        mla     s, s, alpha, rbhalf
        add     misc, misc, misc, lsl #5
        add     g, g, misc, asr #10
        add     s, s, s, lsl #5
        and     g, g, #0x07e0
        add     rb, rb, s, asr #10
        and     rb, rb, rbmask
        pkhbt   rb, rb, rb, lsl #11
        orr     d, rb, g
        orr     d, d, rb, lsr #16
 .endm
 /* On entry:
 * s1 holds 1 32bpp source pixel
 * d holds 1 16bpp destination pixel
 * rbmask holds 0x001f001f
 * On exit:
 * Constant registers preserved
 */
 .macro ARGBto565PixelAlpha_1pixel_opaque  s, d, rbmask
        and     d, rbmask, s, lsr #3
        and     s, s, #0xfc00
        orr     d, d, d, lsr #5
        orr     d, d, s, lsr #5
 .endm
 /* On entry:
 * s1, s2 hold 2 32bpp source pixels
 * d holds 2 16bpp destination pixels
 * rbmask, rbhalf, ghalf hold 0x001f001f, 0x00100010, 0x00000200 respectively
 * other registers are temporaries
 * On exit:
 * Constant registers preserved
 * Blended results have been written through destination pointer
 */
 .macro ARGBto565PixelAlpha_2pixels_translucent  s1, s2, d, rbmask, rbhalf, ghalf, alpha, rb, g, misc
        mov     alpha, s1, lsr #27
        and     misc, s1, #0xfc00
        and     g, d, #0x07e0
        pkhbt   rb, d, d, lsl #5
        rsb     misc, g, misc, lsr #5
        and     s1, rbmask, s1, lsr #3
        and     rb, rbmask, rb
        sub     s1, s1, rb
        smlabb  misc, misc, alpha, ghalf
        mla     s1, s1, alpha, rbhalf
          uxth    d, d, ror #16
        add     misc, misc, misc, lsl #5
          mov     alpha, s2, lsr #27
        add     g, g, misc, asr #10
        add     s1, s1, s1, lsl #5
        and     g, g, #0x07e0
        add     rb, rb, s1, asr #10
        and     rb, rb, rbmask
          and     misc, s2, #0xfc00
        pkhbt   rb, rb, rb, lsl #11
          and     s1, d, #0x07e0
          pkhbt   d, d, d, lsl #5
          rsb     misc, s1, misc, lsr #5
          and     s2, rbmask, s2, lsr #3
          and     d, rbmask, d
          sub     s2, s2, d
          smlabb  misc, misc, alpha, ghalf
          mla     s2, s2, alpha, rbhalf
        orr     alpha, rb, g
          add     misc, misc, misc, lsl #5
        orr     alpha, alpha, rb, lsr #16
          add     s1, s1, misc, asr #10
          add     s2, s2, s2, lsl #5
          and     s1, s1, #0x07e0
          add     d, d, s2, asr #10
          and     d, d, rbmask
        strh    alpha, [DST, #-4]
          pkhbt   d, d, d, lsl #11
          orr     alpha, d, s1
          orr     alpha, alpha, d, lsr #16
          strh    alpha, [DST, #-2]
 .endm
 /* On entry:
 * s1, s2 hold 2 32bpp source pixels
 * rbmask holds 0x001f001f
 * other registers are temporaries
 * On exit:
 * Constant registers preserved
 * Blended results have been written through destination pointer
 */
 .macro ARGBto565PixelAlpha_2pixels_opaque  s1, s2, d, rbmask, g
        and     g, s1, #0xfc00
        and     d, rbmask, s1, lsr #3
          and     s1, rbmask, s2, lsr #3
        orr     d, d, d, lsr #5
        orr     d, d, g, lsr #5
          and     g, s2, #0xfc00
        strh    d, [DST, #-4]
          orr     s1, s1, s1, lsr #5
          orr     s1, s1, g, lsr #5
          strh    s1, [DST, #-2]
 .endm
 .macro ARGBto565PixelAlpha_2pixels_head
        ldrd    WK0, WK1, [SRC], #8
        ldr     WK2, [DST], #4
        orr     SCRATCH, WK0, WK1
        and     ORIG_W, WK0, WK1
        tst     SCRATCH, #0xff000000
 .endm
 .macro ARGBto565PixelAlpha_2pixels_tail
        beq     20f @ all transparent
        cmp     ORIG_W, #0xff000000
        bhs     10f @ all opaque
        ARGBto565PixelAlpha_2pixels_translucent  WK0, WK1, WK2, MASK, STRIDE_M, STRIDE_S, STRIDE_D, WK3, SCRATCH, ORIG_W
        b       20f
 10:     ARGBto565PixelAlpha_2pixels_opaque  WK0, WK1, WK2, MASK, SCRATCH
 20:
 .endm
 .macro ARGBto565PixelAlpha_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
 .if numbytes == 16
        ARGBto565PixelAlpha_2pixels_head
        ARGBto565PixelAlpha_2pixels_tail
        ARGBto565PixelAlpha_2pixels_head
        ARGBto565PixelAlpha_2pixels_tail
 .endif
 .if numbytes >= 8
        ARGBto565PixelAlpha_2pixels_head
        ARGBto565PixelAlpha_2pixels_tail
 .endif
 .if numbytes >= 4
        ARGBto565PixelAlpha_2pixels_head
 .else // numbytes == 2
        ldr     WK0, [SRC], #4
        ldrh    WK2, [DST], #2
        tst     WK0, #0xff000000
 .endif
 .endm
 .macro ARGBto565PixelAlpha_process_tail  cond, numbytes, firstreg
 .if numbytes >= 4
        ARGBto565PixelAlpha_2pixels_tail
 .else // numbytes == 2
        beq     20f @ all transparent
        cmp     WK0, #0xff000000
        bhs     10f @ opaque
        ARGBto565PixelAlpha_1pixel_translucent  WK0, WK2, MASK, STRIDE_M, STRIDE_S, STRIDE_D, WK3, SCRATCH, ORIG_W
        b       19f
 10:     ARGBto565PixelAlpha_1pixel_opaque WK0, WK2, MASK
 19:     strh    WK2, [DST, #-2]
 20:
 .endif
 .endm
 generate_composite_function \
    BlitARGBto565PixelAlphaARMSIMDAsm, 32, 0, 16, \
    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_WK0, \
    2, /* prefetch distance */ \
    ARGBto565PixelAlpha_init, \
    ARGBto565PixelAlpha_newline, \
    nop_macro, /* cleanup */ \
    ARGBto565PixelAlpha_process_head, \
    ARGBto565PixelAlpha_process_tail
 /******************************************************************************/
 .macro BGR888toRGB888_1pixel cond, reg, tmp
        uxtb16&cond  tmp, WK&reg, ror #8
        uxtb16&cond  WK&reg, WK&reg, ror #16
        orr&cond     WK&reg, WK&reg, tmp, lsl #8
 .endm
 .macro BGR888toRGB888_2pixels cond, reg1, reg2, tmp1, tmp2
        uxtb16&cond  tmp1, WK&reg1, ror #8
        uxtb16&cond  WK&reg1, WK&reg1, ror #16
        uxtb16&cond  tmp2, WK&reg2, ror #8
        uxtb16&cond  WK&reg2, WK&reg2, ror #16
        orr&cond     WK&reg1, WK&reg1, tmp1, lsl #8
        orr&cond     WK&reg2, WK&reg2, tmp2, lsl #8
 .endm
 .macro BGR888toRGB888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
        pixld   cond, numbytes, firstreg, SRC, unaligned_src
 .endm
 .macro BGR888toRGB888_process_tail  cond, numbytes, firstreg
 .if numbytes >= 8
        BGR888toRGB888_2pixels cond, %(firstreg+0), %(firstreg+1), MASK, STRIDE_M
  .if numbytes == 16
        BGR888toRGB888_2pixels cond, %(firstreg+2), %(firstreg+3), MASK, STRIDE_M
  .endif
 .else @ numbytes == 4
        BGR888toRGB888_1pixel cond, %(firstreg+0), MASK
 .endif
 .endm
 generate_composite_function \
    Blit_XBGR8888_XRGB8888ARMSIMDAsm, 32, 0, 32, \
    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_SCRATCH, \
    2, /* prefetch distance */ \
    nop_macro, /* init */ \
    nop_macro, /* newline */ \
    nop_macro, /* cleanup */ \
    BGR888toRGB888_process_head, \
    BGR888toRGB888_process_tail
 /******************************************************************************/
 .macro RGB444toRGB888_init
        ldr     MASK, =0x0f0f0f0f
        /* Set GE[3:0] to 0101 so SEL instructions do what we want */
        msr     CPSR_s, #0x50000
 .endm
 .macro RGB444toRGB888_1pixel reg, mask, tmp
        pkhbt   WK&reg, WK&reg, WK&reg, lsl #12      @ 0000aaaarrrrggggaaaarrrrggggbbbb
        and     WK&reg, mask, WK&reg                 @ 0000aaaa0000gggg0000rrrr0000bbbb
        orr     WK&reg, WK&reg, WK&reg, lsl #4       @ aaaaaaaaggggggggrrrrrrrrbbbbbbbb
        pkhtb   tmp, WK&reg, WK&reg, asr #8          @ aaaaaaaaggggggggggggggggrrrrrrrr
        pkhbt   WK&reg, WK&reg, WK&reg, lsl #8       @ ggggggggrrrrrrrrrrrrrrrrbbbbbbbb
        sel     WK&reg, WK&reg, tmp                  @ aaaaaaaarrrrrrrrggggggggbbbbbbbb
 .endm
 .macro RGB444toRGB888_2pixels in, out1, out2, mask, tmp1, tmp2
        and     tmp1, mask, WK&in                    @ 0000RRRR0000BBBB0000rrrr0000bbbb
        and     tmp2, mask, WK&in, lsr #4            @ 0000AAAA0000GGGG0000aaaa0000gggg
        orr     tmp1, tmp1, tmp1, lsl #4             @ RRRRRRRRBBBBBBBBrrrrrrrrbbbbbbbb
        orr     tmp2, tmp2, tmp2, lsl #4             @ AAAAAAAAGGGGGGGGaaaaaaaagggggggg
        pkhtb   WK&out2, tmp2, tmp1, asr #16         @ AAAAAAAAGGGGGGGGRRRRRRRRBBBBBBBB
        pkhbt   WK&out1, tmp1, tmp2, lsl #16         @ aaaaaaaaggggggggrrrrrrrrbbbbbbbb
        pkhtb   tmp2, WK&out2, WK&out2, asr #8       @ AAAAAAAAGGGGGGGGGGGGGGGGRRRRRRRR
        pkhtb   tmp1, WK&out1, WK&out1, asr #8       @ aaaaaaaaggggggggggggggggrrrrrrrr
        pkhbt   WK&out1, WK&out1, WK&out1, lsl #8    @ ggggggggrrrrrrrrrrrrrrrrbbbbbbbb
        pkhbt   WK&out2, WK&out2, WK&out2, lsl #8    @ GGGGGGGGRRRRRRRRRRRRRRRRBBBBBBBB
        sel     WK&out1, WK&out1, tmp1               @ aaaaaaaarrrrrrrrggggggggbbbbbbbb
        sel     WK&out2, WK&out2, tmp2               @ AAAAAAAARRRRRRRRGGGGGGGGBBBBBBBB
 .endm
 .macro RGB444toRGB888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
        pixld   cond, numbytes/2, firstreg, SRC, unaligned_src
 .endm
 .macro RGB444toRGB888_process_tail  cond, numbytes, firstreg
 .if numbytes >= 8
  .if numbytes == 16
        RGB444toRGB888_2pixels %(firstreg+1), %(firstreg+2), %(firstreg+3), MASK, STRIDE_M, SCRATCH
  .endif
        RGB444toRGB888_2pixels %(firstreg+0), %(firstreg+0), %(firstreg+1), MASK, STRIDE_M, SCRATCH
 .else @ numbytes == 4
        RGB444toRGB888_1pixel %(firstreg+0), MASK, SCRATCH
 .endif
 .endm
 generate_composite_function \
    Blit_RGB444_XRGB8888ARMSIMDAsm, 16, 0, 32, \
    FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER, \
    2, /* prefetch distance */ \
    RGB444toRGB888_init, \
    nop_macro, /* newline */ \
    nop_macro, /* cleanup */ \
    RGB444toRGB888_process_head, \
    RGB444toRGB888_process_tail
--- a/src/video/arm/pixman-arm-simd-asm.h
+++ b/src/video/arm/pixman-arm-simd-asm.h