FreeRDP/libfreerdp/primitives/prim_16to32bpp_opt.c

281 lines
7.8 KiB
C

/* prim_16to32bpp_opt.c
* 16-bit to 32-bit color conversion via SSE/Neon
* vi:ts=4 sw=4:
*
* (c) Copyright 2014 Hewlett-Packard Development Company, L.P.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include <freerdp/types.h>
#include <freerdp/primitives.h>
#include <winpr/sysinfo.h>
#ifdef WITH_SSE2
#include <emmintrin.h>
#include <tmmintrin.h>
/* #elif defined(WITH_NEON) */
/* #include <arm_neon.h> */
#endif /* WITH_SSE2 */
#include "prim_internal.h"
#include "prim_16to32bpp.h"
#ifdef WITH_SSE2
/* ------------------------------------------------------------------------- */
/* Note: _no_invert and _invert could be coded with variables as shift
* amounts and a single routine, but tests showed that was much slower.
*/
static pstatus_t sse3_RGB565ToARGB_16u32u_C3C4_no_invert(
const UINT16* pSrc, INT32 srcStep,
UINT32* pDst, INT32 dstStep,
UINT32 width, UINT32 height,
BOOL alpha)
{
const BYTE *src = (const BYTE *) pSrc;
BYTE *dst = (BYTE *) pDst;
int h;
int srcRowBump = srcStep - (width * sizeof(UINT16));
int dstRowBump = dstStep - (width * sizeof(UINT32));
__m128i R0, R1, R2, R_FC00, R_0300, R_00F8, R_0007, R_alpha;
R_FC00 = _mm_set1_epi16(0xFC00);
R_0300 = _mm_set1_epi16(0x0300);
R_00F8 = _mm_set1_epi16(0x00F8);
R_0007 = _mm_set1_epi16(0x0007);
if (alpha) R_alpha = _mm_set1_epi32(0xFF00FF00U);
else R_alpha = _mm_set1_epi32(0x00000000U);
for (h=0; h<height; h++)
{
int w = width;
/* Get to 16-byte destination boundary for the dest. */
if ((ULONG_PTR) dst & 0x0f)
{
int startup = (16 - ((ULONG_PTR) dst & 0x0f)) / 4;
if (startup > width) startup = width;
general_RGB565ToARGB_16u32u_C3C4((const UINT16*) src, srcStep,
(UINT32*) dst, dstStep, startup, 1, alpha, FALSE);
src += startup * sizeof(UINT16);
dst += startup * sizeof(UINT32);
w -= startup;
}
/* The main loop handles eight pixels at a time. */
while (w >= 8)
{
/* If off-stride, use the slower load. */
if ((ULONG_PTR) src & 0x0f)
R0 = _mm_lddqu_si128((__m128i *) src);
else
R0 = _mm_load_si128((__m128i *) src);
src += (128/8);
/* Do the lower two colors, which end up in the lower two bytes. */
/* G = ((P<<5) & 0xFC00) | ((P>>1) & 0x0300) */
R2 = _mm_slli_epi16(R0, 5);
R2 = _mm_and_si128(R_FC00, R2);
R1 = _mm_srli_epi16(R0, 1);
R1 = _mm_and_si128(R_0300, R1);
R2 = _mm_or_si128(R1, R2);
/* R = ((P<<3) & 0x00F8) | ((P>>2) & 0x0007) */
R1 = _mm_slli_epi16(R0, 3);
R1 = _mm_and_si128(R_00F8, R1);
R2 = _mm_or_si128(R1, R2);
R1 = _mm_srli_epi16(R0, 2);
R1 = _mm_and_si128(R_0007, R1);
R2 = _mm_or_si128(R1, R2); /* R2 = lowers */
/* Handle the upper color. */
/* B = ((P<<8) & 0x00F8) | ((P<<13) & 0x0007) */
R1 = _mm_srli_epi16(R0, 8);
R1 = _mm_and_si128(R_00F8, R1);
R0 = _mm_srli_epi16(R0, 13);
R0 = _mm_and_si128(R_0007, R0);
R1 = _mm_or_si128(R0, R1); /* R1 = uppers */
/* Add alpha (or zero) . */
R1 = _mm_or_si128(R_alpha, R1); /* + alpha */
/* Unpack to intermix the AB and GR pieces. */
R0 = _mm_unpackhi_epi16(R2, R1);
R2 = _mm_unpacklo_epi16(R2, R1);
/* Store the results. */
_mm_store_si128((__m128i *) dst, R2); dst += (128/8);
_mm_store_si128((__m128i *) dst, R0); dst += (128/8);
w -= 8;
}
/* Handle any remainder. */
if (w > 0)
{
general_RGB565ToARGB_16u32u_C3C4((const UINT16*) src, srcStep,
(UINT32*) dst, dstStep, w, 1, alpha, FALSE);
src += w * sizeof(UINT16);
dst += w * sizeof(UINT32);
}
/* Bump to the start of the next row. */
src += srcRowBump;
dst += dstRowBump;
}
return PRIMITIVES_SUCCESS;
}
/* ------------------------------------------------------------------------- */
static pstatus_t sse3_RGB565ToARGB_16u32u_C3C4_invert(
const UINT16* pSrc, INT32 srcStep,
UINT32* pDst, INT32 dstStep,
UINT32 width, UINT32 height,
BOOL alpha)
{
const BYTE *src = (const BYTE *) pSrc;
BYTE *dst = (BYTE *) pDst;
int h;
int srcRowBump = srcStep - (width * sizeof(UINT16));
int dstRowBump = dstStep - (width * sizeof(UINT32));
__m128i R0, R1, R2, R_FC00, R_0300, R_00F8, R_0007, R_alpha;
R_FC00 = _mm_set1_epi16(0xFC00);
R_0300 = _mm_set1_epi16(0x0300);
R_00F8 = _mm_set1_epi16(0x00F8);
R_0007 = _mm_set1_epi16(0x0007);
if (alpha) R_alpha = _mm_set1_epi32(0xFF00FF00U);
else R_alpha = _mm_set1_epi32(0x00000000U);
for (h=0; h<height; h++)
{
int w = width;
/* Get to 16-byte destination boundary for the dest. */
if ((ULONG_PTR) dst & 0x0f)
{
int startup = (16 - ((ULONG_PTR) dst & 0x0f)) / 4;
if (startup > width) startup = width;
general_RGB565ToARGB_16u32u_C3C4((const UINT16*) src, srcStep,
(UINT32*) dst, dstStep, startup, 1, alpha, TRUE);
src += startup * sizeof(UINT16);
dst += startup * sizeof(UINT32);
w -= startup;
}
/* The main loop handles eight pixels at a time. */
while (w >= 8)
{
/* Off-stride, slower load. */
if ((ULONG_PTR) src & 0x0f)
R0 = _mm_lddqu_si128((__m128i *) src);
else
R0 = _mm_load_si128((__m128i *) src);
src += (128/8);
/* Do the lower two colors, which end up in the lower two bytes. */
/* G = ((P<<5) & 0xFC00) | ((P>>1) & 0x0300) */
R2 = _mm_slli_epi16(R0, 5);
R2 = _mm_and_si128(R_FC00, R2);
R1 = _mm_srli_epi16(R0, 1);
R1 = _mm_and_si128(R_0300, R1);
R2 = _mm_or_si128(R1, R2);
/* B = ((P>>8) & 0x00F8) | ((P>>13) & 0x0007) */
R1 = _mm_srli_epi16(R0, 8);
R1 = _mm_and_si128(R_00F8, R1);
R2 = _mm_or_si128(R1, R2);
R1 = _mm_srli_epi16(R0, 13);
R1 = _mm_and_si128(R_0007, R1);
R2 = _mm_or_si128(R1, R2); /* R2 = lowers */
/* Handle the upper color. */
/* R = ((P<<3) & 0x00F8) | ((P>>13) & 0x0007) */
R1 = _mm_slli_epi16(R0, 3);
R1 = _mm_and_si128(R_00F8, R1);
R0 = _mm_srli_epi16(R0, 2);
R0 = _mm_and_si128(R_0007, R0);
R1 = _mm_or_si128(R0, R1); /* R1 = uppers */
/* Add alpha (or zero) . */
R1 = _mm_or_si128(R_alpha, R1); /* + alpha */
/* Unpack to intermix the AR and GB pieces. */
R0 = _mm_unpackhi_epi16(R2, R1);
R2 = _mm_unpacklo_epi16(R2, R1);
/* Store the results. */
_mm_store_si128((__m128i *) dst, R2); dst += (128/8);
_mm_store_si128((__m128i *) dst, R0); dst += (128/8);
w -= 8;
}
/* Handle any remainder. */
if (w > 0)
{
general_RGB565ToARGB_16u32u_C3C4((const UINT16*) src, srcStep,
(UINT32*) dst, dstStep, w, 1, alpha, TRUE);
src += w * sizeof(UINT16);
dst += w * sizeof(UINT32);
}
/* Bump to the start of the next row. */
src += srcRowBump;
dst += dstRowBump;
}
return PRIMITIVES_SUCCESS;
}
/* ------------------------------------------------------------------------- */
pstatus_t sse3_RGB565ToARGB_16u32u_C3C4(
const UINT16* pSrc, INT32 srcStep,
UINT32* pDst, INT32 dstStep,
UINT32 width, UINT32 height,
BOOL alpha, BOOL invert)
{
if (invert)
{
return sse3_RGB565ToARGB_16u32u_C3C4_invert(pSrc, srcStep,
pDst, dstStep, width, height, alpha);
}
else
{
return sse3_RGB565ToARGB_16u32u_C3C4_no_invert(pSrc, srcStep,
pDst, dstStep, width, height, alpha);
}
}
#endif /* WITH_SSE2 */
/* ------------------------------------------------------------------------- */
void primitives_init_16to32bpp_opt(
primitives_t *prims)
{
#ifdef WITH_SSE2
if (IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
{
prims->RGB565ToARGB_16u32u_C3C4 = sse3_RGB565ToARGB_16u32u_C3C4;
}
#endif
}