281 lines
7.8 KiB
C
281 lines
7.8 KiB
C
|
/* prim_16to32bpp_opt.c
|
||
|
* 16-bit to 32-bit color conversion via SSE/Neon
|
||
|
* vi:ts=4 sw=4:
|
||
|
*
|
||
|
* (c) Copyright 2014 Hewlett-Packard Development Company, L.P.
|
||
|
*
|
||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
* you may not use this file except in compliance with the License.
|
||
|
* You may obtain a copy of the License at
|
||
|
*
|
||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||
|
*
|
||
|
* Unless required by applicable law or agreed to in writing, software
|
||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
* See the License for the specific language governing permissions and
|
||
|
* limitations under the License.
|
||
|
*/
|
||
|
|
||
|
#ifdef HAVE_CONFIG_H
|
||
|
#include "config.h"
|
||
|
#endif
|
||
|
|
||
|
#include <freerdp/types.h>
|
||
|
#include <freerdp/primitives.h>
|
||
|
#include <winpr/sysinfo.h>
|
||
|
|
||
|
#ifdef WITH_SSE2
|
||
|
#include <emmintrin.h>
|
||
|
#include <tmmintrin.h>
|
||
|
/* #elif defined(WITH_NEON) */
|
||
|
/* #include <arm_neon.h> */
|
||
|
#endif /* WITH_SSE2 */
|
||
|
|
||
|
#include "prim_internal.h"
|
||
|
#include "prim_16to32bpp.h"
|
||
|
|
||
|
#ifdef WITH_SSE2
|
||
|
/* ------------------------------------------------------------------------- */
|
||
|
/* Note: _no_invert and _invert could be coded with variables as shift
|
||
|
* amounts and a single routine, but tests showed that was much slower.
|
||
|
*/
|
||
|
static pstatus_t sse3_RGB565ToARGB_16u32u_C3C4_no_invert(
|
||
|
const UINT16* pSrc, INT32 srcStep,
|
||
|
UINT32* pDst, INT32 dstStep,
|
||
|
UINT32 width, UINT32 height,
|
||
|
BOOL alpha)
|
||
|
{
|
||
|
const BYTE *src = (const BYTE *) pSrc;
|
||
|
BYTE *dst = (BYTE *) pDst;
|
||
|
int h;
|
||
|
int srcRowBump = srcStep - (width * sizeof(UINT16));
|
||
|
int dstRowBump = dstStep - (width * sizeof(UINT32));
|
||
|
__m128i R0, R1, R2, R_FC00, R_0300, R_00F8, R_0007, R_alpha;
|
||
|
|
||
|
R_FC00 = _mm_set1_epi16(0xFC00);
|
||
|
R_0300 = _mm_set1_epi16(0x0300);
|
||
|
R_00F8 = _mm_set1_epi16(0x00F8);
|
||
|
R_0007 = _mm_set1_epi16(0x0007);
|
||
|
if (alpha) R_alpha = _mm_set1_epi32(0xFF00FF00U);
|
||
|
else R_alpha = _mm_set1_epi32(0x00000000U);
|
||
|
|
||
|
for (h=0; h<height; h++)
|
||
|
{
|
||
|
int w = width;
|
||
|
|
||
|
/* Get to 16-byte destination boundary for the dest. */
|
||
|
if ((ULONG_PTR) dst & 0x0f)
|
||
|
{
|
||
|
int startup = (16 - ((ULONG_PTR) dst & 0x0f)) / 4;
|
||
|
if (startup > width) startup = width;
|
||
|
general_RGB565ToARGB_16u32u_C3C4((const UINT16*) src, srcStep,
|
||
|
(UINT32*) dst, dstStep, startup, 1, alpha, FALSE);
|
||
|
src += startup * sizeof(UINT16);
|
||
|
dst += startup * sizeof(UINT32);
|
||
|
w -= startup;
|
||
|
}
|
||
|
|
||
|
/* The main loop handles eight pixels at a time. */
|
||
|
while (w >= 8)
|
||
|
{
|
||
|
/* If off-stride, use the slower load. */
|
||
|
if ((ULONG_PTR) src & 0x0f)
|
||
|
R0 = _mm_lddqu_si128((__m128i *) src);
|
||
|
else
|
||
|
R0 = _mm_load_si128((__m128i *) src);
|
||
|
src += (128/8);
|
||
|
|
||
|
/* Do the lower two colors, which end up in the lower two bytes. */
|
||
|
/* G = ((P<<5) & 0xFC00) | ((P>>1) & 0x0300) */
|
||
|
R2 = _mm_slli_epi16(R0, 5);
|
||
|
R2 = _mm_and_si128(R_FC00, R2);
|
||
|
|
||
|
R1 = _mm_srli_epi16(R0, 1);
|
||
|
R1 = _mm_and_si128(R_0300, R1);
|
||
|
R2 = _mm_or_si128(R1, R2);
|
||
|
|
||
|
/* R = ((P<<3) & 0x00F8) | ((P>>2) & 0x0007) */
|
||
|
R1 = _mm_slli_epi16(R0, 3);
|
||
|
R1 = _mm_and_si128(R_00F8, R1);
|
||
|
R2 = _mm_or_si128(R1, R2);
|
||
|
|
||
|
R1 = _mm_srli_epi16(R0, 2);
|
||
|
R1 = _mm_and_si128(R_0007, R1);
|
||
|
R2 = _mm_or_si128(R1, R2); /* R2 = lowers */
|
||
|
|
||
|
/* Handle the upper color. */
|
||
|
/* B = ((P<<8) & 0x00F8) | ((P<<13) & 0x0007) */
|
||
|
R1 = _mm_srli_epi16(R0, 8);
|
||
|
R1 = _mm_and_si128(R_00F8, R1);
|
||
|
|
||
|
R0 = _mm_srli_epi16(R0, 13);
|
||
|
R0 = _mm_and_si128(R_0007, R0);
|
||
|
R1 = _mm_or_si128(R0, R1); /* R1 = uppers */
|
||
|
|
||
|
/* Add alpha (or zero) . */
|
||
|
R1 = _mm_or_si128(R_alpha, R1); /* + alpha */
|
||
|
|
||
|
/* Unpack to intermix the AB and GR pieces. */
|
||
|
R0 = _mm_unpackhi_epi16(R2, R1);
|
||
|
R2 = _mm_unpacklo_epi16(R2, R1);
|
||
|
|
||
|
/* Store the results. */
|
||
|
_mm_store_si128((__m128i *) dst, R2); dst += (128/8);
|
||
|
_mm_store_si128((__m128i *) dst, R0); dst += (128/8);
|
||
|
w -= 8;
|
||
|
}
|
||
|
|
||
|
/* Handle any remainder. */
|
||
|
if (w > 0)
|
||
|
{
|
||
|
general_RGB565ToARGB_16u32u_C3C4((const UINT16*) src, srcStep,
|
||
|
(UINT32*) dst, dstStep, w, 1, alpha, FALSE);
|
||
|
src += w * sizeof(UINT16);
|
||
|
dst += w * sizeof(UINT32);
|
||
|
}
|
||
|
|
||
|
/* Bump to the start of the next row. */
|
||
|
src += srcRowBump;
|
||
|
dst += dstRowBump;
|
||
|
}
|
||
|
|
||
|
return PRIMITIVES_SUCCESS;
|
||
|
}
|
||
|
|
||
|
/* ------------------------------------------------------------------------- */
|
||
|
static pstatus_t sse3_RGB565ToARGB_16u32u_C3C4_invert(
|
||
|
const UINT16* pSrc, INT32 srcStep,
|
||
|
UINT32* pDst, INT32 dstStep,
|
||
|
UINT32 width, UINT32 height,
|
||
|
BOOL alpha)
|
||
|
{
|
||
|
const BYTE *src = (const BYTE *) pSrc;
|
||
|
BYTE *dst = (BYTE *) pDst;
|
||
|
int h;
|
||
|
int srcRowBump = srcStep - (width * sizeof(UINT16));
|
||
|
int dstRowBump = dstStep - (width * sizeof(UINT32));
|
||
|
__m128i R0, R1, R2, R_FC00, R_0300, R_00F8, R_0007, R_alpha;
|
||
|
|
||
|
R_FC00 = _mm_set1_epi16(0xFC00);
|
||
|
R_0300 = _mm_set1_epi16(0x0300);
|
||
|
R_00F8 = _mm_set1_epi16(0x00F8);
|
||
|
R_0007 = _mm_set1_epi16(0x0007);
|
||
|
if (alpha) R_alpha = _mm_set1_epi32(0xFF00FF00U);
|
||
|
else R_alpha = _mm_set1_epi32(0x00000000U);
|
||
|
|
||
|
for (h=0; h<height; h++)
|
||
|
{
|
||
|
int w = width;
|
||
|
|
||
|
/* Get to 16-byte destination boundary for the dest. */
|
||
|
if ((ULONG_PTR) dst & 0x0f)
|
||
|
{
|
||
|
int startup = (16 - ((ULONG_PTR) dst & 0x0f)) / 4;
|
||
|
if (startup > width) startup = width;
|
||
|
general_RGB565ToARGB_16u32u_C3C4((const UINT16*) src, srcStep,
|
||
|
(UINT32*) dst, dstStep, startup, 1, alpha, TRUE);
|
||
|
src += startup * sizeof(UINT16);
|
||
|
dst += startup * sizeof(UINT32);
|
||
|
w -= startup;
|
||
|
}
|
||
|
|
||
|
/* The main loop handles eight pixels at a time. */
|
||
|
while (w >= 8)
|
||
|
{
|
||
|
/* Off-stride, slower load. */
|
||
|
if ((ULONG_PTR) src & 0x0f)
|
||
|
R0 = _mm_lddqu_si128((__m128i *) src);
|
||
|
else
|
||
|
R0 = _mm_load_si128((__m128i *) src);
|
||
|
src += (128/8);
|
||
|
|
||
|
/* Do the lower two colors, which end up in the lower two bytes. */
|
||
|
/* G = ((P<<5) & 0xFC00) | ((P>>1) & 0x0300) */
|
||
|
R2 = _mm_slli_epi16(R0, 5);
|
||
|
R2 = _mm_and_si128(R_FC00, R2);
|
||
|
|
||
|
R1 = _mm_srli_epi16(R0, 1);
|
||
|
R1 = _mm_and_si128(R_0300, R1);
|
||
|
R2 = _mm_or_si128(R1, R2);
|
||
|
|
||
|
/* B = ((P>>8) & 0x00F8) | ((P>>13) & 0x0007) */
|
||
|
R1 = _mm_srli_epi16(R0, 8);
|
||
|
R1 = _mm_and_si128(R_00F8, R1);
|
||
|
R2 = _mm_or_si128(R1, R2);
|
||
|
|
||
|
R1 = _mm_srli_epi16(R0, 13);
|
||
|
R1 = _mm_and_si128(R_0007, R1);
|
||
|
R2 = _mm_or_si128(R1, R2); /* R2 = lowers */
|
||
|
|
||
|
/* Handle the upper color. */
|
||
|
/* R = ((P<<3) & 0x00F8) | ((P>>13) & 0x0007) */
|
||
|
R1 = _mm_slli_epi16(R0, 3);
|
||
|
R1 = _mm_and_si128(R_00F8, R1);
|
||
|
|
||
|
R0 = _mm_srli_epi16(R0, 2);
|
||
|
R0 = _mm_and_si128(R_0007, R0);
|
||
|
R1 = _mm_or_si128(R0, R1); /* R1 = uppers */
|
||
|
|
||
|
/* Add alpha (or zero) . */
|
||
|
R1 = _mm_or_si128(R_alpha, R1); /* + alpha */
|
||
|
|
||
|
/* Unpack to intermix the AR and GB pieces. */
|
||
|
R0 = _mm_unpackhi_epi16(R2, R1);
|
||
|
R2 = _mm_unpacklo_epi16(R2, R1);
|
||
|
|
||
|
/* Store the results. */
|
||
|
_mm_store_si128((__m128i *) dst, R2); dst += (128/8);
|
||
|
_mm_store_si128((__m128i *) dst, R0); dst += (128/8);
|
||
|
w -= 8;
|
||
|
}
|
||
|
|
||
|
/* Handle any remainder. */
|
||
|
if (w > 0)
|
||
|
{
|
||
|
general_RGB565ToARGB_16u32u_C3C4((const UINT16*) src, srcStep,
|
||
|
(UINT32*) dst, dstStep, w, 1, alpha, TRUE);
|
||
|
src += w * sizeof(UINT16);
|
||
|
dst += w * sizeof(UINT32);
|
||
|
}
|
||
|
|
||
|
/* Bump to the start of the next row. */
|
||
|
src += srcRowBump;
|
||
|
dst += dstRowBump;
|
||
|
}
|
||
|
|
||
|
return PRIMITIVES_SUCCESS;
|
||
|
}
|
||
|
|
||
|
/* ------------------------------------------------------------------------- */
|
||
|
pstatus_t sse3_RGB565ToARGB_16u32u_C3C4(
|
||
|
const UINT16* pSrc, INT32 srcStep,
|
||
|
UINT32* pDst, INT32 dstStep,
|
||
|
UINT32 width, UINT32 height,
|
||
|
BOOL alpha, BOOL invert)
|
||
|
{
|
||
|
if (invert)
|
||
|
{
|
||
|
return sse3_RGB565ToARGB_16u32u_C3C4_invert(pSrc, srcStep,
|
||
|
pDst, dstStep, width, height, alpha);
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
return sse3_RGB565ToARGB_16u32u_C3C4_no_invert(pSrc, srcStep,
|
||
|
pDst, dstStep, width, height, alpha);
|
||
|
}
|
||
|
}
|
||
|
#endif /* WITH_SSE2 */
|
||
|
|
||
|
/* ------------------------------------------------------------------------- */
|
||
|
void primitives_init_16to32bpp_opt(
|
||
|
primitives_t *prims)
|
||
|
{
|
||
|
#ifdef WITH_SSE2
|
||
|
if (IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
|
||
|
{
|
||
|
prims->RGB565ToARGB_16u32u_C3C4 = sse3_RGB565ToARGB_16u32u_C3C4;
|
||
|
}
|
||
|
#endif
|
||
|
}
|