Merge pull request #992 from mfleisz/master

primitives: separating optimized functions into their own .c files
This commit is contained in:
Marc-André Moreau 2013-02-22 04:06:04 -08:00
commit 9e1e8f11f1
31 changed files with 1675 additions and 1195 deletions

View File

@ -71,7 +71,7 @@ if(WITH_NEON)
if(ANDROID)
set(ANDROID_CPU_FEATURES_PATH "${ANDROID_NDK}/sources/android/cpufeatures")
include_directories(${ANDROID_CPU_FEATURES_PATH})
set(${MODULE_PREFIX}_NEON_SRCS ${${MODULE_PREFIX}_NEON_SRCS}
set(${MODULE_PREFIX}_SRCS ${${MODULE_PREFIX}_SRCS}
${ANDROID_CPU_FEATURES_PATH}/cpu-features.c
${ANDROID_CPU_FEATURES_PATH}/cpu-features.h)
endif()

View File

@ -37,6 +37,7 @@
#include <freerdp/codec/rfx.h>
#include <freerdp/constants.h>
#include <freerdp/primitives.h>
#include "rfx_constants.h"
#include "rfx_types.h"
@ -45,13 +46,8 @@
#include "rfx_quantization.h"
#include "rfx_dwt.h"
#ifdef WITH_SSE2
#include "rfx_sse2.h"
#endif
#ifdef WITH_NEON
#include "rfx_neon.h"
#endif
#ifndef RFX_INIT_SIMD
#define RFX_INIT_SIMD(_rfx_context) do { } while (0)
@ -209,6 +205,11 @@ RFX_CONTEXT* rfx_context_new(void)
if (context->priv->UseThreads)
{
/* Call primitives_get here in order to avoid race conditions when using primitives_get */
/* from multiple threads. This call will initialize all function pointers correctly */
/* before any decoding threads are started */
primitives_get();
context->priv->ThreadPool = CreateThreadpool(NULL);
InitializeThreadpoolEnvironment(&context->priv->ThreadPoolEnv);
SetThreadpoolCallbackPool(&context->priv->ThreadPoolEnv, context->priv->ThreadPool);
@ -232,6 +233,8 @@ RFX_CONTEXT* rfx_context_new(void)
context->dwt_2d_decode = rfx_dwt_2d_decode;
context->dwt_2d_encode = rfx_dwt_2d_encode;
RFX_INIT_SIMD(context);
return context;
}

View File

@ -40,9 +40,7 @@
static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
rfx_quantization_decode_block_NEON(INT16 * buffer, const int buffer_size, const UINT32 factor)
{
if (factor <= 6)
return;
int16x8_t quantFactors = vdupq_n_s16(factor - 6);
int16x8_t quantFactors = vdupq_n_s16(factor);
int16x8_t* buf = (int16x8_t*)buffer;
int16x8_t* buf_end = (int16x8_t*)(buffer + buffer_size);
@ -59,16 +57,18 @@ rfx_quantization_decode_block_NEON(INT16 * buffer, const int buffer_size, const
void
rfx_quantization_decode_NEON(INT16 * buffer, const UINT32 * quantization_values)
{
rfx_quantization_decode_block_NEON(buffer, 1024, quantization_values[8]); /* HL1 */
rfx_quantization_decode_block_NEON(buffer + 1024, 1024, quantization_values[7]); /* LH1 */
rfx_quantization_decode_block_NEON(buffer + 2048, 1024, quantization_values[9]); /* HH1 */
rfx_quantization_decode_block_NEON(buffer + 3072, 256, quantization_values[5]); /* HL2 */
rfx_quantization_decode_block_NEON(buffer + 3328, 256, quantization_values[4]); /* LH2 */
rfx_quantization_decode_block_NEON(buffer + 3584, 256, quantization_values[6]); /* HH2 */
rfx_quantization_decode_block_NEON(buffer + 3840, 64, quantization_values[2]); /* HL3 */
rfx_quantization_decode_block_NEON(buffer + 3904, 64, quantization_values[1]); /* LH3 */
rfx_quantization_decode_block_NEON(buffer + 3968, 64, quantization_values[3]); /* HH3 */
rfx_quantization_decode_block_NEON(buffer + 4032, 64, quantization_values[0]); /* LL3 */
rfx_quantization_decode_block_NEON(buffer, 4096, 5);
rfx_quantization_decode_block_NEON(buffer, 1024, quantization_values[8] - 6); /* HL1 */
rfx_quantization_decode_block_NEON(buffer + 1024, 1024, quantization_values[7] - 6); /* LH1 */
rfx_quantization_decode_block_NEON(buffer + 2048, 1024, quantization_values[9] - 6); /* HH1 */
rfx_quantization_decode_block_NEON(buffer + 3072, 256, quantization_values[5] - 6); /* HL2 */
rfx_quantization_decode_block_NEON(buffer + 3328, 256, quantization_values[4] - 6); /* LH2 */
rfx_quantization_decode_block_NEON(buffer + 3584, 256, quantization_values[6] - 6); /* HH2 */
rfx_quantization_decode_block_NEON(buffer + 3840, 64, quantization_values[2] - 6); /* HL3 */
rfx_quantization_decode_block_NEON(buffer + 3904, 64, quantization_values[1] - 6); /* LH3 */
rfx_quantization_decode_block_NEON(buffer + 3968, 64, quantization_values[3] - 6); /* HH3 */
rfx_quantization_decode_block_NEON(buffer + 4032, 64, quantization_values[0] - 6); /* LL3 */
}
@ -278,8 +278,11 @@ int isNeonSupported()
}
return FALSE;
#else
#elif defined(__APPLE)
/* assume NEON support on iOS devices */
return TRUE;
#else
return FALSE;
#endif
}

View File

@ -22,8 +22,6 @@
#include <freerdp/codec/rfx.h>
#if defined(__ARM_NEON__)
void rfx_init_neon(RFX_CONTEXT * context);
#ifndef RFX_INIT_SIMD
@ -32,7 +30,5 @@ void rfx_init_neon(RFX_CONTEXT * context);
#endif
#endif
#endif // __ARM_NEON__
#endif /* __RFX_NEON_H */

View File

@ -24,8 +24,10 @@
void rfx_init_sse2(RFX_CONTEXT* context);
#ifndef RFX_INIT_SIMD
#define RFX_INIT_SIMD(_rfx_context) rfx_init_sse2(_rfx_context)
#ifdef WITH_SSE2
#ifndef RFX_INIT_SIMD
#define RFX_INIT_SIMD(_rfx_context) rfx_init_sse2(_rfx_context)
#endif
#endif
#endif /* __RFX_SSE2_H */

View File

@ -28,6 +28,15 @@ set(${MODULE_PREFIX}_SRCS
primitives.c
prim_internal.h)
set(${MODULE_PREFIX}_OPT_SRCS
prim_add_opt.c
prim_andor_opt.c
prim_alphaComp_opt.c
prim_colors_opt.c
prim_set_opt.c
prim_shift_opt.c
prim_sign_opt.c)
add_definitions(-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE})
### IPP Variable debugging
@ -63,7 +72,9 @@ if(ANDROID)
${ANDROID_CPU_FEATURES_PATH}/cpu-features.h)
endif()
set_property(SOURCE ${${MODULE_PREFIX}_SRCS} PROPERTY COMPILE_FLAGS ${OPTIMIZATION})
set_property(SOURCE ${${MODULE_PREFIX}_OPT_SRCS} PROPERTY COMPILE_FLAGS ${OPTIMIZATION})
set(${MODULE_PREFIX}_SRCS ${${MODULE_PREFIX}_SRCS} ${${MODULE_PREFIX}_OPT_SRCS})
add_complex_library(MODULE ${MODULE_NAME} TYPE "OBJECT"
MONOLITHIC ${MONOLITHIC_BUILD}

View File

@ -18,27 +18,16 @@
#include "config.h"
#endif
#include <string.h>
#include <freerdp/types.h>
#include <freerdp/primitives.h>
#ifdef WITH_SSE2
#include <emmintrin.h>
#include <pmmintrin.h>
#endif /* WITH_SSE2 */
#ifdef WITH_IPP
#include <ipps.h>
#endif /* WITH_IPP */
#include "prim_internal.h"
#include "prim_templates.h"
#include "prim_add.h"
/* ----------------------------------------------------------------------------
* 16-bit signed add with saturation (under and over).
*/
PRIM_STATIC pstatus_t general_add_16s(
pstatus_t general_add_16s(
const INT16 *pSrc1,
const INT16 *pSrc2,
INT16 *pDst,
@ -55,29 +44,14 @@ PRIM_STATIC pstatus_t general_add_16s(
return PRIMITIVES_SUCCESS;
}
#ifdef WITH_SSE2
# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
/* ------------------------------------------------------------------------- */
SSE3_SSD_ROUTINE(sse3_add_16s, INT16, general_add_16s,
_mm_adds_epi16, general_add_16s(sptr1++, sptr2++, dptr++, 1))
# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
#endif
/* ------------------------------------------------------------------------- */
void primitives_init_add(
const primitives_hints_t *hints,
primitives_t *prims)
{
prims->add_16s = general_add_16s;
#ifdef WITH_IPP
prims->add_16s = (__add_16s_t) ippsAdd_16s;
#elif defined(WITH_SSE2)
if ((hints->x86_flags & PRIM_X86_SSE2_AVAILABLE)
&& (hints->x86_flags & PRIM_X86_SSE3_AVAILABLE)) /* for LDDQU */
{
prims->add_16s = sse3_add_16s;
}
#endif
primitives_init_add_opt(hints, prims);
}
/* ------------------------------------------------------------------------- */

View File

@ -0,0 +1,30 @@
/* FreeRDP: A Remote Desktop Protocol Client
* Add operations.
* vi:ts=4 sw=4
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License. Algorithms used by
* this code may be covered by patents by HP, Microsoft, or other parties.
*
*/
#ifdef __GNUC__
# pragma once
#endif
#ifndef __PRIM_ADD_H_INCLUDED__
#define __PRIM_ADD_H_INCLUDED__
pstatus_t general_add_16s(const INT16 *pSrc1, const INT16 *pSrc2, INT16 *pDst, INT32 len);
void primitives_init_add_opt(const primitives_hints_t *hints, primitives_t *prims);
#endif /* !__PRIM_ADD_H_INCLUDED__ */

View File

@ -0,0 +1,62 @@
/* FreeRDP: A Remote Desktop Protocol Client
* Optimized add operations.
* vi:ts=4 sw=4:
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License.
*
*/
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include <freerdp/types.h>
#include <freerdp/primitives.h>
#ifdef WITH_SSE2
#include <emmintrin.h>
#include <pmmintrin.h>
#endif /* WITH_SSE2 */
#ifdef WITH_IPP
#include <ipps.h>
#endif /* WITH_IPP */
#include "prim_internal.h"
#include "prim_templates.h"
#include "prim_add.h"
#ifdef WITH_SSE2
# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
/* ------------------------------------------------------------------------- */
SSE3_SSD_ROUTINE(sse3_add_16s, INT16, general_add_16s,
_mm_adds_epi16, general_add_16s(sptr1++, sptr2++, dptr++, 1))
# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
#endif
/* ------------------------------------------------------------------------- */
void primitives_init_add_opt(
const primitives_hints_t *hints,
primitives_t *prims)
{
#ifdef WITH_IPP
prims->add_16s = (__add_16s_t) ippsAdd_16s;
#elif defined(WITH_SSE2)
if ((hints->x86_flags & PRIM_X86_SSE2_AVAILABLE)
&& (hints->x86_flags & PRIM_X86_SSE3_AVAILABLE)) /* for LDDQU */
{
prims->add_16s = sse3_add_16s;
}
#endif
}

View File

@ -24,21 +24,11 @@
#include "config.h"
#endif
#include <string.h>
#include <freerdp/types.h>
#include <freerdp/primitives.h>
#include "prim_internal.h"
#ifdef WITH_SSE2
#include <emmintrin.h>
#include <pmmintrin.h>
#endif /* WITH_SSE2 */
#ifdef WITH_IPP
#include <ippi.h>
#endif /* WITH_IPP */
#include "prim_alphaComp.h"
#define ALPHA(_k_) (((_k_) & 0xFF000000U) >> 24)
#define RED(_k_) (((_k_) & 0x00FF0000U) >> 16)
@ -46,7 +36,7 @@
#define BLU(_k_) (((_k_) & 0x000000FFU))
/* ------------------------------------------------------------------------- */
PRIM_STATIC pstatus_t general_alphaComp_argb(
pstatus_t general_alphaComp_argb(
const BYTE *pSrc1, INT32 src1Step,
const BYTE *pSrc2, INT32 src2Step,
BYTE *pDst, INT32 dstStep,
@ -111,188 +101,12 @@ PRIM_STATIC pstatus_t general_alphaComp_argb(
return PRIMITIVES_SUCCESS;
}
/* ------------------------------------------------------------------------- */
#ifdef WITH_SSE2
#if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
PRIM_STATIC pstatus_t sse2_alphaComp_argb(
const BYTE *pSrc1, INT32 src1Step,
const BYTE *pSrc2, INT32 src2Step,
BYTE *pDst, INT32 dstStep,
INT32 width, INT32 height)
{
const UINT32 *sptr1 = (const UINT32 *) pSrc1;
const UINT32 *sptr2 = (const UINT32 *) pSrc2;
UINT32 *dptr;
int linebytes, src1Jump, src2Jump, dstJump, y;
__m128i xmm0, xmm1;
if ((width <= 0) || (height <= 0)) return PRIMITIVES_SUCCESS;
if (width < 4) /* pointless if too small */
{
return general_alphaComp_argb(pSrc1, src1Step, pSrc2, src2Step,
pDst, dstStep, width, height);
}
dptr = (UINT32 *) pDst;
linebytes = width * sizeof(UINT32);
src1Jump = (src1Step - linebytes) / sizeof(UINT32);
src2Jump = (src2Step - linebytes) / sizeof(UINT32);
dstJump = (dstStep - linebytes) / sizeof(UINT32);
xmm0 = _mm_set1_epi32(0);
xmm1 = _mm_set1_epi16(1);
for (y=0; y<height; ++y)
{
int pixels = width;
int count;
/* Get to the 16-byte boundary now. */
int leadIn = 0;
switch ((ULONG_PTR) dptr & 0x0f)
{
case 0:
leadIn = 0;
break;
case 4:
leadIn = 3;
break;
case 8:
leadIn = 2;
break;
case 12:
leadIn = 1;
break;
default:
/* We'll never hit a 16-byte boundary, so do the whole
* thing the slow way.
*/
leadIn = width;
break;
}
if (leadIn)
{
general_alphaComp_argb((const BYTE *) sptr1,
src1Step, (const BYTE *) sptr2, src2Step,
(BYTE *) dptr, dstStep, leadIn, 1);
sptr1 += leadIn;
sptr2 += leadIn;
dptr += leadIn;
pixels -= leadIn;
}
/* Use SSE registers to do 4 pixels at a time. */
count = pixels >> 2;
pixels -= count << 2;
while (count--)
{
__m128i xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
/* BdGdRdAdBcGcRcAcBbGbRbAbBaGaRaAa */
xmm2 = LOAD_SI128(sptr1); sptr1 += 4;
/* BhGhRhAhBgGgRgAgBfGfRfAfBeGeReAe */
xmm3 = LOAD_SI128(sptr2); sptr2 += 4;
/* 00Bb00Gb00Rb00Ab00Ba00Ga00Ra00Aa */
xmm4 = _mm_unpackhi_epi8(xmm2, xmm0);
/* 00Bf00Gf00Bf00Af00Be00Ge00Re00Ae */
xmm5 = _mm_unpackhi_epi8(xmm3, xmm0);
/* subtract */
xmm6 = _mm_subs_epi16(xmm4, xmm5);
/* 00Bb00Gb00Rb00Ab00Aa00Aa00Aa00Aa */
xmm4 = _mm_shufflelo_epi16(xmm4, 0xff);
/* 00Ab00Ab00Ab00Ab00Aa00Aa00Aa00Aa */
xmm4 = _mm_shufflehi_epi16(xmm4, 0xff);
/* Add one to alphas */
xmm4 = _mm_adds_epi16(xmm4, xmm1);
/* Multiply and take low word */
xmm4 = _mm_mullo_epi16(xmm4, xmm6);
/* Shift 8 right */
xmm4 = _mm_srai_epi16(xmm4, 8);
/* Add xmm5 */
xmm4 = _mm_adds_epi16(xmm4, xmm5);
/* 00Bj00Gj00Rj00Aj00Bi00Gi00Ri00Ai */
/* 00Bd00Gd00Rd00Ad00Bc00Gc00Rc00Ac */
xmm5 = _mm_unpacklo_epi8(xmm2, xmm0);
/* 00Bh00Gh00Rh00Ah00Bg00Gg00Rg00Ag */
xmm6 = _mm_unpacklo_epi8(xmm3, xmm0);
/* subtract */
xmm7 = _mm_subs_epi16(xmm5, xmm6);
/* 00Bd00Gd00Rd00Ad00Ac00Ac00Ac00Ac */
xmm5 = _mm_shufflelo_epi16(xmm5, 0xff);
/* 00Ad00Ad00Ad00Ad00Ac00Ac00Ac00Ac */
xmm5 = _mm_shufflehi_epi16(xmm5, 0xff);
/* Add one to alphas */
xmm5 = _mm_adds_epi16(xmm5, xmm1);
/* Multiply and take low word */
xmm5 = _mm_mullo_epi16(xmm5, xmm7);
/* Shift 8 right */
xmm5 = _mm_srai_epi16(xmm5, 8);
/* Add xmm6 */
xmm5 = _mm_adds_epi16(xmm5, xmm6);
/* 00Bl00Gl00Rl00Al00Bk00Gk00Rk0ABk */
/* Must mask off remainders or pack gets confused */
xmm3 = _mm_set1_epi16(0x00ffU);
xmm4 = _mm_and_si128(xmm4, xmm3);
xmm5 = _mm_and_si128(xmm5, xmm3);
/* BlGlRlAlBkGkRkAkBjGjRjAjBiGiRiAi */
xmm5 = _mm_packus_epi16(xmm5, xmm4);
_mm_store_si128((__m128i *) dptr, xmm5); dptr += 4;
}
/* Finish off the remainder. */
if (pixels)
{
general_alphaComp_argb((const BYTE *) sptr1, src1Step,
(const BYTE *) sptr2, src2Step,
(BYTE *) dptr, dstStep, pixels, 1);
sptr1 += pixels;
sptr2 += pixels;
dptr += pixels;
}
/* Jump to next row. */
sptr1 += src1Jump;
sptr2 += src2Jump;
dptr += dstJump;
}
return PRIMITIVES_SUCCESS;
}
#endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
#endif
#ifdef WITH_IPP
/* ------------------------------------------------------------------------- */
PRIM_STATIC pstatus_t ipp_alphaComp_argb(
const BYTE *pSrc1, INT32 src1Step,
const BYTE *pSrc2, INT32 src2Step,
BYTE *pDst, INT32 dstStep,
INT32 width, INT32 height)
{
IppiSize sz;
sz.width = width;
sz.height = height;
return ippiAlphaComp_8u_AC4R(pSrc1, src1Step, pSrc2, src2Step,
pDst, dstStep, sz, ippAlphaOver);
}
#endif
/* ------------------------------------------------------------------------- */
void primitives_init_alphaComp(const primitives_hints_t* hints, primitives_t* prims)
{
prims->alphaComp_argb = general_alphaComp_argb;
#ifdef WITH_IPP
prims->alphaComp_argb = ipp_alphaComp_argb;
#elif defined(WITH_SSE2)
if ((hints->x86_flags & PRIM_X86_SSE2_AVAILABLE)
&& (hints->x86_flags & PRIM_X86_SSE3_AVAILABLE)) /* for LDDQU */
{
prims->alphaComp_argb = sse2_alphaComp_argb;
}
#endif
primitives_init_alphaComp_opt(hints, prims);
}
/* ------------------------------------------------------------------------- */
@ -300,3 +114,4 @@ void primitives_deinit_alphaComp(primitives_t *prims)
{
/* Nothing to do. */
}

View File

@ -0,0 +1,30 @@
/* FreeRDP: A Remote Desktop Protocol Client
* Alpha blending routines.
* vi:ts=4 sw=4
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License. Algorithms used by
* this code may be covered by patents by HP, Microsoft, or other parties.
*
*/
#ifdef __GNUC__
# pragma once
#endif
#ifndef __PRIM_ALPHACOMP_H_INCLUDED__
#define __PRIM_ALPHACOMP_H_INCLUDED__
pstatus_t general_alphaComp_argb(const BYTE *pSrc1, INT32 src1Step, const BYTE *pSrc2, INT32 src2Step, BYTE *pDst, INT32 dstStep, INT32 width, INT32 height);
void primitives_init_alphaComp_opt(const primitives_hints_t* hints, primitives_t* prims);
#endif /* !__PRIM_ALPHACOMP_H_INCLUDED__ */

View File

@ -0,0 +1,225 @@
/* FreeRDP: A Remote Desktop Protocol Client
* Optimized alpha blending routines.
* vi:ts=4 sw=4:
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License.
*
* Note: this code assumes the second operand is fully opaque,
* e.g.
* newval = alpha1*val1 + (1-alpha1)*val2
* rather than
* newval = alpha1*val1 + (1-alpha1)*alpha2*val2
* The IPP gives other options.
*/
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include <freerdp/types.h>
#include <freerdp/primitives.h>
#ifdef WITH_SSE2
#include <emmintrin.h>
#include <pmmintrin.h>
#endif /* WITH_SSE2 */
#ifdef WITH_IPP
#include <ippi.h>
#endif /* WITH_IPP */
#include "prim_internal.h"
#include "prim_alphaComp.h"
/* ------------------------------------------------------------------------- */
#ifdef WITH_SSE2
#if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
pstatus_t sse2_alphaComp_argb(
const BYTE *pSrc1, INT32 src1Step,
const BYTE *pSrc2, INT32 src2Step,
BYTE *pDst, INT32 dstStep,
INT32 width, INT32 height)
{
const UINT32 *sptr1 = (const UINT32 *) pSrc1;
const UINT32 *sptr2 = (const UINT32 *) pSrc2;
UINT32 *dptr;
int linebytes, src1Jump, src2Jump, dstJump, y;
__m128i xmm0, xmm1;
if ((width <= 0) || (height <= 0)) return PRIMITIVES_SUCCESS;
if (width < 4) /* pointless if too small */
{
return general_alphaComp_argb(pSrc1, src1Step, pSrc2, src2Step,
pDst, dstStep, width, height);
}
dptr = (UINT32 *) pDst;
linebytes = width * sizeof(UINT32);
src1Jump = (src1Step - linebytes) / sizeof(UINT32);
src2Jump = (src2Step - linebytes) / sizeof(UINT32);
dstJump = (dstStep - linebytes) / sizeof(UINT32);
xmm0 = _mm_set1_epi32(0);
xmm1 = _mm_set1_epi16(1);
for (y=0; y<height; ++y)
{
int pixels = width;
int count;
/* Get to the 16-byte boundary now. */
int leadIn = 0;
switch ((ULONG_PTR) dptr & 0x0f)
{
case 0:
leadIn = 0;
break;
case 4:
leadIn = 3;
break;
case 8:
leadIn = 2;
break;
case 12:
leadIn = 1;
break;
default:
/* We'll never hit a 16-byte boundary, so do the whole
* thing the slow way.
*/
leadIn = width;
break;
}
if (leadIn)
{
general_alphaComp_argb((const BYTE *) sptr1,
src1Step, (const BYTE *) sptr2, src2Step,
(BYTE *) dptr, dstStep, leadIn, 1);
sptr1 += leadIn;
sptr2 += leadIn;
dptr += leadIn;
pixels -= leadIn;
}
/* Use SSE registers to do 4 pixels at a time. */
count = pixels >> 2;
pixels -= count << 2;
while (count--)
{
__m128i xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
/* BdGdRdAdBcGcRcAcBbGbRbAbBaGaRaAa */
xmm2 = LOAD_SI128(sptr1); sptr1 += 4;
/* BhGhRhAhBgGgRgAgBfGfRfAfBeGeReAe */
xmm3 = LOAD_SI128(sptr2); sptr2 += 4;
/* 00Bb00Gb00Rb00Ab00Ba00Ga00Ra00Aa */
xmm4 = _mm_unpackhi_epi8(xmm2, xmm0);
/* 00Bf00Gf00Bf00Af00Be00Ge00Re00Ae */
xmm5 = _mm_unpackhi_epi8(xmm3, xmm0);
/* subtract */
xmm6 = _mm_subs_epi16(xmm4, xmm5);
/* 00Bb00Gb00Rb00Ab00Aa00Aa00Aa00Aa */
xmm4 = _mm_shufflelo_epi16(xmm4, 0xff);
/* 00Ab00Ab00Ab00Ab00Aa00Aa00Aa00Aa */
xmm4 = _mm_shufflehi_epi16(xmm4, 0xff);
/* Add one to alphas */
xmm4 = _mm_adds_epi16(xmm4, xmm1);
/* Multiply and take low word */
xmm4 = _mm_mullo_epi16(xmm4, xmm6);
/* Shift 8 right */
xmm4 = _mm_srai_epi16(xmm4, 8);
/* Add xmm5 */
xmm4 = _mm_adds_epi16(xmm4, xmm5);
/* 00Bj00Gj00Rj00Aj00Bi00Gi00Ri00Ai */
/* 00Bd00Gd00Rd00Ad00Bc00Gc00Rc00Ac */
xmm5 = _mm_unpacklo_epi8(xmm2, xmm0);
/* 00Bh00Gh00Rh00Ah00Bg00Gg00Rg00Ag */
xmm6 = _mm_unpacklo_epi8(xmm3, xmm0);
/* subtract */
xmm7 = _mm_subs_epi16(xmm5, xmm6);
/* 00Bd00Gd00Rd00Ad00Ac00Ac00Ac00Ac */
xmm5 = _mm_shufflelo_epi16(xmm5, 0xff);
/* 00Ad00Ad00Ad00Ad00Ac00Ac00Ac00Ac */
xmm5 = _mm_shufflehi_epi16(xmm5, 0xff);
/* Add one to alphas */
xmm5 = _mm_adds_epi16(xmm5, xmm1);
/* Multiply and take low word */
xmm5 = _mm_mullo_epi16(xmm5, xmm7);
/* Shift 8 right */
xmm5 = _mm_srai_epi16(xmm5, 8);
/* Add xmm6 */
xmm5 = _mm_adds_epi16(xmm5, xmm6);
/* 00Bl00Gl00Rl00Al00Bk00Gk00Rk0ABk */
/* Must mask off remainders or pack gets confused */
xmm3 = _mm_set1_epi16(0x00ffU);
xmm4 = _mm_and_si128(xmm4, xmm3);
xmm5 = _mm_and_si128(xmm5, xmm3);
/* BlGlRlAlBkGkRkAkBjGjRjAjBiGiRiAi */
xmm5 = _mm_packus_epi16(xmm5, xmm4);
_mm_store_si128((__m128i *) dptr, xmm5); dptr += 4;
}
/* Finish off the remainder. */
if (pixels)
{
general_alphaComp_argb((const BYTE *) sptr1, src1Step,
(const BYTE *) sptr2, src2Step,
(BYTE *) dptr, dstStep, pixels, 1);
sptr1 += pixels;
sptr2 += pixels;
dptr += pixels;
}
/* Jump to next row. */
sptr1 += src1Jump;
sptr2 += src2Jump;
dptr += dstJump;
}
return PRIMITIVES_SUCCESS;
}
#endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
#endif
#ifdef WITH_IPP
/* ------------------------------------------------------------------------- */
pstatus_t ipp_alphaComp_argb(
const BYTE *pSrc1, INT32 src1Step,
const BYTE *pSrc2, INT32 src2Step,
BYTE *pDst, INT32 dstStep,
INT32 width, INT32 height)
{
IppiSize sz;
sz.width = width;
sz.height = height;
return ippiAlphaComp_8u_AC4R(pSrc1, src1Step, pSrc2, src2Step,
pDst, dstStep, sz, ippAlphaOver);
}
#endif
/* ------------------------------------------------------------------------- */
void primitives_init_alphaComp_opt(const primitives_hints_t* hints, primitives_t* prims)
{
#ifdef WITH_IPP
prims->alphaComp_argb = ipp_alphaComp_argb;
#elif defined(WITH_SSE2)
if ((hints->x86_flags & PRIM_X86_SSE2_AVAILABLE)
&& (hints->x86_flags & PRIM_X86_SSE3_AVAILABLE)) /* for LDDQU */
{
prims->alphaComp_argb = sse2_alphaComp_argb;
}
#endif
}

View File

@ -17,27 +17,16 @@
#include "config.h"
#endif
#include <string.h>
#include <freerdp/types.h>
#include <freerdp/primitives.h>
#ifdef WITH_SSE2
#include <emmintrin.h>
#include <pmmintrin.h>
#endif /* WITH_SSE2 */
#ifdef WITH_IPP
#include <ipps.h>
#endif /* WITH_IPP */
#include "prim_internal.h"
#include "prim_templates.h"
#include "prim_andor.h"
/* ----------------------------------------------------------------------------
* 32-bit AND with a constant.
*/
PRIM_STATIC pstatus_t general_andC_32u(
pstatus_t general_andC_32u(
const UINT32 *pSrc,
UINT32 val,
UINT32 *pDst,
@ -55,7 +44,7 @@ PRIM_STATIC pstatus_t general_andC_32u(
/* ----------------------------------------------------------------------------
* 32-bit OR with a constant.
*/
PRIM_STATIC pstatus_t general_orC_32u(
pstatus_t general_orC_32u(
const UINT32 *pSrc,
UINT32 val,
UINT32 *pDst,
@ -70,16 +59,6 @@ PRIM_STATIC pstatus_t general_orC_32u(
return PRIMITIVES_SUCCESS;
}
#ifdef WITH_SSE2
# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
/* ------------------------------------------------------------------------- */
SSE3_SCD_PRE_ROUTINE(sse3_andC_32u, UINT32, general_andC_32u,
_mm_and_si128, *dptr++ = *sptr++ & val)
SSE3_SCD_PRE_ROUTINE(sse3_orC_32u, UINT32, general_orC_32u,
_mm_or_si128, *dptr++ = *sptr++ | val)
# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
#endif
/* ------------------------------------------------------------------------- */
void primitives_init_andor(
const primitives_hints_t *hints,
@ -89,17 +68,7 @@ void primitives_init_andor(
prims->andC_32u = general_andC_32u;
prims->orC_32u = general_orC_32u;
#if defined(WITH_IPP)
prims->andC_32u = (__andC_32u_t) ippsAndC_32u;
prims->orC_32u = (__orC_32u_t) ippsOrC_32u;
#elif defined(WITH_SSE2)
if ((hints->x86_flags & PRIM_X86_SSE2_AVAILABLE)
&& (hints->x86_flags & PRIM_X86_SSE3_AVAILABLE))
{
prims->andC_32u = sse3_andC_32u;
prims->orC_32u = sse3_orC_32u;
}
#endif
primitives_init_andor_opt(hints, prims);
}
/* ------------------------------------------------------------------------- */
@ -108,3 +77,4 @@ void primitives_deinit_andor(
{
/* Nothing to do. */
}

View File

@ -0,0 +1,31 @@
/* FreeRDP: A Remote Desktop Protocol Client
* Logical operations.
* vi:ts=4 sw=4
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License. Algorithms used by
* this code may be covered by patents by HP, Microsoft, or other parties.
*
*/
#ifdef __GNUC__
# pragma once
#endif
#ifndef __PRIM_ANDOR_H_INCLUDED__
#define __PRIM_ANDOR_H_INCLUDED__
pstatus_t general_andC_32u(const UINT32 *pSrc, UINT32 val, UINT32 *pDst, INT32 len);
pstatus_t general_orC_32u(const UINT32 *pSrc, UINT32 val, UINT32 *pDst, INT32 len);
void primitives_init_andor_opt(const primitives_hints_t *hints, primitives_t *prims);
#endif /* !__PRIM_ANDOR_H_INCLUDED__ */

View File

@ -0,0 +1,61 @@
/* FreeRDP: A Remote Desktop Protocol Client
* Optimized Logical operations.
* vi:ts=4 sw=4:
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include <freerdp/types.h>
#include <freerdp/primitives.h>
#ifdef WITH_SSE2
#include <emmintrin.h>
#include <pmmintrin.h>
#endif /* WITH_SSE2 */
#ifdef WITH_IPP
#include <ipps.h>
#endif /* WITH_IPP */
#include "prim_internal.h"
#include "prim_templates.h"
#include "prim_andor.h"
#ifdef WITH_SSE2
# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
/* ------------------------------------------------------------------------- */
SSE3_SCD_PRE_ROUTINE(sse3_andC_32u, UINT32, general_andC_32u,
_mm_and_si128, *dptr++ = *sptr++ & val)
SSE3_SCD_PRE_ROUTINE(sse3_orC_32u, UINT32, general_orC_32u,
_mm_or_si128, *dptr++ = *sptr++ | val)
# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
#endif
/* ------------------------------------------------------------------------- */
void primitives_init_andor_opt(const primitives_hints_t *hints, primitives_t *prims)
{
#if defined(WITH_IPP)
prims->andC_32u = (__andC_32u_t) ippsAndC_32u;
prims->orC_32u = (__orC_32u_t) ippsOrC_32u;
#elif defined(WITH_SSE2)
if ((hints->x86_flags & PRIM_X86_SSE2_AVAILABLE)
&& (hints->x86_flags & PRIM_X86_SSE3_AVAILABLE))
{
prims->andC_32u = sse3_andC_32u;
prims->orC_32u = sse3_orC_32u;
}
#endif
}

View File

@ -21,16 +21,11 @@
#include "config.h"
#endif
#include <string.h>
#include <freerdp/types.h>
#include <freerdp/primitives.h>
#ifdef WITH_SSE2
#include <emmintrin.h>
#elif defined(WITH_NEON)
#include <arm_neon.h>
#endif /* WITH_SSE2 else WITH_NEON */
#include "prim_internal.h"
#include "prim_templates.h"
#include "prim_colors.h"
#ifndef MINMAX
#define MINMAX(_v_, _l_, _h_) \
@ -38,7 +33,7 @@
#endif /* !MINMAX */
/* ------------------------------------------------------------------------- */
PRIM_STATIC pstatus_t general_yCbCrToRGB_16s16s_P3P3(
pstatus_t general_yCbCrToRGB_16s16s_P3P3(
const INT16 *pSrc[3], INT32 srcStep,
INT16 *pDst[3], INT32 dstStep,
const prim_size_t *roi) /* region of interest */
@ -119,7 +114,7 @@ PRIM_STATIC pstatus_t general_yCbCrToRGB_16s16s_P3P3(
}
/* ------------------------------------------------------------------------- */
PRIM_STATIC pstatus_t general_RGBToYCbCr_16s16s_P3P3(
pstatus_t general_RGBToYCbCr_16s16s_P3P3(
const INT16 *pSrc[3], INT32 srcStep,
INT16 *pDst[3], INT32 dstStep,
const prim_size_t *roi) /* region of interest */
@ -187,7 +182,7 @@ PRIM_STATIC pstatus_t general_RGBToYCbCr_16s16s_P3P3(
}
/* ------------------------------------------------------------------------- */
PRIM_STATIC pstatus_t general_RGBToRGB_16s8u_P3AC4R(
pstatus_t general_RGBToRGB_16s8u_P3AC4R(
const INT16 *pSrc[3], /* 16-bit R,G, and B arrays */
int srcStep, /* bytes between rows in source data */
BYTE *pDst, /* 32-bit interleaved ARGB (ABGR?) data */
@ -219,514 +214,6 @@ PRIM_STATIC pstatus_t general_RGBToRGB_16s8u_P3AC4R(
return PRIMITIVES_SUCCESS;
}
#ifdef WITH_SSE2
#ifdef __GNUC__
# define GNU_INLINE \
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
#else
# define GNU_INLINE
#endif
#define CACHE_LINE_BYTES 64
#define _mm_between_epi16(_val, _min, _max) \
do { _val = _mm_min_epi16(_max, _mm_max_epi16(_val, _min)); } while (0)
#ifdef DO_PREFETCH
/*---------------------------------------------------------------------------*/
static inline void GNU_INLINE _mm_prefetch_buffer(
char * buffer,
int num_bytes)
{
__m128i * buf = (__m128i*) buffer;
unsigned int i;
for (i = 0; i < (num_bytes / sizeof(__m128i));
i+=(CACHE_LINE_BYTES / sizeof(__m128i)))
{
_mm_prefetch((char*)(&buf[i]), _MM_HINT_NTA);
}
}
#endif /* DO_PREFETCH */
/*---------------------------------------------------------------------------*/
PRIM_STATIC pstatus_t sse2_yCbCrToRGB_16s16s_P3P3(
const INT16 *pSrc[3],
int srcStep,
INT16 *pDst[3],
int dstStep,
const prim_size_t *roi) /* region of interest */
{
__m128i zero, max, r_cr, g_cb, g_cr, b_cb, c4096;
__m128i *y_buf, *cb_buf, *cr_buf, *r_buf, *g_buf, *b_buf;
int srcbump, dstbump, yp, imax;
if (((ULONG_PTR) (pSrc[0]) & 0x0f)
|| ((ULONG_PTR) (pSrc[1]) & 0x0f)
|| ((ULONG_PTR) (pSrc[2]) & 0x0f)
|| ((ULONG_PTR) (pDst[0]) & 0x0f)
|| ((ULONG_PTR) (pDst[1]) & 0x0f)
|| ((ULONG_PTR) (pDst[2]) & 0x0f)
|| (roi->width & 0x07)
|| (srcStep & 127)
|| (dstStep & 127))
{
/* We can't maintain 16-byte alignment. */
return general_yCbCrToRGB_16s16s_P3P3(pSrc, srcStep,
pDst, dstStep, roi);
}
zero = _mm_setzero_si128();
max = _mm_set1_epi16(255);
y_buf = (__m128i*) (pSrc[0]);
cb_buf = (__m128i*) (pSrc[1]);
cr_buf = (__m128i*) (pSrc[2]);
r_buf = (__m128i*) (pDst[0]);
g_buf = (__m128i*) (pDst[1]);
b_buf = (__m128i*) (pDst[2]);
r_cr = _mm_set1_epi16(22986); /* 1.403 << 14 */
g_cb = _mm_set1_epi16(-5636); /* -0.344 << 14 */
g_cr = _mm_set1_epi16(-11698); /* -0.714 << 14 */
b_cb = _mm_set1_epi16(28999); /* 1.770 << 14 */
c4096 = _mm_set1_epi16(4096);
srcbump = srcStep / sizeof(__m128i);
dstbump = dstStep / sizeof(__m128i);
#ifdef DO_PREFETCH
/* Prefetch Y's, Cb's, and Cr's. */
for (yp=0; yp<roi->height; yp++)
{
int i;
for (i=0; i<roi->width * sizeof(INT16) / sizeof(__m128i);
i += (CACHE_LINE_BYTES / sizeof(__m128i)))
{
_mm_prefetch((char*)(&y_buf[i]), _MM_HINT_NTA);
_mm_prefetch((char*)(&cb_buf[i]), _MM_HINT_NTA);
_mm_prefetch((char*)(&cr_buf[i]), _MM_HINT_NTA);
}
y_buf += srcbump;
cb_buf += srcbump;
cr_buf += srcbump;
}
y_buf = (__m128i*) (pSrc[0]);
cb_buf = (__m128i*) (pSrc[1]);
cr_buf = (__m128i*) (pSrc[2]);
#endif /* DO_PREFETCH */
imax = roi->width * sizeof(INT16) / sizeof(__m128i);
for (yp=0; yp<roi->height; ++yp)
{
int i;
for (i=0; i<imax; i++)
{
/* In order to use SSE2 signed 16-bit integer multiplication
* we need to convert the floating point factors to signed int
* without losing information.
* The result of this multiplication is 32 bit and we have two
* SSE instructions that return either the hi or lo word.
* Thus we will multiply the factors by the highest possible 2^n,
* take the upper 16 bits of the signed 32-bit result
* (_mm_mulhi_epi16) and correct this result by multiplying
* it by 2^(16-n).
*
* For the given factors in the conversion matrix the best
* possible n is 14.
*
* Example for calculating r:
* r = (y>>5) + 128 + (cr*1.403)>>5 // our base formula
* r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5 // see above
* r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5 // simplification
* r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
*/
/* y = (y_r_buf[i] + 4096) >> 2 */
__m128i y, cb, cr, r, g, b;
y = _mm_load_si128(y_buf + i);
y = _mm_add_epi16(y, c4096);
y = _mm_srai_epi16(y, 2);
/* cb = cb_g_buf[i]; */
cb = _mm_load_si128(cb_buf + i);
/* cr = cr_b_buf[i]; */
cr = _mm_load_si128(cr_buf + i);
/* (y + HIWORD(cr*22986)) >> 3 */
r = _mm_add_epi16(y, _mm_mulhi_epi16(cr, r_cr));
r = _mm_srai_epi16(r, 3);
/* r_buf[i] = MINMAX(r, 0, 255); */
_mm_between_epi16(r, zero, max);
_mm_store_si128(r_buf + i, r);
/* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
g = _mm_add_epi16(y, _mm_mulhi_epi16(cb, g_cb));
g = _mm_add_epi16(g, _mm_mulhi_epi16(cr, g_cr));
g = _mm_srai_epi16(g, 3);
/* g_buf[i] = MINMAX(g, 0, 255); */
_mm_between_epi16(g, zero, max);
_mm_store_si128(g_buf + i, g);
/* (y + HIWORD(cb*28999)) >> 3 */
b = _mm_add_epi16(y, _mm_mulhi_epi16(cb, b_cb));
b = _mm_srai_epi16(b, 3);
/* b_buf[i] = MINMAX(b, 0, 255); */
_mm_between_epi16(b, zero, max);
_mm_store_si128(b_buf + i, b);
}
y_buf += srcbump;
cb_buf += srcbump;
cr_buf += srcbump;
r_buf += dstbump;
g_buf += dstbump;
b_buf += dstbump;
}
return PRIMITIVES_SUCCESS;
}
/*---------------------------------------------------------------------------*/
/* The encodec YCbCr coeffectients are represented as 11.5 fixed-point
* numbers. See the general code above.
*/
PRIM_STATIC pstatus_t sse2_RGBToYCbCr_16s16s_P3P3(
const INT16 *pSrc[3],
int srcStep,
INT16 *pDst[3],
int dstStep,
const prim_size_t *roi) /* region of interest */
{
__m128i min, max, y_r, y_g, y_b, cb_r, cb_g, cb_b, cr_r, cr_g, cr_b;
__m128i *r_buf, *g_buf, *b_buf, *y_buf, *cb_buf, *cr_buf;
int srcbump, dstbump, yp, imax;
if (((ULONG_PTR) (pSrc[0]) & 0x0f)
|| ((ULONG_PTR) (pSrc[1]) & 0x0f)
|| ((ULONG_PTR) (pSrc[2]) & 0x0f)
|| ((ULONG_PTR) (pDst[0]) & 0x0f)
|| ((ULONG_PTR) (pDst[1]) & 0x0f)
|| ((ULONG_PTR) (pDst[2]) & 0x0f)
|| (roi->width & 0x07)
|| (srcStep & 127)
|| (dstStep & 127))
{
/* We can't maintain 16-byte alignment. */
return general_RGBToYCbCr_16s16s_P3P3(pSrc, srcStep,
pDst, dstStep, roi);
}
min = _mm_set1_epi16(-128 << 5);
max = _mm_set1_epi16(127 << 5);
r_buf = (__m128i*) (pSrc[0]);
g_buf = (__m128i*) (pSrc[1]);
b_buf = (__m128i*) (pSrc[2]);
y_buf = (__m128i*) (pDst[0]);
cb_buf = (__m128i*) (pDst[1]);
cr_buf = (__m128i*) (pDst[2]);
y_r = _mm_set1_epi16(9798); /* 0.299000 << 15 */
y_g = _mm_set1_epi16(19235); /* 0.587000 << 15 */
y_b = _mm_set1_epi16(3735); /* 0.114000 << 15 */
cb_r = _mm_set1_epi16(-5535); /* -0.168935 << 15 */
cb_g = _mm_set1_epi16(-10868); /* -0.331665 << 15 */
cb_b = _mm_set1_epi16(16403); /* 0.500590 << 15 */
cr_r = _mm_set1_epi16(16377); /* 0.499813 << 15 */
cr_g = _mm_set1_epi16(-13714); /* -0.418531 << 15 */
cr_b = _mm_set1_epi16(-2663); /* -0.081282 << 15 */
srcbump = srcStep / sizeof(__m128i);
dstbump = dstStep / sizeof(__m128i);
#ifdef DO_PREFETCH
/* Prefetch RGB's. */
for (yp=0; yp<roi->height; yp++)
{
int i;
for (i=0; i<roi->width * sizeof(INT16) / sizeof(__m128i);
i += (CACHE_LINE_BYTES / sizeof(__m128i)))
{
_mm_prefetch((char*)(&r_buf[i]), _MM_HINT_NTA);
_mm_prefetch((char*)(&g_buf[i]), _MM_HINT_NTA);
_mm_prefetch((char*)(&b_buf[i]), _MM_HINT_NTA);
}
r_buf += srcbump;
g_buf += srcbump;
b_buf += srcbump;
}
r_buf = (__m128i*) (pSrc[0]);
g_buf = (__m128i*) (pSrc[1]);
b_buf = (__m128i*) (pSrc[2]);
#endif /* DO_PREFETCH */
imax = roi->width * sizeof(INT16) / sizeof(__m128i);
for (yp=0; yp<roi->height; ++yp)
{
int i;
for (i=0; i<imax; i++)
{
/* In order to use SSE2 signed 16-bit integer multiplication we
* need to convert the floating point factors to signed int
* without loosing information. The result of this multiplication
* is 32 bit and using SSE2 we get either the product's hi or lo
* word. Thus we will multiply the factors by the highest
* possible 2^n and take the upper 16 bits of the signed 32-bit
* result (_mm_mulhi_epi16). Since the final result needs to
* be scaled by << 5 and also in in order to keep the precision
* within the upper 16 bits we will also have to scale the RGB
* values used in the multiplication by << 5+(16-n).
*/
__m128i r, g, b, y, cb, cr;
r = _mm_load_si128(y_buf+i);
g = _mm_load_si128(g_buf+i);
b = _mm_load_si128(b_buf+i);
/* r<<6; g<<6; b<<6 */
r = _mm_slli_epi16(r, 6);
g = _mm_slli_epi16(g, 6);
b = _mm_slli_epi16(b, 6);
/* y = HIWORD(r*y_r) + HIWORD(g*y_g) + HIWORD(b*y_b) + min */
y = _mm_mulhi_epi16(r, y_r);
y = _mm_add_epi16(y, _mm_mulhi_epi16(g, y_g));
y = _mm_add_epi16(y, _mm_mulhi_epi16(b, y_b));
y = _mm_add_epi16(y, min);
/* y_r_buf[i] = MINMAX(y, 0, (255 << 5)) - (128 << 5); */
_mm_between_epi16(y, min, max);
_mm_store_si128(y_buf+i, y);
/* cb = HIWORD(r*cb_r) + HIWORD(g*cb_g) + HIWORD(b*cb_b) */
cb = _mm_mulhi_epi16(r, cb_r);
cb = _mm_add_epi16(cb, _mm_mulhi_epi16(g, cb_g));
cb = _mm_add_epi16(cb, _mm_mulhi_epi16(b, cb_b));
/* cb_g_buf[i] = MINMAX(cb, (-128 << 5), (127 << 5)); */
_mm_between_epi16(cb, min, max);
_mm_store_si128(cb_buf+i, cb);
/* cr = HIWORD(r*cr_r) + HIWORD(g*cr_g) + HIWORD(b*cr_b) */
cr = _mm_mulhi_epi16(r, cr_r);
cr = _mm_add_epi16(cr, _mm_mulhi_epi16(g, cr_g));
cr = _mm_add_epi16(cr, _mm_mulhi_epi16(b, cr_b));
/* cr_b_buf[i] = MINMAX(cr, (-128 << 5), (127 << 5)); */
_mm_between_epi16(cr, min, max);
_mm_store_si128(cr_buf+i, cr);
}
y_buf += srcbump;
cb_buf += srcbump;
cr_buf += srcbump;
r_buf += dstbump;
g_buf += dstbump;
b_buf += dstbump;
}
return PRIMITIVES_SUCCESS;
}
/*---------------------------------------------------------------------------*/
#define LOAD128(_src_) \
_mm_load_si128((__m128i *) _src_)
#define STORE128(_dst_, _src_) \
_mm_store_si128((__m128i *) _dst_, _src_)
#define PUNPCKLBW(_dst_, _src_) \
_dst_ = _mm_unpacklo_epi8(_src_, _dst_)
#define PUNPCKHBW(_dst_, _src_) \
_dst_ = _mm_unpackhi_epi8(_src_, _dst_)
#define PUNPCKLWD(_dst_, _src_) \
_dst_ = _mm_unpacklo_epi16(_src_, _dst_)
#define PUNPCKHWD(_dst_, _src_) \
_dst_ = _mm_unpackhi_epi16(_src_, _dst_)
#define PACKUSWB(_dst_, _src_) \
_dst_ = _mm_packus_epi16(_dst_, _src_)
#define PREFETCH(_ptr_) \
_mm_prefetch((const void *) _ptr_, _MM_HINT_T0)
#define XMM_ALL_ONES \
_mm_set1_epi32(0xFFFFFFFFU)
PRIM_STATIC pstatus_t sse2_RGBToRGB_16s8u_P3AC4R(
const INT16 *pSrc[3], /* 16-bit R,G, and B arrays */
INT32 srcStep, /* bytes between rows in source data */
BYTE *pDst, /* 32-bit interleaved ARGB (ABGR?) data */
INT32 dstStep, /* bytes between rows in dest data */
const prim_size_t *roi) /* region of interest */
{
const UINT16 *r = (const UINT16 *) (pSrc[0]);
const UINT16 *g = (const UINT16 *) (pSrc[1]);
const UINT16 *b = (const UINT16 *) (pSrc[2]);
BYTE *out;
int srcbump, dstbump, y;
/* Ensure 16-byte alignment on all pointers,
* that width is a multiple of 8,
* and that the next row will also remain aligned.
* Since this is usually used for 64x64 aligned arrays,
* these checks should presumably pass.
*/
if ((((ULONG_PTR) (pSrc[0]) & 0x0f) != 0)
|| (((ULONG_PTR) (pSrc[1]) & 0x0f) != 0)
|| (((ULONG_PTR) (pSrc[2]) & 0x0f) != 0)
|| (((ULONG_PTR) pDst & 0x0f) != 0)
|| (roi->width & 0x0f)
|| (srcStep & 0x0f)
|| (dstStep & 0x0f))
{
return general_RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, roi);
}
out = (BYTE *) pDst;
srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
dstbump = (dstStep - (roi->width * sizeof(UINT32)));
for (y=0; y<roi->height; ++y)
{
int width = roi->width;
do {
__m128i R0, R1, R2, R3, R4;
/* The comments below pretend these are 8-byte registers
* rather than 16-byte, for readability.
*/
R0 = LOAD128(b); b += 8; /* R0 = 00B300B200B100B0 */
R1 = LOAD128(b); b += 8; /* R1 = 00B700B600B500B4 */
PACKUSWB(R0,R1); /* R0 = B7B6B5B4B3B2B1B0 */
R1 = LOAD128(g); g += 8; /* R1 = 00G300G200G100G0 */
R2 = LOAD128(g); g += 8; /* R2 = 00G700G600G500G4 */
PACKUSWB(R1,R2); /* R1 = G7G6G5G4G3G2G1G0 */
R2 = R1; /* R2 = G7G6G5G4G3G2G1G0 */
PUNPCKLBW(R2,R0); /* R2 = G3B3G2B2G1B1G0B0 */
PUNPCKHBW(R1,R0); /* R1 = G7B7G6B7G5B5G4B4 */
R0 = LOAD128(r); r += 8; /* R0 = 00R300R200R100R0 */
R3 = LOAD128(r); r += 8; /* R3 = 00R700R600R500R4 */
PACKUSWB(R0,R3); /* R0 = R7R6R5R4R3R2R1R0 */
R3 = XMM_ALL_ONES; /* R3 = FFFFFFFFFFFFFFFF */
R4 = R3; /* R4 = FFFFFFFFFFFFFFFF */
PUNPCKLBW(R4,R0); /* R4 = FFR3FFR2FFR1FFR0 */
PUNPCKHBW(R3,R0); /* R3 = FFR7FFR6FFR5FFR4 */
R0 = R4; /* R0 = R4 */
PUNPCKLWD(R0,R2); /* R0 = FFR1G1B1FFR0G0B0 */
PUNPCKHWD(R4,R2); /* R4 = FFR3G3B3FFR2G2B2 */
R2 = R3; /* R2 = R3 */
PUNPCKLWD(R2,R1); /* R2 = FFR5G5B5FFR4G4B4 */
PUNPCKHWD(R3,R1); /* R3 = FFR7G7B7FFR6G6B6 */
STORE128(out, R0); out += 16; /* FFR1G1B1FFR0G0B0 */
STORE128(out, R4); out += 16; /* FFR3G3B3FFR2G2B2 */
STORE128(out, R2); out += 16; /* FFR5G5B5FFR4G4B4 */
STORE128(out, R3); out += 16; /* FFR7G7B7FFR6G6B6 */
} while (width -= 16);
/* Jump to next row. */
r += srcbump;
g += srcbump;
b += srcbump;
out += dstbump;
}
return PRIMITIVES_SUCCESS;
}
#endif /* WITH_SSE2 */
/*---------------------------------------------------------------------------*/
#ifdef WITH_NEON
PRIM_STATIC pstatus_t neon_yCbCrToRGB_16s16s_P3P3(
const INT16 *pSrc[3],
int srcStep,
INT16 *pDst[3],
int dstStep,
const prim_size_t *roi) /* region of interest */
{
/* TODO: If necessary, check alignments and call the general version. */
int16x8_t zero = vdupq_n_s16(0);
int16x8_t max = vdupq_n_s16(255);
int16x8_t r_cr = vdupq_n_s16(22986); // 1.403 << 14
int16x8_t g_cb = vdupq_n_s16(-5636); // -0.344 << 14
int16x8_t g_cr = vdupq_n_s16(-11698); // -0.714 << 14
int16x8_t b_cb = vdupq_n_s16(28999); // 1.770 << 14
int16x8_t c4096 = vdupq_n_s16(4096);
int16x8_t* y_buf = (int16x8_t*) pSrc[0];
int16x8_t* cb_buf = (int16x8_t*) pSrc[1];
int16x8_t* cr_buf = (int16x8_t*) pSrc[2];
int16x8_t* r_buf = (int16x8_t*) pDst[0];
int16x8_t* g_buf = (int16x8_t*) pDst[1];
int16x8_t* b_buf = (int16x8_t*) pDst[2];
int srcbump = srcStep / sizeof(int16x8_t);
int dstbump = dstStep / sizeof(int16x8_t);
int yp;
int imax = roi->width * sizeof(INT16) / sizeof(int16x8_t);
for (yp=0; yp<roi->height; ++yp)
{
int i;
for (i=0; i<imax; i++)
{
/*
In order to use NEON signed 16-bit integer multiplication we need to convert
the floating point factors to signed int without loosing information.
The result of this multiplication is 32 bit and we have a NEON instruction
that returns the hi word of the saturated double.
Thus we will multiply the factors by the highest possible 2^n, take the
upper 16 bits of the signed 32-bit result (vqdmulhq_s16 followed by a right
shift by 1 to reverse the doubling) and correct this result by multiplying it
by 2^(16-n).
For the given factors in the conversion matrix the best possible n is 14.
Example for calculating r:
r = (y>>5) + 128 + (cr*1.403)>>5 // our base formula
r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5 // see above
r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5 // simplification
r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
*/
/* y = (y_buf[i] + 4096) >> 2 */
int16x8_t y = vld1q_s16((INT16*) &y_buf[i]);
y = vaddq_s16(y, c4096);
y = vshrq_n_s16(y, 2);
/* cb = cb_buf[i]; */
int16x8_t cb = vld1q_s16((INT16*)&cb_buf[i]);
/* cr = cr_buf[i]; */
int16x8_t cr = vld1q_s16((INT16*) &cr_buf[i]);
/* (y + HIWORD(cr*22986)) >> 3 */
int16x8_t r = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cr, r_cr), 1));
r = vshrq_n_s16(r, 3);
/* r_buf[i] = MINMAX(r, 0, 255); */
r = vminq_s16(vmaxq_s16(r, zero), max);
vst1q_s16((INT16*)&r_buf[i], r);
/* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
int16x8_t g = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cb, g_cb), 1));
g = vaddq_s16(g, vshrq_n_s16(vqdmulhq_s16(cr, g_cr), 1));
g = vshrq_n_s16(g, 3);
/* g_buf[i] = MINMAX(g, 0, 255); */
g = vminq_s16(vmaxq_s16(g, zero), max);
vst1q_s16((INT16*)&g_buf[i], g);
/* (y + HIWORD(cb*28999)) >> 3 */
int16x8_t b = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cb, b_cb), 1));
b = vshrq_n_s16(b, 3);
/* b_buf[i] = MINMAX(b, 0, 255); */
b = vminq_s16(vmaxq_s16(b, zero), max);
vst1q_s16((INT16*)&b_buf[i], b);
}
y_buf += srcbump;
cb_buf += srcbump;
cr_buf += srcbump;
r_buf += dstbump;
g_buf += dstbump;
b_buf += dstbump;
}
return PRIMITIVES_SUCCESS;
}
#endif /* WITH_NEON */
/* I don't see a direct IPP version of this, since the input is INT16
* YCbCr. It may be possible via Deinterleave and then YCbCrToRGB_<mod>.
* But that would likely be slower.
*/
/* ------------------------------------------------------------------------- */
void primitives_init_colors(const primitives_hints_t* hints, primitives_t* prims)
{
@ -734,19 +221,7 @@ void primitives_init_colors(const primitives_hints_t* hints, primitives_t* prims
prims->yCbCrToRGB_16s16s_P3P3 = general_yCbCrToRGB_16s16s_P3P3;
prims->RGBToYCbCr_16s16s_P3P3 = general_RGBToYCbCr_16s16s_P3P3;
#if defined(WITH_SSE2)
if (hints->x86_flags & PRIM_X86_SSE2_AVAILABLE)
{
prims->RGBToRGB_16s8u_P3AC4R = sse2_RGBToRGB_16s8u_P3AC4R;
prims->yCbCrToRGB_16s16s_P3P3 = sse2_yCbCrToRGB_16s16s_P3P3;
prims->RGBToYCbCr_16s16s_P3P3 = sse2_RGBToYCbCr_16s16s_P3P3;
}
#elif defined(WITH_NEON)
if (hints->arm_flags & PRIM_ARM_NEON_AVAILABLE)
{
prims->yCbCrToRGB_16s16s_P3P3 = neon_yCbCrToRGB_16s16s_P3P3;
}
#endif /* WITH_SSE2 */
primitives_init_colors_opt(hints, prims);
}
/* ------------------------------------------------------------------------- */
@ -754,3 +229,4 @@ void primitives_deinit_colors(primitives_t* prims)
{
/* Nothing to do. */
}

View File

@ -0,0 +1,32 @@
/* FreeRDP: A Remote Desktop Protocol Client
* Color conversion operations.
* vi:ts=4 sw=4
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License. Algorithms used by
* this code may be covered by patents by HP, Microsoft, or other parties.
*
*/
#ifdef __GNUC__
# pragma once
#endif
#ifndef __PRIM_COLORS_H_INCLUDED__
#define __PRIM_COLORS_H_INCLUDED__
pstatus_t general_yCbCrToRGB_16s16s_P3P3(const INT16 *pSrc[3], INT32 srcStep, INT16 *pDst[3], INT32 dstStep, const prim_size_t *roi);
pstatus_t general_RGBToYCbCr_16s16s_P3P3(const INT16 *pSrc[3], INT32 srcStep, INT16 *pDst[3], INT32 dstStep, const prim_size_t *roi);
pstatus_t general_RGBToRGB_16s8u_P3AC4R(const INT16 *pSrc[3], int srcStep, BYTE *pDst, int dstStep, const prim_size_t *roi);
void primitives_init_colors_opt(const primitives_hints_t* hints, primitives_t* prims);
#endif /* !__PRIM_COLORS_H_INCLUDED__ */

View File

@ -0,0 +1,561 @@
/* FreeRDP: A Remote Desktop Protocol Client
* Optimized Color conversion operations.
* vi:ts=4 sw=4:
*
* Copyright 2011 Stephen Erisman
* Copyright 2011 Norbert Federa <nfedera@thinstuff.com>
* Copyright 2011 Martin Fleisz <mfleisz@thinstuff.com>
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include <freerdp/types.h>
#include <freerdp/primitives.h>
#ifdef WITH_SSE2
#include <emmintrin.h>
#elif defined(WITH_NEON)
#include <arm_neon.h>
#endif /* WITH_SSE2 else WITH_NEON */
#include "prim_internal.h"
#include "prim_templates.h"
#include "prim_colors.h"
#ifdef WITH_SSE2
#ifdef __GNUC__
# define GNU_INLINE \
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
#else
# define GNU_INLINE
#endif
#define CACHE_LINE_BYTES 64
#define _mm_between_epi16(_val, _min, _max) \
do { _val = _mm_min_epi16(_max, _mm_max_epi16(_val, _min)); } while (0)
#ifdef DO_PREFETCH
/*---------------------------------------------------------------------------*/
static inline void GNU_INLINE _mm_prefetch_buffer(
char * buffer,
int num_bytes)
{
__m128i * buf = (__m128i*) buffer;
unsigned int i;
for (i = 0; i < (num_bytes / sizeof(__m128i));
i+=(CACHE_LINE_BYTES / sizeof(__m128i)))
{
_mm_prefetch((char*)(&buf[i]), _MM_HINT_NTA);
}
}
#endif /* DO_PREFETCH */
/*---------------------------------------------------------------------------*/
pstatus_t sse2_yCbCrToRGB_16s16s_P3P3(
const INT16 *pSrc[3],
int srcStep,
INT16 *pDst[3],
int dstStep,
const prim_size_t *roi) /* region of interest */
{
__m128i zero, max, r_cr, g_cb, g_cr, b_cb, c4096;
__m128i *y_buf, *cb_buf, *cr_buf, *r_buf, *g_buf, *b_buf;
int srcbump, dstbump, yp, imax;
if (((ULONG_PTR) (pSrc[0]) & 0x0f)
|| ((ULONG_PTR) (pSrc[1]) & 0x0f)
|| ((ULONG_PTR) (pSrc[2]) & 0x0f)
|| ((ULONG_PTR) (pDst[0]) & 0x0f)
|| ((ULONG_PTR) (pDst[1]) & 0x0f)
|| ((ULONG_PTR) (pDst[2]) & 0x0f)
|| (roi->width & 0x07)
|| (srcStep & 127)
|| (dstStep & 127))
{
/* We can't maintain 16-byte alignment. */
return general_yCbCrToRGB_16s16s_P3P3(pSrc, srcStep,
pDst, dstStep, roi);
}
zero = _mm_setzero_si128();
max = _mm_set1_epi16(255);
y_buf = (__m128i*) (pSrc[0]);
cb_buf = (__m128i*) (pSrc[1]);
cr_buf = (__m128i*) (pSrc[2]);
r_buf = (__m128i*) (pDst[0]);
g_buf = (__m128i*) (pDst[1]);
b_buf = (__m128i*) (pDst[2]);
r_cr = _mm_set1_epi16(22986); /* 1.403 << 14 */
g_cb = _mm_set1_epi16(-5636); /* -0.344 << 14 */
g_cr = _mm_set1_epi16(-11698); /* -0.714 << 14 */
b_cb = _mm_set1_epi16(28999); /* 1.770 << 14 */
c4096 = _mm_set1_epi16(4096);
srcbump = srcStep / sizeof(__m128i);
dstbump = dstStep / sizeof(__m128i);
#ifdef DO_PREFETCH
/* Prefetch Y's, Cb's, and Cr's. */
for (yp=0; yp<roi->height; yp++)
{
int i;
for (i=0; i<roi->width * sizeof(INT16) / sizeof(__m128i);
i += (CACHE_LINE_BYTES / sizeof(__m128i)))
{
_mm_prefetch((char*)(&y_buf[i]), _MM_HINT_NTA);
_mm_prefetch((char*)(&cb_buf[i]), _MM_HINT_NTA);
_mm_prefetch((char*)(&cr_buf[i]), _MM_HINT_NTA);
}
y_buf += srcbump;
cb_buf += srcbump;
cr_buf += srcbump;
}
y_buf = (__m128i*) (pSrc[0]);
cb_buf = (__m128i*) (pSrc[1]);
cr_buf = (__m128i*) (pSrc[2]);
#endif /* DO_PREFETCH */
imax = roi->width * sizeof(INT16) / sizeof(__m128i);
for (yp=0; yp<roi->height; ++yp)
{
int i;
for (i=0; i<imax; i++)
{
/* In order to use SSE2 signed 16-bit integer multiplication
* we need to convert the floating point factors to signed int
* without losing information.
* The result of this multiplication is 32 bit and we have two
* SSE instructions that return either the hi or lo word.
* Thus we will multiply the factors by the highest possible 2^n,
* take the upper 16 bits of the signed 32-bit result
* (_mm_mulhi_epi16) and correct this result by multiplying
* it by 2^(16-n).
*
* For the given factors in the conversion matrix the best
* possible n is 14.
*
* Example for calculating r:
* r = (y>>5) + 128 + (cr*1.403)>>5 // our base formula
* r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5 // see above
* r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5 // simplification
* r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
*/
/* y = (y_r_buf[i] + 4096) >> 2 */
__m128i y, cb, cr, r, g, b;
y = _mm_load_si128(y_buf + i);
y = _mm_add_epi16(y, c4096);
y = _mm_srai_epi16(y, 2);
/* cb = cb_g_buf[i]; */
cb = _mm_load_si128(cb_buf + i);
/* cr = cr_b_buf[i]; */
cr = _mm_load_si128(cr_buf + i);
/* (y + HIWORD(cr*22986)) >> 3 */
r = _mm_add_epi16(y, _mm_mulhi_epi16(cr, r_cr));
r = _mm_srai_epi16(r, 3);
/* r_buf[i] = MINMAX(r, 0, 255); */
_mm_between_epi16(r, zero, max);
_mm_store_si128(r_buf + i, r);
/* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
g = _mm_add_epi16(y, _mm_mulhi_epi16(cb, g_cb));
g = _mm_add_epi16(g, _mm_mulhi_epi16(cr, g_cr));
g = _mm_srai_epi16(g, 3);
/* g_buf[i] = MINMAX(g, 0, 255); */
_mm_between_epi16(g, zero, max);
_mm_store_si128(g_buf + i, g);
/* (y + HIWORD(cb*28999)) >> 3 */
b = _mm_add_epi16(y, _mm_mulhi_epi16(cb, b_cb));
b = _mm_srai_epi16(b, 3);
/* b_buf[i] = MINMAX(b, 0, 255); */
_mm_between_epi16(b, zero, max);
_mm_store_si128(b_buf + i, b);
}
y_buf += srcbump;
cb_buf += srcbump;
cr_buf += srcbump;
r_buf += dstbump;
g_buf += dstbump;
b_buf += dstbump;
}
return PRIMITIVES_SUCCESS;
}
/*---------------------------------------------------------------------------*/
/* The encodec YCbCr coeffectients are represented as 11.5 fixed-point
* numbers. See the general code above.
*/
pstatus_t sse2_RGBToYCbCr_16s16s_P3P3(
const INT16 *pSrc[3],
int srcStep,
INT16 *pDst[3],
int dstStep,
const prim_size_t *roi) /* region of interest */
{
__m128i min, max, y_r, y_g, y_b, cb_r, cb_g, cb_b, cr_r, cr_g, cr_b;
__m128i *r_buf, *g_buf, *b_buf, *y_buf, *cb_buf, *cr_buf;
int srcbump, dstbump, yp, imax;
if (((ULONG_PTR) (pSrc[0]) & 0x0f)
|| ((ULONG_PTR) (pSrc[1]) & 0x0f)
|| ((ULONG_PTR) (pSrc[2]) & 0x0f)
|| ((ULONG_PTR) (pDst[0]) & 0x0f)
|| ((ULONG_PTR) (pDst[1]) & 0x0f)
|| ((ULONG_PTR) (pDst[2]) & 0x0f)
|| (roi->width & 0x07)
|| (srcStep & 127)
|| (dstStep & 127))
{
/* We can't maintain 16-byte alignment. */
return general_RGBToYCbCr_16s16s_P3P3(pSrc, srcStep,
pDst, dstStep, roi);
}
min = _mm_set1_epi16(-128 << 5);
max = _mm_set1_epi16(127 << 5);
r_buf = (__m128i*) (pSrc[0]);
g_buf = (__m128i*) (pSrc[1]);
b_buf = (__m128i*) (pSrc[2]);
y_buf = (__m128i*) (pDst[0]);
cb_buf = (__m128i*) (pDst[1]);
cr_buf = (__m128i*) (pDst[2]);
y_r = _mm_set1_epi16(9798); /* 0.299000 << 15 */
y_g = _mm_set1_epi16(19235); /* 0.587000 << 15 */
y_b = _mm_set1_epi16(3735); /* 0.114000 << 15 */
cb_r = _mm_set1_epi16(-5535); /* -0.168935 << 15 */
cb_g = _mm_set1_epi16(-10868); /* -0.331665 << 15 */
cb_b = _mm_set1_epi16(16403); /* 0.500590 << 15 */
cr_r = _mm_set1_epi16(16377); /* 0.499813 << 15 */
cr_g = _mm_set1_epi16(-13714); /* -0.418531 << 15 */
cr_b = _mm_set1_epi16(-2663); /* -0.081282 << 15 */
srcbump = srcStep / sizeof(__m128i);
dstbump = dstStep / sizeof(__m128i);
#ifdef DO_PREFETCH
/* Prefetch RGB's. */
for (yp=0; yp<roi->height; yp++)
{
int i;
for (i=0; i<roi->width * sizeof(INT16) / sizeof(__m128i);
i += (CACHE_LINE_BYTES / sizeof(__m128i)))
{
_mm_prefetch((char*)(&r_buf[i]), _MM_HINT_NTA);
_mm_prefetch((char*)(&g_buf[i]), _MM_HINT_NTA);
_mm_prefetch((char*)(&b_buf[i]), _MM_HINT_NTA);
}
r_buf += srcbump;
g_buf += srcbump;
b_buf += srcbump;
}
r_buf = (__m128i*) (pSrc[0]);
g_buf = (__m128i*) (pSrc[1]);
b_buf = (__m128i*) (pSrc[2]);
#endif /* DO_PREFETCH */
imax = roi->width * sizeof(INT16) / sizeof(__m128i);
for (yp=0; yp<roi->height; ++yp)
{
int i;
for (i=0; i<imax; i++)
{
/* In order to use SSE2 signed 16-bit integer multiplication we
* need to convert the floating point factors to signed int
* without loosing information. The result of this multiplication
* is 32 bit and using SSE2 we get either the product's hi or lo
* word. Thus we will multiply the factors by the highest
* possible 2^n and take the upper 16 bits of the signed 32-bit
* result (_mm_mulhi_epi16). Since the final result needs to
* be scaled by << 5 and also in in order to keep the precision
* within the upper 16 bits we will also have to scale the RGB
* values used in the multiplication by << 5+(16-n).
*/
__m128i r, g, b, y, cb, cr;
r = _mm_load_si128(y_buf+i);
g = _mm_load_si128(g_buf+i);
b = _mm_load_si128(b_buf+i);
/* r<<6; g<<6; b<<6 */
r = _mm_slli_epi16(r, 6);
g = _mm_slli_epi16(g, 6);
b = _mm_slli_epi16(b, 6);
/* y = HIWORD(r*y_r) + HIWORD(g*y_g) + HIWORD(b*y_b) + min */
y = _mm_mulhi_epi16(r, y_r);
y = _mm_add_epi16(y, _mm_mulhi_epi16(g, y_g));
y = _mm_add_epi16(y, _mm_mulhi_epi16(b, y_b));
y = _mm_add_epi16(y, min);
/* y_r_buf[i] = MINMAX(y, 0, (255 << 5)) - (128 << 5); */
_mm_between_epi16(y, min, max);
_mm_store_si128(y_buf+i, y);
/* cb = HIWORD(r*cb_r) + HIWORD(g*cb_g) + HIWORD(b*cb_b) */
cb = _mm_mulhi_epi16(r, cb_r);
cb = _mm_add_epi16(cb, _mm_mulhi_epi16(g, cb_g));
cb = _mm_add_epi16(cb, _mm_mulhi_epi16(b, cb_b));
/* cb_g_buf[i] = MINMAX(cb, (-128 << 5), (127 << 5)); */
_mm_between_epi16(cb, min, max);
_mm_store_si128(cb_buf+i, cb);
/* cr = HIWORD(r*cr_r) + HIWORD(g*cr_g) + HIWORD(b*cr_b) */
cr = _mm_mulhi_epi16(r, cr_r);
cr = _mm_add_epi16(cr, _mm_mulhi_epi16(g, cr_g));
cr = _mm_add_epi16(cr, _mm_mulhi_epi16(b, cr_b));
/* cr_b_buf[i] = MINMAX(cr, (-128 << 5), (127 << 5)); */
_mm_between_epi16(cr, min, max);
_mm_store_si128(cr_buf+i, cr);
}
y_buf += srcbump;
cb_buf += srcbump;
cr_buf += srcbump;
r_buf += dstbump;
g_buf += dstbump;
b_buf += dstbump;
}
return PRIMITIVES_SUCCESS;
}
/*---------------------------------------------------------------------------*/
#define LOAD128(_src_) \
_mm_load_si128((__m128i *) _src_)
#define STORE128(_dst_, _src_) \
_mm_store_si128((__m128i *) _dst_, _src_)
#define PUNPCKLBW(_dst_, _src_) \
_dst_ = _mm_unpacklo_epi8(_src_, _dst_)
#define PUNPCKHBW(_dst_, _src_) \
_dst_ = _mm_unpackhi_epi8(_src_, _dst_)
#define PUNPCKLWD(_dst_, _src_) \
_dst_ = _mm_unpacklo_epi16(_src_, _dst_)
#define PUNPCKHWD(_dst_, _src_) \
_dst_ = _mm_unpackhi_epi16(_src_, _dst_)
#define PACKUSWB(_dst_, _src_) \
_dst_ = _mm_packus_epi16(_dst_, _src_)
#define PREFETCH(_ptr_) \
_mm_prefetch((const void *) _ptr_, _MM_HINT_T0)
#define XMM_ALL_ONES \
_mm_set1_epi32(0xFFFFFFFFU)
pstatus_t sse2_RGBToRGB_16s8u_P3AC4R(
const INT16 *pSrc[3], /* 16-bit R,G, and B arrays */
INT32 srcStep, /* bytes between rows in source data */
BYTE *pDst, /* 32-bit interleaved ARGB (ABGR?) data */
INT32 dstStep, /* bytes between rows in dest data */
const prim_size_t *roi) /* region of interest */
{
const UINT16 *r = (const UINT16 *) (pSrc[0]);
const UINT16 *g = (const UINT16 *) (pSrc[1]);
const UINT16 *b = (const UINT16 *) (pSrc[2]);
BYTE *out;
int srcbump, dstbump, y;
/* Ensure 16-byte alignment on all pointers,
* that width is a multiple of 8,
* and that the next row will also remain aligned.
* Since this is usually used for 64x64 aligned arrays,
* these checks should presumably pass.
*/
if ((((ULONG_PTR) (pSrc[0]) & 0x0f) != 0)
|| (((ULONG_PTR) (pSrc[1]) & 0x0f) != 0)
|| (((ULONG_PTR) (pSrc[2]) & 0x0f) != 0)
|| (((ULONG_PTR) pDst & 0x0f) != 0)
|| (roi->width & 0x0f)
|| (srcStep & 0x0f)
|| (dstStep & 0x0f))
{
return general_RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, roi);
}
out = (BYTE *) pDst;
srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
dstbump = (dstStep - (roi->width * sizeof(UINT32)));
for (y=0; y<roi->height; ++y)
{
int width = roi->width;
do {
__m128i R0, R1, R2, R3, R4;
/* The comments below pretend these are 8-byte registers
* rather than 16-byte, for readability.
*/
R0 = LOAD128(b); b += 8; /* R0 = 00B300B200B100B0 */
R1 = LOAD128(b); b += 8; /* R1 = 00B700B600B500B4 */
PACKUSWB(R0,R1); /* R0 = B7B6B5B4B3B2B1B0 */
R1 = LOAD128(g); g += 8; /* R1 = 00G300G200G100G0 */
R2 = LOAD128(g); g += 8; /* R2 = 00G700G600G500G4 */
PACKUSWB(R1,R2); /* R1 = G7G6G5G4G3G2G1G0 */
R2 = R1; /* R2 = G7G6G5G4G3G2G1G0 */
PUNPCKLBW(R2,R0); /* R2 = G3B3G2B2G1B1G0B0 */
PUNPCKHBW(R1,R0); /* R1 = G7B7G6B7G5B5G4B4 */
R0 = LOAD128(r); r += 8; /* R0 = 00R300R200R100R0 */
R3 = LOAD128(r); r += 8; /* R3 = 00R700R600R500R4 */
PACKUSWB(R0,R3); /* R0 = R7R6R5R4R3R2R1R0 */
R3 = XMM_ALL_ONES; /* R3 = FFFFFFFFFFFFFFFF */
R4 = R3; /* R4 = FFFFFFFFFFFFFFFF */
PUNPCKLBW(R4,R0); /* R4 = FFR3FFR2FFR1FFR0 */
PUNPCKHBW(R3,R0); /* R3 = FFR7FFR6FFR5FFR4 */
R0 = R4; /* R0 = R4 */
PUNPCKLWD(R0,R2); /* R0 = FFR1G1B1FFR0G0B0 */
PUNPCKHWD(R4,R2); /* R4 = FFR3G3B3FFR2G2B2 */
R2 = R3; /* R2 = R3 */
PUNPCKLWD(R2,R1); /* R2 = FFR5G5B5FFR4G4B4 */
PUNPCKHWD(R3,R1); /* R3 = FFR7G7B7FFR6G6B6 */
STORE128(out, R0); out += 16; /* FFR1G1B1FFR0G0B0 */
STORE128(out, R4); out += 16; /* FFR3G3B3FFR2G2B2 */
STORE128(out, R2); out += 16; /* FFR5G5B5FFR4G4B4 */
STORE128(out, R3); out += 16; /* FFR7G7B7FFR6G6B6 */
} while (width -= 16);
/* Jump to next row. */
r += srcbump;
g += srcbump;
b += srcbump;
out += dstbump;
}
return PRIMITIVES_SUCCESS;
}
#endif /* WITH_SSE2 */
/*---------------------------------------------------------------------------*/
#ifdef WITH_NEON
pstatus_t neon_yCbCrToRGB_16s16s_P3P3(
const INT16 *pSrc[3],
int srcStep,
INT16 *pDst[3],
int dstStep,
const prim_size_t *roi) /* region of interest */
{
/* TODO: If necessary, check alignments and call the general version. */
int16x8_t zero = vdupq_n_s16(0);
int16x8_t max = vdupq_n_s16(255);
int16x8_t r_cr = vdupq_n_s16(22986); // 1.403 << 14
int16x8_t g_cb = vdupq_n_s16(-5636); // -0.344 << 14
int16x8_t g_cr = vdupq_n_s16(-11698); // -0.714 << 14
int16x8_t b_cb = vdupq_n_s16(28999); // 1.770 << 14
int16x8_t c4096 = vdupq_n_s16(4096);
int16x8_t* y_buf = (int16x8_t*) pSrc[0];
int16x8_t* cb_buf = (int16x8_t*) pSrc[1];
int16x8_t* cr_buf = (int16x8_t*) pSrc[2];
int16x8_t* r_buf = (int16x8_t*) pDst[0];
int16x8_t* g_buf = (int16x8_t*) pDst[1];
int16x8_t* b_buf = (int16x8_t*) pDst[2];
int srcbump = srcStep / sizeof(int16x8_t);
int dstbump = dstStep / sizeof(int16x8_t);
int yp;
int imax = roi->width * sizeof(INT16) / sizeof(int16x8_t);
for (yp=0; yp<roi->height; ++yp)
{
int i;
for (i=0; i<imax; i++)
{
/*
In order to use NEON signed 16-bit integer multiplication we need to convert
the floating point factors to signed int without loosing information.
The result of this multiplication is 32 bit and we have a NEON instruction
that returns the hi word of the saturated double.
Thus we will multiply the factors by the highest possible 2^n, take the
upper 16 bits of the signed 32-bit result (vqdmulhq_s16 followed by a right
shift by 1 to reverse the doubling) and correct this result by multiplying it
by 2^(16-n).
For the given factors in the conversion matrix the best possible n is 14.
Example for calculating r:
r = (y>>5) + 128 + (cr*1.403)>>5 // our base formula
r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5 // see above
r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5 // simplification
r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
*/
/* y = (y_buf[i] + 4096) >> 2 */
int16x8_t y = vld1q_s16((INT16*) &y_buf[i]);
y = vaddq_s16(y, c4096);
y = vshrq_n_s16(y, 2);
/* cb = cb_buf[i]; */
int16x8_t cb = vld1q_s16((INT16*)&cb_buf[i]);
/* cr = cr_buf[i]; */
int16x8_t cr = vld1q_s16((INT16*) &cr_buf[i]);
/* (y + HIWORD(cr*22986)) >> 3 */
int16x8_t r = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cr, r_cr), 1));
r = vshrq_n_s16(r, 3);
/* r_buf[i] = MINMAX(r, 0, 255); */
r = vminq_s16(vmaxq_s16(r, zero), max);
vst1q_s16((INT16*)&r_buf[i], r);
/* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
int16x8_t g = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cb, g_cb), 1));
g = vaddq_s16(g, vshrq_n_s16(vqdmulhq_s16(cr, g_cr), 1));
g = vshrq_n_s16(g, 3);
/* g_buf[i] = MINMAX(g, 0, 255); */
g = vminq_s16(vmaxq_s16(g, zero), max);
vst1q_s16((INT16*)&g_buf[i], g);
/* (y + HIWORD(cb*28999)) >> 3 */
int16x8_t b = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cb, b_cb), 1));
b = vshrq_n_s16(b, 3);
/* b_buf[i] = MINMAX(b, 0, 255); */
b = vminq_s16(vmaxq_s16(b, zero), max);
vst1q_s16((INT16*)&b_buf[i], b);
}
y_buf += srcbump;
cb_buf += srcbump;
cr_buf += srcbump;
r_buf += dstbump;
g_buf += dstbump;
b_buf += dstbump;
}
return PRIMITIVES_SUCCESS;
}
#endif /* WITH_NEON */
/* I don't see a direct IPP version of this, since the input is INT16
* YCbCr. It may be possible via Deinterleave and then YCbCrToRGB_<mod>.
* But that would likely be slower.
*/
/* ------------------------------------------------------------------------- */
void primitives_init_colors_opt(const primitives_hints_t* hints, primitives_t* prims)
{
#if defined(WITH_SSE2)
if (hints->x86_flags & PRIM_X86_SSE2_AVAILABLE)
{
prims->RGBToRGB_16s8u_P3AC4R = sse2_RGBToRGB_16s8u_P3AC4R;
prims->yCbCrToRGB_16s16s_P3P3 = sse2_yCbCrToRGB_16s16s_P3P3;
prims->RGBToYCbCr_16s16s_P3P3 = sse2_RGBToYCbCr_16s16s_P3P3;
}
#elif defined(WITH_NEON)
if (hints->arm_flags & PRIM_ARM_NEON_AVAILABLE)
{
prims->yCbCrToRGB_16s16s_P3P3 = neon_yCbCrToRGB_16s16s_P3P3;
}
#endif /* WITH_SSE2 */
}

View File

@ -72,7 +72,7 @@ static BOOL memory_regions_overlap_2d(
}
/* ------------------------------------------------------------------------- */
PRIM_STATIC pstatus_t general_copy_8u(
pstatus_t general_copy_8u(
const BYTE *pSrc,
BYTE *pDst,
INT32 len)
@ -94,7 +94,7 @@ PRIM_STATIC pstatus_t general_copy_8u(
* The addresses are assumed to have been already offset to the upper-left
* corners of the source and destination region of interest.
*/
PRIM_STATIC pstatus_t general_copy_8u_AC4r(
pstatus_t general_copy_8u_AC4r(
const BYTE *pSrc, INT32 srcStep,
BYTE *pDst, INT32 dstStep,
INT32 width, INT32 height)

View File

@ -27,15 +27,6 @@
#include <freerdp/primitives.h>
/* Normally the internal entrypoints should be static, but a benchmark
* program may want to access them directly and turn this off.
*/
#ifndef PRIM_STATIC
# define PRIM_STATIC static
#else
# undef PRIM_STATIC
# define PRIM_STATIC
#endif /* !PRIM_STATIC */
/* Use lddqu for unaligned; load for 16-byte aligned. */
#define LOAD_SI128(_ptr_) \

View File

@ -19,18 +19,15 @@
#endif
#include <string.h>
#include <freerdp/types.h>
#include <freerdp/primitives.h>
#ifdef WITH_SSE2
# include <emmintrin.h>
#endif /* WITH_SSE2 */
#ifdef WITH_IPP
# include <ipps.h>
#endif /* WITH_IPP */
#include "prim_internal.h"
#include "prim_set.h"
/* ========================================================================= */
PRIM_STATIC pstatus_t general_set_8u(
pstatus_t general_set_8u(
BYTE val,
BYTE *pDst,
INT32 len)
@ -40,7 +37,7 @@ PRIM_STATIC pstatus_t general_set_8u(
}
/* ------------------------------------------------------------------------- */
PRIM_STATIC pstatus_t general_zero(
pstatus_t general_zero(
void *pDst,
size_t len)
{
@ -48,75 +45,8 @@ PRIM_STATIC pstatus_t general_zero(
return PRIMITIVES_SUCCESS;
}
/* ------------------------------------------------------------------------- */
#ifdef WITH_SSE2
# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
PRIM_STATIC pstatus_t sse2_set_8u(
BYTE val,
BYTE *pDst,
INT32 len)
{
BYTE byte, *dptr;
__m128i xmm0;
size_t count;
if (len < 16) return general_set_8u(val, pDst, len);
byte = val;
dptr = (BYTE *) pDst;
/* Seek 16-byte alignment. */
while ((ULONG_PTR) dptr & 0x0f)
{
*dptr++ = byte;
if (--len == 0) return PRIMITIVES_SUCCESS;
}
xmm0 = _mm_set1_epi8(byte);
/* Cover 256-byte chunks via SSE register stores. */
count = len >> 8;
len -= count << 8;
/* Do 256-byte chunks using one XMM register. */
while (count--)
{
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 16;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 16;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 16;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 16;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 16;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 16;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 16;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 16;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 16;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 16;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 16;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 16;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 16;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 16;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 16;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 16;
}
/* Cover 16-byte chunks via SSE register stores. */
count = len >> 4;
len -= count << 4;
/* Do 16-byte chunks using one XMM register. */
while (count--)
{
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 16;
}
/* Do leftover bytes. */
while (len--) *dptr++ = byte;
return PRIMITIVES_SUCCESS;
}
# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
#endif /* WITH_SSE2 */
/* ========================================================================= */
PRIM_STATIC pstatus_t general_set_32s(
pstatus_t general_set_32s(
INT32 val,
INT32 *pDst,
INT32 len)
@ -148,7 +78,7 @@ PRIM_STATIC pstatus_t general_set_32s(
}
/* ------------------------------------------------------------------------- */
PRIM_STATIC pstatus_t general_set_32u(
pstatus_t general_set_32u(
UINT32 val,
UINT32 *pDst,
INT32 len)
@ -179,104 +109,6 @@ PRIM_STATIC pstatus_t general_set_32u(
return PRIMITIVES_SUCCESS;
}
/* ------------------------------------------------------------------------- */
#ifdef WITH_SSE2
# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
PRIM_STATIC pstatus_t sse2_set_32u(
UINT32 val,
UINT32 *pDst,
INT32 len)
{
UINT32 *dptr = (UINT32 *) pDst;
__m128i xmm0;
size_t count;
/* If really short, just do it here. */
if (len < 32)
{
while (len--) *dptr++ = val;
return PRIMITIVES_SUCCESS;
}
/* Assure we can reach 16-byte alignment. */
if (((ULONG_PTR) dptr & 0x03) != 0)
{
return general_set_32u(val, pDst, len);
}
/* Seek 16-byte alignment. */
while ((ULONG_PTR) dptr & 0x0f)
{
*dptr++ = val;
if (--len == 0) return PRIMITIVES_SUCCESS;
}
xmm0 = _mm_set1_epi32(val);
/* Cover 256-byte chunks via SSE register stores. */
count = len >> 6;
len -= count << 6;
/* Do 256-byte chunks using one XMM register. */
while (count--)
{
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 4;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 4;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 4;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 4;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 4;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 4;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 4;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 4;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 4;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 4;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 4;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 4;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 4;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 4;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 4;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 4;
}
/* Cover 16-byte chunks via SSE register stores. */
count = len >> 2;
len -= count << 2;
/* Do 16-byte chunks using one XMM register. */
while (count--)
{
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 4;
}
/* Do leftover bytes. */
while (len--) *dptr++ = val;
return PRIMITIVES_SUCCESS;
}
/* ------------------------------------------------------------------------- */
PRIM_STATIC pstatus_t sse2_set_32s(
INT32 val,
INT32 *pDst,
INT32 len)
{
UINT32 uval = *((UINT32 *) &val);
return sse2_set_32u(uval, (UINT32 *) pDst, len);
}
# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
#endif /* WITH_SSE2 */
#ifdef WITH_IPP
/* ------------------------------------------------------------------------- */
PRIM_STATIC pstatus_t ipp_wrapper_set_32u(
UINT32 val,
UINT32 *pDst,
INT32 len)
{
/* A little type conversion, then use the signed version. */
INT32 sval = *((INT32 *) &val);
return ippsSet_32s(sval, (INT32 *) pDst, len);
}
#endif
/* ------------------------------------------------------------------------- */
void primitives_init_set(
const primitives_hints_t *hints,
@ -288,20 +120,7 @@ void primitives_init_set(
prims->set_32u = general_set_32u;
prims->zero = general_zero;
/* Pick tuned versions if possible. */
#ifdef WITH_IPP
prims->set_8u = (__set_8u_t) ippsSet_8u;
prims->set_32s = (__set_32s_t) ippsSet_32s;
prims->set_32u = (__set_32u_t) ipp_wrapper_set_32u;
prims->zero = (__zero_t) ippsZero_8u;
#elif defined(WITH_SSE2)
if (hints->x86_flags & PRIM_X86_SSE2_AVAILABLE)
{
prims->set_8u = sse2_set_8u;
prims->set_32s = sse2_set_32s;
prims->set_32u = sse2_set_32u;
}
#endif
primitives_init_set_opt(hints, prims);
}
/* ------------------------------------------------------------------------- */
@ -310,3 +129,4 @@ void primitives_deinit_set(
{
/* Nothing to do. */
}

View File

@ -0,0 +1,34 @@
/* FreeRDP: A Remote Desktop Protocol Client
* Routines to set a chunk of memory to a constant.
* vi:ts=4 sw=4
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License. Algorithms used by
* this code may be covered by patents by HP, Microsoft, or other parties.
*
*/
#ifdef __GNUC__
# pragma once
#endif
#ifndef __PRIM_SET_H_INCLUDED__
#define __PRIM_SET_H_INCLUDED__
pstatus_t general_set_8u(BYTE val, BYTE *pDst, INT32 len);
pstatus_t general_zero(void *pDst, size_t len);
pstatus_t general_set_32s(INT32 val, INT32 *pDst, INT32 len);
pstatus_t general_set_32u(UINT32 val, UINT32 *pDst, INT32 len);
void primitives_init_set_opt(const primitives_hints_t *hints, primitives_t *prims);
#endif /* !__PRIM_SET_H_INCLUDED__ */

View File

@ -0,0 +1,218 @@
/* FreeRDP: A Remote Desktop Protocol Client
* Optimized routines to set a chunk of memory to a constant.
* vi:ts=4 sw=4:
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License.
*
*/
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include <string.h>
#include <freerdp/types.h>
#include <freerdp/primitives.h>
#ifdef WITH_SSE2
# include <emmintrin.h>
#endif /* WITH_SSE2 */
#ifdef WITH_IPP
# include <ipps.h>
#endif /* WITH_IPP */
#include "prim_internal.h"
#include "prim_set.h"
/* ========================================================================= */
#ifdef WITH_SSE2
# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
pstatus_t sse2_set_8u(
BYTE val,
BYTE *pDst,
INT32 len)
{
BYTE byte, *dptr;
__m128i xmm0;
size_t count;
if (len < 16) return general_set_8u(val, pDst, len);
byte = val;
dptr = (BYTE *) pDst;
/* Seek 16-byte alignment. */
while ((ULONG_PTR) dptr & 0x0f)
{
*dptr++ = byte;
if (--len == 0) return PRIMITIVES_SUCCESS;
}
xmm0 = _mm_set1_epi8(byte);
/* Cover 256-byte chunks via SSE register stores. */
count = len >> 8;
len -= count << 8;
/* Do 256-byte chunks using one XMM register. */
while (count--)
{
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 16;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 16;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 16;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 16;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 16;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 16;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 16;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 16;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 16;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 16;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 16;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 16;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 16;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 16;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 16;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 16;
}
/* Cover 16-byte chunks via SSE register stores. */
count = len >> 4;
len -= count << 4;
/* Do 16-byte chunks using one XMM register. */
while (count--)
{
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 16;
}
/* Do leftover bytes. */
while (len--) *dptr++ = byte;
return PRIMITIVES_SUCCESS;
}
# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
#endif /* WITH_SSE2 */
/* ------------------------------------------------------------------------- */
#ifdef WITH_SSE2
# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
pstatus_t sse2_set_32u(
UINT32 val,
UINT32 *pDst,
INT32 len)
{
UINT32 *dptr = (UINT32 *) pDst;
__m128i xmm0;
size_t count;
/* If really short, just do it here. */
if (len < 32)
{
while (len--) *dptr++ = val;
return PRIMITIVES_SUCCESS;
}
/* Assure we can reach 16-byte alignment. */
if (((ULONG_PTR) dptr & 0x03) != 0)
{
return general_set_32u(val, pDst, len);
}
/* Seek 16-byte alignment. */
while ((ULONG_PTR) dptr & 0x0f)
{
*dptr++ = val;
if (--len == 0) return PRIMITIVES_SUCCESS;
}
xmm0 = _mm_set1_epi32(val);
/* Cover 256-byte chunks via SSE register stores. */
count = len >> 6;
len -= count << 6;
/* Do 256-byte chunks using one XMM register. */
while (count--)
{
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 4;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 4;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 4;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 4;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 4;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 4;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 4;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 4;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 4;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 4;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 4;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 4;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 4;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 4;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 4;
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 4;
}
/* Cover 16-byte chunks via SSE register stores. */
count = len >> 2;
len -= count << 2;
/* Do 16-byte chunks using one XMM register. */
while (count--)
{
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 4;
}
/* Do leftover bytes. */
while (len--) *dptr++ = val;
return PRIMITIVES_SUCCESS;
}
/* ------------------------------------------------------------------------- */
pstatus_t sse2_set_32s(
INT32 val,
INT32 *pDst,
INT32 len)
{
UINT32 uval = *((UINT32 *) &val);
return sse2_set_32u(uval, (UINT32 *) pDst, len);
}
# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
#endif /* WITH_SSE2 */
#ifdef WITH_IPP
/* ------------------------------------------------------------------------- */
pstatus_t ipp_wrapper_set_32u(
UINT32 val,
UINT32 *pDst,
INT32 len)
{
/* A little type conversion, then use the signed version. */
INT32 sval = *((INT32 *) &val);
return ippsSet_32s(sval, (INT32 *) pDst, len);
}
#endif
/* ------------------------------------------------------------------------- */
void primitives_init_set_opt(const primitives_hints_t *hints, primitives_t *prims)
{
/* Pick tuned versions if possible. */
#ifdef WITH_IPP
prims->set_8u = (__set_8u_t) ippsSet_8u;
prims->set_32s = (__set_32s_t) ippsSet_32s;
prims->set_32u = (__set_32u_t) ipp_wrapper_set_32u;
prims->zero = (__zero_t) ippsZero_8u;
#elif defined(WITH_SSE2)
if (hints->x86_flags & PRIM_X86_SSE2_AVAILABLE)
{
prims->set_8u = sse2_set_8u;
prims->set_32s = sse2_set_32s;
prims->set_32u = sse2_set_32u;
}
#endif
}

View File

@ -17,25 +17,15 @@
#include "config.h"
#endif
#include <string.h>
#include <freerdp/types.h>
#include <freerdp/primitives.h>
#ifdef WITH_SSE2
#include <emmintrin.h>
#include <pmmintrin.h>
#endif /* WITH_SSE2 */
#ifdef WITH_IPP
#include <ipps.h>
#endif /* WITH_IPP */
#include "prim_internal.h"
#include "prim_templates.h"
#include "prim_shift.h"
/* ------------------------------------------------------------------------- */
PRIM_STATIC pstatus_t general_lShiftC_16s(
pstatus_t general_lShiftC_16s(
const INT16 *pSrc,
INT32 val,
INT16 *pDst,
@ -47,7 +37,7 @@ PRIM_STATIC pstatus_t general_lShiftC_16s(
}
/* ------------------------------------------------------------------------- */
PRIM_STATIC pstatus_t general_rShiftC_16s(
pstatus_t general_rShiftC_16s(
const INT16 *pSrc,
INT32 val,
INT16 *pDst,
@ -59,7 +49,7 @@ PRIM_STATIC pstatus_t general_rShiftC_16s(
}
/* ------------------------------------------------------------------------- */
PRIM_STATIC pstatus_t general_lShiftC_16u(
pstatus_t general_lShiftC_16u(
const UINT16 *pSrc,
INT32 val,
UINT16 *pDst,
@ -71,7 +61,7 @@ PRIM_STATIC pstatus_t general_lShiftC_16u(
}
/* ------------------------------------------------------------------------- */
PRIM_STATIC pstatus_t general_rShiftC_16u(
pstatus_t general_rShiftC_16u(
const UINT16 *pSrc,
INT32 val,
UINT16 *pDst,
@ -82,25 +72,8 @@ PRIM_STATIC pstatus_t general_rShiftC_16u(
return PRIMITIVES_SUCCESS;
}
#ifdef WITH_SSE2
# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
/* ------------------------------------------------------------------------- */
SSE3_SCD_ROUTINE(sse2_lShiftC_16s, INT16, general_lShiftC_16s,
_mm_slli_epi16, *dptr++ = *sptr++ << val)
/* ------------------------------------------------------------------------- */
SSE3_SCD_ROUTINE(sse2_rShiftC_16s, INT16, general_rShiftC_16s,
_mm_srai_epi16, *dptr++ = *sptr++ >> val)
/* ------------------------------------------------------------------------- */
SSE3_SCD_ROUTINE(sse2_lShiftC_16u, UINT16, general_lShiftC_16u,
_mm_slli_epi16, *dptr++ = *sptr++ << val)
/* ------------------------------------------------------------------------- */
SSE3_SCD_ROUTINE(sse2_rShiftC_16u, UINT16, general_rShiftC_16u,
_mm_srli_epi16, *dptr++ = *sptr++ >> val)
# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
#endif
/* ------------------------------------------------------------------------- */
PRIM_STATIC pstatus_t general_shiftC_16s(
pstatus_t general_shiftC_16s(
const INT16 *pSrc,
INT32 val,
INT16 *pDst,
@ -115,7 +88,7 @@ PRIM_STATIC pstatus_t general_shiftC_16s(
}
/* ------------------------------------------------------------------------- */
PRIM_STATIC pstatus_t general_shiftC_16u(
pstatus_t general_shiftC_16u(
const UINT16 *pSrc,
INT32 val,
UINT16 *pDst,
@ -129,11 +102,6 @@ PRIM_STATIC pstatus_t general_shiftC_16u(
else return prims->lShiftC_16u(pSrc, val, pDst, len);
}
/* Note: the IPP version will have to call ippLShiftC_16s or ippRShiftC_16s
* depending on the sign of val. To avoid using the deprecated inplace
* routines, a wrapper can use the src for the dest.
*/
/* ------------------------------------------------------------------------- */
void primitives_init_shift(
const primitives_hints_t *hints,
@ -144,24 +112,12 @@ void primitives_init_shift(
prims->rShiftC_16s = general_rShiftC_16s;
prims->lShiftC_16u = general_lShiftC_16u;
prims->rShiftC_16u = general_rShiftC_16u;
#if defined(WITH_IPP)
prims->lShiftC_16s = (__lShiftC_16s_t) ippsLShiftC_16s;
prims->rShiftC_16s = (__rShiftC_16s_t) ippsRShiftC_16s;
prims->lShiftC_16u = (__lShiftC_16u_t) ippsLShiftC_16u;
prims->rShiftC_16u = (__rShiftC_16u_t) ippsRShiftC_16u;
#elif defined(WITH_SSE2)
if ((hints->x86_flags & PRIM_X86_SSE2_AVAILABLE)
&& (hints->x86_flags & PRIM_X86_SSE3_AVAILABLE))
{
prims->lShiftC_16s = sse2_lShiftC_16s;
prims->rShiftC_16s = sse2_rShiftC_16s;
prims->lShiftC_16u = sse2_lShiftC_16u;
prims->rShiftC_16u = sse2_rShiftC_16u;
}
#endif
/* Wrappers */
prims->shiftC_16s = general_shiftC_16s;
prims->shiftC_16u = general_shiftC_16u;
primitives_init_shift_opt(hints, prims);
}
/* ------------------------------------------------------------------------- */

View File

@ -0,0 +1,35 @@
/* FreeRDP: A Remote Desktop Protocol Client
* Shift operations.
* vi:ts=4 sw=4
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License. Algorithms used by
* this code may be covered by patents by HP, Microsoft, or other parties.
*
*/
#ifdef __GNUC__
# pragma once
#endif
#ifndef __PRIM_SHIFT_H_INCLUDED__
#define __PRIM_SHIFT_H_INCLUDED__
pstatus_t general_lShiftC_16s(const INT16 *pSrc, INT32 val, INT16 *pDst, INT32 len);
pstatus_t general_rShiftC_16s(const INT16 *pSrc, INT32 val, INT16 *pDst, INT32 len);
pstatus_t general_lShiftC_16u(const UINT16 *pSrc, INT32 val, UINT16 *pDst, INT32 len);
pstatus_t general_rShiftC_16u(const UINT16 *pSrc, INT32 val, UINT16 *pDst, INT32 len);
pstatus_t general_shiftC_16s(const INT16 *pSrc, INT32 val, INT16 *pDst, INT32 len);
pstatus_t general_shiftC_16u(const UINT16 *pSrc, INT32 val, UINT16 *pDst, INT32 len);
void primitives_init_shift_opt(const primitives_hints_t *hints, primitives_t *prims);
#endif /* !__PRIM_SHIFT_H_INCLUDED__ */

View File

@ -0,0 +1,79 @@
/* FreeRDP: A Remote Desktop Protocol Client
* Shift operations.
* vi:ts=4 sw=4:
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include <freerdp/types.h>
#include <freerdp/primitives.h>
#ifdef WITH_SSE2
#include <emmintrin.h>
#include <pmmintrin.h>
#endif /* WITH_SSE2 */
#ifdef WITH_IPP
#include <ipps.h>
#endif /* WITH_IPP */
#include "prim_internal.h"
#include "prim_templates.h"
#include "prim_shift.h"
#ifdef WITH_SSE2
# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
/* ------------------------------------------------------------------------- */
SSE3_SCD_ROUTINE(sse2_lShiftC_16s, INT16, general_lShiftC_16s,
_mm_slli_epi16, *dptr++ = *sptr++ << val)
/* ------------------------------------------------------------------------- */
SSE3_SCD_ROUTINE(sse2_rShiftC_16s, INT16, general_rShiftC_16s,
_mm_srai_epi16, *dptr++ = *sptr++ >> val)
/* ------------------------------------------------------------------------- */
SSE3_SCD_ROUTINE(sse2_lShiftC_16u, UINT16, general_lShiftC_16u,
_mm_slli_epi16, *dptr++ = *sptr++ << val)
/* ------------------------------------------------------------------------- */
SSE3_SCD_ROUTINE(sse2_rShiftC_16u, UINT16, general_rShiftC_16u,
_mm_srli_epi16, *dptr++ = *sptr++ >> val)
# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
#endif
/* Note: the IPP version will have to call ippLShiftC_16s or ippRShiftC_16s
* depending on the sign of val. To avoid using the deprecated inplace
* routines, a wrapper can use the src for the dest.
*/
/* ------------------------------------------------------------------------- */
void primitives_init_shift_opt(const primitives_hints_t *hints, primitives_t *prims)
{
#if defined(WITH_IPP)
prims->lShiftC_16s = (__lShiftC_16s_t) ippsLShiftC_16s;
prims->rShiftC_16s = (__rShiftC_16s_t) ippsRShiftC_16s;
prims->lShiftC_16u = (__lShiftC_16u_t) ippsLShiftC_16u;
prims->rShiftC_16u = (__rShiftC_16u_t) ippsRShiftC_16u;
#elif defined(WITH_SSE2)
if ((hints->x86_flags & PRIM_X86_SSE2_AVAILABLE)
&& (hints->x86_flags & PRIM_X86_SSE3_AVAILABLE))
{
prims->lShiftC_16s = sse2_lShiftC_16s;
prims->rShiftC_16s = sse2_rShiftC_16s;
prims->lShiftC_16u = sse2_lShiftC_16u;
prims->rShiftC_16u = sse2_rShiftC_16u;
}
#endif
}

View File

@ -17,22 +17,16 @@
#include "config.h"
#endif
#include <string.h>
#include <freerdp/types.h>
#include <freerdp/primitives.h>
#ifdef WITH_SSE2
#include <emmintrin.h>
#include <tmmintrin.h>
#endif /* WITH_SSE2 */
#include "prim_internal.h"
#include "prim_sign.h"
/* ----------------------------------------------------------------------------
* Set pDst to the sign-value of the 16-bit values in pSrc (-1, 0, or 1).
*/
PRIM_STATIC pstatus_t general_sign_16s(
pstatus_t general_sign_16s(
const INT16 *pSrc,
INT16 *pDst,
INT32 len)
@ -46,110 +40,6 @@ PRIM_STATIC pstatus_t general_sign_16s(
return PRIMITIVES_SUCCESS;
}
#ifdef WITH_SSE2
/* ------------------------------------------------------------------------- */
PRIM_STATIC pstatus_t ssse3_sign_16s(
const INT16 *pSrc,
INT16 *pDst,
INT32 len)
{
const INT16 *sptr = (const INT16 *) pSrc;
INT16 *dptr = (INT16 *) pDst;
size_t count;
if (len < 16)
{
return general_sign_16s(pSrc, pDst, len);
}
/* Check for 16-byte alignment (eventually). */
if ((ULONG_PTR) pDst & 0x01)
{
return general_sign_16s(pSrc, pDst, len);
}
/* Seek 16-byte alignment. */
while ((ULONG_PTR) dptr & 0x0f)
{
INT16 src = *sptr++;
*dptr++ = (src < 0) ? (-1) : ((src > 0) ? 1 : 0);
if (--len == 0) return PRIMITIVES_SUCCESS;
}
/* Do 32-short chunks using 8 XMM registers. */
count = len >> 5; /* / 32 */
len -= count << 5; /* * 32 */
if ((ULONG_PTR) sptr & 0x0f)
{
/* Unaligned */
while (count--)
{
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
xmm0 = _mm_set1_epi16(0x0001U);
xmm1 = _mm_set1_epi16(0x0001U);
xmm2 = _mm_set1_epi16(0x0001U);
xmm3 = _mm_set1_epi16(0x0001U);
xmm4 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8;
xmm5 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8;
xmm6 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8;
xmm7 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8;
xmm0 = _mm_sign_epi16(xmm0, xmm4);
xmm1 = _mm_sign_epi16(xmm1, xmm5);
xmm2 = _mm_sign_epi16(xmm2, xmm6);
xmm3 = _mm_sign_epi16(xmm3, xmm7);
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 8;
_mm_store_si128((__m128i *) dptr, xmm1); dptr += 8;
_mm_store_si128((__m128i *) dptr, xmm2); dptr += 8;
_mm_store_si128((__m128i *) dptr, xmm3); dptr += 8;
}
}
else
{
/* Aligned */
while (count--)
{
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
xmm0 = _mm_set1_epi16(0x0001U);
xmm1 = _mm_set1_epi16(0x0001U);
xmm2 = _mm_set1_epi16(0x0001U);
xmm3 = _mm_set1_epi16(0x0001U);
xmm4 = _mm_load_si128((__m128i *) sptr); sptr += 8;
xmm5 = _mm_load_si128((__m128i *) sptr); sptr += 8;
xmm6 = _mm_load_si128((__m128i *) sptr); sptr += 8;
xmm7 = _mm_load_si128((__m128i *) sptr); sptr += 8;
xmm0 = _mm_sign_epi16(xmm0, xmm4);
xmm1 = _mm_sign_epi16(xmm1, xmm5);
xmm2 = _mm_sign_epi16(xmm2, xmm6);
xmm3 = _mm_sign_epi16(xmm3, xmm7);
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 8;
_mm_store_si128((__m128i *) dptr, xmm1); dptr += 8;
_mm_store_si128((__m128i *) dptr, xmm2); dptr += 8;
_mm_store_si128((__m128i *) dptr, xmm3); dptr += 8;
}
}
/* Do 8-short chunks using two XMM registers. */
count = len >> 3;
len -= count << 3;
while (count--)
{
__m128i xmm0 = _mm_set1_epi16(0x0001U);
__m128i xmm1 = LOAD_SI128(sptr); sptr += 8;
xmm0 = _mm_sign_epi16(xmm0, xmm1);
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 8;
}
/* Do leftovers. */
while (len--)
{
INT16 src = *sptr++;
*dptr++ = (src < 0) ? -1 : ((src > 0) ? 1 : 0);
}
return PRIMITIVES_SUCCESS;
}
#endif /* WITH_SSE2 */
/* ------------------------------------------------------------------------- */
void primitives_init_sign(
const primitives_hints_t *hints,
@ -157,15 +47,8 @@ void primitives_init_sign(
{
/* Start with the default. */
prims->sign_16s = general_sign_16s;
/* Pick tuned versions if possible. */
/* I didn't spot an IPP version of this. */
#if defined(WITH_SSE2)
if ((hints->x86_flags & PRIM_X86_SSSE3_AVAILABLE)
&& (hints->x86_flags & PRIM_X86_SSE3_AVAILABLE))
{
prims->sign_16s = ssse3_sign_16s;
}
#endif
primitives_init_sign_opt(hints, prims);
}
/* ------------------------------------------------------------------------- */
@ -174,3 +57,4 @@ void primitives_deinit_sign(
{
/* Nothing to do. */
}

View File

@ -0,0 +1,30 @@
/* FreeRDP: A Remote Desktop Protocol Client
* Sign operations.
* vi:ts=4 sw=4
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License. Algorithms used by
* this code may be covered by patents by HP, Microsoft, or other parties.
*
*/
#ifdef __GNUC__
# pragma once
#endif
#ifndef __PRIM_SIGN_H_INCLUDED__
#define __PRIM_SIGN_H_INCLUDED__
pstatus_t general_sign_16s(const INT16 *pSrc, INT16 *pDst, INT32 len);
void primitives_init_sign_opt(const primitives_hints_t *hints, primitives_t *prims);
#endif /* !__PRIM_SIGN_H_INCLUDED__ */

View File

@ -0,0 +1,149 @@
/* FreeRDP: A Remote Desktop Protocol Client
* Optimized sign operations.
* vi:ts=4 sw=4:
*
* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include <freerdp/types.h>
#include <freerdp/primitives.h>
#ifdef WITH_SSE2
#include <emmintrin.h>
#include <tmmintrin.h>
#endif /* WITH_SSE2 */
#include "prim_internal.h"
#include "prim_sign.h"
#ifdef WITH_SSE2
/* ------------------------------------------------------------------------- */
pstatus_t ssse3_sign_16s(
const INT16 *pSrc,
INT16 *pDst,
INT32 len)
{
const INT16 *sptr = (const INT16 *) pSrc;
INT16 *dptr = (INT16 *) pDst;
size_t count;
if (len < 16)
{
return general_sign_16s(pSrc, pDst, len);
}
/* Check for 16-byte alignment (eventually). */
if ((ULONG_PTR) pDst & 0x01)
{
return general_sign_16s(pSrc, pDst, len);
}
/* Seek 16-byte alignment. */
while ((ULONG_PTR) dptr & 0x0f)
{
INT16 src = *sptr++;
*dptr++ = (src < 0) ? (-1) : ((src > 0) ? 1 : 0);
if (--len == 0) return PRIMITIVES_SUCCESS;
}
/* Do 32-short chunks using 8 XMM registers. */
count = len >> 5; /* / 32 */
len -= count << 5; /* * 32 */
if ((ULONG_PTR) sptr & 0x0f)
{
/* Unaligned */
while (count--)
{
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
xmm0 = _mm_set1_epi16(0x0001U);
xmm1 = _mm_set1_epi16(0x0001U);
xmm2 = _mm_set1_epi16(0x0001U);
xmm3 = _mm_set1_epi16(0x0001U);
xmm4 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8;
xmm5 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8;
xmm6 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8;
xmm7 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8;
xmm0 = _mm_sign_epi16(xmm0, xmm4);
xmm1 = _mm_sign_epi16(xmm1, xmm5);
xmm2 = _mm_sign_epi16(xmm2, xmm6);
xmm3 = _mm_sign_epi16(xmm3, xmm7);
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 8;
_mm_store_si128((__m128i *) dptr, xmm1); dptr += 8;
_mm_store_si128((__m128i *) dptr, xmm2); dptr += 8;
_mm_store_si128((__m128i *) dptr, xmm3); dptr += 8;
}
}
else
{
/* Aligned */
while (count--)
{
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
xmm0 = _mm_set1_epi16(0x0001U);
xmm1 = _mm_set1_epi16(0x0001U);
xmm2 = _mm_set1_epi16(0x0001U);
xmm3 = _mm_set1_epi16(0x0001U);
xmm4 = _mm_load_si128((__m128i *) sptr); sptr += 8;
xmm5 = _mm_load_si128((__m128i *) sptr); sptr += 8;
xmm6 = _mm_load_si128((__m128i *) sptr); sptr += 8;
xmm7 = _mm_load_si128((__m128i *) sptr); sptr += 8;
xmm0 = _mm_sign_epi16(xmm0, xmm4);
xmm1 = _mm_sign_epi16(xmm1, xmm5);
xmm2 = _mm_sign_epi16(xmm2, xmm6);
xmm3 = _mm_sign_epi16(xmm3, xmm7);
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 8;
_mm_store_si128((__m128i *) dptr, xmm1); dptr += 8;
_mm_store_si128((__m128i *) dptr, xmm2); dptr += 8;
_mm_store_si128((__m128i *) dptr, xmm3); dptr += 8;
}
}
/* Do 8-short chunks using two XMM registers. */
count = len >> 3;
len -= count << 3;
while (count--)
{
__m128i xmm0 = _mm_set1_epi16(0x0001U);
__m128i xmm1 = LOAD_SI128(sptr); sptr += 8;
xmm0 = _mm_sign_epi16(xmm0, xmm1);
_mm_store_si128((__m128i *) dptr, xmm0); dptr += 8;
}
/* Do leftovers. */
while (len--)
{
INT16 src = *sptr++;
*dptr++ = (src < 0) ? -1 : ((src > 0) ? 1 : 0);
}
return PRIMITIVES_SUCCESS;
}
#endif /* WITH_SSE2 */
/* ------------------------------------------------------------------------- */
void primitives_init_sign_opt(const primitives_hints_t *hints, primitives_t *prims)
{
/* Pick tuned versions if possible. */
/* I didn't spot an IPP version of this. */
#if defined(WITH_SSE2)
if ((hints->x86_flags & PRIM_X86_SSSE3_AVAILABLE)
&& (hints->x86_flags & PRIM_X86_SSE3_AVAILABLE))
{
prims->sign_16s = ssse3_sign_16s;
}
#endif
}

View File

@ -44,7 +44,7 @@
* SCD = Source, Constant, Destination
*/
#define SSE3_SCD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
PRIM_STATIC pstatus_t _name_(const _type_ *pSrc, INT32 val, _type_ *pDst, INT32 len) \
pstatus_t _name_(const _type_ *pSrc, INT32 val, _type_ *pDst, INT32 len) \
{ \
int shifts; \
UINT32 offBeatMask; \
@ -188,7 +188,7 @@ PRIM_STATIC pstatus_t _name_(const _type_ *pSrc, INT32 val, _type_ *pDst, INT32
* PRE = preload xmm0 with the constant.
*/
#define SSE3_SCD_PRE_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
PRIM_STATIC pstatus_t _name_(const _type_ *pSrc, _type_ val, _type_ *pDst, INT32 len) \
pstatus_t _name_(const _type_ *pSrc, _type_ val, _type_ *pDst, INT32 len) \
{ \
int shifts; \
UINT32 offBeatMask; \
@ -293,7 +293,7 @@ PRIM_STATIC pstatus_t _name_(const _type_ *pSrc, _type_ val, _type_ *pDst, INT32
* SSD = Source1, Source2, Destination
*/
#define SSE3_SSD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
PRIM_STATIC pstatus_t _name_(const _type_ *pSrc1, const _type_ *pSrc2, _type_ *pDst, INT32 len) \
pstatus_t _name_(const _type_ *pSrc1, const _type_ *pSrc2, _type_ *pDst, INT32 len) \
{ \
int shifts; \
UINT32 offBeatMask; \

View File

@ -150,7 +150,7 @@ static void set_hints(primitives_hints_t* hints)
#elif defined(_M_ARM)
static UINT32 androidNeon(void)
static UINT32 getNeonSupport(void)
{
#ifdef __ANDROID__
if (android_getCpuFamily() != ANDROID_CPU_FAMILY_ARM) return 0;
@ -164,7 +164,9 @@ static UINT32 androidNeon(void)
return PRIM_ARM_NEON_AVAILABLE;
}
}
/* else */
#elif defined(__APPLE)
/* assume NEON support on iOS devices */
return PRIM_ARM_NEON_AVAILABLE;
#endif
return 0;
}
@ -172,7 +174,7 @@ static UINT32 androidNeon(void)
static void set_hints(primitives_hints_t* hints)
{
/* ARM: TODO */
hints->arm_flags |= androidNeon();
hints->arm_flags |= getNeonSupport();
}
#else