Merge pull request #992 from mfleisz/master

primitives: separating optimized functions into their own .c files
2013-02-22 04:06:04 -08:00 · 2013-02-22 04:06:04 -08:00 · 9e1e8f11f1
commit 9e1e8f11f1
parent 9fd9b5014a b8fd4b5227
31 changed files with 1675 additions and 1195 deletions
--- a/libfreerdp/codec/CMakeLists.txt
+++ b/libfreerdp/codec/CMakeLists.txt
@ -71,7 +71,7 @@ if(WITH_NEON)
 	if(ANDROID)
 		set(ANDROID_CPU_FEATURES_PATH "${ANDROID_NDK}/sources/android/cpufeatures")
 		include_directories(${ANDROID_CPU_FEATURES_PATH})
-		set(${MODULE_PREFIX}_NEON_SRCS ${${MODULE_PREFIX}_NEON_SRCS}
+		set(${MODULE_PREFIX}_SRCS ${${MODULE_PREFIX}_SRCS}
 			${ANDROID_CPU_FEATURES_PATH}/cpu-features.c
 			${ANDROID_CPU_FEATURES_PATH}/cpu-features.h)
 	endif()
--- a/libfreerdp/codec/rfx.c
+++ b/libfreerdp/codec/rfx.c
@ -37,6 +37,7 @@

 #include <freerdp/codec/rfx.h>
 #include <freerdp/constants.h>
+#include <freerdp/primitives.h>

 #include "rfx_constants.h"
 #include "rfx_types.h"
@ -45,13 +46,8 @@
 #include "rfx_quantization.h"
 #include "rfx_dwt.h"

-#ifdef WITH_SSE2
 #include "rfx_sse2.h"
-#endif
-
-#ifdef WITH_NEON
 #include "rfx_neon.h"
-#endif

 #ifndef RFX_INIT_SIMD
 #define RFX_INIT_SIMD(_rfx_context) do { } while (0)
@ -209,6 +205,11 @@ RFX_CONTEXT* rfx_context_new(void)

 	if (context->priv->UseThreads)
 	{
+		/* Call primitives_get here in order to avoid race conditions when using primitives_get */
+		/* from multiple threads. This call will initialize all function pointers correctly     */
+		/* before any decoding threads are started */
+		primitives_get();
+
 		context->priv->ThreadPool = CreateThreadpool(NULL);
 		InitializeThreadpoolEnvironment(&context->priv->ThreadPoolEnv);
 		SetThreadpoolCallbackPool(&context->priv->ThreadPoolEnv, context->priv->ThreadPool);
@ -232,6 +233,8 @@ RFX_CONTEXT* rfx_context_new(void)
 	context->dwt_2d_decode = rfx_dwt_2d_decode;
 	context->dwt_2d_encode = rfx_dwt_2d_encode;

+	RFX_INIT_SIMD(context);
+	
 	return context;
 }

--- a/libfreerdp/codec/rfx_neon.c
+++ b/libfreerdp/codec/rfx_neon.c
@ -40,9 +40,7 @@
 static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 rfx_quantization_decode_block_NEON(INT16 * buffer, const int buffer_size, const UINT32 factor)
 {
-	if (factor <= 6)
-		return;
-	int16x8_t quantFactors = vdupq_n_s16(factor - 6);
+	int16x8_t quantFactors = vdupq_n_s16(factor);
 	int16x8_t* buf = (int16x8_t*)buffer;
 	int16x8_t* buf_end = (int16x8_t*)(buffer + buffer_size);

@ -59,16 +57,18 @@ rfx_quantization_decode_block_NEON(INT16 * buffer, const int buffer_size, const
 void
 rfx_quantization_decode_NEON(INT16 * buffer, const UINT32 * quantization_values)
 {
-	rfx_quantization_decode_block_NEON(buffer, 1024, quantization_values[8]); /* HL1 */
-	rfx_quantization_decode_block_NEON(buffer + 1024, 1024, quantization_values[7]); /* LH1 */
-	rfx_quantization_decode_block_NEON(buffer + 2048, 1024, quantization_values[9]); /* HH1 */
-	rfx_quantization_decode_block_NEON(buffer + 3072, 256, quantization_values[5]); /* HL2 */
-	rfx_quantization_decode_block_NEON(buffer + 3328, 256, quantization_values[4]); /* LH2 */
-	rfx_quantization_decode_block_NEON(buffer + 3584, 256, quantization_values[6]); /* HH2 */
-	rfx_quantization_decode_block_NEON(buffer + 3840, 64, quantization_values[2]); /* HL3 */
-	rfx_quantization_decode_block_NEON(buffer + 3904, 64, quantization_values[1]); /* LH3 */
-	rfx_quantization_decode_block_NEON(buffer + 3968, 64, quantization_values[3]); /* HH3 */
-	rfx_quantization_decode_block_NEON(buffer + 4032, 64, quantization_values[0]); /* LL3 */
+	rfx_quantization_decode_block_NEON(buffer, 4096, 5);
+
+	rfx_quantization_decode_block_NEON(buffer, 1024, quantization_values[8] - 6); /* HL1 */
+	rfx_quantization_decode_block_NEON(buffer + 1024, 1024, quantization_values[7] - 6); /* LH1 */
+	rfx_quantization_decode_block_NEON(buffer + 2048, 1024, quantization_values[9] - 6); /* HH1 */
+	rfx_quantization_decode_block_NEON(buffer + 3072, 256, quantization_values[5] - 6); /* HL2 */
+	rfx_quantization_decode_block_NEON(buffer + 3328, 256, quantization_values[4] - 6); /* LH2 */
+	rfx_quantization_decode_block_NEON(buffer + 3584, 256, quantization_values[6] - 6); /* HH2 */
+	rfx_quantization_decode_block_NEON(buffer + 3840, 64, quantization_values[2] - 6); /* HL3 */
+	rfx_quantization_decode_block_NEON(buffer + 3904, 64, quantization_values[1] - 6); /* LH3 */
+	rfx_quantization_decode_block_NEON(buffer + 3968, 64, quantization_values[3] - 6); /* HH3 */
+	rfx_quantization_decode_block_NEON(buffer + 4032, 64, quantization_values[0] - 6); /* LL3 */
 }


@ -278,8 +278,11 @@ int isNeonSupported()
 	}

 	return FALSE;
-#else
+#elif defined(__APPLE)
+	/* assume NEON support on iOS devices */
 	return TRUE;
+#else
+	return FALSE;
 #endif
 }

--- a/libfreerdp/codec/rfx_neon.h
+++ b/libfreerdp/codec/rfx_neon.h
@ -22,8 +22,6 @@

 #include <freerdp/codec/rfx.h>

-#if defined(__ARM_NEON__)
-
 void rfx_init_neon(RFX_CONTEXT * context);

 #ifndef RFX_INIT_SIMD
@ -32,7 +30,5 @@ void rfx_init_neon(RFX_CONTEXT * context);
 #endif
 #endif

-#endif // __ARM_NEON__
-
 #endif /* __RFX_NEON_H */

--- a/libfreerdp/codec/rfx_sse2.h
+++ b/libfreerdp/codec/rfx_sse2.h
@ -24,8 +24,10 @@

 void rfx_init_sse2(RFX_CONTEXT* context);

-#ifndef RFX_INIT_SIMD
-#define RFX_INIT_SIMD(_rfx_context) rfx_init_sse2(_rfx_context)
+#ifdef WITH_SSE2
+ #ifndef RFX_INIT_SIMD
+  #define RFX_INIT_SIMD(_rfx_context) rfx_init_sse2(_rfx_context)
+ #endif
 #endif

 #endif /* __RFX_SSE2_H */
--- a/libfreerdp/primitives/CMakeLists.txt
+++ b/libfreerdp/primitives/CMakeLists.txt
@ -28,6 +28,15 @@ set(${MODULE_PREFIX}_SRCS
 	primitives.c
 	prim_internal.h)

+set(${MODULE_PREFIX}_OPT_SRCS
+	prim_add_opt.c
+	prim_andor_opt.c
+	prim_alphaComp_opt.c
+	prim_colors_opt.c
+	prim_set_opt.c
+	prim_shift_opt.c
+	prim_sign_opt.c)
+
 add_definitions(-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE})

 ### IPP Variable debugging
@ -63,7 +72,9 @@ if(ANDROID)
 		${ANDROID_CPU_FEATURES_PATH}/cpu-features.h)
 endif()

-set_property(SOURCE ${${MODULE_PREFIX}_SRCS} PROPERTY COMPILE_FLAGS ${OPTIMIZATION})
+set_property(SOURCE ${${MODULE_PREFIX}_OPT_SRCS} PROPERTY COMPILE_FLAGS ${OPTIMIZATION})
+
+set(${MODULE_PREFIX}_SRCS ${${MODULE_PREFIX}_SRCS} ${${MODULE_PREFIX}_OPT_SRCS})

 add_complex_library(MODULE ${MODULE_NAME} TYPE "OBJECT"
 	MONOLITHIC ${MONOLITHIC_BUILD}
--- a/libfreerdp/primitives/prim_add.c
+++ b/libfreerdp/primitives/prim_add.c
@ -18,27 +18,16 @@
 #include "config.h"
 #endif

-#include <string.h>
-
 #include <freerdp/types.h>
 #include <freerdp/primitives.h>

-#ifdef WITH_SSE2
-#include <emmintrin.h>
-#include <pmmintrin.h>
-#endif /* WITH_SSE2 */
-
-#ifdef WITH_IPP
-#include <ipps.h>
-#endif /* WITH_IPP */
-
 #include "prim_internal.h"
-#include "prim_templates.h"
+#include "prim_add.h"

 /* ----------------------------------------------------------------------------
 * 16-bit signed add with saturation (under and over).
 */
-PRIM_STATIC pstatus_t general_add_16s(
+pstatus_t general_add_16s(
 	const INT16 *pSrc1,
 	const INT16 *pSrc2,
 	INT16 *pDst,
@ -55,29 +44,14 @@ PRIM_STATIC pstatus_t general_add_16s(
 	return PRIMITIVES_SUCCESS;
 }

-#ifdef WITH_SSE2
-# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
-/* ------------------------------------------------------------------------- */
-SSE3_SSD_ROUTINE(sse3_add_16s, INT16, general_add_16s,
-	_mm_adds_epi16, general_add_16s(sptr1++, sptr2++, dptr++, 1))
-# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
-#endif
-
 /* ------------------------------------------------------------------------- */
 void primitives_init_add(
 	const primitives_hints_t *hints,
 	primitives_t *prims)
 {
 	prims->add_16s = general_add_16s;
-#ifdef WITH_IPP
-	prims->add_16s = (__add_16s_t) ippsAdd_16s;
-#elif defined(WITH_SSE2)
-	if ((hints->x86_flags & PRIM_X86_SSE2_AVAILABLE)
-			&& (hints->x86_flags & PRIM_X86_SSE3_AVAILABLE))	/* for LDDQU */
-	{
-		prims->add_16s = sse3_add_16s;
-	}
-#endif
+
+	primitives_init_add_opt(hints, prims);
 }

 /* ------------------------------------------------------------------------- */
--- a/libfreerdp/primitives/prim_add.h
+++ b/libfreerdp/primitives/prim_add.h
@ -0,0 +1,30 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Add operations.
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.  Algorithms used by
+ * this code may be covered by patents by HP, Microsoft, or other parties.
+ *
+ */
+
+#ifdef __GNUC__
+# pragma once
+#endif
+
+#ifndef __PRIM_ADD_H_INCLUDED__
+#define __PRIM_ADD_H_INCLUDED__
+
+pstatus_t general_add_16s(const INT16 *pSrc1, const INT16 *pSrc2, INT16 *pDst, INT32 len);
+
+void primitives_init_add_opt(const primitives_hints_t *hints, primitives_t *prims);
+
+#endif /* !__PRIM_ADD_H_INCLUDED__ */
+
--- a/libfreerdp/primitives/prim_add_opt.c
+++ b/libfreerdp/primitives/prim_add_opt.c
@ -0,0 +1,62 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Optimized add operations.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ * 
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+
+#ifdef WITH_SSE2
+#include <emmintrin.h>
+#include <pmmintrin.h>
+#endif /* WITH_SSE2 */
+
+#ifdef WITH_IPP
+#include <ipps.h>
+#endif /* WITH_IPP */
+
+#include "prim_internal.h"
+#include "prim_templates.h"
+#include "prim_add.h"
+
+
+#ifdef WITH_SSE2
+# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
+/* ------------------------------------------------------------------------- */
+SSE3_SSD_ROUTINE(sse3_add_16s, INT16, general_add_16s,
+	_mm_adds_epi16, general_add_16s(sptr1++, sptr2++, dptr++, 1))
+# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
+#endif
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_add_opt(
+	const primitives_hints_t *hints,
+	primitives_t *prims)
+{
+#ifdef WITH_IPP
+	prims->add_16s = (__add_16s_t) ippsAdd_16s;
+#elif defined(WITH_SSE2)
+	if ((hints->x86_flags & PRIM_X86_SSE2_AVAILABLE)
+			&& (hints->x86_flags & PRIM_X86_SSE3_AVAILABLE))	/* for LDDQU */
+	{
+		prims->add_16s = sse3_add_16s;
+	}
+#endif
+}
+
+
--- a/libfreerdp/primitives/prim_alphaComp.c
+++ b/libfreerdp/primitives/prim_alphaComp.c
@ -24,21 +24,11 @@
 #include "config.h"
 #endif

-#include <string.h>
-
 #include <freerdp/types.h>
 #include <freerdp/primitives.h>

 #include "prim_internal.h"
-
-#ifdef WITH_SSE2
-#include <emmintrin.h>
-#include <pmmintrin.h>
-#endif /* WITH_SSE2 */
-
-#ifdef WITH_IPP
-#include <ippi.h>
-#endif /* WITH_IPP */
+#include "prim_alphaComp.h"

 #define ALPHA(_k_)	(((_k_) & 0xFF000000U) >> 24)
 #define RED(_k_)	(((_k_) & 0x00FF0000U) >> 16)
@ -46,7 +36,7 @@
 #define BLU(_k_)	(((_k_) & 0x000000FFU))

 /* ------------------------------------------------------------------------- */
-PRIM_STATIC pstatus_t general_alphaComp_argb(
+pstatus_t general_alphaComp_argb(
 	const BYTE *pSrc1,  INT32 src1Step,
 	const BYTE *pSrc2,  INT32 src2Step,
 	BYTE *pDst,  INT32 dstStep,
@ -111,188 +101,12 @@ PRIM_STATIC pstatus_t general_alphaComp_argb(
 	return PRIMITIVES_SUCCESS;
 }

-/* ------------------------------------------------------------------------- */
-#ifdef WITH_SSE2
-#if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
-
-PRIM_STATIC pstatus_t sse2_alphaComp_argb(
-	const BYTE *pSrc1,  INT32 src1Step,
-	const BYTE *pSrc2,  INT32 src2Step,
-	BYTE *pDst,  INT32 dstStep,
-	INT32 width,  INT32 height)
-{
-	const UINT32 *sptr1 = (const UINT32 *) pSrc1;
-	const UINT32 *sptr2 = (const UINT32 *) pSrc2;
-	UINT32 *dptr;
-	int linebytes, src1Jump, src2Jump, dstJump, y;
-	__m128i xmm0, xmm1;
-
-	if ((width <= 0) || (height <= 0)) return PRIMITIVES_SUCCESS;
-
-	if (width < 4)     /* pointless if too small */
-	{
-		return general_alphaComp_argb(pSrc1, src1Step, pSrc2, src2Step,
-			pDst, dstStep, width, height);
-	}
-	dptr = (UINT32 *) pDst;
-	linebytes = width * sizeof(UINT32);
-	src1Jump = (src1Step - linebytes) / sizeof(UINT32);
-	src2Jump = (src2Step - linebytes) / sizeof(UINT32);
-	dstJump  = (dstStep  - linebytes) / sizeof(UINT32);
-
-	xmm0 = _mm_set1_epi32(0);
-	xmm1 = _mm_set1_epi16(1);
-
-	for (y=0; y<height; ++y)
-	{
-		int pixels = width;
-		int count;
-
-		/* Get to the 16-byte boundary now. */
-		int leadIn = 0;
-		switch ((ULONG_PTR) dptr & 0x0f)
-		{
-			case 0:
-				leadIn = 0;
-				break;
-			case 4:
-				leadIn = 3;
-				break;
-			case 8:
-				leadIn = 2;
-				break;
-			case 12:
-				leadIn = 1;
-				break;
-			default:
-				/* We'll never hit a 16-byte boundary, so do the whole
-				 * thing the slow way.
-				 */
-				leadIn = width;
-				break;
-		}
-		if (leadIn)
-		{
-			general_alphaComp_argb((const BYTE *) sptr1,
-				src1Step, (const BYTE *) sptr2, src2Step,
-				(BYTE *) dptr, dstStep, leadIn, 1);
-			sptr1 += leadIn;
-			sptr2 += leadIn;
-			dptr  += leadIn;
-			pixels -= leadIn;
-		}
-
-		/* Use SSE registers to do 4 pixels at a time. */
-		count = pixels >> 2;
-		pixels -= count << 2;
-		while (count--)
-		{
-			__m128i xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-			/* BdGdRdAdBcGcRcAcBbGbRbAbBaGaRaAa */
-			xmm2 = LOAD_SI128(sptr1); sptr1 += 4;
-			/* BhGhRhAhBgGgRgAgBfGfRfAfBeGeReAe */
-			xmm3 = LOAD_SI128(sptr2); sptr2 += 4;
-			/* 00Bb00Gb00Rb00Ab00Ba00Ga00Ra00Aa */
-			xmm4 = _mm_unpackhi_epi8(xmm2, xmm0);
-			/* 00Bf00Gf00Bf00Af00Be00Ge00Re00Ae */
-			xmm5 = _mm_unpackhi_epi8(xmm3, xmm0);
-			/* subtract */
-			xmm6 = _mm_subs_epi16(xmm4, xmm5);
-			/* 00Bb00Gb00Rb00Ab00Aa00Aa00Aa00Aa */
-			xmm4 = _mm_shufflelo_epi16(xmm4, 0xff);
-			/* 00Ab00Ab00Ab00Ab00Aa00Aa00Aa00Aa */
-			xmm4 = _mm_shufflehi_epi16(xmm4, 0xff);
-			/* Add one to alphas */
-			xmm4 = _mm_adds_epi16(xmm4, xmm1);
-			/* Multiply and take low word */
-			xmm4 = _mm_mullo_epi16(xmm4, xmm6);
-			/* Shift 8 right */
-			xmm4 = _mm_srai_epi16(xmm4, 8);
-			/* Add xmm5 */
-			xmm4 = _mm_adds_epi16(xmm4, xmm5);
-			/* 00Bj00Gj00Rj00Aj00Bi00Gi00Ri00Ai */
-
-			/* 00Bd00Gd00Rd00Ad00Bc00Gc00Rc00Ac */
-			xmm5 = _mm_unpacklo_epi8(xmm2, xmm0);
-			/* 00Bh00Gh00Rh00Ah00Bg00Gg00Rg00Ag */
-			xmm6 = _mm_unpacklo_epi8(xmm3, xmm0);
-			/* subtract */
-			xmm7 = _mm_subs_epi16(xmm5, xmm6);
-			/* 00Bd00Gd00Rd00Ad00Ac00Ac00Ac00Ac */
-			xmm5 = _mm_shufflelo_epi16(xmm5, 0xff);
-			/* 00Ad00Ad00Ad00Ad00Ac00Ac00Ac00Ac */
-			xmm5 = _mm_shufflehi_epi16(xmm5, 0xff);
-			/* Add one to alphas */
-			xmm5 = _mm_adds_epi16(xmm5, xmm1);
-			/* Multiply and take low word */
-			xmm5 = _mm_mullo_epi16(xmm5, xmm7);
-			/* Shift 8 right */
-			xmm5 = _mm_srai_epi16(xmm5, 8);
-			/* Add xmm6 */
-			xmm5 = _mm_adds_epi16(xmm5, xmm6);
-			/* 00Bl00Gl00Rl00Al00Bk00Gk00Rk0ABk */
-
-			/* Must mask off remainders or pack gets confused */
-			xmm3 = _mm_set1_epi16(0x00ffU);
-			xmm4 = _mm_and_si128(xmm4, xmm3);
-			xmm5 = _mm_and_si128(xmm5, xmm3);
-
-			/* BlGlRlAlBkGkRkAkBjGjRjAjBiGiRiAi */
-			xmm5 = _mm_packus_epi16(xmm5, xmm4);
-			_mm_store_si128((__m128i *) dptr, xmm5);  dptr += 4;
-		}
-
-		/* Finish off the remainder. */
-		if (pixels)
-		{
-			general_alphaComp_argb((const BYTE *) sptr1, src1Step,
-				(const BYTE *) sptr2, src2Step,
-				(BYTE *) dptr, dstStep, pixels, 1);
-			sptr1 += pixels;
-			sptr2 += pixels;
-			dptr  += pixels;
-		}
-
-		/* Jump to next row. */
-		sptr1 += src1Jump;
-		sptr2 += src2Jump;
-		dptr  += dstJump;
-	}
-
-	return PRIMITIVES_SUCCESS;
-}
-#endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
-#endif
-
-#ifdef WITH_IPP
-/* ------------------------------------------------------------------------- */
-PRIM_STATIC pstatus_t ipp_alphaComp_argb(
-	const BYTE *pSrc1,  INT32 src1Step,
-	const BYTE *pSrc2,  INT32 src2Step,
-	BYTE *pDst,  INT32 dstStep,
-	INT32 width,  INT32 height)
-{
-	IppiSize sz;
-	sz.width  = width;
-	sz.height = height;
-	return ippiAlphaComp_8u_AC4R(pSrc1, src1Step, pSrc2, src2Step,
-		pDst, dstStep, sz, ippAlphaOver);
-}
-#endif
-
 /* ------------------------------------------------------------------------- */
 void primitives_init_alphaComp(const primitives_hints_t* hints, primitives_t* prims)
 {
 	prims->alphaComp_argb = general_alphaComp_argb;
-#ifdef WITH_IPP
-	prims->alphaComp_argb = ipp_alphaComp_argb;
-#elif defined(WITH_SSE2)
-	if ((hints->x86_flags & PRIM_X86_SSE2_AVAILABLE)
-			&& (hints->x86_flags & PRIM_X86_SSE3_AVAILABLE))	/* for LDDQU */
-	{
-		prims->alphaComp_argb = sse2_alphaComp_argb;
-	}
-#endif
+
+	primitives_init_alphaComp_opt(hints, prims);
 }

 /* ------------------------------------------------------------------------- */
@ -300,3 +114,4 @@ void primitives_deinit_alphaComp(primitives_t *prims)
 {
 	/* Nothing to do. */
 }
+
--- a/libfreerdp/primitives/prim_alphaComp.h
+++ b/libfreerdp/primitives/prim_alphaComp.h
@ -0,0 +1,30 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Alpha blending routines.
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.  Algorithms used by
+ * this code may be covered by patents by HP, Microsoft, or other parties.
+ *
+ */
+
+#ifdef __GNUC__
+# pragma once
+#endif
+
+#ifndef __PRIM_ALPHACOMP_H_INCLUDED__
+#define __PRIM_ALPHACOMP_H_INCLUDED__
+
+pstatus_t general_alphaComp_argb(const BYTE *pSrc1, INT32 src1Step, const BYTE *pSrc2, INT32 src2Step, BYTE *pDst, INT32 dstStep, INT32 width, INT32 height);
+
+void primitives_init_alphaComp_opt(const primitives_hints_t* hints, primitives_t* prims);
+
+#endif /* !__PRIM_ALPHACOMP_H_INCLUDED__ */
+
--- a/libfreerdp/primitives/prim_alphaComp_opt.c
+++ b/libfreerdp/primitives/prim_alphaComp_opt.c
@ -0,0 +1,225 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Optimized alpha blending routines.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ *
+ * Note: this code assumes the second operand is fully opaque,
+ * e.g.
+ *   newval = alpha1*val1 + (1-alpha1)*val2
+ * rather than
+ *   newval = alpha1*val1 + (1-alpha1)*alpha2*val2
+ * The IPP gives other options.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+
+#ifdef WITH_SSE2
+#include <emmintrin.h>
+#include <pmmintrin.h>
+#endif /* WITH_SSE2 */
+
+#ifdef WITH_IPP
+#include <ippi.h>
+#endif /* WITH_IPP */
+
+#include "prim_internal.h"
+#include "prim_alphaComp.h"
+
+
+/* ------------------------------------------------------------------------- */
+#ifdef WITH_SSE2
+#if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
+
+pstatus_t sse2_alphaComp_argb(
+	const BYTE *pSrc1,  INT32 src1Step,
+	const BYTE *pSrc2,  INT32 src2Step,
+	BYTE *pDst,  INT32 dstStep,
+	INT32 width,  INT32 height)
+{
+	const UINT32 *sptr1 = (const UINT32 *) pSrc1;
+	const UINT32 *sptr2 = (const UINT32 *) pSrc2;
+	UINT32 *dptr;
+	int linebytes, src1Jump, src2Jump, dstJump, y;
+	__m128i xmm0, xmm1;
+
+	if ((width <= 0) || (height <= 0)) return PRIMITIVES_SUCCESS;
+
+	if (width < 4)     /* pointless if too small */
+	{
+		return general_alphaComp_argb(pSrc1, src1Step, pSrc2, src2Step,
+			pDst, dstStep, width, height);
+	}
+	dptr = (UINT32 *) pDst;
+	linebytes = width * sizeof(UINT32);
+	src1Jump = (src1Step - linebytes) / sizeof(UINT32);
+	src2Jump = (src2Step - linebytes) / sizeof(UINT32);
+	dstJump  = (dstStep  - linebytes) / sizeof(UINT32);
+
+	xmm0 = _mm_set1_epi32(0);
+	xmm1 = _mm_set1_epi16(1);
+
+	for (y=0; y<height; ++y)
+	{
+		int pixels = width;
+		int count;
+
+		/* Get to the 16-byte boundary now. */
+		int leadIn = 0;
+		switch ((ULONG_PTR) dptr & 0x0f)
+		{
+			case 0:
+				leadIn = 0;
+				break;
+			case 4:
+				leadIn = 3;
+				break;
+			case 8:
+				leadIn = 2;
+				break;
+			case 12:
+				leadIn = 1;
+				break;
+			default:
+				/* We'll never hit a 16-byte boundary, so do the whole
+				 * thing the slow way.
+				 */
+				leadIn = width;
+				break;
+		}
+		if (leadIn)
+		{
+			general_alphaComp_argb((const BYTE *) sptr1,
+				src1Step, (const BYTE *) sptr2, src2Step,
+				(BYTE *) dptr, dstStep, leadIn, 1);
+			sptr1 += leadIn;
+			sptr2 += leadIn;
+			dptr  += leadIn;
+			pixels -= leadIn;
+		}
+
+		/* Use SSE registers to do 4 pixels at a time. */
+		count = pixels >> 2;
+		pixels -= count << 2;
+		while (count--)
+		{
+			__m128i xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+			/* BdGdRdAdBcGcRcAcBbGbRbAbBaGaRaAa */
+			xmm2 = LOAD_SI128(sptr1); sptr1 += 4;
+			/* BhGhRhAhBgGgRgAgBfGfRfAfBeGeReAe */
+			xmm3 = LOAD_SI128(sptr2); sptr2 += 4;
+			/* 00Bb00Gb00Rb00Ab00Ba00Ga00Ra00Aa */
+			xmm4 = _mm_unpackhi_epi8(xmm2, xmm0);
+			/* 00Bf00Gf00Bf00Af00Be00Ge00Re00Ae */
+			xmm5 = _mm_unpackhi_epi8(xmm3, xmm0);
+			/* subtract */
+			xmm6 = _mm_subs_epi16(xmm4, xmm5);
+			/* 00Bb00Gb00Rb00Ab00Aa00Aa00Aa00Aa */
+			xmm4 = _mm_shufflelo_epi16(xmm4, 0xff);
+			/* 00Ab00Ab00Ab00Ab00Aa00Aa00Aa00Aa */
+			xmm4 = _mm_shufflehi_epi16(xmm4, 0xff);
+			/* Add one to alphas */
+			xmm4 = _mm_adds_epi16(xmm4, xmm1);
+			/* Multiply and take low word */
+			xmm4 = _mm_mullo_epi16(xmm4, xmm6);
+			/* Shift 8 right */
+			xmm4 = _mm_srai_epi16(xmm4, 8);
+			/* Add xmm5 */
+			xmm4 = _mm_adds_epi16(xmm4, xmm5);
+			/* 00Bj00Gj00Rj00Aj00Bi00Gi00Ri00Ai */
+
+			/* 00Bd00Gd00Rd00Ad00Bc00Gc00Rc00Ac */
+			xmm5 = _mm_unpacklo_epi8(xmm2, xmm0);
+			/* 00Bh00Gh00Rh00Ah00Bg00Gg00Rg00Ag */
+			xmm6 = _mm_unpacklo_epi8(xmm3, xmm0);
+			/* subtract */
+			xmm7 = _mm_subs_epi16(xmm5, xmm6);
+			/* 00Bd00Gd00Rd00Ad00Ac00Ac00Ac00Ac */
+			xmm5 = _mm_shufflelo_epi16(xmm5, 0xff);
+			/* 00Ad00Ad00Ad00Ad00Ac00Ac00Ac00Ac */
+			xmm5 = _mm_shufflehi_epi16(xmm5, 0xff);
+			/* Add one to alphas */
+			xmm5 = _mm_adds_epi16(xmm5, xmm1);
+			/* Multiply and take low word */
+			xmm5 = _mm_mullo_epi16(xmm5, xmm7);
+			/* Shift 8 right */
+			xmm5 = _mm_srai_epi16(xmm5, 8);
+			/* Add xmm6 */
+			xmm5 = _mm_adds_epi16(xmm5, xmm6);
+			/* 00Bl00Gl00Rl00Al00Bk00Gk00Rk0ABk */
+
+			/* Must mask off remainders or pack gets confused */
+			xmm3 = _mm_set1_epi16(0x00ffU);
+			xmm4 = _mm_and_si128(xmm4, xmm3);
+			xmm5 = _mm_and_si128(xmm5, xmm3);
+
+			/* BlGlRlAlBkGkRkAkBjGjRjAjBiGiRiAi */
+			xmm5 = _mm_packus_epi16(xmm5, xmm4);
+			_mm_store_si128((__m128i *) dptr, xmm5);  dptr += 4;
+		}
+
+		/* Finish off the remainder. */
+		if (pixels)
+		{
+			general_alphaComp_argb((const BYTE *) sptr1, src1Step,
+				(const BYTE *) sptr2, src2Step,
+				(BYTE *) dptr, dstStep, pixels, 1);
+			sptr1 += pixels;
+			sptr2 += pixels;
+			dptr  += pixels;
+		}
+
+		/* Jump to next row. */
+		sptr1 += src1Jump;
+		sptr2 += src2Jump;
+		dptr  += dstJump;
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+#endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
+#endif
+
+#ifdef WITH_IPP
+/* ------------------------------------------------------------------------- */
+pstatus_t ipp_alphaComp_argb(
+	const BYTE *pSrc1,  INT32 src1Step,
+	const BYTE *pSrc2,  INT32 src2Step,
+	BYTE *pDst,  INT32 dstStep,
+	INT32 width,  INT32 height)
+{
+	IppiSize sz;
+	sz.width  = width;
+	sz.height = height;
+	return ippiAlphaComp_8u_AC4R(pSrc1, src1Step, pSrc2, src2Step,
+		pDst, dstStep, sz, ippAlphaOver);
+}
+#endif
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_alphaComp_opt(const primitives_hints_t* hints, primitives_t* prims)
+{
+#ifdef WITH_IPP
+	prims->alphaComp_argb = ipp_alphaComp_argb;
+#elif defined(WITH_SSE2)
+	if ((hints->x86_flags & PRIM_X86_SSE2_AVAILABLE)
+			&& (hints->x86_flags & PRIM_X86_SSE3_AVAILABLE))	/* for LDDQU */
+	{
+		prims->alphaComp_argb = sse2_alphaComp_argb;
+	}
+#endif
+}
+
--- a/libfreerdp/primitives/prim_andor.c
+++ b/libfreerdp/primitives/prim_andor.c
@ -17,27 +17,16 @@
 #include "config.h"
 #endif

-#include <string.h>
-
 #include <freerdp/types.h>
 #include <freerdp/primitives.h>

-#ifdef WITH_SSE2
-#include <emmintrin.h>
-#include <pmmintrin.h>
-#endif /* WITH_SSE2 */
-
-#ifdef WITH_IPP
-#include <ipps.h>
-#endif /* WITH_IPP */
-
 #include "prim_internal.h"
-#include "prim_templates.h"
+#include "prim_andor.h"

 /* ----------------------------------------------------------------------------
 * 32-bit AND with a constant.
 */
-PRIM_STATIC pstatus_t general_andC_32u(
+pstatus_t general_andC_32u(
 	const UINT32 *pSrc,
 	UINT32 val,
 	UINT32 *pDst,
@ -55,7 +44,7 @@ PRIM_STATIC pstatus_t general_andC_32u(
 /* ----------------------------------------------------------------------------
 * 32-bit OR with a constant.
 */
-PRIM_STATIC pstatus_t general_orC_32u(
+pstatus_t general_orC_32u(
 	const UINT32 *pSrc,
 	UINT32 val,
 	UINT32 *pDst,
@ -70,16 +59,6 @@ PRIM_STATIC pstatus_t general_orC_32u(
 	return PRIMITIVES_SUCCESS;
 }

-#ifdef WITH_SSE2
-# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
-/* ------------------------------------------------------------------------- */
-SSE3_SCD_PRE_ROUTINE(sse3_andC_32u, UINT32, general_andC_32u,
-	_mm_and_si128, *dptr++ = *sptr++ & val)
-SSE3_SCD_PRE_ROUTINE(sse3_orC_32u, UINT32, general_orC_32u,
-	_mm_or_si128, *dptr++ = *sptr++ | val)
-# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
-#endif
-
 /* ------------------------------------------------------------------------- */
 void primitives_init_andor(
 	const primitives_hints_t *hints,
@ -89,17 +68,7 @@ void primitives_init_andor(
 	prims->andC_32u = general_andC_32u;
 	prims->orC_32u  = general_orC_32u;

-#if defined(WITH_IPP)
-	prims->andC_32u = (__andC_32u_t) ippsAndC_32u;
-	prims->orC_32u  = (__orC_32u_t) ippsOrC_32u;
-#elif defined(WITH_SSE2)
-	if ((hints->x86_flags & PRIM_X86_SSE2_AVAILABLE)
-			&& (hints->x86_flags & PRIM_X86_SSE3_AVAILABLE))
-	{
-		prims->andC_32u = sse3_andC_32u;
-		prims->orC_32u  = sse3_orC_32u;
-	}
-#endif
+	primitives_init_andor_opt(hints, prims);
 }

 /* ------------------------------------------------------------------------- */
@ -108,3 +77,4 @@ void primitives_deinit_andor(
 {
 	/* Nothing to do. */
 }
+
--- a/libfreerdp/primitives/prim_andor.h
+++ b/libfreerdp/primitives/prim_andor.h
@ -0,0 +1,31 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Logical operations.
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.  Algorithms used by
+ * this code may be covered by patents by HP, Microsoft, or other parties.
+ *
+ */
+
+#ifdef __GNUC__
+# pragma once
+#endif
+
+#ifndef __PRIM_ANDOR_H_INCLUDED__
+#define __PRIM_ANDOR_H_INCLUDED__
+
+pstatus_t general_andC_32u(const UINT32 *pSrc, UINT32 val, UINT32 *pDst, INT32 len);
+pstatus_t general_orC_32u(const UINT32 *pSrc, UINT32 val, UINT32 *pDst, INT32 len);
+
+void primitives_init_andor_opt(const primitives_hints_t *hints,	primitives_t *prims);
+
+#endif /* !__PRIM_ANDOR_H_INCLUDED__ */
+
--- a/libfreerdp/primitives/prim_andor_opt.c
+++ b/libfreerdp/primitives/prim_andor_opt.c
@ -0,0 +1,61 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Optimized Logical operations.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+
+#ifdef WITH_SSE2
+#include <emmintrin.h>
+#include <pmmintrin.h>
+#endif /* WITH_SSE2 */
+
+#ifdef WITH_IPP
+#include <ipps.h>
+#endif /* WITH_IPP */
+
+#include "prim_internal.h"
+#include "prim_templates.h"
+#include "prim_andor.h"
+
+#ifdef WITH_SSE2
+# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
+/* ------------------------------------------------------------------------- */
+SSE3_SCD_PRE_ROUTINE(sse3_andC_32u, UINT32, general_andC_32u,
+	_mm_and_si128, *dptr++ = *sptr++ & val)
+SSE3_SCD_PRE_ROUTINE(sse3_orC_32u, UINT32, general_orC_32u,
+	_mm_or_si128, *dptr++ = *sptr++ | val)
+# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
+#endif
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_andor_opt(const primitives_hints_t *hints,	primitives_t *prims)
+{
+#if defined(WITH_IPP)
+	prims->andC_32u = (__andC_32u_t) ippsAndC_32u;
+	prims->orC_32u  = (__orC_32u_t) ippsOrC_32u;
+#elif defined(WITH_SSE2)
+	if ((hints->x86_flags & PRIM_X86_SSE2_AVAILABLE)
+			&& (hints->x86_flags & PRIM_X86_SSE3_AVAILABLE))
+	{
+		prims->andC_32u = sse3_andC_32u;
+		prims->orC_32u  = sse3_orC_32u;
+	}
+#endif
+}
+
--- a/libfreerdp/primitives/prim_colors.c
+++ b/libfreerdp/primitives/prim_colors.c
@ -21,16 +21,11 @@
 #include "config.h"
 #endif

-#include <string.h>
 #include <freerdp/types.h>
 #include <freerdp/primitives.h>
-#ifdef WITH_SSE2
-#include <emmintrin.h>
-#elif defined(WITH_NEON)
-#include <arm_neon.h>
-#endif /* WITH_SSE2 else WITH_NEON */
+
 #include "prim_internal.h"
-#include "prim_templates.h"
+#include "prim_colors.h"

 #ifndef MINMAX
 #define MINMAX(_v_, _l_, _h_) \
@ -38,7 +33,7 @@
 #endif /* !MINMAX */

 /* ------------------------------------------------------------------------- */
-PRIM_STATIC pstatus_t general_yCbCrToRGB_16s16s_P3P3(
+pstatus_t general_yCbCrToRGB_16s16s_P3P3(
 	const INT16 *pSrc[3],  INT32 srcStep,
 	INT16 *pDst[3],  INT32 dstStep,
 	const prim_size_t *roi)	/* region of interest */
@ -119,7 +114,7 @@ PRIM_STATIC pstatus_t general_yCbCrToRGB_16s16s_P3P3(
 }

 /* ------------------------------------------------------------------------- */
-PRIM_STATIC pstatus_t general_RGBToYCbCr_16s16s_P3P3(
+pstatus_t general_RGBToYCbCr_16s16s_P3P3(
 	const INT16 *pSrc[3],  INT32 srcStep,
 	INT16 *pDst[3],  INT32 dstStep,
 	const prim_size_t *roi)	/* region of interest */
@ -187,7 +182,7 @@ PRIM_STATIC pstatus_t general_RGBToYCbCr_16s16s_P3P3(
 }

 /* ------------------------------------------------------------------------- */
-PRIM_STATIC pstatus_t general_RGBToRGB_16s8u_P3AC4R(
+pstatus_t general_RGBToRGB_16s8u_P3AC4R(
 	const INT16 *pSrc[3],	/* 16-bit R,G, and B arrays */
 	int srcStep,			/* bytes between rows in source data */
 	BYTE *pDst,			/* 32-bit interleaved ARGB (ABGR?) data */
@ -219,514 +214,6 @@ PRIM_STATIC pstatus_t general_RGBToRGB_16s8u_P3AC4R(
 	return PRIMITIVES_SUCCESS;
 }

-
-#ifdef WITH_SSE2
-
-#ifdef __GNUC__
-# define GNU_INLINE \
-	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
-#else
-# define GNU_INLINE
-#endif
-
-#define CACHE_LINE_BYTES	64
-
-#define _mm_between_epi16(_val, _min, _max) \
-	do { _val = _mm_min_epi16(_max, _mm_max_epi16(_val, _min)); } while (0)
-
-#ifdef DO_PREFETCH
-/*---------------------------------------------------------------------------*/
-static inline void GNU_INLINE _mm_prefetch_buffer(
-	char * buffer, 
-	int num_bytes)
-{
-	__m128i * buf = (__m128i*) buffer;
-	unsigned int i;
-	for (i = 0; i < (num_bytes / sizeof(__m128i)); 
-		i+=(CACHE_LINE_BYTES / sizeof(__m128i)))
-	{
-		_mm_prefetch((char*)(&buf[i]), _MM_HINT_NTA);
-	}
-}
-#endif /* DO_PREFETCH */
-
-/*---------------------------------------------------------------------------*/
-PRIM_STATIC pstatus_t sse2_yCbCrToRGB_16s16s_P3P3(
-	const INT16 *pSrc[3],
-	int srcStep,
-	INT16 *pDst[3],
-	int dstStep,
-	const prim_size_t *roi)	/* region of interest */
-{
-	__m128i zero, max, r_cr, g_cb, g_cr, b_cb, c4096;
-	__m128i *y_buf, *cb_buf, *cr_buf, *r_buf, *g_buf, *b_buf;
-	int srcbump, dstbump, yp, imax;
-
-	if (((ULONG_PTR) (pSrc[0]) & 0x0f)
-			|| ((ULONG_PTR) (pSrc[1]) & 0x0f)
-			|| ((ULONG_PTR) (pSrc[2]) & 0x0f)
-			|| ((ULONG_PTR) (pDst[0]) & 0x0f)
-			|| ((ULONG_PTR) (pDst[1]) & 0x0f)
-			|| ((ULONG_PTR) (pDst[2]) & 0x0f)
-			|| (roi->width & 0x07)
-			|| (srcStep & 127)
-			|| (dstStep & 127))
-	{
-		/* We can't maintain 16-byte alignment. */
-		return general_yCbCrToRGB_16s16s_P3P3(pSrc, srcStep,
-			pDst, dstStep, roi);
-	}
-
-	zero = _mm_setzero_si128();
-	max = _mm_set1_epi16(255);
-
-	y_buf  = (__m128i*) (pSrc[0]);
-	cb_buf = (__m128i*) (pSrc[1]);
-	cr_buf = (__m128i*) (pSrc[2]);
-	r_buf  = (__m128i*) (pDst[0]);
-	g_buf  = (__m128i*) (pDst[1]);
-	b_buf  = (__m128i*) (pDst[2]);
-
-	r_cr = _mm_set1_epi16(22986);	/*  1.403 << 14 */
-	g_cb = _mm_set1_epi16(-5636);	/* -0.344 << 14 */
-	g_cr = _mm_set1_epi16(-11698);	/* -0.714 << 14 */
-	b_cb = _mm_set1_epi16(28999);	/*  1.770 << 14 */
-	c4096 = _mm_set1_epi16(4096);
-	srcbump = srcStep / sizeof(__m128i);
-	dstbump = dstStep / sizeof(__m128i);
-
-#ifdef DO_PREFETCH
-	/* Prefetch Y's, Cb's, and Cr's. */
-	for (yp=0; yp<roi->height; yp++)
-	{
-		int i;
-		for (i=0; i<roi->width * sizeof(INT16) / sizeof(__m128i);
-			i += (CACHE_LINE_BYTES / sizeof(__m128i)))
-		{
-			_mm_prefetch((char*)(&y_buf[i]),  _MM_HINT_NTA);
-			_mm_prefetch((char*)(&cb_buf[i]), _MM_HINT_NTA);
-			_mm_prefetch((char*)(&cr_buf[i]), _MM_HINT_NTA);
-		}
-		y_buf  += srcbump;
-		cb_buf += srcbump;
-		cr_buf += srcbump;
-	}
-	y_buf  = (__m128i*) (pSrc[0]);
-	cb_buf = (__m128i*) (pSrc[1]);
-	cr_buf = (__m128i*) (pSrc[2]);
-#endif /* DO_PREFETCH */
-
-	imax = roi->width * sizeof(INT16) / sizeof(__m128i);
-	for (yp=0; yp<roi->height; ++yp)
-	{
-		int i;
-		for (i=0; i<imax; i++)
-		{
-			/* In order to use SSE2 signed 16-bit integer multiplication
-			 * we need to convert the floating point factors to signed int
-			 * without losing information.
-			 * The result of this multiplication is 32 bit and we have two
-			 * SSE instructions that return either the hi or lo word.
-			 * Thus we will multiply the factors by the highest possible 2^n,
-			 * take the upper 16 bits of the signed 32-bit result
-			 * (_mm_mulhi_epi16) and correct this result by multiplying
-			 * it by 2^(16-n).
-			 *
-			 * For the given factors in the conversion matrix the best
-			 * possible n is 14.
-			 *
-			 * Example for calculating r:
-			 * r = (y>>5) + 128 + (cr*1.403)>>5             // our base formula
-			 * r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5   // see above
-			 * r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5     // simplification
-			 * r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
-			 */
-
-			/* y = (y_r_buf[i] + 4096) >> 2 */
-			__m128i y, cb, cr, r, g, b;
-			y = _mm_load_si128(y_buf + i);
-			y = _mm_add_epi16(y, c4096);
-			y = _mm_srai_epi16(y, 2);
-			/* cb = cb_g_buf[i]; */
-			cb = _mm_load_si128(cb_buf + i);
-			/* cr = cr_b_buf[i]; */
-			cr = _mm_load_si128(cr_buf + i);
-
-			/* (y + HIWORD(cr*22986)) >> 3 */
-			r = _mm_add_epi16(y, _mm_mulhi_epi16(cr, r_cr));
-			r = _mm_srai_epi16(r, 3);
-
-			/* r_buf[i] = MINMAX(r, 0, 255); */
-			_mm_between_epi16(r, zero, max);
-			_mm_store_si128(r_buf + i, r);
-
-			/* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
-			g = _mm_add_epi16(y, _mm_mulhi_epi16(cb, g_cb));
-			g = _mm_add_epi16(g, _mm_mulhi_epi16(cr, g_cr));
-			g = _mm_srai_epi16(g, 3);
-
-			/* g_buf[i] = MINMAX(g, 0, 255); */
-			_mm_between_epi16(g, zero, max);
-			_mm_store_si128(g_buf + i, g);
-
-			/* (y + HIWORD(cb*28999)) >> 3 */
-			b = _mm_add_epi16(y, _mm_mulhi_epi16(cb, b_cb));
-			b = _mm_srai_epi16(b, 3);
-			/* b_buf[i] = MINMAX(b, 0, 255); */
-			_mm_between_epi16(b, zero, max);
-			_mm_store_si128(b_buf + i, b);
-		}
-		y_buf  += srcbump;
-		cb_buf += srcbump;
-		cr_buf += srcbump;
-		r_buf += dstbump;
-		g_buf += dstbump;
-		b_buf += dstbump;
-	}
-
-	return PRIMITIVES_SUCCESS;
-}
-
-/*---------------------------------------------------------------------------*/
-/* The encodec YCbCr coeffectients are represented as 11.5 fixed-point
- * numbers. See the general code above.
- */
-PRIM_STATIC pstatus_t sse2_RGBToYCbCr_16s16s_P3P3(
-	const INT16 *pSrc[3],
-	int srcStep,
-	INT16 *pDst[3],
-	int dstStep,
-	const prim_size_t *roi)	/* region of interest */
-{
-	__m128i min, max, y_r, y_g, y_b, cb_r, cb_g, cb_b, cr_r, cr_g, cr_b;
-	__m128i *r_buf, *g_buf, *b_buf, *y_buf, *cb_buf, *cr_buf;
-	int srcbump, dstbump, yp, imax;
-
-	if (((ULONG_PTR) (pSrc[0]) & 0x0f)
-			|| ((ULONG_PTR) (pSrc[1]) & 0x0f)
-			|| ((ULONG_PTR) (pSrc[2]) & 0x0f)
-			|| ((ULONG_PTR) (pDst[0]) & 0x0f)
-			|| ((ULONG_PTR) (pDst[1]) & 0x0f)
-			|| ((ULONG_PTR) (pDst[2]) & 0x0f)
-			|| (roi->width & 0x07)
-			|| (srcStep & 127)
-			|| (dstStep & 127))
-	{
-		/* We can't maintain 16-byte alignment. */
-		return general_RGBToYCbCr_16s16s_P3P3(pSrc, srcStep,
-			pDst, dstStep, roi);
-	}
-
-	min = _mm_set1_epi16(-128 << 5);
-	max = _mm_set1_epi16(127 << 5);
-
-	r_buf  = (__m128i*) (pSrc[0]);
-	g_buf  = (__m128i*) (pSrc[1]);
-	b_buf  = (__m128i*) (pSrc[2]);
-	y_buf  = (__m128i*) (pDst[0]);
-	cb_buf = (__m128i*) (pDst[1]);
-	cr_buf = (__m128i*) (pDst[2]);
-
-	y_r  = _mm_set1_epi16(9798);   /*  0.299000 << 15 */
-	y_g  = _mm_set1_epi16(19235);  /*  0.587000 << 15 */
-	y_b  = _mm_set1_epi16(3735);   /*  0.114000 << 15 */
-	cb_r = _mm_set1_epi16(-5535);  /* -0.168935 << 15 */
-	cb_g = _mm_set1_epi16(-10868); /* -0.331665 << 15 */
-	cb_b = _mm_set1_epi16(16403);  /*  0.500590 << 15 */
-	cr_r = _mm_set1_epi16(16377);  /*  0.499813 << 15 */
-	cr_g = _mm_set1_epi16(-13714); /* -0.418531 << 15 */
-	cr_b = _mm_set1_epi16(-2663);  /* -0.081282 << 15 */
-
-	srcbump = srcStep / sizeof(__m128i);
-	dstbump = dstStep / sizeof(__m128i);
-
-#ifdef DO_PREFETCH
-	/* Prefetch RGB's. */
-	for (yp=0; yp<roi->height; yp++)
-	{
-		int i;
-		for (i=0; i<roi->width * sizeof(INT16) / sizeof(__m128i);
-			i += (CACHE_LINE_BYTES / sizeof(__m128i)))
-		{
-			_mm_prefetch((char*)(&r_buf[i]), _MM_HINT_NTA);
-			_mm_prefetch((char*)(&g_buf[i]), _MM_HINT_NTA);
-			_mm_prefetch((char*)(&b_buf[i]), _MM_HINT_NTA);
-		}
-		r_buf += srcbump;
-		g_buf += srcbump;
-		b_buf += srcbump;
-	}
-	r_buf = (__m128i*) (pSrc[0]);
-	g_buf = (__m128i*) (pSrc[1]);
-	b_buf = (__m128i*) (pSrc[2]);
-#endif /* DO_PREFETCH */
-
-	imax = roi->width * sizeof(INT16) / sizeof(__m128i);
-	for (yp=0; yp<roi->height; ++yp)
-	{
-		int i;
-		for (i=0; i<imax; i++)
-		{
-			/* In order to use SSE2 signed 16-bit integer multiplication we
-			 * need to convert the floating point factors to signed int
-			 * without loosing information.  The result of this multiplication
-			 * is 32 bit and using SSE2 we get either the product's hi or lo
-			 * word.  Thus we will multiply the factors by the highest
-			 * possible 2^n and take the upper 16 bits of the signed 32-bit
-			 * result (_mm_mulhi_epi16).  Since the final result needs to
-			 * be scaled by << 5 and also in in order to keep the precision
-			 * within the upper 16 bits we will also have to scale the RGB
-			 * values used in the multiplication by << 5+(16-n).
-			 */
-			__m128i r, g, b, y, cb, cr;
-			r = _mm_load_si128(y_buf+i);
-			g = _mm_load_si128(g_buf+i);
-			b = _mm_load_si128(b_buf+i);
-
-			/* r<<6; g<<6; b<<6 */
-			r = _mm_slli_epi16(r, 6);
-			g = _mm_slli_epi16(g, 6);
-			b = _mm_slli_epi16(b, 6);
-
-			/* y = HIWORD(r*y_r) + HIWORD(g*y_g) + HIWORD(b*y_b) + min */
-			y = _mm_mulhi_epi16(r, y_r);
-			y = _mm_add_epi16(y, _mm_mulhi_epi16(g, y_g));
-			y = _mm_add_epi16(y, _mm_mulhi_epi16(b, y_b));
-			y = _mm_add_epi16(y, min);
-			/* y_r_buf[i] = MINMAX(y, 0, (255 << 5)) - (128 << 5); */
-			_mm_between_epi16(y, min, max);
-			_mm_store_si128(y_buf+i, y);
-
-			/* cb = HIWORD(r*cb_r) + HIWORD(g*cb_g) + HIWORD(b*cb_b) */
-			cb = _mm_mulhi_epi16(r, cb_r);
-			cb = _mm_add_epi16(cb, _mm_mulhi_epi16(g, cb_g));
-			cb = _mm_add_epi16(cb, _mm_mulhi_epi16(b, cb_b));
-			/* cb_g_buf[i] = MINMAX(cb, (-128 << 5), (127 << 5)); */
-			_mm_between_epi16(cb, min, max);
-			_mm_store_si128(cb_buf+i, cb);
-
-			/* cr = HIWORD(r*cr_r) + HIWORD(g*cr_g) + HIWORD(b*cr_b) */
-			cr = _mm_mulhi_epi16(r, cr_r);
-			cr = _mm_add_epi16(cr, _mm_mulhi_epi16(g, cr_g));
-			cr = _mm_add_epi16(cr, _mm_mulhi_epi16(b, cr_b));
-			/* cr_b_buf[i] = MINMAX(cr, (-128 << 5), (127 << 5)); */
-			_mm_between_epi16(cr, min, max);
-			_mm_store_si128(cr_buf+i, cr);
-		}
-		y_buf  += srcbump;
-		cb_buf += srcbump;
-		cr_buf += srcbump;
-		r_buf += dstbump;
-		g_buf += dstbump;
-		b_buf += dstbump;
-	}
-
-	return PRIMITIVES_SUCCESS;
-}
-
-/*---------------------------------------------------------------------------*/
-#define LOAD128(_src_) \
-	_mm_load_si128((__m128i *) _src_)
-#define STORE128(_dst_, _src_) \
-	_mm_store_si128((__m128i *) _dst_, _src_)
-#define PUNPCKLBW(_dst_, _src_) \
-	_dst_ = _mm_unpacklo_epi8(_src_, _dst_)
-#define PUNPCKHBW(_dst_, _src_) \
-	_dst_ = _mm_unpackhi_epi8(_src_, _dst_)
-#define PUNPCKLWD(_dst_, _src_) \
-	_dst_ = _mm_unpacklo_epi16(_src_, _dst_)
-#define PUNPCKHWD(_dst_, _src_) \
-	_dst_ = _mm_unpackhi_epi16(_src_, _dst_)
-#define PACKUSWB(_dst_, _src_) \
-	_dst_ = _mm_packus_epi16(_dst_, _src_)
-#define PREFETCH(_ptr_) \
-	_mm_prefetch((const void *) _ptr_, _MM_HINT_T0)
-#define XMM_ALL_ONES \
-	_mm_set1_epi32(0xFFFFFFFFU)
-
-PRIM_STATIC pstatus_t sse2_RGBToRGB_16s8u_P3AC4R(
-	const INT16 *pSrc[3],	/* 16-bit R,G, and B arrays */
-	INT32 srcStep,			/* bytes between rows in source data */
-	BYTE *pDst,				/* 32-bit interleaved ARGB (ABGR?) data */
-	INT32 dstStep,			/* bytes between rows in dest data */
-	const prim_size_t *roi)	/* region of interest */
-{
-	const UINT16 *r = (const UINT16 *) (pSrc[0]);
-	const UINT16 *g = (const UINT16 *) (pSrc[1]);
-	const UINT16 *b = (const UINT16 *) (pSrc[2]);
-	BYTE *out;
-	int srcbump, dstbump, y;
-
-	/* Ensure 16-byte alignment on all pointers,
-	 * that width is a multiple of 8,
-	 * and that the next row will also remain aligned.
-	 * Since this is usually used for 64x64 aligned arrays,
-	 * these checks should presumably pass.
-	 */
-	if ((((ULONG_PTR) (pSrc[0]) & 0x0f) != 0)
-			|| (((ULONG_PTR) (pSrc[1]) & 0x0f) != 0)
-			|| (((ULONG_PTR) (pSrc[2]) & 0x0f) != 0)
-			|| (((ULONG_PTR) pDst & 0x0f) != 0)
-			|| (roi->width & 0x0f)
-			|| (srcStep & 0x0f)
-			|| (dstStep & 0x0f))
-	{
-		return general_RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, roi);
-	}
-
-	out = (BYTE *) pDst;
-	srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
-	dstbump = (dstStep - (roi->width * sizeof(UINT32)));
-
-	for (y=0; y<roi->height; ++y)
-	{
-		int width = roi->width;
-		do {
-			__m128i R0, R1, R2, R3, R4;
-			/* The comments below pretend these are 8-byte registers
-			 * rather than 16-byte, for readability.
-			 */
-			R0 = LOAD128(b);  b += 8;		/* R0 = 00B300B200B100B0 */
-			R1 = LOAD128(b);  b += 8;		/* R1 = 00B700B600B500B4 */
-			PACKUSWB(R0,R1);				/* R0 = B7B6B5B4B3B2B1B0 */
-			R1 = LOAD128(g);  g += 8;		/* R1 = 00G300G200G100G0 */
-			R2 = LOAD128(g);  g += 8;		/* R2 = 00G700G600G500G4 */
-			PACKUSWB(R1,R2);				/* R1 = G7G6G5G4G3G2G1G0 */
-			R2 = R1;						/* R2 = G7G6G5G4G3G2G1G0 */
-			PUNPCKLBW(R2,R0);				/* R2 = G3B3G2B2G1B1G0B0 */
-			PUNPCKHBW(R1,R0);				/* R1 = G7B7G6B7G5B5G4B4 */
-			R0 = LOAD128(r);  r += 8;		/* R0 = 00R300R200R100R0 */
-			R3 = LOAD128(r);  r += 8;		/* R3 = 00R700R600R500R4 */
-			PACKUSWB(R0,R3);				/* R0 = R7R6R5R4R3R2R1R0 */
-			R3 = XMM_ALL_ONES;				/* R3 = FFFFFFFFFFFFFFFF */
-			R4 = R3;						/* R4 = FFFFFFFFFFFFFFFF */
-			PUNPCKLBW(R4,R0);				/* R4 = FFR3FFR2FFR1FFR0 */
-			PUNPCKHBW(R3,R0);				/* R3 = FFR7FFR6FFR5FFR4 */
-			R0 = R4;						/* R0 = R4               */
-			PUNPCKLWD(R0,R2);				/* R0 = FFR1G1B1FFR0G0B0 */
-			PUNPCKHWD(R4,R2);				/* R4 = FFR3G3B3FFR2G2B2 */
-			R2 = R3;						/* R2 = R3               */
-			PUNPCKLWD(R2,R1);				/* R2 = FFR5G5B5FFR4G4B4 */
-			PUNPCKHWD(R3,R1);				/* R3 = FFR7G7B7FFR6G6B6 */
-			STORE128(out, R0);  out += 16;	/* FFR1G1B1FFR0G0B0      */
-			STORE128(out, R4);  out += 16;	/* FFR3G3B3FFR2G2B2      */
-			STORE128(out, R2);  out += 16;	/* FFR5G5B5FFR4G4B4      */
-			STORE128(out, R3);  out += 16;	/* FFR7G7B7FFR6G6B6      */
-		} while (width -= 16);
-		/* Jump to next row. */
-		r += srcbump;
-		g += srcbump;
-		b += srcbump;
-		out += dstbump;
-	}
-	return PRIMITIVES_SUCCESS;
-}
-#endif /* WITH_SSE2 */
-
-/*---------------------------------------------------------------------------*/
-#ifdef WITH_NEON
-PRIM_STATIC pstatus_t neon_yCbCrToRGB_16s16s_P3P3(
-	const INT16 *pSrc[3],
-	int srcStep,
-	INT16 *pDst[3],
-	int dstStep,
-	const prim_size_t *roi)	/* region of interest */
-{
-	/* TODO: If necessary, check alignments and call the general version. */
-
-	int16x8_t zero = vdupq_n_s16(0);
-	int16x8_t max = vdupq_n_s16(255);
-
-	int16x8_t r_cr = vdupq_n_s16(22986);	//  1.403 << 14
-	int16x8_t g_cb = vdupq_n_s16(-5636);	// -0.344 << 14
-	int16x8_t g_cr = vdupq_n_s16(-11698);	// -0.714 << 14
-	int16x8_t b_cb = vdupq_n_s16(28999);	//  1.770 << 14
-	int16x8_t c4096 = vdupq_n_s16(4096);
-
-	int16x8_t* y_buf  = (int16x8_t*) pSrc[0];
-	int16x8_t* cb_buf = (int16x8_t*) pSrc[1];
-	int16x8_t* cr_buf = (int16x8_t*) pSrc[2];
-	int16x8_t* r_buf  = (int16x8_t*) pDst[0];
-	int16x8_t* g_buf  = (int16x8_t*) pDst[1];
-	int16x8_t* b_buf  = (int16x8_t*) pDst[2];
-
-	int srcbump = srcStep / sizeof(int16x8_t);
-	int dstbump = dstStep / sizeof(int16x8_t);
-	int yp;
-
-	int imax = roi->width * sizeof(INT16) / sizeof(int16x8_t);
-	for (yp=0; yp<roi->height; ++yp)
-	{
-		int i;
-		for (i=0; i<imax; i++)
-		{
-			/*
-				In order to use NEON signed 16-bit integer multiplication we need to convert
-				the floating point factors to signed int without loosing information.
-				The result of this multiplication is 32 bit and we have a NEON instruction
-				that returns the hi word of the saturated double.
-				Thus we will multiply the factors by the highest possible 2^n, take the 
-				upper 16 bits of the signed 32-bit result (vqdmulhq_s16 followed by a right
-				shift by 1 to reverse the doubling) and correct	this result by multiplying it 
-				by 2^(16-n).
-				For the given factors in the conversion matrix the best possible n is 14.
-
-				Example for calculating r:
-				r = (y>>5) + 128 + (cr*1.403)>>5                       // our base formula
-				r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5      // see above
-				r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5             // simplification
-				r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
-			*/
-		
-			/* y = (y_buf[i] + 4096) >> 2 */
-			int16x8_t y = vld1q_s16((INT16*) &y_buf[i]);
-			y = vaddq_s16(y, c4096);
-			y = vshrq_n_s16(y, 2);
-			/* cb = cb_buf[i]; */
-			int16x8_t cb = vld1q_s16((INT16*)&cb_buf[i]);
-			/* cr = cr_buf[i]; */
-			int16x8_t cr = vld1q_s16((INT16*) &cr_buf[i]);
-
-			/* (y + HIWORD(cr*22986)) >> 3 */
-			int16x8_t r = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cr, r_cr), 1));
-			r = vshrq_n_s16(r, 3);
-			/* r_buf[i] = MINMAX(r, 0, 255); */
-			r = vminq_s16(vmaxq_s16(r, zero), max);
-			vst1q_s16((INT16*)&r_buf[i], r);
-
-			/* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
-			int16x8_t g = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cb, g_cb), 1));
-			g = vaddq_s16(g, vshrq_n_s16(vqdmulhq_s16(cr, g_cr), 1));
-			g = vshrq_n_s16(g, 3);
-			/* g_buf[i] = MINMAX(g, 0, 255); */
-			g = vminq_s16(vmaxq_s16(g, zero), max);
-			vst1q_s16((INT16*)&g_buf[i], g);
-
-			/* (y + HIWORD(cb*28999)) >> 3 */
-			int16x8_t b = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cb, b_cb), 1));
-			b = vshrq_n_s16(b, 3);
-			/* b_buf[i] = MINMAX(b, 0, 255); */
-			b = vminq_s16(vmaxq_s16(b, zero), max);
-			vst1q_s16((INT16*)&b_buf[i], b);
-		}
-
-		y_buf  += srcbump;
-		cb_buf += srcbump;
-		cr_buf += srcbump;
-		r_buf += dstbump;
-		g_buf += dstbump;
-		b_buf += dstbump;
-	}
-	return PRIMITIVES_SUCCESS;
-}
-#endif /* WITH_NEON */
-
-
-/* I don't see a direct IPP version of this, since the input is INT16
- * YCbCr.  It may be possible via  Deinterleave and then YCbCrToRGB_<mod>.
- * But that would likely be slower.
- */
-
 /* ------------------------------------------------------------------------- */
 void primitives_init_colors(const primitives_hints_t* hints, primitives_t* prims)
 {
@ -734,19 +221,7 @@ void primitives_init_colors(const primitives_hints_t* hints, primitives_t* prims
 	prims->yCbCrToRGB_16s16s_P3P3 = general_yCbCrToRGB_16s16s_P3P3;
 	prims->RGBToYCbCr_16s16s_P3P3 = general_RGBToYCbCr_16s16s_P3P3;

-#if defined(WITH_SSE2)
-	if (hints->x86_flags & PRIM_X86_SSE2_AVAILABLE)
-	{
-		prims->RGBToRGB_16s8u_P3AC4R  = sse2_RGBToRGB_16s8u_P3AC4R;
-		prims->yCbCrToRGB_16s16s_P3P3 = sse2_yCbCrToRGB_16s16s_P3P3;
-		prims->RGBToYCbCr_16s16s_P3P3 = sse2_RGBToYCbCr_16s16s_P3P3;
-	}
-#elif defined(WITH_NEON)
-	if (hints->arm_flags & PRIM_ARM_NEON_AVAILABLE)
-	{
-		prims->yCbCrToRGB_16s16s_P3P3 = neon_yCbCrToRGB_16s16s_P3P3;
-	}
-#endif /* WITH_SSE2 */
+	primitives_init_colors_opt(hints, prims);
 }

 /* ------------------------------------------------------------------------- */
@ -754,3 +229,4 @@ void primitives_deinit_colors(primitives_t* prims)
 {
 	/* Nothing to do. */
 }
+
--- a/libfreerdp/primitives/prim_colors.h
+++ b/libfreerdp/primitives/prim_colors.h
@ -0,0 +1,32 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Color conversion operations.
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.  Algorithms used by
+ * this code may be covered by patents by HP, Microsoft, or other parties.
+ *
+ */
+
+#ifdef __GNUC__
+# pragma once
+#endif
+
+#ifndef __PRIM_COLORS_H_INCLUDED__
+#define __PRIM_COLORS_H_INCLUDED__
+
+pstatus_t general_yCbCrToRGB_16s16s_P3P3(const INT16 *pSrc[3], INT32 srcStep, INT16 *pDst[3], INT32 dstStep, const prim_size_t *roi);
+pstatus_t general_RGBToYCbCr_16s16s_P3P3(const INT16 *pSrc[3], INT32 srcStep, INT16 *pDst[3], INT32 dstStep, const prim_size_t *roi);
+pstatus_t general_RGBToRGB_16s8u_P3AC4R(const INT16 *pSrc[3], int srcStep, BYTE *pDst, int dstStep, const prim_size_t *roi);
+
+void primitives_init_colors_opt(const primitives_hints_t* hints, primitives_t* prims);
+
+#endif /* !__PRIM_COLORS_H_INCLUDED__ */
+
--- a/libfreerdp/primitives/prim_colors_opt.c
+++ b/libfreerdp/primitives/prim_colors_opt.c
@ -0,0 +1,561 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Optimized Color conversion operations.
+ * vi:ts=4 sw=4:
+ *
+ * Copyright 2011 Stephen Erisman
+ * Copyright 2011 Norbert Federa <nfedera@thinstuff.com>
+ * Copyright 2011 Martin Fleisz <mfleisz@thinstuff.com>
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+
+#ifdef WITH_SSE2
+#include <emmintrin.h>
+#elif defined(WITH_NEON)
+#include <arm_neon.h>
+#endif /* WITH_SSE2 else WITH_NEON */
+
+#include "prim_internal.h"
+#include "prim_templates.h"
+#include "prim_colors.h"
+
+#ifdef WITH_SSE2
+
+#ifdef __GNUC__
+# define GNU_INLINE \
+	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+#else
+# define GNU_INLINE
+#endif
+
+#define CACHE_LINE_BYTES	64
+
+#define _mm_between_epi16(_val, _min, _max) \
+	do { _val = _mm_min_epi16(_max, _mm_max_epi16(_val, _min)); } while (0)
+
+#ifdef DO_PREFETCH
+/*---------------------------------------------------------------------------*/
+static inline void GNU_INLINE _mm_prefetch_buffer(
+	char * buffer, 
+	int num_bytes)
+{
+	__m128i * buf = (__m128i*) buffer;
+	unsigned int i;
+	for (i = 0; i < (num_bytes / sizeof(__m128i)); 
+		i+=(CACHE_LINE_BYTES / sizeof(__m128i)))
+	{
+		_mm_prefetch((char*)(&buf[i]), _MM_HINT_NTA);
+	}
+}
+#endif /* DO_PREFETCH */
+
+/*---------------------------------------------------------------------------*/
+pstatus_t sse2_yCbCrToRGB_16s16s_P3P3(
+	const INT16 *pSrc[3],
+	int srcStep,
+	INT16 *pDst[3],
+	int dstStep,
+	const prim_size_t *roi)	/* region of interest */
+{
+	__m128i zero, max, r_cr, g_cb, g_cr, b_cb, c4096;
+	__m128i *y_buf, *cb_buf, *cr_buf, *r_buf, *g_buf, *b_buf;
+	int srcbump, dstbump, yp, imax;
+
+	if (((ULONG_PTR) (pSrc[0]) & 0x0f)
+			|| ((ULONG_PTR) (pSrc[1]) & 0x0f)
+			|| ((ULONG_PTR) (pSrc[2]) & 0x0f)
+			|| ((ULONG_PTR) (pDst[0]) & 0x0f)
+			|| ((ULONG_PTR) (pDst[1]) & 0x0f)
+			|| ((ULONG_PTR) (pDst[2]) & 0x0f)
+			|| (roi->width & 0x07)
+			|| (srcStep & 127)
+			|| (dstStep & 127))
+	{
+		/* We can't maintain 16-byte alignment. */
+		return general_yCbCrToRGB_16s16s_P3P3(pSrc, srcStep,
+			pDst, dstStep, roi);
+	}
+
+	zero = _mm_setzero_si128();
+	max = _mm_set1_epi16(255);
+
+	y_buf  = (__m128i*) (pSrc[0]);
+	cb_buf = (__m128i*) (pSrc[1]);
+	cr_buf = (__m128i*) (pSrc[2]);
+	r_buf  = (__m128i*) (pDst[0]);
+	g_buf  = (__m128i*) (pDst[1]);
+	b_buf  = (__m128i*) (pDst[2]);
+
+	r_cr = _mm_set1_epi16(22986);	/*  1.403 << 14 */
+	g_cb = _mm_set1_epi16(-5636);	/* -0.344 << 14 */
+	g_cr = _mm_set1_epi16(-11698);	/* -0.714 << 14 */
+	b_cb = _mm_set1_epi16(28999);	/*  1.770 << 14 */
+	c4096 = _mm_set1_epi16(4096);
+	srcbump = srcStep / sizeof(__m128i);
+	dstbump = dstStep / sizeof(__m128i);
+
+#ifdef DO_PREFETCH
+	/* Prefetch Y's, Cb's, and Cr's. */
+	for (yp=0; yp<roi->height; yp++)
+	{
+		int i;
+		for (i=0; i<roi->width * sizeof(INT16) / sizeof(__m128i);
+			i += (CACHE_LINE_BYTES / sizeof(__m128i)))
+		{
+			_mm_prefetch((char*)(&y_buf[i]),  _MM_HINT_NTA);
+			_mm_prefetch((char*)(&cb_buf[i]), _MM_HINT_NTA);
+			_mm_prefetch((char*)(&cr_buf[i]), _MM_HINT_NTA);
+		}
+		y_buf  += srcbump;
+		cb_buf += srcbump;
+		cr_buf += srcbump;
+	}
+	y_buf  = (__m128i*) (pSrc[0]);
+	cb_buf = (__m128i*) (pSrc[1]);
+	cr_buf = (__m128i*) (pSrc[2]);
+#endif /* DO_PREFETCH */
+
+	imax = roi->width * sizeof(INT16) / sizeof(__m128i);
+	for (yp=0; yp<roi->height; ++yp)
+	{
+		int i;
+		for (i=0; i<imax; i++)
+		{
+			/* In order to use SSE2 signed 16-bit integer multiplication
+			 * we need to convert the floating point factors to signed int
+			 * without losing information.
+			 * The result of this multiplication is 32 bit and we have two
+			 * SSE instructions that return either the hi or lo word.
+			 * Thus we will multiply the factors by the highest possible 2^n,
+			 * take the upper 16 bits of the signed 32-bit result
+			 * (_mm_mulhi_epi16) and correct this result by multiplying
+			 * it by 2^(16-n).
+			 *
+			 * For the given factors in the conversion matrix the best
+			 * possible n is 14.
+			 *
+			 * Example for calculating r:
+			 * r = (y>>5) + 128 + (cr*1.403)>>5             // our base formula
+			 * r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5   // see above
+			 * r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5     // simplification
+			 * r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
+			 */
+
+			/* y = (y_r_buf[i] + 4096) >> 2 */
+			__m128i y, cb, cr, r, g, b;
+			y = _mm_load_si128(y_buf + i);
+			y = _mm_add_epi16(y, c4096);
+			y = _mm_srai_epi16(y, 2);
+			/* cb = cb_g_buf[i]; */
+			cb = _mm_load_si128(cb_buf + i);
+			/* cr = cr_b_buf[i]; */
+			cr = _mm_load_si128(cr_buf + i);
+
+			/* (y + HIWORD(cr*22986)) >> 3 */
+			r = _mm_add_epi16(y, _mm_mulhi_epi16(cr, r_cr));
+			r = _mm_srai_epi16(r, 3);
+
+			/* r_buf[i] = MINMAX(r, 0, 255); */
+			_mm_between_epi16(r, zero, max);
+			_mm_store_si128(r_buf + i, r);
+
+			/* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
+			g = _mm_add_epi16(y, _mm_mulhi_epi16(cb, g_cb));
+			g = _mm_add_epi16(g, _mm_mulhi_epi16(cr, g_cr));
+			g = _mm_srai_epi16(g, 3);
+
+			/* g_buf[i] = MINMAX(g, 0, 255); */
+			_mm_between_epi16(g, zero, max);
+			_mm_store_si128(g_buf + i, g);
+
+			/* (y + HIWORD(cb*28999)) >> 3 */
+			b = _mm_add_epi16(y, _mm_mulhi_epi16(cb, b_cb));
+			b = _mm_srai_epi16(b, 3);
+			/* b_buf[i] = MINMAX(b, 0, 255); */
+			_mm_between_epi16(b, zero, max);
+			_mm_store_si128(b_buf + i, b);
+		}
+		y_buf  += srcbump;
+		cb_buf += srcbump;
+		cr_buf += srcbump;
+		r_buf += dstbump;
+		g_buf += dstbump;
+		b_buf += dstbump;
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+/*---------------------------------------------------------------------------*/
+/* The encodec YCbCr coeffectients are represented as 11.5 fixed-point
+ * numbers. See the general code above.
+ */
+pstatus_t sse2_RGBToYCbCr_16s16s_P3P3(
+	const INT16 *pSrc[3],
+	int srcStep,
+	INT16 *pDst[3],
+	int dstStep,
+	const prim_size_t *roi)	/* region of interest */
+{
+	__m128i min, max, y_r, y_g, y_b, cb_r, cb_g, cb_b, cr_r, cr_g, cr_b;
+	__m128i *r_buf, *g_buf, *b_buf, *y_buf, *cb_buf, *cr_buf;
+	int srcbump, dstbump, yp, imax;
+
+	if (((ULONG_PTR) (pSrc[0]) & 0x0f)
+			|| ((ULONG_PTR) (pSrc[1]) & 0x0f)
+			|| ((ULONG_PTR) (pSrc[2]) & 0x0f)
+			|| ((ULONG_PTR) (pDst[0]) & 0x0f)
+			|| ((ULONG_PTR) (pDst[1]) & 0x0f)
+			|| ((ULONG_PTR) (pDst[2]) & 0x0f)
+			|| (roi->width & 0x07)
+			|| (srcStep & 127)
+			|| (dstStep & 127))
+	{
+		/* We can't maintain 16-byte alignment. */
+		return general_RGBToYCbCr_16s16s_P3P3(pSrc, srcStep,
+			pDst, dstStep, roi);
+	}
+
+	min = _mm_set1_epi16(-128 << 5);
+	max = _mm_set1_epi16(127 << 5);
+
+	r_buf  = (__m128i*) (pSrc[0]);
+	g_buf  = (__m128i*) (pSrc[1]);
+	b_buf  = (__m128i*) (pSrc[2]);
+	y_buf  = (__m128i*) (pDst[0]);
+	cb_buf = (__m128i*) (pDst[1]);
+	cr_buf = (__m128i*) (pDst[2]);
+
+	y_r  = _mm_set1_epi16(9798);   /*  0.299000 << 15 */
+	y_g  = _mm_set1_epi16(19235);  /*  0.587000 << 15 */
+	y_b  = _mm_set1_epi16(3735);   /*  0.114000 << 15 */
+	cb_r = _mm_set1_epi16(-5535);  /* -0.168935 << 15 */
+	cb_g = _mm_set1_epi16(-10868); /* -0.331665 << 15 */
+	cb_b = _mm_set1_epi16(16403);  /*  0.500590 << 15 */
+	cr_r = _mm_set1_epi16(16377);  /*  0.499813 << 15 */
+	cr_g = _mm_set1_epi16(-13714); /* -0.418531 << 15 */
+	cr_b = _mm_set1_epi16(-2663);  /* -0.081282 << 15 */
+
+	srcbump = srcStep / sizeof(__m128i);
+	dstbump = dstStep / sizeof(__m128i);
+
+#ifdef DO_PREFETCH
+	/* Prefetch RGB's. */
+	for (yp=0; yp<roi->height; yp++)
+	{
+		int i;
+		for (i=0; i<roi->width * sizeof(INT16) / sizeof(__m128i);
+			i += (CACHE_LINE_BYTES / sizeof(__m128i)))
+		{
+			_mm_prefetch((char*)(&r_buf[i]), _MM_HINT_NTA);
+			_mm_prefetch((char*)(&g_buf[i]), _MM_HINT_NTA);
+			_mm_prefetch((char*)(&b_buf[i]), _MM_HINT_NTA);
+		}
+		r_buf += srcbump;
+		g_buf += srcbump;
+		b_buf += srcbump;
+	}
+	r_buf = (__m128i*) (pSrc[0]);
+	g_buf = (__m128i*) (pSrc[1]);
+	b_buf = (__m128i*) (pSrc[2]);
+#endif /* DO_PREFETCH */
+
+	imax = roi->width * sizeof(INT16) / sizeof(__m128i);
+	for (yp=0; yp<roi->height; ++yp)
+	{
+		int i;
+		for (i=0; i<imax; i++)
+		{
+			/* In order to use SSE2 signed 16-bit integer multiplication we
+			 * need to convert the floating point factors to signed int
+			 * without loosing information.  The result of this multiplication
+			 * is 32 bit and using SSE2 we get either the product's hi or lo
+			 * word.  Thus we will multiply the factors by the highest
+			 * possible 2^n and take the upper 16 bits of the signed 32-bit
+			 * result (_mm_mulhi_epi16).  Since the final result needs to
+			 * be scaled by << 5 and also in in order to keep the precision
+			 * within the upper 16 bits we will also have to scale the RGB
+			 * values used in the multiplication by << 5+(16-n).
+			 */
+			__m128i r, g, b, y, cb, cr;
+			r = _mm_load_si128(y_buf+i);
+			g = _mm_load_si128(g_buf+i);
+			b = _mm_load_si128(b_buf+i);
+
+			/* r<<6; g<<6; b<<6 */
+			r = _mm_slli_epi16(r, 6);
+			g = _mm_slli_epi16(g, 6);
+			b = _mm_slli_epi16(b, 6);
+
+			/* y = HIWORD(r*y_r) + HIWORD(g*y_g) + HIWORD(b*y_b) + min */
+			y = _mm_mulhi_epi16(r, y_r);
+			y = _mm_add_epi16(y, _mm_mulhi_epi16(g, y_g));
+			y = _mm_add_epi16(y, _mm_mulhi_epi16(b, y_b));
+			y = _mm_add_epi16(y, min);
+			/* y_r_buf[i] = MINMAX(y, 0, (255 << 5)) - (128 << 5); */
+			_mm_between_epi16(y, min, max);
+			_mm_store_si128(y_buf+i, y);
+
+			/* cb = HIWORD(r*cb_r) + HIWORD(g*cb_g) + HIWORD(b*cb_b) */
+			cb = _mm_mulhi_epi16(r, cb_r);
+			cb = _mm_add_epi16(cb, _mm_mulhi_epi16(g, cb_g));
+			cb = _mm_add_epi16(cb, _mm_mulhi_epi16(b, cb_b));
+			/* cb_g_buf[i] = MINMAX(cb, (-128 << 5), (127 << 5)); */
+			_mm_between_epi16(cb, min, max);
+			_mm_store_si128(cb_buf+i, cb);
+
+			/* cr = HIWORD(r*cr_r) + HIWORD(g*cr_g) + HIWORD(b*cr_b) */
+			cr = _mm_mulhi_epi16(r, cr_r);
+			cr = _mm_add_epi16(cr, _mm_mulhi_epi16(g, cr_g));
+			cr = _mm_add_epi16(cr, _mm_mulhi_epi16(b, cr_b));
+			/* cr_b_buf[i] = MINMAX(cr, (-128 << 5), (127 << 5)); */
+			_mm_between_epi16(cr, min, max);
+			_mm_store_si128(cr_buf+i, cr);
+		}
+		y_buf  += srcbump;
+		cb_buf += srcbump;
+		cr_buf += srcbump;
+		r_buf += dstbump;
+		g_buf += dstbump;
+		b_buf += dstbump;
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+/*---------------------------------------------------------------------------*/
+#define LOAD128(_src_) \
+	_mm_load_si128((__m128i *) _src_)
+#define STORE128(_dst_, _src_) \
+	_mm_store_si128((__m128i *) _dst_, _src_)
+#define PUNPCKLBW(_dst_, _src_) \
+	_dst_ = _mm_unpacklo_epi8(_src_, _dst_)
+#define PUNPCKHBW(_dst_, _src_) \
+	_dst_ = _mm_unpackhi_epi8(_src_, _dst_)
+#define PUNPCKLWD(_dst_, _src_) \
+	_dst_ = _mm_unpacklo_epi16(_src_, _dst_)
+#define PUNPCKHWD(_dst_, _src_) \
+	_dst_ = _mm_unpackhi_epi16(_src_, _dst_)
+#define PACKUSWB(_dst_, _src_) \
+	_dst_ = _mm_packus_epi16(_dst_, _src_)
+#define PREFETCH(_ptr_) \
+	_mm_prefetch((const void *) _ptr_, _MM_HINT_T0)
+#define XMM_ALL_ONES \
+	_mm_set1_epi32(0xFFFFFFFFU)
+
+pstatus_t sse2_RGBToRGB_16s8u_P3AC4R(
+	const INT16 *pSrc[3],	/* 16-bit R,G, and B arrays */
+	INT32 srcStep,			/* bytes between rows in source data */
+	BYTE *pDst,				/* 32-bit interleaved ARGB (ABGR?) data */
+	INT32 dstStep,			/* bytes between rows in dest data */
+	const prim_size_t *roi)	/* region of interest */
+{
+	const UINT16 *r = (const UINT16 *) (pSrc[0]);
+	const UINT16 *g = (const UINT16 *) (pSrc[1]);
+	const UINT16 *b = (const UINT16 *) (pSrc[2]);
+	BYTE *out;
+	int srcbump, dstbump, y;
+
+	/* Ensure 16-byte alignment on all pointers,
+	 * that width is a multiple of 8,
+	 * and that the next row will also remain aligned.
+	 * Since this is usually used for 64x64 aligned arrays,
+	 * these checks should presumably pass.
+	 */
+	if ((((ULONG_PTR) (pSrc[0]) & 0x0f) != 0)
+			|| (((ULONG_PTR) (pSrc[1]) & 0x0f) != 0)
+			|| (((ULONG_PTR) (pSrc[2]) & 0x0f) != 0)
+			|| (((ULONG_PTR) pDst & 0x0f) != 0)
+			|| (roi->width & 0x0f)
+			|| (srcStep & 0x0f)
+			|| (dstStep & 0x0f))
+	{
+		return general_RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, roi);
+	}
+
+	out = (BYTE *) pDst;
+	srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
+	dstbump = (dstStep - (roi->width * sizeof(UINT32)));
+
+	for (y=0; y<roi->height; ++y)
+	{
+		int width = roi->width;
+		do {
+			__m128i R0, R1, R2, R3, R4;
+			/* The comments below pretend these are 8-byte registers
+			 * rather than 16-byte, for readability.
+			 */
+			R0 = LOAD128(b);  b += 8;		/* R0 = 00B300B200B100B0 */
+			R1 = LOAD128(b);  b += 8;		/* R1 = 00B700B600B500B4 */
+			PACKUSWB(R0,R1);				/* R0 = B7B6B5B4B3B2B1B0 */
+			R1 = LOAD128(g);  g += 8;		/* R1 = 00G300G200G100G0 */
+			R2 = LOAD128(g);  g += 8;		/* R2 = 00G700G600G500G4 */
+			PACKUSWB(R1,R2);				/* R1 = G7G6G5G4G3G2G1G0 */
+			R2 = R1;						/* R2 = G7G6G5G4G3G2G1G0 */
+			PUNPCKLBW(R2,R0);				/* R2 = G3B3G2B2G1B1G0B0 */
+			PUNPCKHBW(R1,R0);				/* R1 = G7B7G6B7G5B5G4B4 */
+			R0 = LOAD128(r);  r += 8;		/* R0 = 00R300R200R100R0 */
+			R3 = LOAD128(r);  r += 8;		/* R3 = 00R700R600R500R4 */
+			PACKUSWB(R0,R3);				/* R0 = R7R6R5R4R3R2R1R0 */
+			R3 = XMM_ALL_ONES;				/* R3 = FFFFFFFFFFFFFFFF */
+			R4 = R3;						/* R4 = FFFFFFFFFFFFFFFF */
+			PUNPCKLBW(R4,R0);				/* R4 = FFR3FFR2FFR1FFR0 */
+			PUNPCKHBW(R3,R0);				/* R3 = FFR7FFR6FFR5FFR4 */
+			R0 = R4;						/* R0 = R4               */
+			PUNPCKLWD(R0,R2);				/* R0 = FFR1G1B1FFR0G0B0 */
+			PUNPCKHWD(R4,R2);				/* R4 = FFR3G3B3FFR2G2B2 */
+			R2 = R3;						/* R2 = R3               */
+			PUNPCKLWD(R2,R1);				/* R2 = FFR5G5B5FFR4G4B4 */
+			PUNPCKHWD(R3,R1);				/* R3 = FFR7G7B7FFR6G6B6 */
+			STORE128(out, R0);  out += 16;	/* FFR1G1B1FFR0G0B0      */
+			STORE128(out, R4);  out += 16;	/* FFR3G3B3FFR2G2B2      */
+			STORE128(out, R2);  out += 16;	/* FFR5G5B5FFR4G4B4      */
+			STORE128(out, R3);  out += 16;	/* FFR7G7B7FFR6G6B6      */
+		} while (width -= 16);
+		/* Jump to next row. */
+		r += srcbump;
+		g += srcbump;
+		b += srcbump;
+		out += dstbump;
+	}
+	return PRIMITIVES_SUCCESS;
+}
+#endif /* WITH_SSE2 */
+
+/*---------------------------------------------------------------------------*/
+#ifdef WITH_NEON
+pstatus_t neon_yCbCrToRGB_16s16s_P3P3(
+	const INT16 *pSrc[3],
+	int srcStep,
+	INT16 *pDst[3],
+	int dstStep,
+	const prim_size_t *roi)	/* region of interest */
+{
+	/* TODO: If necessary, check alignments and call the general version. */
+
+	int16x8_t zero = vdupq_n_s16(0);
+	int16x8_t max = vdupq_n_s16(255);
+
+	int16x8_t r_cr = vdupq_n_s16(22986);	//  1.403 << 14
+	int16x8_t g_cb = vdupq_n_s16(-5636);	// -0.344 << 14
+	int16x8_t g_cr = vdupq_n_s16(-11698);	// -0.714 << 14
+	int16x8_t b_cb = vdupq_n_s16(28999);	//  1.770 << 14
+	int16x8_t c4096 = vdupq_n_s16(4096);
+
+	int16x8_t* y_buf  = (int16x8_t*) pSrc[0];
+	int16x8_t* cb_buf = (int16x8_t*) pSrc[1];
+	int16x8_t* cr_buf = (int16x8_t*) pSrc[2];
+	int16x8_t* r_buf  = (int16x8_t*) pDst[0];
+	int16x8_t* g_buf  = (int16x8_t*) pDst[1];
+	int16x8_t* b_buf  = (int16x8_t*) pDst[2];
+
+	int srcbump = srcStep / sizeof(int16x8_t);
+	int dstbump = dstStep / sizeof(int16x8_t);
+	int yp;
+
+	int imax = roi->width * sizeof(INT16) / sizeof(int16x8_t);
+	for (yp=0; yp<roi->height; ++yp)
+	{
+		int i;
+		for (i=0; i<imax; i++)
+		{
+			/*
+				In order to use NEON signed 16-bit integer multiplication we need to convert
+				the floating point factors to signed int without loosing information.
+				The result of this multiplication is 32 bit and we have a NEON instruction
+				that returns the hi word of the saturated double.
+				Thus we will multiply the factors by the highest possible 2^n, take the 
+				upper 16 bits of the signed 32-bit result (vqdmulhq_s16 followed by a right
+				shift by 1 to reverse the doubling) and correct	this result by multiplying it 
+				by 2^(16-n).
+				For the given factors in the conversion matrix the best possible n is 14.
+
+				Example for calculating r:
+				r = (y>>5) + 128 + (cr*1.403)>>5                       // our base formula
+				r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5      // see above
+				r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5             // simplification
+				r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
+			*/
+		
+			/* y = (y_buf[i] + 4096) >> 2 */
+			int16x8_t y = vld1q_s16((INT16*) &y_buf[i]);
+			y = vaddq_s16(y, c4096);
+			y = vshrq_n_s16(y, 2);
+			/* cb = cb_buf[i]; */
+			int16x8_t cb = vld1q_s16((INT16*)&cb_buf[i]);
+			/* cr = cr_buf[i]; */
+			int16x8_t cr = vld1q_s16((INT16*) &cr_buf[i]);
+
+			/* (y + HIWORD(cr*22986)) >> 3 */
+			int16x8_t r = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cr, r_cr), 1));
+			r = vshrq_n_s16(r, 3);
+			/* r_buf[i] = MINMAX(r, 0, 255); */
+			r = vminq_s16(vmaxq_s16(r, zero), max);
+			vst1q_s16((INT16*)&r_buf[i], r);
+
+			/* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
+			int16x8_t g = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cb, g_cb), 1));
+			g = vaddq_s16(g, vshrq_n_s16(vqdmulhq_s16(cr, g_cr), 1));
+			g = vshrq_n_s16(g, 3);
+			/* g_buf[i] = MINMAX(g, 0, 255); */
+			g = vminq_s16(vmaxq_s16(g, zero), max);
+			vst1q_s16((INT16*)&g_buf[i], g);
+
+			/* (y + HIWORD(cb*28999)) >> 3 */
+			int16x8_t b = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cb, b_cb), 1));
+			b = vshrq_n_s16(b, 3);
+			/* b_buf[i] = MINMAX(b, 0, 255); */
+			b = vminq_s16(vmaxq_s16(b, zero), max);
+			vst1q_s16((INT16*)&b_buf[i], b);
+		}
+
+		y_buf  += srcbump;
+		cb_buf += srcbump;
+		cr_buf += srcbump;
+		r_buf += dstbump;
+		g_buf += dstbump;
+		b_buf += dstbump;
+	}
+	return PRIMITIVES_SUCCESS;
+}
+#endif /* WITH_NEON */
+
+
+/* I don't see a direct IPP version of this, since the input is INT16
+ * YCbCr.  It may be possible via  Deinterleave and then YCbCrToRGB_<mod>.
+ * But that would likely be slower.
+ */
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_colors_opt(const primitives_hints_t* hints, primitives_t* prims)
+{
+#if defined(WITH_SSE2)
+	if (hints->x86_flags & PRIM_X86_SSE2_AVAILABLE)
+	{
+		prims->RGBToRGB_16s8u_P3AC4R  = sse2_RGBToRGB_16s8u_P3AC4R;
+		prims->yCbCrToRGB_16s16s_P3P3 = sse2_yCbCrToRGB_16s16s_P3P3;
+		prims->RGBToYCbCr_16s16s_P3P3 = sse2_RGBToYCbCr_16s16s_P3P3;
+	}
+#elif defined(WITH_NEON)
+	if (hints->arm_flags & PRIM_ARM_NEON_AVAILABLE)
+	{
+		prims->yCbCrToRGB_16s16s_P3P3 = neon_yCbCrToRGB_16s16s_P3P3;
+	}
+#endif /* WITH_SSE2 */
+}
+
--- a/libfreerdp/primitives/prim_copy.c
+++ b/libfreerdp/primitives/prim_copy.c
@ -72,7 +72,7 @@ static BOOL memory_regions_overlap_2d(
 }

 /* ------------------------------------------------------------------------- */
-PRIM_STATIC pstatus_t general_copy_8u(
+pstatus_t general_copy_8u(
 	const BYTE *pSrc,
 	BYTE *pDst,
 	INT32 len)
@ -94,7 +94,7 @@ PRIM_STATIC pstatus_t general_copy_8u(
 * The addresses are assumed to have been already offset to the upper-left
 * corners of the source and destination region of interest.
 */
-PRIM_STATIC pstatus_t general_copy_8u_AC4r(
+pstatus_t general_copy_8u_AC4r(
 	const BYTE *pSrc,  INT32 srcStep,
 	BYTE *pDst,  INT32 dstStep,
 	INT32 width,  INT32 height)
--- a/libfreerdp/primitives/prim_internal.h
+++ b/libfreerdp/primitives/prim_internal.h
@ -27,15 +27,6 @@

 #include <freerdp/primitives.h>

-/* Normally the internal entrypoints should be static, but a benchmark
- * program may want to access them directly and turn this off.
- */
-#ifndef PRIM_STATIC
-# define PRIM_STATIC static
-#else
-# undef PRIM_STATIC
-# define PRIM_STATIC
-#endif /* !PRIM_STATIC */

 /* Use lddqu for unaligned; load for 16-byte aligned. */
 #define LOAD_SI128(_ptr_) \
--- a/libfreerdp/primitives/prim_set.c
+++ b/libfreerdp/primitives/prim_set.c
@ -19,18 +19,15 @@
 #endif

 #include <string.h>
+
 #include <freerdp/types.h>
 #include <freerdp/primitives.h>
-#ifdef WITH_SSE2
-# include <emmintrin.h>
-#endif /* WITH_SSE2 */
-#ifdef WITH_IPP
-# include <ipps.h>
-#endif /* WITH_IPP */
+
 #include "prim_internal.h"
+#include "prim_set.h"

 /* ========================================================================= */
-PRIM_STATIC pstatus_t general_set_8u(
+pstatus_t general_set_8u(
 	BYTE val,
 	BYTE *pDst,
 	INT32 len)
@ -40,7 +37,7 @@ PRIM_STATIC pstatus_t general_set_8u(
 }

 /* ------------------------------------------------------------------------- */
-PRIM_STATIC pstatus_t general_zero(
+pstatus_t general_zero(
 	void *pDst,
 	size_t len)
 {
@ -48,75 +45,8 @@ PRIM_STATIC pstatus_t general_zero(
 	return PRIMITIVES_SUCCESS;
 }

-/* ------------------------------------------------------------------------- */
-#ifdef WITH_SSE2
-# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
-PRIM_STATIC pstatus_t sse2_set_8u(
-	BYTE val,
-	BYTE *pDst,
-	INT32 len)
-{
-	BYTE byte, *dptr;
-	__m128i xmm0;
-	size_t count;
-
-	if (len < 16) return general_set_8u(val, pDst, len);
-
-	byte  = val;
-	dptr = (BYTE *) pDst;
-
-	/* Seek 16-byte alignment. */
-	while ((ULONG_PTR) dptr & 0x0f)
-	{
-		*dptr++ = byte;
-		if (--len == 0) return PRIMITIVES_SUCCESS;
-	}
-
-	xmm0 = _mm_set1_epi8(byte);
-
-	/* Cover 256-byte chunks via SSE register stores. */
-	count = len >> 8;
-	len -= count << 8;
-	/* Do 256-byte chunks using one XMM register. */
-	while (count--)
-	{
-		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
-		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
-		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
-		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
-		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
-		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
-		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
-		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
-		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
-		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
-		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
-		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
-		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
-		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
-		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
-		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
-	}
-
-	/* Cover 16-byte chunks via SSE register stores. */
-	count = len >> 4;
-	len -= count << 4;
-	/* Do 16-byte chunks using one XMM register. */
-	while (count--)
-	{
-		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
-	}
-
-	/* Do leftover bytes. */
-	while (len--) *dptr++ = byte;
-
-	return PRIMITIVES_SUCCESS;
-}
-# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
-#endif /* WITH_SSE2 */
-
 /* ========================================================================= */
-PRIM_STATIC pstatus_t general_set_32s(
+pstatus_t general_set_32s(
 	INT32 val,
 	INT32 *pDst,
 	INT32 len)
@ -148,7 +78,7 @@ PRIM_STATIC pstatus_t general_set_32s(
 }

 /* ------------------------------------------------------------------------- */
-PRIM_STATIC pstatus_t general_set_32u(
+pstatus_t general_set_32u(
 	UINT32 val,
 	UINT32 *pDst,
 	INT32 len)
@ -179,104 +109,6 @@ PRIM_STATIC pstatus_t general_set_32u(
 	return PRIMITIVES_SUCCESS;
 }

-/* ------------------------------------------------------------------------- */
-#ifdef WITH_SSE2
-# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
-PRIM_STATIC pstatus_t sse2_set_32u(
-	UINT32 val,
-	UINT32 *pDst,
-	INT32 len)
-{
-	UINT32 *dptr = (UINT32 *) pDst;
-	__m128i xmm0;
-	size_t count;
-
-	/* If really short, just do it here. */
-	if (len < 32)
-	{
-		while (len--) *dptr++ = val;
-		return PRIMITIVES_SUCCESS;
-	}
-
-	/* Assure we can reach 16-byte alignment. */
-	if (((ULONG_PTR) dptr & 0x03) != 0)
-	{
-		return general_set_32u(val, pDst, len);
-	}
-
-	/* Seek 16-byte alignment. */
-	while ((ULONG_PTR) dptr & 0x0f)
-	{
-		*dptr++ = val;
-		if (--len == 0) return PRIMITIVES_SUCCESS;
-	}
-
-	xmm0 = _mm_set1_epi32(val);
-
-	/* Cover 256-byte chunks via SSE register stores. */
-	count = len >> 6;
-	len -= count << 6;
-	/* Do 256-byte chunks using one XMM register. */
-	while (count--)
-	{
-		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
-		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
-		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
-		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
-		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
-		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
-		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
-		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
-		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
-		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
-		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
-		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
-		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
-		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
-		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
-		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
-	}
-
-	/* Cover 16-byte chunks via SSE register stores. */
-	count = len >> 2;
-	len -= count << 2;
-	/* Do 16-byte chunks using one XMM register. */
-	while (count--)
-	{
-		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
-	}
-
-	/* Do leftover bytes. */
-	while (len--) *dptr++ = val;
-
-	return PRIMITIVES_SUCCESS;
-}
-
-/* ------------------------------------------------------------------------- */
-PRIM_STATIC pstatus_t sse2_set_32s(
-	INT32 val,
-	INT32 *pDst,
-	INT32 len)
-{
-	UINT32 uval = *((UINT32 *) &val);
-	return sse2_set_32u(uval, (UINT32 *) pDst, len);
-}
-# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
-#endif /* WITH_SSE2 */
-
-#ifdef WITH_IPP
-/* ------------------------------------------------------------------------- */
-PRIM_STATIC pstatus_t ipp_wrapper_set_32u(
-	UINT32 val,
-	UINT32 *pDst,
-	INT32 len)
-{
-	/* A little type conversion, then use the signed version. */
-	INT32 sval = *((INT32 *) &val);
-	return ippsSet_32s(sval, (INT32 *) pDst, len);
-}
-#endif
-
 /* ------------------------------------------------------------------------- */
 void primitives_init_set(
 	const primitives_hints_t *hints,
@ -288,20 +120,7 @@ void primitives_init_set(
 	prims->set_32u = general_set_32u;
 	prims->zero = general_zero;

-	/* Pick tuned versions if possible. */
-#ifdef WITH_IPP
-	prims->set_8u  = (__set_8u_t)  ippsSet_8u;
-	prims->set_32s = (__set_32s_t) ippsSet_32s;
-	prims->set_32u = (__set_32u_t) ipp_wrapper_set_32u;
-	prims->zero = (__zero_t) ippsZero_8u;
-#elif defined(WITH_SSE2)
-	if (hints->x86_flags & PRIM_X86_SSE2_AVAILABLE)
-	{
-		prims->set_8u  = sse2_set_8u;
-		prims->set_32s = sse2_set_32s;
-		prims->set_32u = sse2_set_32u;
-	}
-#endif
+	primitives_init_set_opt(hints, prims);
 }

 /* ------------------------------------------------------------------------- */
@ -310,3 +129,4 @@ void primitives_deinit_set(
 {
 	/* Nothing to do. */
 }
+
--- a/libfreerdp/primitives/prim_set.h
+++ b/libfreerdp/primitives/prim_set.h
@ -0,0 +1,34 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Routines to set a chunk of memory to a constant.
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.  Algorithms used by
+ * this code may be covered by patents by HP, Microsoft, or other parties.
+ *
+ */
+
+#ifdef __GNUC__
+# pragma once
+#endif
+
+#ifndef __PRIM_SET_H_INCLUDED__
+#define __PRIM_SET_H_INCLUDED__
+
+pstatus_t general_set_8u(BYTE val, BYTE *pDst, INT32 len);
+pstatus_t general_zero(void *pDst, size_t len);
+pstatus_t general_set_32s(INT32 val, INT32 *pDst, INT32 len);
+pstatus_t general_set_32u(UINT32 val, UINT32 *pDst, INT32 len);
+
+
+void primitives_init_set_opt(const primitives_hints_t *hints, primitives_t *prims);
+
+#endif /* !__PRIM_SET_H_INCLUDED__ */
+
--- a/libfreerdp/primitives/prim_set_opt.c
+++ b/libfreerdp/primitives/prim_set_opt.c
@ -0,0 +1,218 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Optimized routines to set a chunk of memory to a constant.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ *
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <string.h>
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+
+#ifdef WITH_SSE2
+# include <emmintrin.h>
+#endif /* WITH_SSE2 */
+#ifdef WITH_IPP
+# include <ipps.h>
+#endif /* WITH_IPP */
+
+#include "prim_internal.h"
+#include "prim_set.h"
+
+/* ========================================================================= */
+#ifdef WITH_SSE2
+# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
+pstatus_t sse2_set_8u(
+	BYTE val,
+	BYTE *pDst,
+	INT32 len)
+{
+	BYTE byte, *dptr;
+	__m128i xmm0;
+	size_t count;
+
+	if (len < 16) return general_set_8u(val, pDst, len);
+
+	byte  = val;
+	dptr = (BYTE *) pDst;
+
+	/* Seek 16-byte alignment. */
+	while ((ULONG_PTR) dptr & 0x0f)
+	{
+		*dptr++ = byte;
+		if (--len == 0) return PRIMITIVES_SUCCESS;
+	}
+
+	xmm0 = _mm_set1_epi8(byte);
+
+	/* Cover 256-byte chunks via SSE register stores. */
+	count = len >> 8;
+	len -= count << 8;
+	/* Do 256-byte chunks using one XMM register. */
+	while (count--)
+	{
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
+	}
+
+	/* Cover 16-byte chunks via SSE register stores. */
+	count = len >> 4;
+	len -= count << 4;
+	/* Do 16-byte chunks using one XMM register. */
+	while (count--)
+	{
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
+	}
+
+	/* Do leftover bytes. */
+	while (len--) *dptr++ = byte;
+
+	return PRIMITIVES_SUCCESS;
+}
+# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
+#endif /* WITH_SSE2 */
+
+/* ------------------------------------------------------------------------- */
+#ifdef WITH_SSE2
+# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
+pstatus_t sse2_set_32u(
+	UINT32 val,
+	UINT32 *pDst,
+	INT32 len)
+{
+	UINT32 *dptr = (UINT32 *) pDst;
+	__m128i xmm0;
+	size_t count;
+
+	/* If really short, just do it here. */
+	if (len < 32)
+	{
+		while (len--) *dptr++ = val;
+		return PRIMITIVES_SUCCESS;
+	}
+
+	/* Assure we can reach 16-byte alignment. */
+	if (((ULONG_PTR) dptr & 0x03) != 0)
+	{
+		return general_set_32u(val, pDst, len);
+	}
+
+	/* Seek 16-byte alignment. */
+	while ((ULONG_PTR) dptr & 0x0f)
+	{
+		*dptr++ = val;
+		if (--len == 0) return PRIMITIVES_SUCCESS;
+	}
+
+	xmm0 = _mm_set1_epi32(val);
+
+	/* Cover 256-byte chunks via SSE register stores. */
+	count = len >> 6;
+	len -= count << 6;
+	/* Do 256-byte chunks using one XMM register. */
+	while (count--)
+	{
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
+	}
+
+	/* Cover 16-byte chunks via SSE register stores. */
+	count = len >> 2;
+	len -= count << 2;
+	/* Do 16-byte chunks using one XMM register. */
+	while (count--)
+	{
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
+	}
+
+	/* Do leftover bytes. */
+	while (len--) *dptr++ = val;
+
+	return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+pstatus_t sse2_set_32s(
+	INT32 val,
+	INT32 *pDst,
+	INT32 len)
+{
+	UINT32 uval = *((UINT32 *) &val);
+	return sse2_set_32u(uval, (UINT32 *) pDst, len);
+}
+# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
+#endif /* WITH_SSE2 */
+
+#ifdef WITH_IPP
+/* ------------------------------------------------------------------------- */
+pstatus_t ipp_wrapper_set_32u(
+	UINT32 val,
+	UINT32 *pDst,
+	INT32 len)
+{
+	/* A little type conversion, then use the signed version. */
+	INT32 sval = *((INT32 *) &val);
+	return ippsSet_32s(sval, (INT32 *) pDst, len);
+}
+#endif
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_set_opt(const primitives_hints_t *hints, primitives_t *prims)
+{
+	/* Pick tuned versions if possible. */
+#ifdef WITH_IPP
+	prims->set_8u  = (__set_8u_t)  ippsSet_8u;
+	prims->set_32s = (__set_32s_t) ippsSet_32s;
+	prims->set_32u = (__set_32u_t) ipp_wrapper_set_32u;
+	prims->zero = (__zero_t) ippsZero_8u;
+#elif defined(WITH_SSE2)
+	if (hints->x86_flags & PRIM_X86_SSE2_AVAILABLE)
+	{
+		prims->set_8u  = sse2_set_8u;
+		prims->set_32s = sse2_set_32s;
+		prims->set_32u = sse2_set_32u;
+	}
+#endif
+}
+
--- a/libfreerdp/primitives/prim_shift.c
+++ b/libfreerdp/primitives/prim_shift.c
@ -17,25 +17,15 @@
 #include "config.h"
 #endif

-#include <string.h>
-
 #include <freerdp/types.h>
 #include <freerdp/primitives.h>

-#ifdef WITH_SSE2
-#include <emmintrin.h>
-#include <pmmintrin.h>
-#endif /* WITH_SSE2 */
-
-#ifdef WITH_IPP
-#include <ipps.h>
-#endif /* WITH_IPP */
-
 #include "prim_internal.h"
-#include "prim_templates.h"
+#include "prim_shift.h"
+

 /* ------------------------------------------------------------------------- */
-PRIM_STATIC pstatus_t general_lShiftC_16s(
+pstatus_t general_lShiftC_16s(
 	const INT16 *pSrc,
 	INT32 val,
 	INT16 *pDst,
@ -47,7 +37,7 @@ PRIM_STATIC pstatus_t general_lShiftC_16s(
 }

 /* ------------------------------------------------------------------------- */
-PRIM_STATIC pstatus_t general_rShiftC_16s(
+pstatus_t general_rShiftC_16s(
 	const INT16 *pSrc,
 	INT32 val,
 	INT16 *pDst,
@ -59,7 +49,7 @@ PRIM_STATIC pstatus_t general_rShiftC_16s(
 }

 /* ------------------------------------------------------------------------- */
-PRIM_STATIC pstatus_t general_lShiftC_16u(
+pstatus_t general_lShiftC_16u(
 	const UINT16 *pSrc,
 	INT32 val,
 	UINT16 *pDst,
@ -71,7 +61,7 @@ PRIM_STATIC pstatus_t general_lShiftC_16u(
 }

 /* ------------------------------------------------------------------------- */
-PRIM_STATIC pstatus_t general_rShiftC_16u(
+pstatus_t general_rShiftC_16u(
 	const UINT16 *pSrc,
 	INT32 val,
 	UINT16 *pDst,
@ -82,25 +72,8 @@ PRIM_STATIC pstatus_t general_rShiftC_16u(
 	return PRIMITIVES_SUCCESS;
 }

-#ifdef WITH_SSE2
-# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
 /* ------------------------------------------------------------------------- */
-SSE3_SCD_ROUTINE(sse2_lShiftC_16s, INT16, general_lShiftC_16s,
-	_mm_slli_epi16, *dptr++ = *sptr++ << val)
-/* ------------------------------------------------------------------------- */
-SSE3_SCD_ROUTINE(sse2_rShiftC_16s, INT16, general_rShiftC_16s,
-	_mm_srai_epi16, *dptr++ = *sptr++ >> val)
-/* ------------------------------------------------------------------------- */
-SSE3_SCD_ROUTINE(sse2_lShiftC_16u, UINT16, general_lShiftC_16u,
-	_mm_slli_epi16, *dptr++ = *sptr++ << val)
-/* ------------------------------------------------------------------------- */
-SSE3_SCD_ROUTINE(sse2_rShiftC_16u, UINT16, general_rShiftC_16u,
-	_mm_srli_epi16, *dptr++ = *sptr++ >> val)
-# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
-#endif
-
-/* ------------------------------------------------------------------------- */
-PRIM_STATIC pstatus_t general_shiftC_16s(
+pstatus_t general_shiftC_16s(
 	const INT16 *pSrc,
 	INT32 val,
 	INT16 *pDst,
@ -115,7 +88,7 @@ PRIM_STATIC pstatus_t general_shiftC_16s(
 }

 /* ------------------------------------------------------------------------- */
-PRIM_STATIC pstatus_t general_shiftC_16u(
+pstatus_t general_shiftC_16u(
 	const UINT16 *pSrc,
 	INT32 val,
 	UINT16 *pDst,
@ -129,11 +102,6 @@ PRIM_STATIC pstatus_t general_shiftC_16u(
 	else         return prims->lShiftC_16u(pSrc,  val, pDst, len);
 }

-/* Note: the IPP version will have to call ippLShiftC_16s or ippRShiftC_16s
- * depending on the sign of val.  To avoid using the deprecated inplace
- * routines, a wrapper can use the src for the dest.
- */
-
 /* ------------------------------------------------------------------------- */
 void primitives_init_shift(
 	const primitives_hints_t *hints,
@ -144,24 +112,12 @@ void primitives_init_shift(
 	prims->rShiftC_16s = general_rShiftC_16s;
 	prims->lShiftC_16u = general_lShiftC_16u;
 	prims->rShiftC_16u = general_rShiftC_16u;
-#if defined(WITH_IPP)
-	prims->lShiftC_16s = (__lShiftC_16s_t) ippsLShiftC_16s;
-	prims->rShiftC_16s = (__rShiftC_16s_t) ippsRShiftC_16s;
-	prims->lShiftC_16u = (__lShiftC_16u_t) ippsLShiftC_16u;
-	prims->rShiftC_16u = (__rShiftC_16u_t) ippsRShiftC_16u;
-#elif defined(WITH_SSE2)
-	if ((hints->x86_flags & PRIM_X86_SSE2_AVAILABLE)
-			&& (hints->x86_flags & PRIM_X86_SSE3_AVAILABLE))
-	{
-		prims->lShiftC_16s = sse2_lShiftC_16s;
-		prims->rShiftC_16s = sse2_rShiftC_16s;
-		prims->lShiftC_16u = sse2_lShiftC_16u;
-		prims->rShiftC_16u = sse2_rShiftC_16u;
-	}
-#endif
+
 	/* Wrappers */
 	prims->shiftC_16s  = general_shiftC_16s;
 	prims->shiftC_16u  = general_shiftC_16u;
+
+	primitives_init_shift_opt(hints, prims);
 }

 /* ------------------------------------------------------------------------- */
--- a/libfreerdp/primitives/prim_shift.h
+++ b/libfreerdp/primitives/prim_shift.h
@ -0,0 +1,35 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Shift operations.
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.  Algorithms used by
+ * this code may be covered by patents by HP, Microsoft, or other parties.
+ *
+ */
+
+#ifdef __GNUC__
+# pragma once
+#endif
+
+#ifndef __PRIM_SHIFT_H_INCLUDED__
+#define __PRIM_SHIFT_H_INCLUDED__
+
+pstatus_t general_lShiftC_16s(const INT16 *pSrc, INT32 val, INT16 *pDst, INT32 len);
+pstatus_t general_rShiftC_16s(const INT16 *pSrc, INT32 val, INT16 *pDst, INT32 len);
+pstatus_t general_lShiftC_16u(const UINT16 *pSrc, INT32 val, UINT16 *pDst, INT32 len);
+pstatus_t general_rShiftC_16u(const UINT16 *pSrc, INT32 val, UINT16 *pDst, INT32 len);
+pstatus_t general_shiftC_16s(const INT16 *pSrc, INT32 val, INT16 *pDst, INT32 len);
+pstatus_t general_shiftC_16u(const UINT16 *pSrc, INT32 val, UINT16 *pDst, INT32 len);
+
+void primitives_init_shift_opt(const primitives_hints_t *hints,	primitives_t *prims);
+
+#endif /* !__PRIM_SHIFT_H_INCLUDED__ */
+
--- a/libfreerdp/primitives/prim_shift_opt.c
+++ b/libfreerdp/primitives/prim_shift_opt.c
@ -0,0 +1,79 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Shift operations.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+
+#ifdef WITH_SSE2
+#include <emmintrin.h>
+#include <pmmintrin.h>
+#endif /* WITH_SSE2 */
+
+#ifdef WITH_IPP
+#include <ipps.h>
+#endif /* WITH_IPP */
+
+#include "prim_internal.h"
+#include "prim_templates.h"
+#include "prim_shift.h"
+
+
+#ifdef WITH_SSE2
+# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
+/* ------------------------------------------------------------------------- */
+SSE3_SCD_ROUTINE(sse2_lShiftC_16s, INT16, general_lShiftC_16s,
+	_mm_slli_epi16, *dptr++ = *sptr++ << val)
+/* ------------------------------------------------------------------------- */
+SSE3_SCD_ROUTINE(sse2_rShiftC_16s, INT16, general_rShiftC_16s,
+	_mm_srai_epi16, *dptr++ = *sptr++ >> val)
+/* ------------------------------------------------------------------------- */
+SSE3_SCD_ROUTINE(sse2_lShiftC_16u, UINT16, general_lShiftC_16u,
+	_mm_slli_epi16, *dptr++ = *sptr++ << val)
+/* ------------------------------------------------------------------------- */
+SSE3_SCD_ROUTINE(sse2_rShiftC_16u, UINT16, general_rShiftC_16u,
+	_mm_srli_epi16, *dptr++ = *sptr++ >> val)
+# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
+#endif
+
+
+/* Note: the IPP version will have to call ippLShiftC_16s or ippRShiftC_16s
+ * depending on the sign of val.  To avoid using the deprecated inplace
+ * routines, a wrapper can use the src for the dest.
+ */
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_shift_opt(const primitives_hints_t *hints,	primitives_t *prims)
+{
+#if defined(WITH_IPP)
+	prims->lShiftC_16s = (__lShiftC_16s_t) ippsLShiftC_16s;
+	prims->rShiftC_16s = (__rShiftC_16s_t) ippsRShiftC_16s;
+	prims->lShiftC_16u = (__lShiftC_16u_t) ippsLShiftC_16u;
+	prims->rShiftC_16u = (__rShiftC_16u_t) ippsRShiftC_16u;
+#elif defined(WITH_SSE2)
+	if ((hints->x86_flags & PRIM_X86_SSE2_AVAILABLE)
+			&& (hints->x86_flags & PRIM_X86_SSE3_AVAILABLE))
+	{
+		prims->lShiftC_16s = sse2_lShiftC_16s;
+		prims->rShiftC_16s = sse2_rShiftC_16s;
+		prims->lShiftC_16u = sse2_lShiftC_16u;
+		prims->rShiftC_16u = sse2_rShiftC_16u;
+	}
+#endif
+}
+
--- a/libfreerdp/primitives/prim_sign.c
+++ b/libfreerdp/primitives/prim_sign.c
@ -17,22 +17,16 @@
 #include "config.h"
 #endif

-#include <string.h>
-
 #include <freerdp/types.h>
 #include <freerdp/primitives.h>

-#ifdef WITH_SSE2
-#include <emmintrin.h>
-#include <tmmintrin.h>
-#endif /* WITH_SSE2 */
-
 #include "prim_internal.h"
+#include "prim_sign.h"

 /* ----------------------------------------------------------------------------
 * Set pDst to the sign-value of the 16-bit values in pSrc (-1, 0, or 1).
 */
-PRIM_STATIC pstatus_t general_sign_16s(
+pstatus_t general_sign_16s(
 	const INT16 *pSrc,
 	INT16 *pDst,
 	INT32 len)
@ -46,110 +40,6 @@ PRIM_STATIC pstatus_t general_sign_16s(
 	return PRIMITIVES_SUCCESS;
 }

-#ifdef WITH_SSE2
-/* ------------------------------------------------------------------------- */
-PRIM_STATIC pstatus_t ssse3_sign_16s(
-	const INT16 *pSrc,
-	INT16 *pDst,
-	INT32 len)
-{
-	const INT16 *sptr = (const INT16 *) pSrc;
-	INT16 *dptr = (INT16 *) pDst;
-	size_t count;
-
-	if (len < 16)
-	{
-		return general_sign_16s(pSrc, pDst, len);
-	}
-
-	/* Check for 16-byte alignment (eventually). */
-	if ((ULONG_PTR) pDst & 0x01)
-	{
-		return general_sign_16s(pSrc, pDst, len);
-	}
-
-	/* Seek 16-byte alignment. */
-	while ((ULONG_PTR) dptr & 0x0f)
-	{
-		INT16 src = *sptr++;
-		*dptr++ = (src < 0) ? (-1) : ((src > 0) ? 1 : 0);
-		if (--len == 0) return PRIMITIVES_SUCCESS;
-	}
-
-	/* Do 32-short chunks using 8 XMM registers. */
-	count = len >> 5;	/* / 32  */
-	len -= count << 5;	/* * 32 */
-	if ((ULONG_PTR) sptr & 0x0f)
-	{
-		/* Unaligned */
-		while (count--)
-		{
-			__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-			xmm0 = _mm_set1_epi16(0x0001U);
-			xmm1 = _mm_set1_epi16(0x0001U);
-			xmm2 = _mm_set1_epi16(0x0001U);
-			xmm3 = _mm_set1_epi16(0x0001U);
-			xmm4 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8;
-			xmm5 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8;
-			xmm6 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8;
-			xmm7 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8;
-			xmm0 = _mm_sign_epi16(xmm0, xmm4);
-			xmm1 = _mm_sign_epi16(xmm1, xmm5);
-			xmm2 = _mm_sign_epi16(xmm2, xmm6);
-			xmm3 = _mm_sign_epi16(xmm3, xmm7);
-			_mm_store_si128((__m128i *) dptr, xmm0);         dptr += 8;
-			_mm_store_si128((__m128i *) dptr, xmm1);         dptr += 8;
-			_mm_store_si128((__m128i *) dptr, xmm2);         dptr += 8;
-			_mm_store_si128((__m128i *) dptr, xmm3);         dptr += 8;
-		}
-	}
-	else
-	{
-		/* Aligned */
-		while (count--)
-		{
-			__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-			xmm0 = _mm_set1_epi16(0x0001U);
-			xmm1 = _mm_set1_epi16(0x0001U);
-			xmm2 = _mm_set1_epi16(0x0001U);
-			xmm3 = _mm_set1_epi16(0x0001U);
-			xmm4 = _mm_load_si128((__m128i *) sptr); sptr += 8;
-			xmm5 = _mm_load_si128((__m128i *) sptr); sptr += 8;
-			xmm6 = _mm_load_si128((__m128i *) sptr); sptr += 8;
-			xmm7 = _mm_load_si128((__m128i *) sptr); sptr += 8;
-			xmm0 = _mm_sign_epi16(xmm0, xmm4);
-			xmm1 = _mm_sign_epi16(xmm1, xmm5);
-			xmm2 = _mm_sign_epi16(xmm2, xmm6);
-			xmm3 = _mm_sign_epi16(xmm3, xmm7);
-			_mm_store_si128((__m128i *) dptr, xmm0);         dptr += 8;
-			_mm_store_si128((__m128i *) dptr, xmm1);         dptr += 8;
-			_mm_store_si128((__m128i *) dptr, xmm2);         dptr += 8;
-			_mm_store_si128((__m128i *) dptr, xmm3);         dptr += 8;
-		}
-	}
-
-	/* Do 8-short chunks using two XMM registers. */
-	count = len >> 3;
-	len -= count << 3;
-	while (count--)
-	{
-		__m128i xmm0 = _mm_set1_epi16(0x0001U);
-		__m128i xmm1 = LOAD_SI128(sptr);					sptr += 8;
-		xmm0 = _mm_sign_epi16(xmm0, xmm1);
-		_mm_store_si128((__m128i *) dptr, xmm0);			dptr += 8;
-	}
-
-	/* Do leftovers. */
-	while (len--)
-	{
-		INT16 src = *sptr++;
-		*dptr++ = (src < 0) ? -1 : ((src > 0) ? 1 : 0);
-	}
-
-	return PRIMITIVES_SUCCESS;
-}
-#endif /* WITH_SSE2 */
-
 /* ------------------------------------------------------------------------- */
 void primitives_init_sign(
 	const primitives_hints_t *hints,
@ -157,15 +47,8 @@ void primitives_init_sign(
 {
 	/* Start with the default. */
 	prims->sign_16s = general_sign_16s;
-	/* Pick tuned versions if possible. */
-	/* I didn't spot an IPP version of this. */
-#if defined(WITH_SSE2)
-	if ((hints->x86_flags & PRIM_X86_SSSE3_AVAILABLE)
-			&& (hints->x86_flags & PRIM_X86_SSE3_AVAILABLE))
-	{
-		prims->sign_16s  = ssse3_sign_16s;
-	}
-#endif
+
+	primitives_init_sign_opt(hints, prims);
 }

 /* ------------------------------------------------------------------------- */
@ -174,3 +57,4 @@ void primitives_deinit_sign(
 {
 	/* Nothing to do. */
 }
+
--- a/libfreerdp/primitives/prim_sign.h
+++ b/libfreerdp/primitives/prim_sign.h
@ -0,0 +1,30 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Sign operations.
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.  Algorithms used by
+ * this code may be covered by patents by HP, Microsoft, or other parties.
+ *
+ */
+
+#ifdef __GNUC__
+# pragma once
+#endif
+
+#ifndef __PRIM_SIGN_H_INCLUDED__
+#define __PRIM_SIGN_H_INCLUDED__
+
+pstatus_t general_sign_16s(const INT16 *pSrc, INT16 *pDst, INT32 len);
+
+void primitives_init_sign_opt(const primitives_hints_t *hints,	primitives_t *prims);
+
+#endif /* !__PRIM_SIGN_H_INCLUDED__ */
+
--- a/libfreerdp/primitives/prim_sign_opt.c
+++ b/libfreerdp/primitives/prim_sign_opt.c
@ -0,0 +1,149 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Optimized sign operations.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+
+#ifdef WITH_SSE2
+#include <emmintrin.h>
+#include <tmmintrin.h>
+#endif /* WITH_SSE2 */
+
+#include "prim_internal.h"
+#include "prim_sign.h"
+
+
+#ifdef WITH_SSE2
+/* ------------------------------------------------------------------------- */
+pstatus_t ssse3_sign_16s(
+	const INT16 *pSrc,
+	INT16 *pDst,
+	INT32 len)
+{
+	const INT16 *sptr = (const INT16 *) pSrc;
+	INT16 *dptr = (INT16 *) pDst;
+	size_t count;
+
+	if (len < 16)
+	{
+		return general_sign_16s(pSrc, pDst, len);
+	}
+
+	/* Check for 16-byte alignment (eventually). */
+	if ((ULONG_PTR) pDst & 0x01)
+	{
+		return general_sign_16s(pSrc, pDst, len);
+	}
+
+	/* Seek 16-byte alignment. */
+	while ((ULONG_PTR) dptr & 0x0f)
+	{
+		INT16 src = *sptr++;
+		*dptr++ = (src < 0) ? (-1) : ((src > 0) ? 1 : 0);
+		if (--len == 0) return PRIMITIVES_SUCCESS;
+	}
+
+	/* Do 32-short chunks using 8 XMM registers. */
+	count = len >> 5;	/* / 32  */
+	len -= count << 5;	/* * 32 */
+	if ((ULONG_PTR) sptr & 0x0f)
+	{
+		/* Unaligned */
+		while (count--)
+		{
+			__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+			xmm0 = _mm_set1_epi16(0x0001U);
+			xmm1 = _mm_set1_epi16(0x0001U);
+			xmm2 = _mm_set1_epi16(0x0001U);
+			xmm3 = _mm_set1_epi16(0x0001U);
+			xmm4 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8;
+			xmm5 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8;
+			xmm6 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8;
+			xmm7 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8;
+			xmm0 = _mm_sign_epi16(xmm0, xmm4);
+			xmm1 = _mm_sign_epi16(xmm1, xmm5);
+			xmm2 = _mm_sign_epi16(xmm2, xmm6);
+			xmm3 = _mm_sign_epi16(xmm3, xmm7);
+			_mm_store_si128((__m128i *) dptr, xmm0);         dptr += 8;
+			_mm_store_si128((__m128i *) dptr, xmm1);         dptr += 8;
+			_mm_store_si128((__m128i *) dptr, xmm2);         dptr += 8;
+			_mm_store_si128((__m128i *) dptr, xmm3);         dptr += 8;
+		}
+	}
+	else
+	{
+		/* Aligned */
+		while (count--)
+		{
+			__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+			xmm0 = _mm_set1_epi16(0x0001U);
+			xmm1 = _mm_set1_epi16(0x0001U);
+			xmm2 = _mm_set1_epi16(0x0001U);
+			xmm3 = _mm_set1_epi16(0x0001U);
+			xmm4 = _mm_load_si128((__m128i *) sptr); sptr += 8;
+			xmm5 = _mm_load_si128((__m128i *) sptr); sptr += 8;
+			xmm6 = _mm_load_si128((__m128i *) sptr); sptr += 8;
+			xmm7 = _mm_load_si128((__m128i *) sptr); sptr += 8;
+			xmm0 = _mm_sign_epi16(xmm0, xmm4);
+			xmm1 = _mm_sign_epi16(xmm1, xmm5);
+			xmm2 = _mm_sign_epi16(xmm2, xmm6);
+			xmm3 = _mm_sign_epi16(xmm3, xmm7);
+			_mm_store_si128((__m128i *) dptr, xmm0);         dptr += 8;
+			_mm_store_si128((__m128i *) dptr, xmm1);         dptr += 8;
+			_mm_store_si128((__m128i *) dptr, xmm2);         dptr += 8;
+			_mm_store_si128((__m128i *) dptr, xmm3);         dptr += 8;
+		}
+	}
+
+	/* Do 8-short chunks using two XMM registers. */
+	count = len >> 3;
+	len -= count << 3;
+	while (count--)
+	{
+		__m128i xmm0 = _mm_set1_epi16(0x0001U);
+		__m128i xmm1 = LOAD_SI128(sptr);					sptr += 8;
+		xmm0 = _mm_sign_epi16(xmm0, xmm1);
+		_mm_store_si128((__m128i *) dptr, xmm0);			dptr += 8;
+	}
+
+	/* Do leftovers. */
+	while (len--)
+	{
+		INT16 src = *sptr++;
+		*dptr++ = (src < 0) ? -1 : ((src > 0) ? 1 : 0);
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+#endif /* WITH_SSE2 */
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_sign_opt(const primitives_hints_t *hints, primitives_t *prims)
+{
+	/* Pick tuned versions if possible. */
+	/* I didn't spot an IPP version of this. */
+#if defined(WITH_SSE2)
+	if ((hints->x86_flags & PRIM_X86_SSSE3_AVAILABLE)
+			&& (hints->x86_flags & PRIM_X86_SSE3_AVAILABLE))
+	{
+		prims->sign_16s  = ssse3_sign_16s;
+	}
+#endif
+}
+
--- a/libfreerdp/primitives/prim_templates.h
+++ b/libfreerdp/primitives/prim_templates.h
@ -44,7 +44,7 @@
 * SCD = Source, Constant, Destination
 */
 #define SSE3_SCD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
-PRIM_STATIC pstatus_t _name_(const _type_ *pSrc, INT32 val, _type_ *pDst, INT32 len) \
+pstatus_t _name_(const _type_ *pSrc, INT32 val, _type_ *pDst, INT32 len) \
 { \
 	int shifts; \
 	UINT32 offBeatMask; \
@ -188,7 +188,7 @@ PRIM_STATIC pstatus_t _name_(const _type_ *pSrc, INT32 val, _type_ *pDst, INT32
 * PRE = preload xmm0 with the constant.
 */
 #define SSE3_SCD_PRE_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
-PRIM_STATIC pstatus_t _name_(const _type_ *pSrc, _type_ val, _type_ *pDst, INT32 len) \
+pstatus_t _name_(const _type_ *pSrc, _type_ val, _type_ *pDst, INT32 len) \
 { \
 	int shifts; \
 	UINT32 offBeatMask; \
@ -293,7 +293,7 @@ PRIM_STATIC pstatus_t _name_(const _type_ *pSrc, _type_ val, _type_ *pDst, INT32
 * SSD = Source1, Source2, Destination
 */
 #define SSE3_SSD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
-PRIM_STATIC pstatus_t _name_(const _type_ *pSrc1, const _type_ *pSrc2, _type_ *pDst, INT32 len) \
+pstatus_t _name_(const _type_ *pSrc1, const _type_ *pSrc2, _type_ *pDst, INT32 len) \
 { \
 	int shifts; \
 	UINT32 offBeatMask; \
--- a/libfreerdp/primitives/primitives.c
+++ b/libfreerdp/primitives/primitives.c
@ -150,7 +150,7 @@ static void set_hints(primitives_hints_t* hints)

 #elif defined(_M_ARM)

-static UINT32 androidNeon(void)
+static UINT32 getNeonSupport(void)
 {
 #ifdef __ANDROID__
 	if (android_getCpuFamily() != ANDROID_CPU_FAMILY_ARM) return 0;
@ -164,7 +164,9 @@ static UINT32 androidNeon(void)
 			return PRIM_ARM_NEON_AVAILABLE;
 		}
 	}
-	/* else */
+#elif defined(__APPLE)
+	/* assume NEON support on iOS devices */
+	return PRIM_ARM_NEON_AVAILABLE;
 #endif
 	return 0;
 }
@ -172,7 +174,7 @@ static UINT32 androidNeon(void)
 static void set_hints(primitives_hints_t* hints)
 {
 	/* ARM:  TODO */
-	hints->arm_flags |= androidNeon();
+	hints->arm_flags |= getNeonSupport();
 }

 #else