From 343d210a75f65a748f2003bcd8a5bd387bdea120 Mon Sep 17 00:00:00 2001
From: Martin Fleisz <mfleisz@thinstuff.at>
Date: Thu, 21 Feb 2013 02:45:10 -0800
Subject: [PATCH] primitives: separating optimized functions into their own .c
 files.

This way we use certain compiler flags (like -msse3) only on files
containing optimized code. This avoids problems that occured when
using these flags compiling generic code and running it on platforms
that don't support these optimizations (i.e. NEON optimization on
ARM platforms).
---
 libfreerdp/primitives/CMakeLists.txt       |  13 +-
 libfreerdp/primitives/prim_add.c           |  34 +-
 libfreerdp/primitives/prim_add.h           |  30 ++
 libfreerdp/primitives/prim_add_opt.c       |  62 +++
 libfreerdp/primitives/prim_alphaComp.c     | 195 +------
 libfreerdp/primitives/prim_alphaComp.h     |  30 ++
 libfreerdp/primitives/prim_alphaComp_opt.c | 225 +++++++++
 libfreerdp/primitives/prim_andor.c         |  40 +-
 libfreerdp/primitives/prim_andor.h         |  31 ++
 libfreerdp/primitives/prim_andor_opt.c     |  61 +++
 libfreerdp/primitives/prim_colors.c        | 538 +-------------------
 libfreerdp/primitives/prim_colors.h        |  32 ++
 libfreerdp/primitives/prim_colors_opt.c    | 561 +++++++++++++++++++++
 libfreerdp/primitives/prim_copy.c          |   4 +-
 libfreerdp/primitives/prim_internal.h      |   9 -
 libfreerdp/primitives/prim_set.c           | 198 +-------
 libfreerdp/primitives/prim_set.h           |  34 ++
 libfreerdp/primitives/prim_set_opt.c       | 218 ++++++++
 libfreerdp/primitives/prim_shift.c         |  66 +--
 libfreerdp/primitives/prim_shift.h         |  35 ++
 libfreerdp/primitives/prim_shift_opt.c     |  79 +++
 libfreerdp/primitives/prim_sign.c          | 126 +----
 libfreerdp/primitives/prim_sign.h          |  30 ++
 libfreerdp/primitives/prim_sign_opt.c      | 149 ++++++
 libfreerdp/primitives/prim_templates.h     |   6 +-
 25 files changed, 1640 insertions(+), 1166 deletions(-)
 create mode 100644 libfreerdp/primitives/prim_add.h
 create mode 100644 libfreerdp/primitives/prim_add_opt.c
 create mode 100644 libfreerdp/primitives/prim_alphaComp.h
 create mode 100644 libfreerdp/primitives/prim_alphaComp_opt.c
 create mode 100644 libfreerdp/primitives/prim_andor.h
 create mode 100644 libfreerdp/primitives/prim_andor_opt.c
 create mode 100644 libfreerdp/primitives/prim_colors.h
 create mode 100644 libfreerdp/primitives/prim_colors_opt.c
 create mode 100644 libfreerdp/primitives/prim_set.h
 create mode 100644 libfreerdp/primitives/prim_set_opt.c
 create mode 100644 libfreerdp/primitives/prim_shift.h
 create mode 100644 libfreerdp/primitives/prim_shift_opt.c
 create mode 100644 libfreerdp/primitives/prim_sign.h
 create mode 100644 libfreerdp/primitives/prim_sign_opt.c

diff --git a/libfreerdp/primitives/CMakeLists.txt b/libfreerdp/primitives/CMakeLists.txt
index 01dce882b..e8d1a7bfb 100644
--- a/libfreerdp/primitives/CMakeLists.txt
+++ b/libfreerdp/primitives/CMakeLists.txt
@@ -28,6 +28,15 @@ set(${MODULE_PREFIX}_SRCS
 	primitives.c
 	prim_internal.h)
 
+set(${MODULE_PREFIX}_OPT_SRCS
+	prim_add_opt.c
+	prim_andor_opt.c
+	prim_alphaComp_opt.c
+	prim_colors_opt.c
+	prim_set_opt.c
+	prim_shift_opt.c
+	prim_sign_opt.c)
+
 add_definitions(-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE})
 
 ### IPP Variable debugging
@@ -63,7 +72,9 @@ if(ANDROID)
 		${ANDROID_CPU_FEATURES_PATH}/cpu-features.h)
 endif()
 
-set_property(SOURCE ${${MODULE_PREFIX}_SRCS} PROPERTY COMPILE_FLAGS ${OPTIMIZATION})
+set_property(SOURCE ${${MODULE_PREFIX}_OPT_SRCS} PROPERTY COMPILE_FLAGS ${OPTIMIZATION})
+
+set(${MODULE_PREFIX}_SRCS ${${MODULE_PREFIX}_SRCS} ${${MODULE_PREFIX}_OPT_SRCS})
 
 add_complex_library(MODULE ${MODULE_NAME} TYPE "OBJECT"
 	MONOLITHIC ${MONOLITHIC_BUILD}
diff --git a/libfreerdp/primitives/prim_add.c b/libfreerdp/primitives/prim_add.c
index 17e504201..258bcc6ea 100644
--- a/libfreerdp/primitives/prim_add.c
+++ b/libfreerdp/primitives/prim_add.c
@@ -18,27 +18,16 @@
 #include "config.h"
 #endif
 
-#include <string.h>
-
 #include <freerdp/types.h>
 #include <freerdp/primitives.h>
 
-#ifdef WITH_SSE2
-#include <emmintrin.h>
-#include <pmmintrin.h>
-#endif /* WITH_SSE2 */
-
-#ifdef WITH_IPP
-#include <ipps.h>
-#endif /* WITH_IPP */
-
 #include "prim_internal.h"
-#include "prim_templates.h"
+#include "prim_add.h"
 
 /* ----------------------------------------------------------------------------
  * 16-bit signed add with saturation (under and over).
  */
-PRIM_STATIC pstatus_t general_add_16s(
+pstatus_t general_add_16s(
 	const INT16 *pSrc1,
 	const INT16 *pSrc2,
 	INT16 *pDst,
@@ -55,29 +44,14 @@ PRIM_STATIC pstatus_t general_add_16s(
 	return PRIMITIVES_SUCCESS;
 }
 
-#ifdef WITH_SSE2
-# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
-/* ------------------------------------------------------------------------- */
-SSE3_SSD_ROUTINE(sse3_add_16s, INT16, general_add_16s,
-	_mm_adds_epi16, general_add_16s(sptr1++, sptr2++, dptr++, 1))
-# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
-#endif
-
 /* ------------------------------------------------------------------------- */
 void primitives_init_add(
 	const primitives_hints_t *hints,
 	primitives_t *prims)
 {
 	prims->add_16s = general_add_16s;
-#ifdef WITH_IPP
-	prims->add_16s = (__add_16s_t) ippsAdd_16s;
-#elif defined(WITH_SSE2)
-	if ((hints->x86_flags & PRIM_X86_SSE2_AVAILABLE)
-			&& (hints->x86_flags & PRIM_X86_SSE3_AVAILABLE))	/* for LDDQU */
-	{
-		prims->add_16s = sse3_add_16s;
-	}
-#endif
+
+	primitives_init_add_opt(hints, prims);
 }
 
 /* ------------------------------------------------------------------------- */
diff --git a/libfreerdp/primitives/prim_add.h b/libfreerdp/primitives/prim_add.h
new file mode 100644
index 000000000..4ad460279
--- /dev/null
+++ b/libfreerdp/primitives/prim_add.h
@@ -0,0 +1,30 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Add operations.
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.  Algorithms used by
+ * this code may be covered by patents by HP, Microsoft, or other parties.
+ *
+ */
+
+#ifdef __GNUC__
+# pragma once
+#endif
+
+#ifndef __PRIM_ADD_H_INCLUDED__
+#define __PRIM_ADD_H_INCLUDED__
+
+pstatus_t general_add_16s(const INT16 *pSrc1, const INT16 *pSrc2, INT16 *pDst, INT32 len);
+
+void primitives_init_add_opt(const primitives_hints_t *hints, primitives_t *prims);
+
+#endif /* !__PRIM_ADD_H_INCLUDED__ */
+
diff --git a/libfreerdp/primitives/prim_add_opt.c b/libfreerdp/primitives/prim_add_opt.c
new file mode 100644
index 000000000..2de0b8fc6
--- /dev/null
+++ b/libfreerdp/primitives/prim_add_opt.c
@@ -0,0 +1,62 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Optimized add operations.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ * 
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+
+#ifdef WITH_SSE2
+#include <emmintrin.h>
+#include <pmmintrin.h>
+#endif /* WITH_SSE2 */
+
+#ifdef WITH_IPP
+#include <ipps.h>
+#endif /* WITH_IPP */
+
+#include "prim_internal.h"
+#include "prim_templates.h"
+#include "prim_add.h"
+
+
+#ifdef WITH_SSE2
+# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
+/* ------------------------------------------------------------------------- */
+SSE3_SSD_ROUTINE(sse3_add_16s, INT16, general_add_16s,
+	_mm_adds_epi16, general_add_16s(sptr1++, sptr2++, dptr++, 1))
+# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
+#endif
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_add_opt(
+	const primitives_hints_t *hints,
+	primitives_t *prims)
+{
+#ifdef WITH_IPP
+	prims->add_16s = (__add_16s_t) ippsAdd_16s;
+#elif defined(WITH_SSE2)
+	if ((hints->x86_flags & PRIM_X86_SSE2_AVAILABLE)
+			&& (hints->x86_flags & PRIM_X86_SSE3_AVAILABLE))	/* for LDDQU */
+	{
+		prims->add_16s = sse3_add_16s;
+	}
+#endif
+}
+
+
diff --git a/libfreerdp/primitives/prim_alphaComp.c b/libfreerdp/primitives/prim_alphaComp.c
index 7f1a19093..24f916770 100644
--- a/libfreerdp/primitives/prim_alphaComp.c
+++ b/libfreerdp/primitives/prim_alphaComp.c
@@ -24,21 +24,11 @@
 #include "config.h"
 #endif
 
-#include <string.h>
-
 #include <freerdp/types.h>
 #include <freerdp/primitives.h>
 
 #include "prim_internal.h"
-
-#ifdef WITH_SSE2
-#include <emmintrin.h>
-#include <pmmintrin.h>
-#endif /* WITH_SSE2 */
-
-#ifdef WITH_IPP
-#include <ippi.h>
-#endif /* WITH_IPP */
+#include "prim_alphaComp.h"
 
 #define ALPHA(_k_)	(((_k_) & 0xFF000000U) >> 24)
 #define RED(_k_)	(((_k_) & 0x00FF0000U) >> 16)
@@ -46,7 +36,7 @@
 #define BLU(_k_)	(((_k_) & 0x000000FFU))
 
 /* ------------------------------------------------------------------------- */
-PRIM_STATIC pstatus_t general_alphaComp_argb(
+pstatus_t general_alphaComp_argb(
 	const BYTE *pSrc1,  INT32 src1Step,
 	const BYTE *pSrc2,  INT32 src2Step,
 	BYTE *pDst,  INT32 dstStep,
@@ -111,188 +101,12 @@ PRIM_STATIC pstatus_t general_alphaComp_argb(
 	return PRIMITIVES_SUCCESS;
 }
 
-/* ------------------------------------------------------------------------- */
-#ifdef WITH_SSE2
-#if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
-
-PRIM_STATIC pstatus_t sse2_alphaComp_argb(
-	const BYTE *pSrc1,  INT32 src1Step,
-	const BYTE *pSrc2,  INT32 src2Step,
-	BYTE *pDst,  INT32 dstStep,
-	INT32 width,  INT32 height)
-{
-	const UINT32 *sptr1 = (const UINT32 *) pSrc1;
-	const UINT32 *sptr2 = (const UINT32 *) pSrc2;
-	UINT32 *dptr;
-	int linebytes, src1Jump, src2Jump, dstJump, y;
-	__m128i xmm0, xmm1;
-
-	if ((width <= 0) || (height <= 0)) return PRIMITIVES_SUCCESS;
-
-	if (width < 4)     /* pointless if too small */
-	{
-		return general_alphaComp_argb(pSrc1, src1Step, pSrc2, src2Step,
-			pDst, dstStep, width, height);
-	}
-	dptr = (UINT32 *) pDst;
-	linebytes = width * sizeof(UINT32);
-	src1Jump = (src1Step - linebytes) / sizeof(UINT32);
-	src2Jump = (src2Step - linebytes) / sizeof(UINT32);
-	dstJump  = (dstStep  - linebytes) / sizeof(UINT32);
-
-	xmm0 = _mm_set1_epi32(0);
-	xmm1 = _mm_set1_epi16(1);
-
-	for (y=0; y<height; ++y)
-	{
-		int pixels = width;
-		int count;
-
-		/* Get to the 16-byte boundary now. */
-		int leadIn = 0;
-		switch ((ULONG_PTR) dptr & 0x0f)
-		{
-			case 0:
-				leadIn = 0;
-				break;
-			case 4:
-				leadIn = 3;
-				break;
-			case 8:
-				leadIn = 2;
-				break;
-			case 12:
-				leadIn = 1;
-				break;
-			default:
-				/* We'll never hit a 16-byte boundary, so do the whole
-				 * thing the slow way.
-				 */
-				leadIn = width;
-				break;
-		}
-		if (leadIn)
-		{
-			general_alphaComp_argb((const BYTE *) sptr1,
-				src1Step, (const BYTE *) sptr2, src2Step,
-				(BYTE *) dptr, dstStep, leadIn, 1);
-			sptr1 += leadIn;
-			sptr2 += leadIn;
-			dptr  += leadIn;
-			pixels -= leadIn;
-		}
-
-		/* Use SSE registers to do 4 pixels at a time. */
-		count = pixels >> 2;
-		pixels -= count << 2;
-		while (count--)
-		{
-			__m128i xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-			/* BdGdRdAdBcGcRcAcBbGbRbAbBaGaRaAa */
-			xmm2 = LOAD_SI128(sptr1); sptr1 += 4;
-			/* BhGhRhAhBgGgRgAgBfGfRfAfBeGeReAe */
-			xmm3 = LOAD_SI128(sptr2); sptr2 += 4;
-			/* 00Bb00Gb00Rb00Ab00Ba00Ga00Ra00Aa */
-			xmm4 = _mm_unpackhi_epi8(xmm2, xmm0);
-			/* 00Bf00Gf00Bf00Af00Be00Ge00Re00Ae */
-			xmm5 = _mm_unpackhi_epi8(xmm3, xmm0);
-			/* subtract */
-			xmm6 = _mm_subs_epi16(xmm4, xmm5);
-			/* 00Bb00Gb00Rb00Ab00Aa00Aa00Aa00Aa */
-			xmm4 = _mm_shufflelo_epi16(xmm4, 0xff);
-			/* 00Ab00Ab00Ab00Ab00Aa00Aa00Aa00Aa */
-			xmm4 = _mm_shufflehi_epi16(xmm4, 0xff);
-			/* Add one to alphas */
-			xmm4 = _mm_adds_epi16(xmm4, xmm1);
-			/* Multiply and take low word */
-			xmm4 = _mm_mullo_epi16(xmm4, xmm6);
-			/* Shift 8 right */
-			xmm4 = _mm_srai_epi16(xmm4, 8);
-			/* Add xmm5 */
-			xmm4 = _mm_adds_epi16(xmm4, xmm5);
-			/* 00Bj00Gj00Rj00Aj00Bi00Gi00Ri00Ai */
-
-			/* 00Bd00Gd00Rd00Ad00Bc00Gc00Rc00Ac */
-			xmm5 = _mm_unpacklo_epi8(xmm2, xmm0);
-			/* 00Bh00Gh00Rh00Ah00Bg00Gg00Rg00Ag */
-			xmm6 = _mm_unpacklo_epi8(xmm3, xmm0);
-			/* subtract */
-			xmm7 = _mm_subs_epi16(xmm5, xmm6);
-			/* 00Bd00Gd00Rd00Ad00Ac00Ac00Ac00Ac */
-			xmm5 = _mm_shufflelo_epi16(xmm5, 0xff);
-			/* 00Ad00Ad00Ad00Ad00Ac00Ac00Ac00Ac */
-			xmm5 = _mm_shufflehi_epi16(xmm5, 0xff);
-			/* Add one to alphas */
-			xmm5 = _mm_adds_epi16(xmm5, xmm1);
-			/* Multiply and take low word */
-			xmm5 = _mm_mullo_epi16(xmm5, xmm7);
-			/* Shift 8 right */
-			xmm5 = _mm_srai_epi16(xmm5, 8);
-			/* Add xmm6 */
-			xmm5 = _mm_adds_epi16(xmm5, xmm6);
-			/* 00Bl00Gl00Rl00Al00Bk00Gk00Rk0ABk */
-
-			/* Must mask off remainders or pack gets confused */
-			xmm3 = _mm_set1_epi16(0x00ffU);
-			xmm4 = _mm_and_si128(xmm4, xmm3);
-			xmm5 = _mm_and_si128(xmm5, xmm3);
-
-			/* BlGlRlAlBkGkRkAkBjGjRjAjBiGiRiAi */
-			xmm5 = _mm_packus_epi16(xmm5, xmm4);
-			_mm_store_si128((__m128i *) dptr, xmm5);  dptr += 4;
-		}
-
-		/* Finish off the remainder. */
-		if (pixels)
-		{
-			general_alphaComp_argb((const BYTE *) sptr1, src1Step,
-				(const BYTE *) sptr2, src2Step,
-				(BYTE *) dptr, dstStep, pixels, 1);
-			sptr1 += pixels;
-			sptr2 += pixels;
-			dptr  += pixels;
-		}
-
-		/* Jump to next row. */
-		sptr1 += src1Jump;
-		sptr2 += src2Jump;
-		dptr  += dstJump;
-	}
-
-	return PRIMITIVES_SUCCESS;
-}
-#endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
-#endif
-
-#ifdef WITH_IPP
-/* ------------------------------------------------------------------------- */
-PRIM_STATIC pstatus_t ipp_alphaComp_argb(
-	const BYTE *pSrc1,  INT32 src1Step,
-	const BYTE *pSrc2,  INT32 src2Step,
-	BYTE *pDst,  INT32 dstStep,
-	INT32 width,  INT32 height)
-{
-	IppiSize sz;
-	sz.width  = width;
-	sz.height = height;
-	return ippiAlphaComp_8u_AC4R(pSrc1, src1Step, pSrc2, src2Step,
-		pDst, dstStep, sz, ippAlphaOver);
-}
-#endif
-
 /* ------------------------------------------------------------------------- */
 void primitives_init_alphaComp(const primitives_hints_t* hints, primitives_t* prims)
 {
 	prims->alphaComp_argb = general_alphaComp_argb;
-#ifdef WITH_IPP
-	prims->alphaComp_argb = ipp_alphaComp_argb;
-#elif defined(WITH_SSE2)
-	if ((hints->x86_flags & PRIM_X86_SSE2_AVAILABLE)
-			&& (hints->x86_flags & PRIM_X86_SSE3_AVAILABLE))	/* for LDDQU */
-	{
-		prims->alphaComp_argb = sse2_alphaComp_argb;
-	}
-#endif
+
+	primitives_init_alphaComp_opt(hints, prims);
 }
 
 /* ------------------------------------------------------------------------- */
@@ -300,3 +114,4 @@ void primitives_deinit_alphaComp(primitives_t *prims)
 {
 	/* Nothing to do. */
 }
+
diff --git a/libfreerdp/primitives/prim_alphaComp.h b/libfreerdp/primitives/prim_alphaComp.h
new file mode 100644
index 000000000..50591162d
--- /dev/null
+++ b/libfreerdp/primitives/prim_alphaComp.h
@@ -0,0 +1,30 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Alpha blending routines.
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.  Algorithms used by
+ * this code may be covered by patents by HP, Microsoft, or other parties.
+ *
+ */
+
+#ifdef __GNUC__
+# pragma once
+#endif
+
+#ifndef __PRIM_ALPHACOMP_H_INCLUDED__
+#define __PRIM_ALPHACOMP_H_INCLUDED__
+
+pstatus_t general_alphaComp_argb(const BYTE *pSrc1, INT32 src1Step, const BYTE *pSrc2, INT32 src2Step, BYTE *pDst, INT32 dstStep, INT32 width, INT32 height);
+
+void primitives_init_alphaComp_opt(const primitives_hints_t* hints, primitives_t* prims);
+
+#endif /* !__PRIM_ALPHACOMP_H_INCLUDED__ */
+
diff --git a/libfreerdp/primitives/prim_alphaComp_opt.c b/libfreerdp/primitives/prim_alphaComp_opt.c
new file mode 100644
index 000000000..5550fcbc1
--- /dev/null
+++ b/libfreerdp/primitives/prim_alphaComp_opt.c
@@ -0,0 +1,225 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Optimized alpha blending routines.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ *
+ * Note: this code assumes the second operand is fully opaque,
+ * e.g.
+ *   newval = alpha1*val1 + (1-alpha1)*val2
+ * rather than
+ *   newval = alpha1*val1 + (1-alpha1)*alpha2*val2
+ * The IPP gives other options.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+
+#ifdef WITH_SSE2
+#include <emmintrin.h>
+#include <pmmintrin.h>
+#endif /* WITH_SSE2 */
+
+#ifdef WITH_IPP
+#include <ippi.h>
+#endif /* WITH_IPP */
+
+#include "prim_internal.h"
+#include "prim_alphaComp.h"
+
+
+/* ------------------------------------------------------------------------- */
+#ifdef WITH_SSE2
+#if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
+
+pstatus_t sse2_alphaComp_argb(
+	const BYTE *pSrc1,  INT32 src1Step,
+	const BYTE *pSrc2,  INT32 src2Step,
+	BYTE *pDst,  INT32 dstStep,
+	INT32 width,  INT32 height)
+{
+	const UINT32 *sptr1 = (const UINT32 *) pSrc1;
+	const UINT32 *sptr2 = (const UINT32 *) pSrc2;
+	UINT32 *dptr;
+	int linebytes, src1Jump, src2Jump, dstJump, y;
+	__m128i xmm0, xmm1;
+
+	if ((width <= 0) || (height <= 0)) return PRIMITIVES_SUCCESS;
+
+	if (width < 4)     /* pointless if too small */
+	{
+		return general_alphaComp_argb(pSrc1, src1Step, pSrc2, src2Step,
+			pDst, dstStep, width, height);
+	}
+	dptr = (UINT32 *) pDst;
+	linebytes = width * sizeof(UINT32);
+	src1Jump = (src1Step - linebytes) / sizeof(UINT32);
+	src2Jump = (src2Step - linebytes) / sizeof(UINT32);
+	dstJump  = (dstStep  - linebytes) / sizeof(UINT32);
+
+	xmm0 = _mm_set1_epi32(0);
+	xmm1 = _mm_set1_epi16(1);
+
+	for (y=0; y<height; ++y)
+	{
+		int pixels = width;
+		int count;
+
+		/* Get to the 16-byte boundary now. */
+		int leadIn = 0;
+		switch ((ULONG_PTR) dptr & 0x0f)
+		{
+			case 0:
+				leadIn = 0;
+				break;
+			case 4:
+				leadIn = 3;
+				break;
+			case 8:
+				leadIn = 2;
+				break;
+			case 12:
+				leadIn = 1;
+				break;
+			default:
+				/* We'll never hit a 16-byte boundary, so do the whole
+				 * thing the slow way.
+				 */
+				leadIn = width;
+				break;
+		}
+		if (leadIn)
+		{
+			general_alphaComp_argb((const BYTE *) sptr1,
+				src1Step, (const BYTE *) sptr2, src2Step,
+				(BYTE *) dptr, dstStep, leadIn, 1);
+			sptr1 += leadIn;
+			sptr2 += leadIn;
+			dptr  += leadIn;
+			pixels -= leadIn;
+		}
+
+		/* Use SSE registers to do 4 pixels at a time. */
+		count = pixels >> 2;
+		pixels -= count << 2;
+		while (count--)
+		{
+			__m128i xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+			/* BdGdRdAdBcGcRcAcBbGbRbAbBaGaRaAa */
+			xmm2 = LOAD_SI128(sptr1); sptr1 += 4;
+			/* BhGhRhAhBgGgRgAgBfGfRfAfBeGeReAe */
+			xmm3 = LOAD_SI128(sptr2); sptr2 += 4;
+			/* 00Bb00Gb00Rb00Ab00Ba00Ga00Ra00Aa */
+			xmm4 = _mm_unpackhi_epi8(xmm2, xmm0);
+			/* 00Bf00Gf00Bf00Af00Be00Ge00Re00Ae */
+			xmm5 = _mm_unpackhi_epi8(xmm3, xmm0);
+			/* subtract */
+			xmm6 = _mm_subs_epi16(xmm4, xmm5);
+			/* 00Bb00Gb00Rb00Ab00Aa00Aa00Aa00Aa */
+			xmm4 = _mm_shufflelo_epi16(xmm4, 0xff);
+			/* 00Ab00Ab00Ab00Ab00Aa00Aa00Aa00Aa */
+			xmm4 = _mm_shufflehi_epi16(xmm4, 0xff);
+			/* Add one to alphas */
+			xmm4 = _mm_adds_epi16(xmm4, xmm1);
+			/* Multiply and take low word */
+			xmm4 = _mm_mullo_epi16(xmm4, xmm6);
+			/* Shift 8 right */
+			xmm4 = _mm_srai_epi16(xmm4, 8);
+			/* Add xmm5 */
+			xmm4 = _mm_adds_epi16(xmm4, xmm5);
+			/* 00Bj00Gj00Rj00Aj00Bi00Gi00Ri00Ai */
+
+			/* 00Bd00Gd00Rd00Ad00Bc00Gc00Rc00Ac */
+			xmm5 = _mm_unpacklo_epi8(xmm2, xmm0);
+			/* 00Bh00Gh00Rh00Ah00Bg00Gg00Rg00Ag */
+			xmm6 = _mm_unpacklo_epi8(xmm3, xmm0);
+			/* subtract */
+			xmm7 = _mm_subs_epi16(xmm5, xmm6);
+			/* 00Bd00Gd00Rd00Ad00Ac00Ac00Ac00Ac */
+			xmm5 = _mm_shufflelo_epi16(xmm5, 0xff);
+			/* 00Ad00Ad00Ad00Ad00Ac00Ac00Ac00Ac */
+			xmm5 = _mm_shufflehi_epi16(xmm5, 0xff);
+			/* Add one to alphas */
+			xmm5 = _mm_adds_epi16(xmm5, xmm1);
+			/* Multiply and take low word */
+			xmm5 = _mm_mullo_epi16(xmm5, xmm7);
+			/* Shift 8 right */
+			xmm5 = _mm_srai_epi16(xmm5, 8);
+			/* Add xmm6 */
+			xmm5 = _mm_adds_epi16(xmm5, xmm6);
+			/* 00Bl00Gl00Rl00Al00Bk00Gk00Rk0ABk */
+
+			/* Must mask off remainders or pack gets confused */
+			xmm3 = _mm_set1_epi16(0x00ffU);
+			xmm4 = _mm_and_si128(xmm4, xmm3);
+			xmm5 = _mm_and_si128(xmm5, xmm3);
+
+			/* BlGlRlAlBkGkRkAkBjGjRjAjBiGiRiAi */
+			xmm5 = _mm_packus_epi16(xmm5, xmm4);
+			_mm_store_si128((__m128i *) dptr, xmm5);  dptr += 4;
+		}
+
+		/* Finish off the remainder. */
+		if (pixels)
+		{
+			general_alphaComp_argb((const BYTE *) sptr1, src1Step,
+				(const BYTE *) sptr2, src2Step,
+				(BYTE *) dptr, dstStep, pixels, 1);
+			sptr1 += pixels;
+			sptr2 += pixels;
+			dptr  += pixels;
+		}
+
+		/* Jump to next row. */
+		sptr1 += src1Jump;
+		sptr2 += src2Jump;
+		dptr  += dstJump;
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+#endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
+#endif
+
+#ifdef WITH_IPP
+/* ------------------------------------------------------------------------- */
+pstatus_t ipp_alphaComp_argb(
+	const BYTE *pSrc1,  INT32 src1Step,
+	const BYTE *pSrc2,  INT32 src2Step,
+	BYTE *pDst,  INT32 dstStep,
+	INT32 width,  INT32 height)
+{
+	IppiSize sz;
+	sz.width  = width;
+	sz.height = height;
+	return ippiAlphaComp_8u_AC4R(pSrc1, src1Step, pSrc2, src2Step,
+		pDst, dstStep, sz, ippAlphaOver);
+}
+#endif
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_alphaComp_opt(const primitives_hints_t* hints, primitives_t* prims)
+{
+#ifdef WITH_IPP
+	prims->alphaComp_argb = ipp_alphaComp_argb;
+#elif defined(WITH_SSE2)
+	if ((hints->x86_flags & PRIM_X86_SSE2_AVAILABLE)
+			&& (hints->x86_flags & PRIM_X86_SSE3_AVAILABLE))	/* for LDDQU */
+	{
+		prims->alphaComp_argb = sse2_alphaComp_argb;
+	}
+#endif
+}
+
diff --git a/libfreerdp/primitives/prim_andor.c b/libfreerdp/primitives/prim_andor.c
index 358d6c823..0b8092ff2 100644
--- a/libfreerdp/primitives/prim_andor.c
+++ b/libfreerdp/primitives/prim_andor.c
@@ -17,27 +17,16 @@
 #include "config.h"
 #endif
 
-#include <string.h>
-
 #include <freerdp/types.h>
 #include <freerdp/primitives.h>
 
-#ifdef WITH_SSE2
-#include <emmintrin.h>
-#include <pmmintrin.h>
-#endif /* WITH_SSE2 */
-
-#ifdef WITH_IPP
-#include <ipps.h>
-#endif /* WITH_IPP */
-
 #include "prim_internal.h"
-#include "prim_templates.h"
+#include "prim_andor.h"
 
 /* ----------------------------------------------------------------------------
  * 32-bit AND with a constant.
  */
-PRIM_STATIC pstatus_t general_andC_32u(
+pstatus_t general_andC_32u(
 	const UINT32 *pSrc,
 	UINT32 val,
 	UINT32 *pDst,
@@ -55,7 +44,7 @@ PRIM_STATIC pstatus_t general_andC_32u(
 /* ----------------------------------------------------------------------------
  * 32-bit OR with a constant.
  */
-PRIM_STATIC pstatus_t general_orC_32u(
+pstatus_t general_orC_32u(
 	const UINT32 *pSrc,
 	UINT32 val,
 	UINT32 *pDst,
@@ -70,16 +59,6 @@ PRIM_STATIC pstatus_t general_orC_32u(
 	return PRIMITIVES_SUCCESS;
 }
 
-#ifdef WITH_SSE2
-# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
-/* ------------------------------------------------------------------------- */
-SSE3_SCD_PRE_ROUTINE(sse3_andC_32u, UINT32, general_andC_32u,
-	_mm_and_si128, *dptr++ = *sptr++ & val)
-SSE3_SCD_PRE_ROUTINE(sse3_orC_32u, UINT32, general_orC_32u,
-	_mm_or_si128, *dptr++ = *sptr++ | val)
-# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
-#endif
-
 /* ------------------------------------------------------------------------- */
 void primitives_init_andor(
 	const primitives_hints_t *hints,
@@ -89,17 +68,7 @@ void primitives_init_andor(
 	prims->andC_32u = general_andC_32u;
 	prims->orC_32u  = general_orC_32u;
 
-#if defined(WITH_IPP)
-	prims->andC_32u = (__andC_32u_t) ippsAndC_32u;
-	prims->orC_32u  = (__orC_32u_t) ippsOrC_32u;
-#elif defined(WITH_SSE2)
-	if ((hints->x86_flags & PRIM_X86_SSE2_AVAILABLE)
-			&& (hints->x86_flags & PRIM_X86_SSE3_AVAILABLE))
-	{
-		prims->andC_32u = sse3_andC_32u;
-		prims->orC_32u  = sse3_orC_32u;
-	}
-#endif
+	primitives_init_andor_opt(hints, prims);
 }
 
 /* ------------------------------------------------------------------------- */
@@ -108,3 +77,4 @@ void primitives_deinit_andor(
 {
 	/* Nothing to do. */
 }
+
diff --git a/libfreerdp/primitives/prim_andor.h b/libfreerdp/primitives/prim_andor.h
new file mode 100644
index 000000000..6a2e7ac46
--- /dev/null
+++ b/libfreerdp/primitives/prim_andor.h
@@ -0,0 +1,31 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Logical operations.
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.  Algorithms used by
+ * this code may be covered by patents by HP, Microsoft, or other parties.
+ *
+ */
+
+#ifdef __GNUC__
+# pragma once
+#endif
+
+#ifndef __PRIM_ANDOR_H_INCLUDED__
+#define __PRIM_ANDOR_H_INCLUDED__
+
+pstatus_t general_andC_32u(const UINT32 *pSrc, UINT32 val, UINT32 *pDst, INT32 len);
+pstatus_t general_orC_32u(const UINT32 *pSrc, UINT32 val, UINT32 *pDst, INT32 len);
+
+void primitives_init_andor_opt(const primitives_hints_t *hints,	primitives_t *prims);
+
+#endif /* !__PRIM_ANDOR_H_INCLUDED__ */
+
diff --git a/libfreerdp/primitives/prim_andor_opt.c b/libfreerdp/primitives/prim_andor_opt.c
new file mode 100644
index 000000000..1a7ebd2a0
--- /dev/null
+++ b/libfreerdp/primitives/prim_andor_opt.c
@@ -0,0 +1,61 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Optimized Logical operations.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+
+#ifdef WITH_SSE2
+#include <emmintrin.h>
+#include <pmmintrin.h>
+#endif /* WITH_SSE2 */
+
+#ifdef WITH_IPP
+#include <ipps.h>
+#endif /* WITH_IPP */
+
+#include "prim_internal.h"
+#include "prim_templates.h"
+#include "prim_andor.h"
+
+#ifdef WITH_SSE2
+# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
+/* ------------------------------------------------------------------------- */
+SSE3_SCD_PRE_ROUTINE(sse3_andC_32u, UINT32, general_andC_32u,
+	_mm_and_si128, *dptr++ = *sptr++ & val)
+SSE3_SCD_PRE_ROUTINE(sse3_orC_32u, UINT32, general_orC_32u,
+	_mm_or_si128, *dptr++ = *sptr++ | val)
+# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
+#endif
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_andor_opt(const primitives_hints_t *hints,	primitives_t *prims)
+{
+#if defined(WITH_IPP)
+	prims->andC_32u = (__andC_32u_t) ippsAndC_32u;
+	prims->orC_32u  = (__orC_32u_t) ippsOrC_32u;
+#elif defined(WITH_SSE2)
+	if ((hints->x86_flags & PRIM_X86_SSE2_AVAILABLE)
+			&& (hints->x86_flags & PRIM_X86_SSE3_AVAILABLE))
+	{
+		prims->andC_32u = sse3_andC_32u;
+		prims->orC_32u  = sse3_orC_32u;
+	}
+#endif
+}
+
diff --git a/libfreerdp/primitives/prim_colors.c b/libfreerdp/primitives/prim_colors.c
index 865f4767c..179e569b7 100644
--- a/libfreerdp/primitives/prim_colors.c
+++ b/libfreerdp/primitives/prim_colors.c
@@ -21,16 +21,11 @@
 #include "config.h"
 #endif
 
-#include <string.h>
 #include <freerdp/types.h>
 #include <freerdp/primitives.h>
-#ifdef WITH_SSE2
-#include <emmintrin.h>
-#elif defined(WITH_NEON)
-#include <arm_neon.h>
-#endif /* WITH_SSE2 else WITH_NEON */
+
 #include "prim_internal.h"
-#include "prim_templates.h"
+#include "prim_colors.h"
 
 #ifndef MINMAX
 #define MINMAX(_v_, _l_, _h_) \
@@ -38,7 +33,7 @@
 #endif /* !MINMAX */
 
 /* ------------------------------------------------------------------------- */
-PRIM_STATIC pstatus_t general_yCbCrToRGB_16s16s_P3P3(
+pstatus_t general_yCbCrToRGB_16s16s_P3P3(
 	const INT16 *pSrc[3],  INT32 srcStep,
 	INT16 *pDst[3],  INT32 dstStep,
 	const prim_size_t *roi)	/* region of interest */
@@ -119,7 +114,7 @@ PRIM_STATIC pstatus_t general_yCbCrToRGB_16s16s_P3P3(
 }
 
 /* ------------------------------------------------------------------------- */
-PRIM_STATIC pstatus_t general_RGBToYCbCr_16s16s_P3P3(
+pstatus_t general_RGBToYCbCr_16s16s_P3P3(
 	const INT16 *pSrc[3],  INT32 srcStep,
 	INT16 *pDst[3],  INT32 dstStep,
 	const prim_size_t *roi)	/* region of interest */
@@ -187,7 +182,7 @@ PRIM_STATIC pstatus_t general_RGBToYCbCr_16s16s_P3P3(
 }
 
 /* ------------------------------------------------------------------------- */
-PRIM_STATIC pstatus_t general_RGBToRGB_16s8u_P3AC4R(
+pstatus_t general_RGBToRGB_16s8u_P3AC4R(
 	const INT16 *pSrc[3],	/* 16-bit R,G, and B arrays */
 	int srcStep,			/* bytes between rows in source data */
 	BYTE *pDst,			/* 32-bit interleaved ARGB (ABGR?) data */
@@ -219,514 +214,6 @@ PRIM_STATIC pstatus_t general_RGBToRGB_16s8u_P3AC4R(
 	return PRIMITIVES_SUCCESS;
 }
 
-
-#ifdef WITH_SSE2
-
-#ifdef __GNUC__
-# define GNU_INLINE \
-	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
-#else
-# define GNU_INLINE
-#endif
-
-#define CACHE_LINE_BYTES	64
-
-#define _mm_between_epi16(_val, _min, _max) \
-	do { _val = _mm_min_epi16(_max, _mm_max_epi16(_val, _min)); } while (0)
-
-#ifdef DO_PREFETCH
-/*---------------------------------------------------------------------------*/
-static inline void GNU_INLINE _mm_prefetch_buffer(
-	char * buffer, 
-	int num_bytes)
-{
-	__m128i * buf = (__m128i*) buffer;
-	unsigned int i;
-	for (i = 0; i < (num_bytes / sizeof(__m128i)); 
-		i+=(CACHE_LINE_BYTES / sizeof(__m128i)))
-	{
-		_mm_prefetch((char*)(&buf[i]), _MM_HINT_NTA);
-	}
-}
-#endif /* DO_PREFETCH */
-
-/*---------------------------------------------------------------------------*/
-PRIM_STATIC pstatus_t sse2_yCbCrToRGB_16s16s_P3P3(
-	const INT16 *pSrc[3],
-	int srcStep,
-	INT16 *pDst[3],
-	int dstStep,
-	const prim_size_t *roi)	/* region of interest */
-{
-	__m128i zero, max, r_cr, g_cb, g_cr, b_cb, c4096;
-	__m128i *y_buf, *cb_buf, *cr_buf, *r_buf, *g_buf, *b_buf;
-	int srcbump, dstbump, yp, imax;
-
-	if (((ULONG_PTR) (pSrc[0]) & 0x0f)
-			|| ((ULONG_PTR) (pSrc[1]) & 0x0f)
-			|| ((ULONG_PTR) (pSrc[2]) & 0x0f)
-			|| ((ULONG_PTR) (pDst[0]) & 0x0f)
-			|| ((ULONG_PTR) (pDst[1]) & 0x0f)
-			|| ((ULONG_PTR) (pDst[2]) & 0x0f)
-			|| (roi->width & 0x07)
-			|| (srcStep & 127)
-			|| (dstStep & 127))
-	{
-		/* We can't maintain 16-byte alignment. */
-		return general_yCbCrToRGB_16s16s_P3P3(pSrc, srcStep,
-			pDst, dstStep, roi);
-	}
-
-	zero = _mm_setzero_si128();
-	max = _mm_set1_epi16(255);
-
-	y_buf  = (__m128i*) (pSrc[0]);
-	cb_buf = (__m128i*) (pSrc[1]);
-	cr_buf = (__m128i*) (pSrc[2]);
-	r_buf  = (__m128i*) (pDst[0]);
-	g_buf  = (__m128i*) (pDst[1]);
-	b_buf  = (__m128i*) (pDst[2]);
-
-	r_cr = _mm_set1_epi16(22986);	/*  1.403 << 14 */
-	g_cb = _mm_set1_epi16(-5636);	/* -0.344 << 14 */
-	g_cr = _mm_set1_epi16(-11698);	/* -0.714 << 14 */
-	b_cb = _mm_set1_epi16(28999);	/*  1.770 << 14 */
-	c4096 = _mm_set1_epi16(4096);
-	srcbump = srcStep / sizeof(__m128i);
-	dstbump = dstStep / sizeof(__m128i);
-
-#ifdef DO_PREFETCH
-	/* Prefetch Y's, Cb's, and Cr's. */
-	for (yp=0; yp<roi->height; yp++)
-	{
-		int i;
-		for (i=0; i<roi->width * sizeof(INT16) / sizeof(__m128i);
-			i += (CACHE_LINE_BYTES / sizeof(__m128i)))
-		{
-			_mm_prefetch((char*)(&y_buf[i]),  _MM_HINT_NTA);
-			_mm_prefetch((char*)(&cb_buf[i]), _MM_HINT_NTA);
-			_mm_prefetch((char*)(&cr_buf[i]), _MM_HINT_NTA);
-		}
-		y_buf  += srcbump;
-		cb_buf += srcbump;
-		cr_buf += srcbump;
-	}
-	y_buf  = (__m128i*) (pSrc[0]);
-	cb_buf = (__m128i*) (pSrc[1]);
-	cr_buf = (__m128i*) (pSrc[2]);
-#endif /* DO_PREFETCH */
-
-	imax = roi->width * sizeof(INT16) / sizeof(__m128i);
-	for (yp=0; yp<roi->height; ++yp)
-	{
-		int i;
-		for (i=0; i<imax; i++)
-		{
-			/* In order to use SSE2 signed 16-bit integer multiplication
-			 * we need to convert the floating point factors to signed int
-			 * without losing information.
-			 * The result of this multiplication is 32 bit and we have two
-			 * SSE instructions that return either the hi or lo word.
-			 * Thus we will multiply the factors by the highest possible 2^n,
-			 * take the upper 16 bits of the signed 32-bit result
-			 * (_mm_mulhi_epi16) and correct this result by multiplying
-			 * it by 2^(16-n).
-			 *
-			 * For the given factors in the conversion matrix the best
-			 * possible n is 14.
-			 *
-			 * Example for calculating r:
-			 * r = (y>>5) + 128 + (cr*1.403)>>5             // our base formula
-			 * r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5   // see above
-			 * r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5     // simplification
-			 * r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
-			 */
-
-			/* y = (y_r_buf[i] + 4096) >> 2 */
-			__m128i y, cb, cr, r, g, b;
-			y = _mm_load_si128(y_buf + i);
-			y = _mm_add_epi16(y, c4096);
-			y = _mm_srai_epi16(y, 2);
-			/* cb = cb_g_buf[i]; */
-			cb = _mm_load_si128(cb_buf + i);
-			/* cr = cr_b_buf[i]; */
-			cr = _mm_load_si128(cr_buf + i);
-
-			/* (y + HIWORD(cr*22986)) >> 3 */
-			r = _mm_add_epi16(y, _mm_mulhi_epi16(cr, r_cr));
-			r = _mm_srai_epi16(r, 3);
-
-			/* r_buf[i] = MINMAX(r, 0, 255); */
-			_mm_between_epi16(r, zero, max);
-			_mm_store_si128(r_buf + i, r);
-
-			/* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
-			g = _mm_add_epi16(y, _mm_mulhi_epi16(cb, g_cb));
-			g = _mm_add_epi16(g, _mm_mulhi_epi16(cr, g_cr));
-			g = _mm_srai_epi16(g, 3);
-
-			/* g_buf[i] = MINMAX(g, 0, 255); */
-			_mm_between_epi16(g, zero, max);
-			_mm_store_si128(g_buf + i, g);
-
-			/* (y + HIWORD(cb*28999)) >> 3 */
-			b = _mm_add_epi16(y, _mm_mulhi_epi16(cb, b_cb));
-			b = _mm_srai_epi16(b, 3);
-			/* b_buf[i] = MINMAX(b, 0, 255); */
-			_mm_between_epi16(b, zero, max);
-			_mm_store_si128(b_buf + i, b);
-		}
-		y_buf  += srcbump;
-		cb_buf += srcbump;
-		cr_buf += srcbump;
-		r_buf += dstbump;
-		g_buf += dstbump;
-		b_buf += dstbump;
-	}
-
-	return PRIMITIVES_SUCCESS;
-}
-
-/*---------------------------------------------------------------------------*/
-/* The encodec YCbCr coeffectients are represented as 11.5 fixed-point
- * numbers. See the general code above.
- */
-PRIM_STATIC pstatus_t sse2_RGBToYCbCr_16s16s_P3P3(
-	const INT16 *pSrc[3],
-	int srcStep,
-	INT16 *pDst[3],
-	int dstStep,
-	const prim_size_t *roi)	/* region of interest */
-{
-	__m128i min, max, y_r, y_g, y_b, cb_r, cb_g, cb_b, cr_r, cr_g, cr_b;
-	__m128i *r_buf, *g_buf, *b_buf, *y_buf, *cb_buf, *cr_buf;
-	int srcbump, dstbump, yp, imax;
-
-	if (((ULONG_PTR) (pSrc[0]) & 0x0f)
-			|| ((ULONG_PTR) (pSrc[1]) & 0x0f)
-			|| ((ULONG_PTR) (pSrc[2]) & 0x0f)
-			|| ((ULONG_PTR) (pDst[0]) & 0x0f)
-			|| ((ULONG_PTR) (pDst[1]) & 0x0f)
-			|| ((ULONG_PTR) (pDst[2]) & 0x0f)
-			|| (roi->width & 0x07)
-			|| (srcStep & 127)
-			|| (dstStep & 127))
-	{
-		/* We can't maintain 16-byte alignment. */
-		return general_RGBToYCbCr_16s16s_P3P3(pSrc, srcStep,
-			pDst, dstStep, roi);
-	}
-
-	min = _mm_set1_epi16(-128 << 5);
-	max = _mm_set1_epi16(127 << 5);
-
-	r_buf  = (__m128i*) (pSrc[0]);
-	g_buf  = (__m128i*) (pSrc[1]);
-	b_buf  = (__m128i*) (pSrc[2]);
-	y_buf  = (__m128i*) (pDst[0]);
-	cb_buf = (__m128i*) (pDst[1]);
-	cr_buf = (__m128i*) (pDst[2]);
-
-	y_r  = _mm_set1_epi16(9798);   /*  0.299000 << 15 */
-	y_g  = _mm_set1_epi16(19235);  /*  0.587000 << 15 */
-	y_b  = _mm_set1_epi16(3735);   /*  0.114000 << 15 */
-	cb_r = _mm_set1_epi16(-5535);  /* -0.168935 << 15 */
-	cb_g = _mm_set1_epi16(-10868); /* -0.331665 << 15 */
-	cb_b = _mm_set1_epi16(16403);  /*  0.500590 << 15 */
-	cr_r = _mm_set1_epi16(16377);  /*  0.499813 << 15 */
-	cr_g = _mm_set1_epi16(-13714); /* -0.418531 << 15 */
-	cr_b = _mm_set1_epi16(-2663);  /* -0.081282 << 15 */
-
-	srcbump = srcStep / sizeof(__m128i);
-	dstbump = dstStep / sizeof(__m128i);
-
-#ifdef DO_PREFETCH
-	/* Prefetch RGB's. */
-	for (yp=0; yp<roi->height; yp++)
-	{
-		int i;
-		for (i=0; i<roi->width * sizeof(INT16) / sizeof(__m128i);
-			i += (CACHE_LINE_BYTES / sizeof(__m128i)))
-		{
-			_mm_prefetch((char*)(&r_buf[i]), _MM_HINT_NTA);
-			_mm_prefetch((char*)(&g_buf[i]), _MM_HINT_NTA);
-			_mm_prefetch((char*)(&b_buf[i]), _MM_HINT_NTA);
-		}
-		r_buf += srcbump;
-		g_buf += srcbump;
-		b_buf += srcbump;
-	}
-	r_buf = (__m128i*) (pSrc[0]);
-	g_buf = (__m128i*) (pSrc[1]);
-	b_buf = (__m128i*) (pSrc[2]);
-#endif /* DO_PREFETCH */
-
-	imax = roi->width * sizeof(INT16) / sizeof(__m128i);
-	for (yp=0; yp<roi->height; ++yp)
-	{
-		int i;
-		for (i=0; i<imax; i++)
-		{
-			/* In order to use SSE2 signed 16-bit integer multiplication we
-			 * need to convert the floating point factors to signed int
-			 * without loosing information.  The result of this multiplication
-			 * is 32 bit and using SSE2 we get either the product's hi or lo
-			 * word.  Thus we will multiply the factors by the highest
-			 * possible 2^n and take the upper 16 bits of the signed 32-bit
-			 * result (_mm_mulhi_epi16).  Since the final result needs to
-			 * be scaled by << 5 and also in in order to keep the precision
-			 * within the upper 16 bits we will also have to scale the RGB
-			 * values used in the multiplication by << 5+(16-n).
-			 */
-			__m128i r, g, b, y, cb, cr;
-			r = _mm_load_si128(y_buf+i);
-			g = _mm_load_si128(g_buf+i);
-			b = _mm_load_si128(b_buf+i);
-
-			/* r<<6; g<<6; b<<6 */
-			r = _mm_slli_epi16(r, 6);
-			g = _mm_slli_epi16(g, 6);
-			b = _mm_slli_epi16(b, 6);
-
-			/* y = HIWORD(r*y_r) + HIWORD(g*y_g) + HIWORD(b*y_b) + min */
-			y = _mm_mulhi_epi16(r, y_r);
-			y = _mm_add_epi16(y, _mm_mulhi_epi16(g, y_g));
-			y = _mm_add_epi16(y, _mm_mulhi_epi16(b, y_b));
-			y = _mm_add_epi16(y, min);
-			/* y_r_buf[i] = MINMAX(y, 0, (255 << 5)) - (128 << 5); */
-			_mm_between_epi16(y, min, max);
-			_mm_store_si128(y_buf+i, y);
-
-			/* cb = HIWORD(r*cb_r) + HIWORD(g*cb_g) + HIWORD(b*cb_b) */
-			cb = _mm_mulhi_epi16(r, cb_r);
-			cb = _mm_add_epi16(cb, _mm_mulhi_epi16(g, cb_g));
-			cb = _mm_add_epi16(cb, _mm_mulhi_epi16(b, cb_b));
-			/* cb_g_buf[i] = MINMAX(cb, (-128 << 5), (127 << 5)); */
-			_mm_between_epi16(cb, min, max);
-			_mm_store_si128(cb_buf+i, cb);
-
-			/* cr = HIWORD(r*cr_r) + HIWORD(g*cr_g) + HIWORD(b*cr_b) */
-			cr = _mm_mulhi_epi16(r, cr_r);
-			cr = _mm_add_epi16(cr, _mm_mulhi_epi16(g, cr_g));
-			cr = _mm_add_epi16(cr, _mm_mulhi_epi16(b, cr_b));
-			/* cr_b_buf[i] = MINMAX(cr, (-128 << 5), (127 << 5)); */
-			_mm_between_epi16(cr, min, max);
-			_mm_store_si128(cr_buf+i, cr);
-		}
-		y_buf  += srcbump;
-		cb_buf += srcbump;
-		cr_buf += srcbump;
-		r_buf += dstbump;
-		g_buf += dstbump;
-		b_buf += dstbump;
-	}
-
-	return PRIMITIVES_SUCCESS;
-}
-
-/*---------------------------------------------------------------------------*/
-#define LOAD128(_src_) \
-	_mm_load_si128((__m128i *) _src_)
-#define STORE128(_dst_, _src_) \
-	_mm_store_si128((__m128i *) _dst_, _src_)
-#define PUNPCKLBW(_dst_, _src_) \
-	_dst_ = _mm_unpacklo_epi8(_src_, _dst_)
-#define PUNPCKHBW(_dst_, _src_) \
-	_dst_ = _mm_unpackhi_epi8(_src_, _dst_)
-#define PUNPCKLWD(_dst_, _src_) \
-	_dst_ = _mm_unpacklo_epi16(_src_, _dst_)
-#define PUNPCKHWD(_dst_, _src_) \
-	_dst_ = _mm_unpackhi_epi16(_src_, _dst_)
-#define PACKUSWB(_dst_, _src_) \
-	_dst_ = _mm_packus_epi16(_dst_, _src_)
-#define PREFETCH(_ptr_) \
-	_mm_prefetch((const void *) _ptr_, _MM_HINT_T0)
-#define XMM_ALL_ONES \
-	_mm_set1_epi32(0xFFFFFFFFU)
-
-PRIM_STATIC pstatus_t sse2_RGBToRGB_16s8u_P3AC4R(
-	const INT16 *pSrc[3],	/* 16-bit R,G, and B arrays */
-	INT32 srcStep,			/* bytes between rows in source data */
-	BYTE *pDst,				/* 32-bit interleaved ARGB (ABGR?) data */
-	INT32 dstStep,			/* bytes between rows in dest data */
-	const prim_size_t *roi)	/* region of interest */
-{
-	const UINT16 *r = (const UINT16 *) (pSrc[0]);
-	const UINT16 *g = (const UINT16 *) (pSrc[1]);
-	const UINT16 *b = (const UINT16 *) (pSrc[2]);
-	BYTE *out;
-	int srcbump, dstbump, y;
-
-	/* Ensure 16-byte alignment on all pointers,
-	 * that width is a multiple of 8,
-	 * and that the next row will also remain aligned.
-	 * Since this is usually used for 64x64 aligned arrays,
-	 * these checks should presumably pass.
-	 */
-	if ((((ULONG_PTR) (pSrc[0]) & 0x0f) != 0)
-			|| (((ULONG_PTR) (pSrc[1]) & 0x0f) != 0)
-			|| (((ULONG_PTR) (pSrc[2]) & 0x0f) != 0)
-			|| (((ULONG_PTR) pDst & 0x0f) != 0)
-			|| (roi->width & 0x0f)
-			|| (srcStep & 0x0f)
-			|| (dstStep & 0x0f))
-	{
-		return general_RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, roi);
-	}
-
-	out = (BYTE *) pDst;
-	srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
-	dstbump = (dstStep - (roi->width * sizeof(UINT32)));
-
-	for (y=0; y<roi->height; ++y)
-	{
-		int width = roi->width;
-		do {
-			__m128i R0, R1, R2, R3, R4;
-			/* The comments below pretend these are 8-byte registers
-			 * rather than 16-byte, for readability.
-			 */
-			R0 = LOAD128(b);  b += 8;		/* R0 = 00B300B200B100B0 */
-			R1 = LOAD128(b);  b += 8;		/* R1 = 00B700B600B500B4 */
-			PACKUSWB(R0,R1);				/* R0 = B7B6B5B4B3B2B1B0 */
-			R1 = LOAD128(g);  g += 8;		/* R1 = 00G300G200G100G0 */
-			R2 = LOAD128(g);  g += 8;		/* R2 = 00G700G600G500G4 */
-			PACKUSWB(R1,R2);				/* R1 = G7G6G5G4G3G2G1G0 */
-			R2 = R1;						/* R2 = G7G6G5G4G3G2G1G0 */
-			PUNPCKLBW(R2,R0);				/* R2 = G3B3G2B2G1B1G0B0 */
-			PUNPCKHBW(R1,R0);				/* R1 = G7B7G6B7G5B5G4B4 */
-			R0 = LOAD128(r);  r += 8;		/* R0 = 00R300R200R100R0 */
-			R3 = LOAD128(r);  r += 8;		/* R3 = 00R700R600R500R4 */
-			PACKUSWB(R0,R3);				/* R0 = R7R6R5R4R3R2R1R0 */
-			R3 = XMM_ALL_ONES;				/* R3 = FFFFFFFFFFFFFFFF */
-			R4 = R3;						/* R4 = FFFFFFFFFFFFFFFF */
-			PUNPCKLBW(R4,R0);				/* R4 = FFR3FFR2FFR1FFR0 */
-			PUNPCKHBW(R3,R0);				/* R3 = FFR7FFR6FFR5FFR4 */
-			R0 = R4;						/* R0 = R4               */
-			PUNPCKLWD(R0,R2);				/* R0 = FFR1G1B1FFR0G0B0 */
-			PUNPCKHWD(R4,R2);				/* R4 = FFR3G3B3FFR2G2B2 */
-			R2 = R3;						/* R2 = R3               */
-			PUNPCKLWD(R2,R1);				/* R2 = FFR5G5B5FFR4G4B4 */
-			PUNPCKHWD(R3,R1);				/* R3 = FFR7G7B7FFR6G6B6 */
-			STORE128(out, R0);  out += 16;	/* FFR1G1B1FFR0G0B0      */
-			STORE128(out, R4);  out += 16;	/* FFR3G3B3FFR2G2B2      */
-			STORE128(out, R2);  out += 16;	/* FFR5G5B5FFR4G4B4      */
-			STORE128(out, R3);  out += 16;	/* FFR7G7B7FFR6G6B6      */
-		} while (width -= 16);
-		/* Jump to next row. */
-		r += srcbump;
-		g += srcbump;
-		b += srcbump;
-		out += dstbump;
-	}
-	return PRIMITIVES_SUCCESS;
-}
-#endif /* WITH_SSE2 */
-
-/*---------------------------------------------------------------------------*/
-#ifdef WITH_NEON
-PRIM_STATIC pstatus_t neon_yCbCrToRGB_16s16s_P3P3(
-	const INT16 *pSrc[3],
-	int srcStep,
-	INT16 *pDst[3],
-	int dstStep,
-	const prim_size_t *roi)	/* region of interest */
-{
-	/* TODO: If necessary, check alignments and call the general version. */
-
-	int16x8_t zero = vdupq_n_s16(0);
-	int16x8_t max = vdupq_n_s16(255);
-
-	int16x8_t r_cr = vdupq_n_s16(22986);	//  1.403 << 14
-	int16x8_t g_cb = vdupq_n_s16(-5636);	// -0.344 << 14
-	int16x8_t g_cr = vdupq_n_s16(-11698);	// -0.714 << 14
-	int16x8_t b_cb = vdupq_n_s16(28999);	//  1.770 << 14
-	int16x8_t c4096 = vdupq_n_s16(4096);
-
-	int16x8_t* y_buf  = (int16x8_t*) pSrc[0];
-	int16x8_t* cb_buf = (int16x8_t*) pSrc[1];
-	int16x8_t* cr_buf = (int16x8_t*) pSrc[2];
-	int16x8_t* r_buf  = (int16x8_t*) pDst[0];
-	int16x8_t* g_buf  = (int16x8_t*) pDst[1];
-	int16x8_t* b_buf  = (int16x8_t*) pDst[2];
-
-	int srcbump = srcStep / sizeof(int16x8_t);
-	int dstbump = dstStep / sizeof(int16x8_t);
-	int yp;
-
-	int imax = roi->width * sizeof(INT16) / sizeof(int16x8_t);
-	for (yp=0; yp<roi->height; ++yp)
-	{
-		int i;
-		for (i=0; i<imax; i++)
-		{
-			/*
-				In order to use NEON signed 16-bit integer multiplication we need to convert
-				the floating point factors to signed int without loosing information.
-				The result of this multiplication is 32 bit and we have a NEON instruction
-				that returns the hi word of the saturated double.
-				Thus we will multiply the factors by the highest possible 2^n, take the 
-				upper 16 bits of the signed 32-bit result (vqdmulhq_s16 followed by a right
-				shift by 1 to reverse the doubling) and correct	this result by multiplying it 
-				by 2^(16-n).
-				For the given factors in the conversion matrix the best possible n is 14.
-
-				Example for calculating r:
-				r = (y>>5) + 128 + (cr*1.403)>>5                       // our base formula
-				r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5      // see above
-				r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5             // simplification
-				r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
-			*/
-		
-			/* y = (y_buf[i] + 4096) >> 2 */
-			int16x8_t y = vld1q_s16((INT16*) &y_buf[i]);
-			y = vaddq_s16(y, c4096);
-			y = vshrq_n_s16(y, 2);
-			/* cb = cb_buf[i]; */
-			int16x8_t cb = vld1q_s16((INT16*)&cb_buf[i]);
-			/* cr = cr_buf[i]; */
-			int16x8_t cr = vld1q_s16((INT16*) &cr_buf[i]);
-
-			/* (y + HIWORD(cr*22986)) >> 3 */
-			int16x8_t r = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cr, r_cr), 1));
-			r = vshrq_n_s16(r, 3);
-			/* r_buf[i] = MINMAX(r, 0, 255); */
-			r = vminq_s16(vmaxq_s16(r, zero), max);
-			vst1q_s16((INT16*)&r_buf[i], r);
-
-			/* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
-			int16x8_t g = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cb, g_cb), 1));
-			g = vaddq_s16(g, vshrq_n_s16(vqdmulhq_s16(cr, g_cr), 1));
-			g = vshrq_n_s16(g, 3);
-			/* g_buf[i] = MINMAX(g, 0, 255); */
-			g = vminq_s16(vmaxq_s16(g, zero), max);
-			vst1q_s16((INT16*)&g_buf[i], g);
-
-			/* (y + HIWORD(cb*28999)) >> 3 */
-			int16x8_t b = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cb, b_cb), 1));
-			b = vshrq_n_s16(b, 3);
-			/* b_buf[i] = MINMAX(b, 0, 255); */
-			b = vminq_s16(vmaxq_s16(b, zero), max);
-			vst1q_s16((INT16*)&b_buf[i], b);
-		}
-
-		y_buf  += srcbump;
-		cb_buf += srcbump;
-		cr_buf += srcbump;
-		r_buf += dstbump;
-		g_buf += dstbump;
-		b_buf += dstbump;
-	}
-	return PRIMITIVES_SUCCESS;
-}
-#endif /* WITH_NEON */
-
-
-/* I don't see a direct IPP version of this, since the input is INT16
- * YCbCr.  It may be possible via  Deinterleave and then YCbCrToRGB_<mod>.
- * But that would likely be slower.
- */
-
 /* ------------------------------------------------------------------------- */
 void primitives_init_colors(const primitives_hints_t* hints, primitives_t* prims)
 {
@@ -734,19 +221,7 @@ void primitives_init_colors(const primitives_hints_t* hints, primitives_t* prims
 	prims->yCbCrToRGB_16s16s_P3P3 = general_yCbCrToRGB_16s16s_P3P3;
 	prims->RGBToYCbCr_16s16s_P3P3 = general_RGBToYCbCr_16s16s_P3P3;
 
-#if defined(WITH_SSE2)
-	if (hints->x86_flags & PRIM_X86_SSE2_AVAILABLE)
-	{
-		prims->RGBToRGB_16s8u_P3AC4R  = sse2_RGBToRGB_16s8u_P3AC4R;
-		prims->yCbCrToRGB_16s16s_P3P3 = sse2_yCbCrToRGB_16s16s_P3P3;
-		prims->RGBToYCbCr_16s16s_P3P3 = sse2_RGBToYCbCr_16s16s_P3P3;
-	}
-#elif defined(WITH_NEON)
-	if (hints->arm_flags & PRIM_ARM_NEON_AVAILABLE)
-	{
-		prims->yCbCrToRGB_16s16s_P3P3 = neon_yCbCrToRGB_16s16s_P3P3;
-	}
-#endif /* WITH_SSE2 */
+	primitives_init_colors_opt(hints, prims);
 }
 
 /* ------------------------------------------------------------------------- */
@@ -754,3 +229,4 @@ void primitives_deinit_colors(primitives_t* prims)
 {
 	/* Nothing to do. */
 }
+
diff --git a/libfreerdp/primitives/prim_colors.h b/libfreerdp/primitives/prim_colors.h
new file mode 100644
index 000000000..70f478547
--- /dev/null
+++ b/libfreerdp/primitives/prim_colors.h
@@ -0,0 +1,32 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Color conversion operations.
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.  Algorithms used by
+ * this code may be covered by patents by HP, Microsoft, or other parties.
+ *
+ */
+
+#ifdef __GNUC__
+# pragma once
+#endif
+
+#ifndef __PRIM_COLORS_H_INCLUDED__
+#define __PRIM_COLORS_H_INCLUDED__
+
+pstatus_t general_yCbCrToRGB_16s16s_P3P3(const INT16 *pSrc[3], INT32 srcStep, INT16 *pDst[3], INT32 dstStep, const prim_size_t *roi);
+pstatus_t general_RGBToYCbCr_16s16s_P3P3(const INT16 *pSrc[3], INT32 srcStep, INT16 *pDst[3], INT32 dstStep, const prim_size_t *roi);
+pstatus_t general_RGBToRGB_16s8u_P3AC4R(const INT16 *pSrc[3], int srcStep, BYTE *pDst, int dstStep, const prim_size_t *roi);
+
+void primitives_init_colors_opt(const primitives_hints_t* hints, primitives_t* prims);
+
+#endif /* !__PRIM_COLORS_H_INCLUDED__ */
+
diff --git a/libfreerdp/primitives/prim_colors_opt.c b/libfreerdp/primitives/prim_colors_opt.c
new file mode 100644
index 000000000..cfc87414e
--- /dev/null
+++ b/libfreerdp/primitives/prim_colors_opt.c
@@ -0,0 +1,561 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Optimized Color conversion operations.
+ * vi:ts=4 sw=4:
+ *
+ * Copyright 2011 Stephen Erisman
+ * Copyright 2011 Norbert Federa <nfedera@thinstuff.com>
+ * Copyright 2011 Martin Fleisz <mfleisz@thinstuff.com>
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+
+#ifdef WITH_SSE2
+#include <emmintrin.h>
+#elif defined(WITH_NEON)
+#include <arm_neon.h>
+#endif /* WITH_SSE2 else WITH_NEON */
+
+#include "prim_internal.h"
+#include "prim_templates.h"
+#include "prim_colors.h"
+
+#ifdef WITH_SSE2
+
+#ifdef __GNUC__
+# define GNU_INLINE \
+	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+#else
+# define GNU_INLINE
+#endif
+
+#define CACHE_LINE_BYTES	64
+
+#define _mm_between_epi16(_val, _min, _max) \
+	do { _val = _mm_min_epi16(_max, _mm_max_epi16(_val, _min)); } while (0)
+
+#ifdef DO_PREFETCH
+/*---------------------------------------------------------------------------*/
+static inline void GNU_INLINE _mm_prefetch_buffer(
+	char * buffer, 
+	int num_bytes)
+{
+	__m128i * buf = (__m128i*) buffer;
+	unsigned int i;
+	for (i = 0; i < (num_bytes / sizeof(__m128i)); 
+		i+=(CACHE_LINE_BYTES / sizeof(__m128i)))
+	{
+		_mm_prefetch((char*)(&buf[i]), _MM_HINT_NTA);
+	}
+}
+#endif /* DO_PREFETCH */
+
+/*---------------------------------------------------------------------------*/
+pstatus_t sse2_yCbCrToRGB_16s16s_P3P3(
+	const INT16 *pSrc[3],
+	int srcStep,
+	INT16 *pDst[3],
+	int dstStep,
+	const prim_size_t *roi)	/* region of interest */
+{
+	__m128i zero, max, r_cr, g_cb, g_cr, b_cb, c4096;
+	__m128i *y_buf, *cb_buf, *cr_buf, *r_buf, *g_buf, *b_buf;
+	int srcbump, dstbump, yp, imax;
+
+	if (((ULONG_PTR) (pSrc[0]) & 0x0f)
+			|| ((ULONG_PTR) (pSrc[1]) & 0x0f)
+			|| ((ULONG_PTR) (pSrc[2]) & 0x0f)
+			|| ((ULONG_PTR) (pDst[0]) & 0x0f)
+			|| ((ULONG_PTR) (pDst[1]) & 0x0f)
+			|| ((ULONG_PTR) (pDst[2]) & 0x0f)
+			|| (roi->width & 0x07)
+			|| (srcStep & 127)
+			|| (dstStep & 127))
+	{
+		/* We can't maintain 16-byte alignment. */
+		return general_yCbCrToRGB_16s16s_P3P3(pSrc, srcStep,
+			pDst, dstStep, roi);
+	}
+
+	zero = _mm_setzero_si128();
+	max = _mm_set1_epi16(255);
+
+	y_buf  = (__m128i*) (pSrc[0]);
+	cb_buf = (__m128i*) (pSrc[1]);
+	cr_buf = (__m128i*) (pSrc[2]);
+	r_buf  = (__m128i*) (pDst[0]);
+	g_buf  = (__m128i*) (pDst[1]);
+	b_buf  = (__m128i*) (pDst[2]);
+
+	r_cr = _mm_set1_epi16(22986);	/*  1.403 << 14 */
+	g_cb = _mm_set1_epi16(-5636);	/* -0.344 << 14 */
+	g_cr = _mm_set1_epi16(-11698);	/* -0.714 << 14 */
+	b_cb = _mm_set1_epi16(28999);	/*  1.770 << 14 */
+	c4096 = _mm_set1_epi16(4096);
+	srcbump = srcStep / sizeof(__m128i);
+	dstbump = dstStep / sizeof(__m128i);
+
+#ifdef DO_PREFETCH
+	/* Prefetch Y's, Cb's, and Cr's. */
+	for (yp=0; yp<roi->height; yp++)
+	{
+		int i;
+		for (i=0; i<roi->width * sizeof(INT16) / sizeof(__m128i);
+			i += (CACHE_LINE_BYTES / sizeof(__m128i)))
+		{
+			_mm_prefetch((char*)(&y_buf[i]),  _MM_HINT_NTA);
+			_mm_prefetch((char*)(&cb_buf[i]), _MM_HINT_NTA);
+			_mm_prefetch((char*)(&cr_buf[i]), _MM_HINT_NTA);
+		}
+		y_buf  += srcbump;
+		cb_buf += srcbump;
+		cr_buf += srcbump;
+	}
+	y_buf  = (__m128i*) (pSrc[0]);
+	cb_buf = (__m128i*) (pSrc[1]);
+	cr_buf = (__m128i*) (pSrc[2]);
+#endif /* DO_PREFETCH */
+
+	imax = roi->width * sizeof(INT16) / sizeof(__m128i);
+	for (yp=0; yp<roi->height; ++yp)
+	{
+		int i;
+		for (i=0; i<imax; i++)
+		{
+			/* In order to use SSE2 signed 16-bit integer multiplication
+			 * we need to convert the floating point factors to signed int
+			 * without losing information.
+			 * The result of this multiplication is 32 bit and we have two
+			 * SSE instructions that return either the hi or lo word.
+			 * Thus we will multiply the factors by the highest possible 2^n,
+			 * take the upper 16 bits of the signed 32-bit result
+			 * (_mm_mulhi_epi16) and correct this result by multiplying
+			 * it by 2^(16-n).
+			 *
+			 * For the given factors in the conversion matrix the best
+			 * possible n is 14.
+			 *
+			 * Example for calculating r:
+			 * r = (y>>5) + 128 + (cr*1.403)>>5             // our base formula
+			 * r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5   // see above
+			 * r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5     // simplification
+			 * r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
+			 */
+
+			/* y = (y_r_buf[i] + 4096) >> 2 */
+			__m128i y, cb, cr, r, g, b;
+			y = _mm_load_si128(y_buf + i);
+			y = _mm_add_epi16(y, c4096);
+			y = _mm_srai_epi16(y, 2);
+			/* cb = cb_g_buf[i]; */
+			cb = _mm_load_si128(cb_buf + i);
+			/* cr = cr_b_buf[i]; */
+			cr = _mm_load_si128(cr_buf + i);
+
+			/* (y + HIWORD(cr*22986)) >> 3 */
+			r = _mm_add_epi16(y, _mm_mulhi_epi16(cr, r_cr));
+			r = _mm_srai_epi16(r, 3);
+
+			/* r_buf[i] = MINMAX(r, 0, 255); */
+			_mm_between_epi16(r, zero, max);
+			_mm_store_si128(r_buf + i, r);
+
+			/* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
+			g = _mm_add_epi16(y, _mm_mulhi_epi16(cb, g_cb));
+			g = _mm_add_epi16(g, _mm_mulhi_epi16(cr, g_cr));
+			g = _mm_srai_epi16(g, 3);
+
+			/* g_buf[i] = MINMAX(g, 0, 255); */
+			_mm_between_epi16(g, zero, max);
+			_mm_store_si128(g_buf + i, g);
+
+			/* (y + HIWORD(cb*28999)) >> 3 */
+			b = _mm_add_epi16(y, _mm_mulhi_epi16(cb, b_cb));
+			b = _mm_srai_epi16(b, 3);
+			/* b_buf[i] = MINMAX(b, 0, 255); */
+			_mm_between_epi16(b, zero, max);
+			_mm_store_si128(b_buf + i, b);
+		}
+		y_buf  += srcbump;
+		cb_buf += srcbump;
+		cr_buf += srcbump;
+		r_buf += dstbump;
+		g_buf += dstbump;
+		b_buf += dstbump;
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+/*---------------------------------------------------------------------------*/
+/* The encodec YCbCr coeffectients are represented as 11.5 fixed-point
+ * numbers. See the general code above.
+ */
+pstatus_t sse2_RGBToYCbCr_16s16s_P3P3(
+	const INT16 *pSrc[3],
+	int srcStep,
+	INT16 *pDst[3],
+	int dstStep,
+	const prim_size_t *roi)	/* region of interest */
+{
+	__m128i min, max, y_r, y_g, y_b, cb_r, cb_g, cb_b, cr_r, cr_g, cr_b;
+	__m128i *r_buf, *g_buf, *b_buf, *y_buf, *cb_buf, *cr_buf;
+	int srcbump, dstbump, yp, imax;
+
+	if (((ULONG_PTR) (pSrc[0]) & 0x0f)
+			|| ((ULONG_PTR) (pSrc[1]) & 0x0f)
+			|| ((ULONG_PTR) (pSrc[2]) & 0x0f)
+			|| ((ULONG_PTR) (pDst[0]) & 0x0f)
+			|| ((ULONG_PTR) (pDst[1]) & 0x0f)
+			|| ((ULONG_PTR) (pDst[2]) & 0x0f)
+			|| (roi->width & 0x07)
+			|| (srcStep & 127)
+			|| (dstStep & 127))
+	{
+		/* We can't maintain 16-byte alignment. */
+		return general_RGBToYCbCr_16s16s_P3P3(pSrc, srcStep,
+			pDst, dstStep, roi);
+	}
+
+	min = _mm_set1_epi16(-128 << 5);
+	max = _mm_set1_epi16(127 << 5);
+
+	r_buf  = (__m128i*) (pSrc[0]);
+	g_buf  = (__m128i*) (pSrc[1]);
+	b_buf  = (__m128i*) (pSrc[2]);
+	y_buf  = (__m128i*) (pDst[0]);
+	cb_buf = (__m128i*) (pDst[1]);
+	cr_buf = (__m128i*) (pDst[2]);
+
+	y_r  = _mm_set1_epi16(9798);   /*  0.299000 << 15 */
+	y_g  = _mm_set1_epi16(19235);  /*  0.587000 << 15 */
+	y_b  = _mm_set1_epi16(3735);   /*  0.114000 << 15 */
+	cb_r = _mm_set1_epi16(-5535);  /* -0.168935 << 15 */
+	cb_g = _mm_set1_epi16(-10868); /* -0.331665 << 15 */
+	cb_b = _mm_set1_epi16(16403);  /*  0.500590 << 15 */
+	cr_r = _mm_set1_epi16(16377);  /*  0.499813 << 15 */
+	cr_g = _mm_set1_epi16(-13714); /* -0.418531 << 15 */
+	cr_b = _mm_set1_epi16(-2663);  /* -0.081282 << 15 */
+
+	srcbump = srcStep / sizeof(__m128i);
+	dstbump = dstStep / sizeof(__m128i);
+
+#ifdef DO_PREFETCH
+	/* Prefetch RGB's. */
+	for (yp=0; yp<roi->height; yp++)
+	{
+		int i;
+		for (i=0; i<roi->width * sizeof(INT16) / sizeof(__m128i);
+			i += (CACHE_LINE_BYTES / sizeof(__m128i)))
+		{
+			_mm_prefetch((char*)(&r_buf[i]), _MM_HINT_NTA);
+			_mm_prefetch((char*)(&g_buf[i]), _MM_HINT_NTA);
+			_mm_prefetch((char*)(&b_buf[i]), _MM_HINT_NTA);
+		}
+		r_buf += srcbump;
+		g_buf += srcbump;
+		b_buf += srcbump;
+	}
+	r_buf = (__m128i*) (pSrc[0]);
+	g_buf = (__m128i*) (pSrc[1]);
+	b_buf = (__m128i*) (pSrc[2]);
+#endif /* DO_PREFETCH */
+
+	imax = roi->width * sizeof(INT16) / sizeof(__m128i);
+	for (yp=0; yp<roi->height; ++yp)
+	{
+		int i;
+		for (i=0; i<imax; i++)
+		{
+			/* In order to use SSE2 signed 16-bit integer multiplication we
+			 * need to convert the floating point factors to signed int
+			 * without loosing information.  The result of this multiplication
+			 * is 32 bit and using SSE2 we get either the product's hi or lo
+			 * word.  Thus we will multiply the factors by the highest
+			 * possible 2^n and take the upper 16 bits of the signed 32-bit
+			 * result (_mm_mulhi_epi16).  Since the final result needs to
+			 * be scaled by << 5 and also in in order to keep the precision
+			 * within the upper 16 bits we will also have to scale the RGB
+			 * values used in the multiplication by << 5+(16-n).
+			 */
+			__m128i r, g, b, y, cb, cr;
+			r = _mm_load_si128(y_buf+i);
+			g = _mm_load_si128(g_buf+i);
+			b = _mm_load_si128(b_buf+i);
+
+			/* r<<6; g<<6; b<<6 */
+			r = _mm_slli_epi16(r, 6);
+			g = _mm_slli_epi16(g, 6);
+			b = _mm_slli_epi16(b, 6);
+
+			/* y = HIWORD(r*y_r) + HIWORD(g*y_g) + HIWORD(b*y_b) + min */
+			y = _mm_mulhi_epi16(r, y_r);
+			y = _mm_add_epi16(y, _mm_mulhi_epi16(g, y_g));
+			y = _mm_add_epi16(y, _mm_mulhi_epi16(b, y_b));
+			y = _mm_add_epi16(y, min);
+			/* y_r_buf[i] = MINMAX(y, 0, (255 << 5)) - (128 << 5); */
+			_mm_between_epi16(y, min, max);
+			_mm_store_si128(y_buf+i, y);
+
+			/* cb = HIWORD(r*cb_r) + HIWORD(g*cb_g) + HIWORD(b*cb_b) */
+			cb = _mm_mulhi_epi16(r, cb_r);
+			cb = _mm_add_epi16(cb, _mm_mulhi_epi16(g, cb_g));
+			cb = _mm_add_epi16(cb, _mm_mulhi_epi16(b, cb_b));
+			/* cb_g_buf[i] = MINMAX(cb, (-128 << 5), (127 << 5)); */
+			_mm_between_epi16(cb, min, max);
+			_mm_store_si128(cb_buf+i, cb);
+
+			/* cr = HIWORD(r*cr_r) + HIWORD(g*cr_g) + HIWORD(b*cr_b) */
+			cr = _mm_mulhi_epi16(r, cr_r);
+			cr = _mm_add_epi16(cr, _mm_mulhi_epi16(g, cr_g));
+			cr = _mm_add_epi16(cr, _mm_mulhi_epi16(b, cr_b));
+			/* cr_b_buf[i] = MINMAX(cr, (-128 << 5), (127 << 5)); */
+			_mm_between_epi16(cr, min, max);
+			_mm_store_si128(cr_buf+i, cr);
+		}
+		y_buf  += srcbump;
+		cb_buf += srcbump;
+		cr_buf += srcbump;
+		r_buf += dstbump;
+		g_buf += dstbump;
+		b_buf += dstbump;
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+/*---------------------------------------------------------------------------*/
+#define LOAD128(_src_) \
+	_mm_load_si128((__m128i *) _src_)
+#define STORE128(_dst_, _src_) \
+	_mm_store_si128((__m128i *) _dst_, _src_)
+#define PUNPCKLBW(_dst_, _src_) \
+	_dst_ = _mm_unpacklo_epi8(_src_, _dst_)
+#define PUNPCKHBW(_dst_, _src_) \
+	_dst_ = _mm_unpackhi_epi8(_src_, _dst_)
+#define PUNPCKLWD(_dst_, _src_) \
+	_dst_ = _mm_unpacklo_epi16(_src_, _dst_)
+#define PUNPCKHWD(_dst_, _src_) \
+	_dst_ = _mm_unpackhi_epi16(_src_, _dst_)
+#define PACKUSWB(_dst_, _src_) \
+	_dst_ = _mm_packus_epi16(_dst_, _src_)
+#define PREFETCH(_ptr_) \
+	_mm_prefetch((const void *) _ptr_, _MM_HINT_T0)
+#define XMM_ALL_ONES \
+	_mm_set1_epi32(0xFFFFFFFFU)
+
+pstatus_t sse2_RGBToRGB_16s8u_P3AC4R(
+	const INT16 *pSrc[3],	/* 16-bit R,G, and B arrays */
+	INT32 srcStep,			/* bytes between rows in source data */
+	BYTE *pDst,				/* 32-bit interleaved ARGB (ABGR?) data */
+	INT32 dstStep,			/* bytes between rows in dest data */
+	const prim_size_t *roi)	/* region of interest */
+{
+	const UINT16 *r = (const UINT16 *) (pSrc[0]);
+	const UINT16 *g = (const UINT16 *) (pSrc[1]);
+	const UINT16 *b = (const UINT16 *) (pSrc[2]);
+	BYTE *out;
+	int srcbump, dstbump, y;
+
+	/* Ensure 16-byte alignment on all pointers,
+	 * that width is a multiple of 8,
+	 * and that the next row will also remain aligned.
+	 * Since this is usually used for 64x64 aligned arrays,
+	 * these checks should presumably pass.
+	 */
+	if ((((ULONG_PTR) (pSrc[0]) & 0x0f) != 0)
+			|| (((ULONG_PTR) (pSrc[1]) & 0x0f) != 0)
+			|| (((ULONG_PTR) (pSrc[2]) & 0x0f) != 0)
+			|| (((ULONG_PTR) pDst & 0x0f) != 0)
+			|| (roi->width & 0x0f)
+			|| (srcStep & 0x0f)
+			|| (dstStep & 0x0f))
+	{
+		return general_RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, roi);
+	}
+
+	out = (BYTE *) pDst;
+	srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
+	dstbump = (dstStep - (roi->width * sizeof(UINT32)));
+
+	for (y=0; y<roi->height; ++y)
+	{
+		int width = roi->width;
+		do {
+			__m128i R0, R1, R2, R3, R4;
+			/* The comments below pretend these are 8-byte registers
+			 * rather than 16-byte, for readability.
+			 */
+			R0 = LOAD128(b);  b += 8;		/* R0 = 00B300B200B100B0 */
+			R1 = LOAD128(b);  b += 8;		/* R1 = 00B700B600B500B4 */
+			PACKUSWB(R0,R1);				/* R0 = B7B6B5B4B3B2B1B0 */
+			R1 = LOAD128(g);  g += 8;		/* R1 = 00G300G200G100G0 */
+			R2 = LOAD128(g);  g += 8;		/* R2 = 00G700G600G500G4 */
+			PACKUSWB(R1,R2);				/* R1 = G7G6G5G4G3G2G1G0 */
+			R2 = R1;						/* R2 = G7G6G5G4G3G2G1G0 */
+			PUNPCKLBW(R2,R0);				/* R2 = G3B3G2B2G1B1G0B0 */
+			PUNPCKHBW(R1,R0);				/* R1 = G7B7G6B7G5B5G4B4 */
+			R0 = LOAD128(r);  r += 8;		/* R0 = 00R300R200R100R0 */
+			R3 = LOAD128(r);  r += 8;		/* R3 = 00R700R600R500R4 */
+			PACKUSWB(R0,R3);				/* R0 = R7R6R5R4R3R2R1R0 */
+			R3 = XMM_ALL_ONES;				/* R3 = FFFFFFFFFFFFFFFF */
+			R4 = R3;						/* R4 = FFFFFFFFFFFFFFFF */
+			PUNPCKLBW(R4,R0);				/* R4 = FFR3FFR2FFR1FFR0 */
+			PUNPCKHBW(R3,R0);				/* R3 = FFR7FFR6FFR5FFR4 */
+			R0 = R4;						/* R0 = R4               */
+			PUNPCKLWD(R0,R2);				/* R0 = FFR1G1B1FFR0G0B0 */
+			PUNPCKHWD(R4,R2);				/* R4 = FFR3G3B3FFR2G2B2 */
+			R2 = R3;						/* R2 = R3               */
+			PUNPCKLWD(R2,R1);				/* R2 = FFR5G5B5FFR4G4B4 */
+			PUNPCKHWD(R3,R1);				/* R3 = FFR7G7B7FFR6G6B6 */
+			STORE128(out, R0);  out += 16;	/* FFR1G1B1FFR0G0B0      */
+			STORE128(out, R4);  out += 16;	/* FFR3G3B3FFR2G2B2      */
+			STORE128(out, R2);  out += 16;	/* FFR5G5B5FFR4G4B4      */
+			STORE128(out, R3);  out += 16;	/* FFR7G7B7FFR6G6B6      */
+		} while (width -= 16);
+		/* Jump to next row. */
+		r += srcbump;
+		g += srcbump;
+		b += srcbump;
+		out += dstbump;
+	}
+	return PRIMITIVES_SUCCESS;
+}
+#endif /* WITH_SSE2 */
+
+/*---------------------------------------------------------------------------*/
+#ifdef WITH_NEON
+pstatus_t neon_yCbCrToRGB_16s16s_P3P3(
+	const INT16 *pSrc[3],
+	int srcStep,
+	INT16 *pDst[3],
+	int dstStep,
+	const prim_size_t *roi)	/* region of interest */
+{
+	/* TODO: If necessary, check alignments and call the general version. */
+
+	int16x8_t zero = vdupq_n_s16(0);
+	int16x8_t max = vdupq_n_s16(255);
+
+	int16x8_t r_cr = vdupq_n_s16(22986);	//  1.403 << 14
+	int16x8_t g_cb = vdupq_n_s16(-5636);	// -0.344 << 14
+	int16x8_t g_cr = vdupq_n_s16(-11698);	// -0.714 << 14
+	int16x8_t b_cb = vdupq_n_s16(28999);	//  1.770 << 14
+	int16x8_t c4096 = vdupq_n_s16(4096);
+
+	int16x8_t* y_buf  = (int16x8_t*) pSrc[0];
+	int16x8_t* cb_buf = (int16x8_t*) pSrc[1];
+	int16x8_t* cr_buf = (int16x8_t*) pSrc[2];
+	int16x8_t* r_buf  = (int16x8_t*) pDst[0];
+	int16x8_t* g_buf  = (int16x8_t*) pDst[1];
+	int16x8_t* b_buf  = (int16x8_t*) pDst[2];
+
+	int srcbump = srcStep / sizeof(int16x8_t);
+	int dstbump = dstStep / sizeof(int16x8_t);
+	int yp;
+
+	int imax = roi->width * sizeof(INT16) / sizeof(int16x8_t);
+	for (yp=0; yp<roi->height; ++yp)
+	{
+		int i;
+		for (i=0; i<imax; i++)
+		{
+			/*
+				In order to use NEON signed 16-bit integer multiplication we need to convert
+				the floating point factors to signed int without loosing information.
+				The result of this multiplication is 32 bit and we have a NEON instruction
+				that returns the hi word of the saturated double.
+				Thus we will multiply the factors by the highest possible 2^n, take the 
+				upper 16 bits of the signed 32-bit result (vqdmulhq_s16 followed by a right
+				shift by 1 to reverse the doubling) and correct	this result by multiplying it 
+				by 2^(16-n).
+				For the given factors in the conversion matrix the best possible n is 14.
+
+				Example for calculating r:
+				r = (y>>5) + 128 + (cr*1.403)>>5                       // our base formula
+				r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5      // see above
+				r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5             // simplification
+				r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
+			*/
+		
+			/* y = (y_buf[i] + 4096) >> 2 */
+			int16x8_t y = vld1q_s16((INT16*) &y_buf[i]);
+			y = vaddq_s16(y, c4096);
+			y = vshrq_n_s16(y, 2);
+			/* cb = cb_buf[i]; */
+			int16x8_t cb = vld1q_s16((INT16*)&cb_buf[i]);
+			/* cr = cr_buf[i]; */
+			int16x8_t cr = vld1q_s16((INT16*) &cr_buf[i]);
+
+			/* (y + HIWORD(cr*22986)) >> 3 */
+			int16x8_t r = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cr, r_cr), 1));
+			r = vshrq_n_s16(r, 3);
+			/* r_buf[i] = MINMAX(r, 0, 255); */
+			r = vminq_s16(vmaxq_s16(r, zero), max);
+			vst1q_s16((INT16*)&r_buf[i], r);
+
+			/* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
+			int16x8_t g = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cb, g_cb), 1));
+			g = vaddq_s16(g, vshrq_n_s16(vqdmulhq_s16(cr, g_cr), 1));
+			g = vshrq_n_s16(g, 3);
+			/* g_buf[i] = MINMAX(g, 0, 255); */
+			g = vminq_s16(vmaxq_s16(g, zero), max);
+			vst1q_s16((INT16*)&g_buf[i], g);
+
+			/* (y + HIWORD(cb*28999)) >> 3 */
+			int16x8_t b = vaddq_s16(y, vshrq_n_s16(vqdmulhq_s16(cb, b_cb), 1));
+			b = vshrq_n_s16(b, 3);
+			/* b_buf[i] = MINMAX(b, 0, 255); */
+			b = vminq_s16(vmaxq_s16(b, zero), max);
+			vst1q_s16((INT16*)&b_buf[i], b);
+		}
+
+		y_buf  += srcbump;
+		cb_buf += srcbump;
+		cr_buf += srcbump;
+		r_buf += dstbump;
+		g_buf += dstbump;
+		b_buf += dstbump;
+	}
+	return PRIMITIVES_SUCCESS;
+}
+#endif /* WITH_NEON */
+
+
+/* I don't see a direct IPP version of this, since the input is INT16
+ * YCbCr.  It may be possible via  Deinterleave and then YCbCrToRGB_<mod>.
+ * But that would likely be slower.
+ */
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_colors_opt(const primitives_hints_t* hints, primitives_t* prims)
+{
+#if defined(WITH_SSE2)
+	if (hints->x86_flags & PRIM_X86_SSE2_AVAILABLE)
+	{
+		prims->RGBToRGB_16s8u_P3AC4R  = sse2_RGBToRGB_16s8u_P3AC4R;
+		prims->yCbCrToRGB_16s16s_P3P3 = sse2_yCbCrToRGB_16s16s_P3P3;
+		prims->RGBToYCbCr_16s16s_P3P3 = sse2_RGBToYCbCr_16s16s_P3P3;
+	}
+#elif defined(WITH_NEON)
+	if (hints->arm_flags & PRIM_ARM_NEON_AVAILABLE)
+	{
+		prims->yCbCrToRGB_16s16s_P3P3 = neon_yCbCrToRGB_16s16s_P3P3;
+	}
+#endif /* WITH_SSE2 */
+}
+
diff --git a/libfreerdp/primitives/prim_copy.c b/libfreerdp/primitives/prim_copy.c
index 71303b1a6..4198f2d8f 100644
--- a/libfreerdp/primitives/prim_copy.c
+++ b/libfreerdp/primitives/prim_copy.c
@@ -72,7 +72,7 @@ static BOOL memory_regions_overlap_2d(
 }
 
 /* ------------------------------------------------------------------------- */
-PRIM_STATIC pstatus_t general_copy_8u(
+pstatus_t general_copy_8u(
 	const BYTE *pSrc,
 	BYTE *pDst,
 	INT32 len)
@@ -94,7 +94,7 @@ PRIM_STATIC pstatus_t general_copy_8u(
  * The addresses are assumed to have been already offset to the upper-left
  * corners of the source and destination region of interest.
  */
-PRIM_STATIC pstatus_t general_copy_8u_AC4r(
+pstatus_t general_copy_8u_AC4r(
 	const BYTE *pSrc,  INT32 srcStep,
 	BYTE *pDst,  INT32 dstStep,
 	INT32 width,  INT32 height)
diff --git a/libfreerdp/primitives/prim_internal.h b/libfreerdp/primitives/prim_internal.h
index 21df8cae0..001ab8562 100644
--- a/libfreerdp/primitives/prim_internal.h
+++ b/libfreerdp/primitives/prim_internal.h
@@ -27,15 +27,6 @@
 
 #include <freerdp/primitives.h>
 
-/* Normally the internal entrypoints should be static, but a benchmark
- * program may want to access them directly and turn this off.
- */
-#ifndef PRIM_STATIC
-# define PRIM_STATIC static
-#else
-# undef PRIM_STATIC
-# define PRIM_STATIC
-#endif /* !PRIM_STATIC */
 
 /* Use lddqu for unaligned; load for 16-byte aligned. */
 #define LOAD_SI128(_ptr_) \
diff --git a/libfreerdp/primitives/prim_set.c b/libfreerdp/primitives/prim_set.c
index 5b40ce00c..9176c8722 100644
--- a/libfreerdp/primitives/prim_set.c
+++ b/libfreerdp/primitives/prim_set.c
@@ -19,18 +19,15 @@
 #endif
 
 #include <string.h>
+
 #include <freerdp/types.h>
 #include <freerdp/primitives.h>
-#ifdef WITH_SSE2
-# include <emmintrin.h>
-#endif /* WITH_SSE2 */
-#ifdef WITH_IPP
-# include <ipps.h>
-#endif /* WITH_IPP */
+
 #include "prim_internal.h"
+#include "prim_set.h"
 
 /* ========================================================================= */
-PRIM_STATIC pstatus_t general_set_8u(
+pstatus_t general_set_8u(
 	BYTE val,
 	BYTE *pDst,
 	INT32 len)
@@ -40,7 +37,7 @@ PRIM_STATIC pstatus_t general_set_8u(
 }
 
 /* ------------------------------------------------------------------------- */
-PRIM_STATIC pstatus_t general_zero(
+pstatus_t general_zero(
 	void *pDst,
 	size_t len)
 {
@@ -48,75 +45,8 @@ PRIM_STATIC pstatus_t general_zero(
 	return PRIMITIVES_SUCCESS;
 }
 
-/* ------------------------------------------------------------------------- */
-#ifdef WITH_SSE2
-# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
-PRIM_STATIC pstatus_t sse2_set_8u(
-	BYTE val,
-	BYTE *pDst,
-	INT32 len)
-{
-	BYTE byte, *dptr;
-	__m128i xmm0;
-	size_t count;
-
-	if (len < 16) return general_set_8u(val, pDst, len);
-
-	byte  = val;
-	dptr = (BYTE *) pDst;
-
-	/* Seek 16-byte alignment. */
-	while ((ULONG_PTR) dptr & 0x0f)
-	{
-		*dptr++ = byte;
-		if (--len == 0) return PRIMITIVES_SUCCESS;
-	}
-
-	xmm0 = _mm_set1_epi8(byte);
-
-	/* Cover 256-byte chunks via SSE register stores. */
-	count = len >> 8;
-	len -= count << 8;
-	/* Do 256-byte chunks using one XMM register. */
-	while (count--)
-	{
-		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
-		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
-		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
-		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
-		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
-		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
-		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
-		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
-		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
-		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
-		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
-		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
-		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
-		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
-		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
-		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
-	}
-
-	/* Cover 16-byte chunks via SSE register stores. */
-	count = len >> 4;
-	len -= count << 4;
-	/* Do 16-byte chunks using one XMM register. */
-	while (count--)
-	{
-		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
-	}
-
-	/* Do leftover bytes. */
-	while (len--) *dptr++ = byte;
-
-	return PRIMITIVES_SUCCESS;
-}
-# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
-#endif /* WITH_SSE2 */
-
 /* ========================================================================= */
-PRIM_STATIC pstatus_t general_set_32s(
+pstatus_t general_set_32s(
 	INT32 val,
 	INT32 *pDst,
 	INT32 len)
@@ -148,7 +78,7 @@ PRIM_STATIC pstatus_t general_set_32s(
 }
 
 /* ------------------------------------------------------------------------- */
-PRIM_STATIC pstatus_t general_set_32u(
+pstatus_t general_set_32u(
 	UINT32 val,
 	UINT32 *pDst,
 	INT32 len)
@@ -179,104 +109,6 @@ PRIM_STATIC pstatus_t general_set_32u(
 	return PRIMITIVES_SUCCESS;
 }
 
-/* ------------------------------------------------------------------------- */
-#ifdef WITH_SSE2
-# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
-PRIM_STATIC pstatus_t sse2_set_32u(
-	UINT32 val,
-	UINT32 *pDst,
-	INT32 len)
-{
-	UINT32 *dptr = (UINT32 *) pDst;
-	__m128i xmm0;
-	size_t count;
-
-	/* If really short, just do it here. */
-	if (len < 32)
-	{
-		while (len--) *dptr++ = val;
-		return PRIMITIVES_SUCCESS;
-	}
-
-	/* Assure we can reach 16-byte alignment. */
-	if (((ULONG_PTR) dptr & 0x03) != 0)
-	{
-		return general_set_32u(val, pDst, len);
-	}
-
-	/* Seek 16-byte alignment. */
-	while ((ULONG_PTR) dptr & 0x0f)
-	{
-		*dptr++ = val;
-		if (--len == 0) return PRIMITIVES_SUCCESS;
-	}
-
-	xmm0 = _mm_set1_epi32(val);
-
-	/* Cover 256-byte chunks via SSE register stores. */
-	count = len >> 6;
-	len -= count << 6;
-	/* Do 256-byte chunks using one XMM register. */
-	while (count--)
-	{
-		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
-		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
-		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
-		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
-		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
-		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
-		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
-		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
-		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
-		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
-		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
-		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
-		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
-		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
-		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
-		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
-	}
-
-	/* Cover 16-byte chunks via SSE register stores. */
-	count = len >> 2;
-	len -= count << 2;
-	/* Do 16-byte chunks using one XMM register. */
-	while (count--)
-	{
-		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
-	}
-
-	/* Do leftover bytes. */
-	while (len--) *dptr++ = val;
-
-	return PRIMITIVES_SUCCESS;
-}
-
-/* ------------------------------------------------------------------------- */
-PRIM_STATIC pstatus_t sse2_set_32s(
-	INT32 val,
-	INT32 *pDst,
-	INT32 len)
-{
-	UINT32 uval = *((UINT32 *) &val);
-	return sse2_set_32u(uval, (UINT32 *) pDst, len);
-}
-# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
-#endif /* WITH_SSE2 */
-
-#ifdef WITH_IPP
-/* ------------------------------------------------------------------------- */
-PRIM_STATIC pstatus_t ipp_wrapper_set_32u(
-	UINT32 val,
-	UINT32 *pDst,
-	INT32 len)
-{
-	/* A little type conversion, then use the signed version. */
-	INT32 sval = *((INT32 *) &val);
-	return ippsSet_32s(sval, (INT32 *) pDst, len);
-}
-#endif
-
 /* ------------------------------------------------------------------------- */
 void primitives_init_set(
 	const primitives_hints_t *hints,
@@ -288,20 +120,7 @@ void primitives_init_set(
 	prims->set_32u = general_set_32u;
 	prims->zero = general_zero;
 
-	/* Pick tuned versions if possible. */
-#ifdef WITH_IPP
-	prims->set_8u  = (__set_8u_t)  ippsSet_8u;
-	prims->set_32s = (__set_32s_t) ippsSet_32s;
-	prims->set_32u = (__set_32u_t) ipp_wrapper_set_32u;
-	prims->zero = (__zero_t) ippsZero_8u;
-#elif defined(WITH_SSE2)
-	if (hints->x86_flags & PRIM_X86_SSE2_AVAILABLE)
-	{
-		prims->set_8u  = sse2_set_8u;
-		prims->set_32s = sse2_set_32s;
-		prims->set_32u = sse2_set_32u;
-	}
-#endif
+	primitives_init_set_opt(hints, prims);
 }
 
 /* ------------------------------------------------------------------------- */
@@ -310,3 +129,4 @@ void primitives_deinit_set(
 {
 	/* Nothing to do. */
 }
+
diff --git a/libfreerdp/primitives/prim_set.h b/libfreerdp/primitives/prim_set.h
new file mode 100644
index 000000000..e4504dc2c
--- /dev/null
+++ b/libfreerdp/primitives/prim_set.h
@@ -0,0 +1,34 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Routines to set a chunk of memory to a constant.
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.  Algorithms used by
+ * this code may be covered by patents by HP, Microsoft, or other parties.
+ *
+ */
+
+#ifdef __GNUC__
+# pragma once
+#endif
+
+#ifndef __PRIM_SET_H_INCLUDED__
+#define __PRIM_SET_H_INCLUDED__
+
+pstatus_t general_set_8u(BYTE val, BYTE *pDst, INT32 len);
+pstatus_t general_zero(void *pDst, size_t len);
+pstatus_t general_set_32s(INT32 val, INT32 *pDst, INT32 len);
+pstatus_t general_set_32u(UINT32 val, UINT32 *pDst, INT32 len);
+
+
+void primitives_init_set_opt(const primitives_hints_t *hints, primitives_t *prims);
+
+#endif /* !__PRIM_SET_H_INCLUDED__ */
+
diff --git a/libfreerdp/primitives/prim_set_opt.c b/libfreerdp/primitives/prim_set_opt.c
new file mode 100644
index 000000000..0523434ff
--- /dev/null
+++ b/libfreerdp/primitives/prim_set_opt.c
@@ -0,0 +1,218 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Optimized routines to set a chunk of memory to a constant.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ *
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <string.h>
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+
+#ifdef WITH_SSE2
+# include <emmintrin.h>
+#endif /* WITH_SSE2 */
+#ifdef WITH_IPP
+# include <ipps.h>
+#endif /* WITH_IPP */
+
+#include "prim_internal.h"
+#include "prim_set.h"
+
+/* ========================================================================= */
+#ifdef WITH_SSE2
+# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
+pstatus_t sse2_set_8u(
+	BYTE val,
+	BYTE *pDst,
+	INT32 len)
+{
+	BYTE byte, *dptr;
+	__m128i xmm0;
+	size_t count;
+
+	if (len < 16) return general_set_8u(val, pDst, len);
+
+	byte  = val;
+	dptr = (BYTE *) pDst;
+
+	/* Seek 16-byte alignment. */
+	while ((ULONG_PTR) dptr & 0x0f)
+	{
+		*dptr++ = byte;
+		if (--len == 0) return PRIMITIVES_SUCCESS;
+	}
+
+	xmm0 = _mm_set1_epi8(byte);
+
+	/* Cover 256-byte chunks via SSE register stores. */
+	count = len >> 8;
+	len -= count << 8;
+	/* Do 256-byte chunks using one XMM register. */
+	while (count--)
+	{
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
+	}
+
+	/* Cover 16-byte chunks via SSE register stores. */
+	count = len >> 4;
+	len -= count << 4;
+	/* Do 16-byte chunks using one XMM register. */
+	while (count--)
+	{
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
+	}
+
+	/* Do leftover bytes. */
+	while (len--) *dptr++ = byte;
+
+	return PRIMITIVES_SUCCESS;
+}
+# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
+#endif /* WITH_SSE2 */
+
+/* ------------------------------------------------------------------------- */
+#ifdef WITH_SSE2
+# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
+pstatus_t sse2_set_32u(
+	UINT32 val,
+	UINT32 *pDst,
+	INT32 len)
+{
+	UINT32 *dptr = (UINT32 *) pDst;
+	__m128i xmm0;
+	size_t count;
+
+	/* If really short, just do it here. */
+	if (len < 32)
+	{
+		while (len--) *dptr++ = val;
+		return PRIMITIVES_SUCCESS;
+	}
+
+	/* Assure we can reach 16-byte alignment. */
+	if (((ULONG_PTR) dptr & 0x03) != 0)
+	{
+		return general_set_32u(val, pDst, len);
+	}
+
+	/* Seek 16-byte alignment. */
+	while ((ULONG_PTR) dptr & 0x0f)
+	{
+		*dptr++ = val;
+		if (--len == 0) return PRIMITIVES_SUCCESS;
+	}
+
+	xmm0 = _mm_set1_epi32(val);
+
+	/* Cover 256-byte chunks via SSE register stores. */
+	count = len >> 6;
+	len -= count << 6;
+	/* Do 256-byte chunks using one XMM register. */
+	while (count--)
+	{
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
+	}
+
+	/* Cover 16-byte chunks via SSE register stores. */
+	count = len >> 2;
+	len -= count << 2;
+	/* Do 16-byte chunks using one XMM register. */
+	while (count--)
+	{
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
+	}
+
+	/* Do leftover bytes. */
+	while (len--) *dptr++ = val;
+
+	return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+pstatus_t sse2_set_32s(
+	INT32 val,
+	INT32 *pDst,
+	INT32 len)
+{
+	UINT32 uval = *((UINT32 *) &val);
+	return sse2_set_32u(uval, (UINT32 *) pDst, len);
+}
+# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
+#endif /* WITH_SSE2 */
+
+#ifdef WITH_IPP
+/* ------------------------------------------------------------------------- */
+pstatus_t ipp_wrapper_set_32u(
+	UINT32 val,
+	UINT32 *pDst,
+	INT32 len)
+{
+	/* A little type conversion, then use the signed version. */
+	INT32 sval = *((INT32 *) &val);
+	return ippsSet_32s(sval, (INT32 *) pDst, len);
+}
+#endif
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_set_opt(const primitives_hints_t *hints, primitives_t *prims)
+{
+	/* Pick tuned versions if possible. */
+#ifdef WITH_IPP
+	prims->set_8u  = (__set_8u_t)  ippsSet_8u;
+	prims->set_32s = (__set_32s_t) ippsSet_32s;
+	prims->set_32u = (__set_32u_t) ipp_wrapper_set_32u;
+	prims->zero = (__zero_t) ippsZero_8u;
+#elif defined(WITH_SSE2)
+	if (hints->x86_flags & PRIM_X86_SSE2_AVAILABLE)
+	{
+		prims->set_8u  = sse2_set_8u;
+		prims->set_32s = sse2_set_32s;
+		prims->set_32u = sse2_set_32u;
+	}
+#endif
+}
+
diff --git a/libfreerdp/primitives/prim_shift.c b/libfreerdp/primitives/prim_shift.c
index 331c7216e..bd26dc0a0 100644
--- a/libfreerdp/primitives/prim_shift.c
+++ b/libfreerdp/primitives/prim_shift.c
@@ -17,25 +17,15 @@
 #include "config.h"
 #endif
 
-#include <string.h>
-
 #include <freerdp/types.h>
 #include <freerdp/primitives.h>
 
-#ifdef WITH_SSE2
-#include <emmintrin.h>
-#include <pmmintrin.h>
-#endif /* WITH_SSE2 */
-
-#ifdef WITH_IPP
-#include <ipps.h>
-#endif /* WITH_IPP */
-
 #include "prim_internal.h"
-#include "prim_templates.h"
+#include "prim_shift.h"
+
 
 /* ------------------------------------------------------------------------- */
-PRIM_STATIC pstatus_t general_lShiftC_16s(
+pstatus_t general_lShiftC_16s(
 	const INT16 *pSrc,
 	INT32 val,
 	INT16 *pDst,
@@ -47,7 +37,7 @@ PRIM_STATIC pstatus_t general_lShiftC_16s(
 }
 
 /* ------------------------------------------------------------------------- */
-PRIM_STATIC pstatus_t general_rShiftC_16s(
+pstatus_t general_rShiftC_16s(
 	const INT16 *pSrc,
 	INT32 val,
 	INT16 *pDst,
@@ -59,7 +49,7 @@ PRIM_STATIC pstatus_t general_rShiftC_16s(
 }
 
 /* ------------------------------------------------------------------------- */
-PRIM_STATIC pstatus_t general_lShiftC_16u(
+pstatus_t general_lShiftC_16u(
 	const UINT16 *pSrc,
 	INT32 val,
 	UINT16 *pDst,
@@ -71,7 +61,7 @@ PRIM_STATIC pstatus_t general_lShiftC_16u(
 }
 
 /* ------------------------------------------------------------------------- */
-PRIM_STATIC pstatus_t general_rShiftC_16u(
+pstatus_t general_rShiftC_16u(
 	const UINT16 *pSrc,
 	INT32 val,
 	UINT16 *pDst,
@@ -82,25 +72,8 @@ PRIM_STATIC pstatus_t general_rShiftC_16u(
 	return PRIMITIVES_SUCCESS;
 }
 
-#ifdef WITH_SSE2
-# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
 /* ------------------------------------------------------------------------- */
-SSE3_SCD_ROUTINE(sse2_lShiftC_16s, INT16, general_lShiftC_16s,
-	_mm_slli_epi16, *dptr++ = *sptr++ << val)
-/* ------------------------------------------------------------------------- */
-SSE3_SCD_ROUTINE(sse2_rShiftC_16s, INT16, general_rShiftC_16s,
-	_mm_srai_epi16, *dptr++ = *sptr++ >> val)
-/* ------------------------------------------------------------------------- */
-SSE3_SCD_ROUTINE(sse2_lShiftC_16u, UINT16, general_lShiftC_16u,
-	_mm_slli_epi16, *dptr++ = *sptr++ << val)
-/* ------------------------------------------------------------------------- */
-SSE3_SCD_ROUTINE(sse2_rShiftC_16u, UINT16, general_rShiftC_16u,
-	_mm_srli_epi16, *dptr++ = *sptr++ >> val)
-# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
-#endif
-
-/* ------------------------------------------------------------------------- */
-PRIM_STATIC pstatus_t general_shiftC_16s(
+pstatus_t general_shiftC_16s(
 	const INT16 *pSrc,
 	INT32 val,
 	INT16 *pDst,
@@ -115,7 +88,7 @@ PRIM_STATIC pstatus_t general_shiftC_16s(
 }
 
 /* ------------------------------------------------------------------------- */
-PRIM_STATIC pstatus_t general_shiftC_16u(
+pstatus_t general_shiftC_16u(
 	const UINT16 *pSrc,
 	INT32 val,
 	UINT16 *pDst,
@@ -129,11 +102,6 @@ PRIM_STATIC pstatus_t general_shiftC_16u(
 	else         return prims->lShiftC_16u(pSrc,  val, pDst, len);
 }
 
-/* Note: the IPP version will have to call ippLShiftC_16s or ippRShiftC_16s
- * depending on the sign of val.  To avoid using the deprecated inplace
- * routines, a wrapper can use the src for the dest.
- */
-
 /* ------------------------------------------------------------------------- */
 void primitives_init_shift(
 	const primitives_hints_t *hints,
@@ -144,24 +112,12 @@ void primitives_init_shift(
 	prims->rShiftC_16s = general_rShiftC_16s;
 	prims->lShiftC_16u = general_lShiftC_16u;
 	prims->rShiftC_16u = general_rShiftC_16u;
-#if defined(WITH_IPP)
-	prims->lShiftC_16s = (__lShiftC_16s_t) ippsLShiftC_16s;
-	prims->rShiftC_16s = (__rShiftC_16s_t) ippsRShiftC_16s;
-	prims->lShiftC_16u = (__lShiftC_16u_t) ippsLShiftC_16u;
-	prims->rShiftC_16u = (__rShiftC_16u_t) ippsRShiftC_16u;
-#elif defined(WITH_SSE2)
-	if ((hints->x86_flags & PRIM_X86_SSE2_AVAILABLE)
-			&& (hints->x86_flags & PRIM_X86_SSE3_AVAILABLE))
-	{
-		prims->lShiftC_16s = sse2_lShiftC_16s;
-		prims->rShiftC_16s = sse2_rShiftC_16s;
-		prims->lShiftC_16u = sse2_lShiftC_16u;
-		prims->rShiftC_16u = sse2_rShiftC_16u;
-	}
-#endif
+
 	/* Wrappers */
 	prims->shiftC_16s  = general_shiftC_16s;
 	prims->shiftC_16u  = general_shiftC_16u;
+
+	primitives_init_shift_opt(hints, prims);
 }
 
 /* ------------------------------------------------------------------------- */
diff --git a/libfreerdp/primitives/prim_shift.h b/libfreerdp/primitives/prim_shift.h
new file mode 100644
index 000000000..cad054013
--- /dev/null
+++ b/libfreerdp/primitives/prim_shift.h
@@ -0,0 +1,35 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Shift operations.
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.  Algorithms used by
+ * this code may be covered by patents by HP, Microsoft, or other parties.
+ *
+ */
+
+#ifdef __GNUC__
+# pragma once
+#endif
+
+#ifndef __PRIM_SHIFT_H_INCLUDED__
+#define __PRIM_SHIFT_H_INCLUDED__
+
+pstatus_t general_lShiftC_16s(const INT16 *pSrc, INT32 val, INT16 *pDst, INT32 len);
+pstatus_t general_rShiftC_16s(const INT16 *pSrc, INT32 val, INT16 *pDst, INT32 len);
+pstatus_t general_lShiftC_16u(const UINT16 *pSrc, INT32 val, UINT16 *pDst, INT32 len);
+pstatus_t general_rShiftC_16u(const UINT16 *pSrc, INT32 val, UINT16 *pDst, INT32 len);
+pstatus_t general_shiftC_16s(const INT16 *pSrc, INT32 val, INT16 *pDst, INT32 len);
+pstatus_t general_shiftC_16u(const UINT16 *pSrc, INT32 val, UINT16 *pDst, INT32 len);
+
+void primitives_init_shift_opt(const primitives_hints_t *hints,	primitives_t *prims);
+
+#endif /* !__PRIM_SHIFT_H_INCLUDED__ */
+
diff --git a/libfreerdp/primitives/prim_shift_opt.c b/libfreerdp/primitives/prim_shift_opt.c
new file mode 100644
index 000000000..0e57da269
--- /dev/null
+++ b/libfreerdp/primitives/prim_shift_opt.c
@@ -0,0 +1,79 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Shift operations.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+
+#ifdef WITH_SSE2
+#include <emmintrin.h>
+#include <pmmintrin.h>
+#endif /* WITH_SSE2 */
+
+#ifdef WITH_IPP
+#include <ipps.h>
+#endif /* WITH_IPP */
+
+#include "prim_internal.h"
+#include "prim_templates.h"
+#include "prim_shift.h"
+
+
+#ifdef WITH_SSE2
+# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
+/* ------------------------------------------------------------------------- */
+SSE3_SCD_ROUTINE(sse2_lShiftC_16s, INT16, general_lShiftC_16s,
+	_mm_slli_epi16, *dptr++ = *sptr++ << val)
+/* ------------------------------------------------------------------------- */
+SSE3_SCD_ROUTINE(sse2_rShiftC_16s, INT16, general_rShiftC_16s,
+	_mm_srai_epi16, *dptr++ = *sptr++ >> val)
+/* ------------------------------------------------------------------------- */
+SSE3_SCD_ROUTINE(sse2_lShiftC_16u, UINT16, general_lShiftC_16u,
+	_mm_slli_epi16, *dptr++ = *sptr++ << val)
+/* ------------------------------------------------------------------------- */
+SSE3_SCD_ROUTINE(sse2_rShiftC_16u, UINT16, general_rShiftC_16u,
+	_mm_srli_epi16, *dptr++ = *sptr++ >> val)
+# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
+#endif
+
+
+/* Note: the IPP version will have to call ippLShiftC_16s or ippRShiftC_16s
+ * depending on the sign of val.  To avoid using the deprecated inplace
+ * routines, a wrapper can use the src for the dest.
+ */
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_shift_opt(const primitives_hints_t *hints,	primitives_t *prims)
+{
+#if defined(WITH_IPP)
+	prims->lShiftC_16s = (__lShiftC_16s_t) ippsLShiftC_16s;
+	prims->rShiftC_16s = (__rShiftC_16s_t) ippsRShiftC_16s;
+	prims->lShiftC_16u = (__lShiftC_16u_t) ippsLShiftC_16u;
+	prims->rShiftC_16u = (__rShiftC_16u_t) ippsRShiftC_16u;
+#elif defined(WITH_SSE2)
+	if ((hints->x86_flags & PRIM_X86_SSE2_AVAILABLE)
+			&& (hints->x86_flags & PRIM_X86_SSE3_AVAILABLE))
+	{
+		prims->lShiftC_16s = sse2_lShiftC_16s;
+		prims->rShiftC_16s = sse2_rShiftC_16s;
+		prims->lShiftC_16u = sse2_lShiftC_16u;
+		prims->rShiftC_16u = sse2_rShiftC_16u;
+	}
+#endif
+}
+
diff --git a/libfreerdp/primitives/prim_sign.c b/libfreerdp/primitives/prim_sign.c
index a3b11ee14..d7d2eb018 100644
--- a/libfreerdp/primitives/prim_sign.c
+++ b/libfreerdp/primitives/prim_sign.c
@@ -17,22 +17,16 @@
 #include "config.h"
 #endif
 
-#include <string.h>
-
 #include <freerdp/types.h>
 #include <freerdp/primitives.h>
 
-#ifdef WITH_SSE2
-#include <emmintrin.h>
-#include <tmmintrin.h>
-#endif /* WITH_SSE2 */
-
 #include "prim_internal.h"
+#include "prim_sign.h"
 
 /* ----------------------------------------------------------------------------
  * Set pDst to the sign-value of the 16-bit values in pSrc (-1, 0, or 1).
  */
-PRIM_STATIC pstatus_t general_sign_16s(
+pstatus_t general_sign_16s(
 	const INT16 *pSrc,
 	INT16 *pDst,
 	INT32 len)
@@ -46,110 +40,6 @@ PRIM_STATIC pstatus_t general_sign_16s(
 	return PRIMITIVES_SUCCESS;
 }
 
-#ifdef WITH_SSE2
-/* ------------------------------------------------------------------------- */
-PRIM_STATIC pstatus_t ssse3_sign_16s(
-	const INT16 *pSrc,
-	INT16 *pDst,
-	INT32 len)
-{
-	const INT16 *sptr = (const INT16 *) pSrc;
-	INT16 *dptr = (INT16 *) pDst;
-	size_t count;
-
-	if (len < 16)
-	{
-		return general_sign_16s(pSrc, pDst, len);
-	}
-
-	/* Check for 16-byte alignment (eventually). */
-	if ((ULONG_PTR) pDst & 0x01)
-	{
-		return general_sign_16s(pSrc, pDst, len);
-	}
-
-	/* Seek 16-byte alignment. */
-	while ((ULONG_PTR) dptr & 0x0f)
-	{
-		INT16 src = *sptr++;
-		*dptr++ = (src < 0) ? (-1) : ((src > 0) ? 1 : 0);
-		if (--len == 0) return PRIMITIVES_SUCCESS;
-	}
-
-	/* Do 32-short chunks using 8 XMM registers. */
-	count = len >> 5;	/* / 32  */
-	len -= count << 5;	/* * 32 */
-	if ((ULONG_PTR) sptr & 0x0f)
-	{
-		/* Unaligned */
-		while (count--)
-		{
-			__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-			xmm0 = _mm_set1_epi16(0x0001U);
-			xmm1 = _mm_set1_epi16(0x0001U);
-			xmm2 = _mm_set1_epi16(0x0001U);
-			xmm3 = _mm_set1_epi16(0x0001U);
-			xmm4 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8;
-			xmm5 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8;
-			xmm6 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8;
-			xmm7 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8;
-			xmm0 = _mm_sign_epi16(xmm0, xmm4);
-			xmm1 = _mm_sign_epi16(xmm1, xmm5);
-			xmm2 = _mm_sign_epi16(xmm2, xmm6);
-			xmm3 = _mm_sign_epi16(xmm3, xmm7);
-			_mm_store_si128((__m128i *) dptr, xmm0);         dptr += 8;
-			_mm_store_si128((__m128i *) dptr, xmm1);         dptr += 8;
-			_mm_store_si128((__m128i *) dptr, xmm2);         dptr += 8;
-			_mm_store_si128((__m128i *) dptr, xmm3);         dptr += 8;
-		}
-	}
-	else
-	{
-		/* Aligned */
-		while (count--)
-		{
-			__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-			xmm0 = _mm_set1_epi16(0x0001U);
-			xmm1 = _mm_set1_epi16(0x0001U);
-			xmm2 = _mm_set1_epi16(0x0001U);
-			xmm3 = _mm_set1_epi16(0x0001U);
-			xmm4 = _mm_load_si128((__m128i *) sptr); sptr += 8;
-			xmm5 = _mm_load_si128((__m128i *) sptr); sptr += 8;
-			xmm6 = _mm_load_si128((__m128i *) sptr); sptr += 8;
-			xmm7 = _mm_load_si128((__m128i *) sptr); sptr += 8;
-			xmm0 = _mm_sign_epi16(xmm0, xmm4);
-			xmm1 = _mm_sign_epi16(xmm1, xmm5);
-			xmm2 = _mm_sign_epi16(xmm2, xmm6);
-			xmm3 = _mm_sign_epi16(xmm3, xmm7);
-			_mm_store_si128((__m128i *) dptr, xmm0);         dptr += 8;
-			_mm_store_si128((__m128i *) dptr, xmm1);         dptr += 8;
-			_mm_store_si128((__m128i *) dptr, xmm2);         dptr += 8;
-			_mm_store_si128((__m128i *) dptr, xmm3);         dptr += 8;
-		}
-	}
-
-	/* Do 8-short chunks using two XMM registers. */
-	count = len >> 3;
-	len -= count << 3;
-	while (count--)
-	{
-		__m128i xmm0 = _mm_set1_epi16(0x0001U);
-		__m128i xmm1 = LOAD_SI128(sptr);					sptr += 8;
-		xmm0 = _mm_sign_epi16(xmm0, xmm1);
-		_mm_store_si128((__m128i *) dptr, xmm0);			dptr += 8;
-	}
-
-	/* Do leftovers. */
-	while (len--)
-	{
-		INT16 src = *sptr++;
-		*dptr++ = (src < 0) ? -1 : ((src > 0) ? 1 : 0);
-	}
-
-	return PRIMITIVES_SUCCESS;
-}
-#endif /* WITH_SSE2 */
-
 /* ------------------------------------------------------------------------- */
 void primitives_init_sign(
 	const primitives_hints_t *hints,
@@ -157,15 +47,8 @@ void primitives_init_sign(
 {
 	/* Start with the default. */
 	prims->sign_16s = general_sign_16s;
-	/* Pick tuned versions if possible. */
-	/* I didn't spot an IPP version of this. */
-#if defined(WITH_SSE2)
-	if ((hints->x86_flags & PRIM_X86_SSSE3_AVAILABLE)
-			&& (hints->x86_flags & PRIM_X86_SSE3_AVAILABLE))
-	{
-		prims->sign_16s  = ssse3_sign_16s;
-	}
-#endif
+
+	primitives_init_sign_opt(hints, prims);
 }
 
 /* ------------------------------------------------------------------------- */
@@ -174,3 +57,4 @@ void primitives_deinit_sign(
 {
 	/* Nothing to do. */
 }
+
diff --git a/libfreerdp/primitives/prim_sign.h b/libfreerdp/primitives/prim_sign.h
new file mode 100644
index 000000000..3592990ec
--- /dev/null
+++ b/libfreerdp/primitives/prim_sign.h
@@ -0,0 +1,30 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Sign operations.
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.  Algorithms used by
+ * this code may be covered by patents by HP, Microsoft, or other parties.
+ *
+ */
+
+#ifdef __GNUC__
+# pragma once
+#endif
+
+#ifndef __PRIM_SIGN_H_INCLUDED__
+#define __PRIM_SIGN_H_INCLUDED__
+
+pstatus_t general_sign_16s(const INT16 *pSrc, INT16 *pDst, INT32 len);
+
+void primitives_init_sign_opt(const primitives_hints_t *hints,	primitives_t *prims);
+
+#endif /* !__PRIM_SIGN_H_INCLUDED__ */
+
diff --git a/libfreerdp/primitives/prim_sign_opt.c b/libfreerdp/primitives/prim_sign_opt.c
new file mode 100644
index 000000000..81842b9bd
--- /dev/null
+++ b/libfreerdp/primitives/prim_sign_opt.c
@@ -0,0 +1,149 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Optimized sign operations.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+
+#ifdef WITH_SSE2
+#include <emmintrin.h>
+#include <tmmintrin.h>
+#endif /* WITH_SSE2 */
+
+#include "prim_internal.h"
+#include "prim_sign.h"
+
+
+#ifdef WITH_SSE2
+/* ------------------------------------------------------------------------- */
+pstatus_t ssse3_sign_16s(
+	const INT16 *pSrc,
+	INT16 *pDst,
+	INT32 len)
+{
+	const INT16 *sptr = (const INT16 *) pSrc;
+	INT16 *dptr = (INT16 *) pDst;
+	size_t count;
+
+	if (len < 16)
+	{
+		return general_sign_16s(pSrc, pDst, len);
+	}
+
+	/* Check for 16-byte alignment (eventually). */
+	if ((ULONG_PTR) pDst & 0x01)
+	{
+		return general_sign_16s(pSrc, pDst, len);
+	}
+
+	/* Seek 16-byte alignment. */
+	while ((ULONG_PTR) dptr & 0x0f)
+	{
+		INT16 src = *sptr++;
+		*dptr++ = (src < 0) ? (-1) : ((src > 0) ? 1 : 0);
+		if (--len == 0) return PRIMITIVES_SUCCESS;
+	}
+
+	/* Do 32-short chunks using 8 XMM registers. */
+	count = len >> 5;	/* / 32  */
+	len -= count << 5;	/* * 32 */
+	if ((ULONG_PTR) sptr & 0x0f)
+	{
+		/* Unaligned */
+		while (count--)
+		{
+			__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+			xmm0 = _mm_set1_epi16(0x0001U);
+			xmm1 = _mm_set1_epi16(0x0001U);
+			xmm2 = _mm_set1_epi16(0x0001U);
+			xmm3 = _mm_set1_epi16(0x0001U);
+			xmm4 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8;
+			xmm5 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8;
+			xmm6 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8;
+			xmm7 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8;
+			xmm0 = _mm_sign_epi16(xmm0, xmm4);
+			xmm1 = _mm_sign_epi16(xmm1, xmm5);
+			xmm2 = _mm_sign_epi16(xmm2, xmm6);
+			xmm3 = _mm_sign_epi16(xmm3, xmm7);
+			_mm_store_si128((__m128i *) dptr, xmm0);         dptr += 8;
+			_mm_store_si128((__m128i *) dptr, xmm1);         dptr += 8;
+			_mm_store_si128((__m128i *) dptr, xmm2);         dptr += 8;
+			_mm_store_si128((__m128i *) dptr, xmm3);         dptr += 8;
+		}
+	}
+	else
+	{
+		/* Aligned */
+		while (count--)
+		{
+			__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+			xmm0 = _mm_set1_epi16(0x0001U);
+			xmm1 = _mm_set1_epi16(0x0001U);
+			xmm2 = _mm_set1_epi16(0x0001U);
+			xmm3 = _mm_set1_epi16(0x0001U);
+			xmm4 = _mm_load_si128((__m128i *) sptr); sptr += 8;
+			xmm5 = _mm_load_si128((__m128i *) sptr); sptr += 8;
+			xmm6 = _mm_load_si128((__m128i *) sptr); sptr += 8;
+			xmm7 = _mm_load_si128((__m128i *) sptr); sptr += 8;
+			xmm0 = _mm_sign_epi16(xmm0, xmm4);
+			xmm1 = _mm_sign_epi16(xmm1, xmm5);
+			xmm2 = _mm_sign_epi16(xmm2, xmm6);
+			xmm3 = _mm_sign_epi16(xmm3, xmm7);
+			_mm_store_si128((__m128i *) dptr, xmm0);         dptr += 8;
+			_mm_store_si128((__m128i *) dptr, xmm1);         dptr += 8;
+			_mm_store_si128((__m128i *) dptr, xmm2);         dptr += 8;
+			_mm_store_si128((__m128i *) dptr, xmm3);         dptr += 8;
+		}
+	}
+
+	/* Do 8-short chunks using two XMM registers. */
+	count = len >> 3;
+	len -= count << 3;
+	while (count--)
+	{
+		__m128i xmm0 = _mm_set1_epi16(0x0001U);
+		__m128i xmm1 = LOAD_SI128(sptr);					sptr += 8;
+		xmm0 = _mm_sign_epi16(xmm0, xmm1);
+		_mm_store_si128((__m128i *) dptr, xmm0);			dptr += 8;
+	}
+
+	/* Do leftovers. */
+	while (len--)
+	{
+		INT16 src = *sptr++;
+		*dptr++ = (src < 0) ? -1 : ((src > 0) ? 1 : 0);
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+#endif /* WITH_SSE2 */
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_sign_opt(const primitives_hints_t *hints, primitives_t *prims)
+{
+	/* Pick tuned versions if possible. */
+	/* I didn't spot an IPP version of this. */
+#if defined(WITH_SSE2)
+	if ((hints->x86_flags & PRIM_X86_SSSE3_AVAILABLE)
+			&& (hints->x86_flags & PRIM_X86_SSE3_AVAILABLE))
+	{
+		prims->sign_16s  = ssse3_sign_16s;
+	}
+#endif
+}
+
diff --git a/libfreerdp/primitives/prim_templates.h b/libfreerdp/primitives/prim_templates.h
index c0b6ac10d..b530637b7 100644
--- a/libfreerdp/primitives/prim_templates.h
+++ b/libfreerdp/primitives/prim_templates.h
@@ -44,7 +44,7 @@
  * SCD = Source, Constant, Destination
  */
 #define SSE3_SCD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
-PRIM_STATIC pstatus_t _name_(const _type_ *pSrc, INT32 val, _type_ *pDst, INT32 len) \
+pstatus_t _name_(const _type_ *pSrc, INT32 val, _type_ *pDst, INT32 len) \
 { \
 	int shifts; \
 	UINT32 offBeatMask; \
@@ -188,7 +188,7 @@ PRIM_STATIC pstatus_t _name_(const _type_ *pSrc, INT32 val, _type_ *pDst, INT32
  * PRE = preload xmm0 with the constant.
  */
 #define SSE3_SCD_PRE_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
-PRIM_STATIC pstatus_t _name_(const _type_ *pSrc, _type_ val, _type_ *pDst, INT32 len) \
+pstatus_t _name_(const _type_ *pSrc, _type_ val, _type_ *pDst, INT32 len) \
 { \
 	int shifts; \
 	UINT32 offBeatMask; \
@@ -293,7 +293,7 @@ PRIM_STATIC pstatus_t _name_(const _type_ *pSrc, _type_ val, _type_ *pDst, INT32
  * SSD = Source1, Source2, Destination
  */
 #define SSE3_SSD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
-PRIM_STATIC pstatus_t _name_(const _type_ *pSrc1, const _type_ *pSrc2, _type_ *pDst, INT32 len) \
+pstatus_t _name_(const _type_ *pSrc1, const _type_ *pSrc2, _type_ *pDst, INT32 len) \
 { \
 	int shifts; \
 	UINT32 offBeatMask; \