Merge branch 'master' of https://github.com/dpoe/FreeRDP

2013-01-18 19:28:55 -05:00 · 2013-01-18 19:28:55 -05:00 · c6e6956a43
parent 9ae698c1e5 b64408975d
commit c6e6956a43
49 changed files with 5696 additions and 407 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -231,6 +231,10 @@ set(GSTREAMER_FEATURE_TYPE "RECOMMENDED")
 set(GSTREAMER_FEATURE_PURPOSE "multimedia")
 set(GSTREAMER_FEATURE_DESCRIPTION "multimedia redirection, audio and video playback")

+set(IPP_FEATURE_TYPE "OPTIONAL")
+set(IPP_FEATURE_PURPOSE "performance")
+set(IPP_FEATURE_DESCRIPTION "Intel Integrated Performance Primitives library")
+
 if(WIN32)
 	set(X11_FEATURE_TYPE "DISABLED")
 	set(ZLIB_FEATURE_TYPE "DISABLED")
@ -285,6 +289,9 @@ find_feature(PCSC ${PCSC_FEATURE_TYPE} ${PCSC_FEATURE_PURPOSE} ${PCSC_FEATURE_DE
 find_feature(FFmpeg ${FFMPEG_FEATURE_TYPE} ${FFMPEG_FEATURE_PURPOSE} ${FFMPEG_FEATURE_DESCRIPTION})
 find_feature(Gstreamer ${GSTREAMER_FEATURE_TYPE} ${GSTREAMER_FEATURE_PURPOSE} ${GSTREAMER_FEATURE_DESCRIPTION})

+# Intel Performance Primitives
+find_feature(IPP ${IPP_FEATURE_TYPE} ${IPP_FEATURE_PURPOSE} ${IPP_FEATURE_DESCRIPTION})
+
 # Installation Paths
 if(WIN32)
 	set(CMAKE_INSTALL_BINDIR ".")
--- a/client/DirectFB/CMakeLists.txt
+++ b/client/DirectFB/CMakeLists.txt
@ -36,7 +36,7 @@ set(${MODULE_PREFIX}_LIBS ${${MODULE_PREFIX}_LIBS} freerdp-client)
 set_complex_link_libraries(VARIABLE ${MODULE_PREFIX}_LIBS
 	MONOLITHIC ${MONOLITHIC_BUILD}
 	MODULE freerdp
-	MODULES freerdp-core freerdp-gdi freerdp-locale freerdp-codec freerdp-utils)
+	MODULES freerdp-core freerdp-gdi freerdp-locale freerdp-codec freerdp-primitives freerdp-utils)

 target_link_libraries(${MODULE_NAME} ${${MODULE_PREFIX}_LIBS})
 install(TARGETS ${MODULE_NAME} DESTINATION ${CMAKE_INSTALL_BINDIR})
--- a/client/Mac/CMakeLists.txt
+++ b/client/Mac/CMakeLists.txt
@ -78,7 +78,7 @@ set(${MODULE_PREFIX}_LIBS ${${MODULE_PREFIX}_LIBS} freerdp-client)

 set_complex_link_libraries(VARIABLE ${MODULE_PREFIX}_LIBS MONOLITHIC ${MONOLITHIC_BUILD}
 	MODULE freerdp
-	MODULES freerdp-core freerdp-cache freerdp-gdi freerdp-codec freerdp-rail freerdp-utils)
+	MODULES freerdp-core freerdp-cache freerdp-gdi freerdp-codec freerdp-primitives freerdp-rail freerdp-utils)

 set_complex_link_libraries(VARIABLE ${MODULE_PREFIX}_LIBS MONOLITHIC ${MONOLITHIC_BUILD}
 	MODULE winpr
--- a/client/Windows/CMakeLists.txt
+++ b/client/Windows/CMakeLists.txt
@ -46,7 +46,7 @@ set_complex_link_libraries(VARIABLE ${MODULE_PREFIX}_LIBS
 set_complex_link_libraries(VARIABLE ${MODULE_PREFIX}_LIBS
 	MONOLITHIC ${MONOLITHIC_BUILD}
 	MODULE freerdp
-	MODULES freerdp-core freerdp-gdi freerdp-codec freerdp-utils)
+	MODULES freerdp-core freerdp-gdi freerdp-codec freerdp-primitives freerdp-utils)

 target_link_libraries(${MODULE_NAME} ${${MODULE_PREFIX}_LIBS})

--- a/client/X11/CMakeLists.txt
+++ b/client/X11/CMakeLists.txt
@ -120,10 +120,14 @@ set(${MODULE_PREFIX}_LIBS ${${MODULE_PREFIX}_LIBS} freerdp-client)

 set_complex_link_libraries(VARIABLE ${MODULE_PREFIX}_LIBS MONOLITHIC ${MONOLITHIC_BUILD}
 	MODULE freerdp
-	MODULES freerdp-core freerdp-gdi freerdp-locale freerdp-rail freerdp-utils)
+	MODULES freerdp-core freerdp-gdi freerdp-locale freerdp-primitives freerdp-rail freerdp-utils)

 target_link_libraries(${MODULE_NAME} ${${MODULE_PREFIX}_LIBS})

+if(WITH_IPP)
+    target_link_libraries(xfreerdp ${IPP_LIBRARY_LIST})
+endif()
+
 install(TARGETS ${MODULE_NAME} DESTINATION ${CMAKE_INSTALL_BINDIR})

 set_property(TARGET ${MODULE_NAME} PROPERTY FOLDER "Client/X11")
--- a/cmake/ConfigOptions.cmake
+++ b/cmake/ConfigOptions.cmake
@ -9,6 +9,7 @@ endif()

 option(WITH_MANPAGES "Generate manpages." ON)
 option(WITH_PROFILER "Compile profiler." OFF)
+option(WITH_IPP "Use Intel Performance Primitives." OFF)

 if((TARGET_ARCH MATCHES "x86|x64") AND (NOT DEFINED WITH_SSE2))
 	option(WITH_SSE2 "Enable SSE2 optimization." ON)
--- a/cmake/FindIPP.cmake
+++ b/cmake/FindIPP.cmake
@ -265,7 +265,7 @@ if(NOT IPP_FOUND)
    # Note, if several IPP installations found the newest version will be
    # selected
    # ------------------------------------------------------------------------
-    foreach(curdir ${CMAKE_SYSTEM_PREFIX_PATH})
+    foreach(curdir ${CMAKE_SYSTEM_PREFIX_PATH} /opt)
        set(curdir ${curdir}/intel)
        file(TO_CMAKE_PATH ${curdir} CURDIR)

@ -336,3 +336,33 @@ if(WIN32 AND MINGW AND NOT IPP_LATEST_VERSION_MAJOR LESS 7)
    set(MSV_NTDLL    "ntdll")
    set(IPP_LIBRARIES ${IPP_LIBRARIES} ${MSV_NTDLL}${IPP_LIB_SUFFIX})
 endif()
+
+# ------------------------------------------------------------------------
+# This section will look for the IPP "compiler" dependent library
+# libiomp5.
+# ------------------------------------------------------------------------
+foreach(curdir ${CMAKE_SYSTEM_PREFIX_PATH} /opt)
+    set(curdir ${curdir}/intel)
+
+    if(EXISTS ${curdir})
+       file(GLOB_RECURSE liblist FOLLOW_SYMLINKS ${curdir}/libiomp5.*)
+       foreach(lib ${liblist})
+           get_filename_component(libdir ${lib} REALPATH)
+           get_filename_component(libdir ${libdir} PATH)
+           set(IPP_COMPILER_LIBRARY_DIRS ${libdir})
+           set(IPP_COMPILER_LIBRARIES iomp5)
+       endforeach(lib)
+    endif()
+endforeach(curdir)
+
+# ------------------------------------------------------------------------
+# Build fullpath library list.
+# ------------------------------------------------------------------------
+find_library(LIB_IPPI ippi PATHS ${IPP_LIBRARY_DIRS})
+set(IPP_LIBRARY_LIST ${IPP_LIBRARY_LIST} ${LIB_IPPI})
+find_library(LIB_IPPS ipps PATHS ${IPP_LIBRARY_DIRS})
+set(IPP_LIBRARY_LIST ${IPP_LIBRARY_LIST} ${LIB_IPPS})
+find_library(LIB_IPPCORE ippcore PATHS ${IPP_LIBRARY_DIRS})
+set(IPP_LIBRARY_LIST ${IPP_LIBRARY_LIST} ${LIB_IPPCORE})
+find_library(LIB_IOMP5 iomp5 PATHS ${IPP_COMPILER_LIBRARY_DIRS})
+set(IPP_LIBRARY_LIST ${IPP_LIBRARY_LIST} ${LIB_IOMP5})
--- a/config.h.in
+++ b/config.h.in
@ -36,6 +36,7 @@
 #cmakedefine WITH_PROFILER
 #cmakedefine WITH_SSE2
 #cmakedefine WITH_NEON
+#cmakedefine WITH_IPP
 #cmakedefine WITH_NATIVE_SSPI
 #cmakedefine WITH_JPEG
 #cmakedefine WITH_WIN8
--- a/include/freerdp/codec/rfx.h
+++ b/include/freerdp/codec/rfx.h
@ -101,8 +101,6 @@ struct _RFX_CONTEXT
 	BYTE quant_idx_cr;

 	/* routines */
-	void (*decode_ycbcr_to_rgb)(INT16* y_r_buf, INT16* cb_g_buf, INT16* cr_b_buf);
-	void (*encode_rgb_to_ycbcr)(INT16* y_r_buf, INT16* cb_g_buf, INT16* cr_b_buf);
 	void (*quantization_decode)(INT16* buffer, const UINT32* quantization_values);
 	void (*quantization_encode)(INT16* buffer, const UINT32* quantization_values);
 	void (*dwt_2d_decode)(INT16* buffer, INT16* dwt_buffer);
--- a/include/freerdp/primitives.h
+++ b/include/freerdp/primitives.h
@ -0,0 +1,207 @@
+/* primitives.h
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.  Algorithms used by
+ * this code may be covered by patents by HP, Microsoft, or other parties.
+ */
+
+#ifdef __GNUC__
+# pragma once
+#endif
+
+#ifndef __PRIMITIVES_H_INCLUDED__
+#define __PRIMITIVES_H_INCLUDED__
+
+#include <freerdp/api.h>
+#include <freerdp/types.h>
+
+typedef INT32 pstatus_t;				/* match IppStatus. */
+#define PRIMITIVES_SUCCESS		(0)		/* match ippStsNoErr */
+
+/* Simple macro for address of an x,y location in 2d 4-byte memory block */
+#define PIXMAP4_ADDR(_dst_, _x_, _y_, _span_) \
+	((void *) (((BYTE *) (_dst_)) + (((_x_) + (_y_)*(_span_)) << 2)))
+
+#define PRIM_X86_MMX_AVAILABLE					(1U<<0)
+#define PRIM_X86_3DNOW_AVAILABLE				(1U<<1)
+#define PRIM_X86_3DNOW_PREFETCH_AVAILABLE		(1U<<2)
+#define PRIM_X86_SSE_AVAILABLE					(1U<<3)
+#define PRIM_X86_SSE2_AVAILABLE					(1U<<4)
+#define PRIM_X86_SSE3_AVAILABLE					(1U<<5)
+#define PRIM_X86_SSSE3_AVAILABLE				(1U<<6)
+#define PRIM_X86_SSE41_AVAILABLE				(1U<<7)
+#define PRIM_X86_SSE42_AVAILABLE				(1U<<8)
+#define PRIM_X86_AVX_AVAILABLE					(1U<<9)
+#define PRIM_X86_FMA_AVAILABLE					(1U<<10)
+#define PRIM_X86_AVX_AES_AVAILABLE				(1U<<11)
+#define PRIM_X86_AVX2_AVAILABLE					(1U<<12)
+
+#define PRIM_ARM_VFP1_AVAILABLE					(1U<<0)
+#define PRIM_ARM_VFP2_AVAILABLE					(1U<<1)
+#define PRIM_ARM_VFP3_AVAILABLE					(1U<<2)
+#define PRIM_ARM_VFP4_AVAILABLE					(1U<<3)
+#define PRIM_ARM_FPA_AVAILABLE					(1U<<4)
+#define PRIM_ARM_FPE_AVAILABLE					(1U<<5)
+#define PRIM_ARM_IWMMXT_AVAILABLE				(1U<<6)
+#define PRIM_ARM_NEON_AVAILABLE					(1U<<7)
+
+/* Structures compatible with IPP */
+typedef struct
+{
+	INT32 width;
+	INT32 height;
+} prim_size_t;		/* like IppiSize */
+
+/* Function prototypes for all of the supported primitives. */
+typedef pstatus_t (*__copy_t)(
+	const void *pSrc,
+	void *pDst,
+	INT32 bytes);
+typedef pstatus_t (*__copy_8u_t)(
+	const BYTE *pSrc,
+	BYTE *pDst,
+	INT32 len);
+typedef pstatus_t (*__copy_8u_AC4r_t)(
+	const BYTE *pSrc,
+	INT32 srcStep,	/* bytes */
+	BYTE *pDst,  
+	INT32 dstStep,	/* bytes */
+	INT32 width,  INT32 height);	/* pixels */
+typedef pstatus_t (*__set_8u_t)(
+	BYTE val,
+	BYTE *pDst,
+	INT32 len);
+typedef pstatus_t (*__set_32s_t)(
+	INT32 val,
+	INT32 *pDst,
+	INT32 len);
+typedef pstatus_t (*__set_32u_t)(
+	UINT32 val,
+	UINT32 *pDst,
+	INT32 len);
+typedef pstatus_t (*__zero_t)(
+	void *pDst,
+	size_t bytes);
+typedef pstatus_t (*__alphaComp_argb_t)(
+	const BYTE *pSrc1,  INT32 src1Step,
+	const BYTE *pSrc2,  INT32 src2Step,
+	BYTE *pDst,  INT32 dstStep,
+	INT32 width,  INT32 height);
+typedef pstatus_t (*__add_16s_t)(
+	const INT16 *pSrc1,
+	const INT16 *pSrc2,
+	INT16 *pDst,
+	INT32 len);
+typedef pstatus_t (*__lShiftC_16s_t)(
+	const INT16 *pSrc,
+	INT32 val,
+	INT16 *pSrcDst,
+	INT32 len);
+typedef pstatus_t (*__lShiftC_16u_t)(
+	const UINT16 *pSrc,
+	INT32 val,
+	UINT16 *pSrcDst,
+	INT32 len);
+typedef pstatus_t (*__rShiftC_16s_t)(
+	const INT16 *pSrc,
+	INT32 val,
+	INT16 *pSrcDst,
+	INT32 len);
+typedef pstatus_t (*__rShiftC_16u_t)(
+	const UINT16 *pSrc,
+	INT32 val,
+	UINT16 *pSrcDst,
+	INT32 len);
+typedef pstatus_t (*__shiftC_16s_t)(
+	const INT16 *pSrc,
+	INT32 val,
+	INT16 *pSrcDst,
+	INT32 len);
+typedef pstatus_t (*__shiftC_16u_t)(
+	const UINT16 *pSrc,
+	INT32 val,
+	UINT16 *pSrcDst,
+	INT32 len);
+typedef pstatus_t (*__sign_16s_t)(
+	const INT16 *pSrc,
+	INT16 *pDst,
+	INT32 len);
+typedef pstatus_t (*__yCbCrToRGB_16s16s_P3P3_t)(
+	const INT16 *pSrc[3],  INT32 srcStep,
+	INT16 *pDst[3],  INT32 dstStep,
+	const prim_size_t *roi);
+typedef pstatus_t (*__RGBToYCbCr_16s16s_P3P3_t)(
+	const INT16 *pSrc[3],  INT32 srcStep,
+	INT16 *pDst[3],  INT32 dstStep,
+	const prim_size_t *roi);
+typedef pstatus_t (*__RGBToRGB_16s8u_P3AC4R_t)(
+	const INT16 *pSrc[3],  INT32 srcStep,
+	BYTE *pDst,  INT32 dstStep,
+	const prim_size_t *roi);
+typedef pstatus_t (*__andC_32u_t)(
+	const UINT32 *pSrc,
+	UINT32 val,
+	UINT32 *pDst,
+	INT32 len);
+typedef pstatus_t (*__orC_32u_t)(
+	const UINT32 *pSrc,
+	UINT32 val,
+	UINT32 *pDst,
+	INT32 len);
+
+typedef struct
+{
+	/* Memory-to-memory copy routines */
+	__copy_t copy;						/* memcpy/memmove, basically */
+	__copy_8u_t copy_8u;				/* more strongly typed */
+	__copy_8u_AC4r_t copy_8u_AC4r;		/* pixel copy function */
+	/* Memory setting routines */
+	__set_8u_t set_8u;					/* memset, basically */
+	__set_32s_t set_32s;
+	__set_32u_t set_32u;
+	__zero_t zero;						/* bzero or faster */
+	/* Arithmetic functions */
+	__add_16s_t add_16s;
+	/* And/or */
+	__andC_32u_t andC_32u;
+	__orC_32u_t orC_32u;
+	/* Shifts */
+	__lShiftC_16s_t lShiftC_16s;
+	__lShiftC_16u_t lShiftC_16u;
+	__rShiftC_16s_t rShiftC_16s;
+	__rShiftC_16u_t rShiftC_16u;
+	__shiftC_16s_t   shiftC_16s;
+	__shiftC_16u_t   shiftC_16u;
+	/* Alpha Composition */
+	__alphaComp_argb_t alphaComp_argb;
+	/* Sign */
+	__sign_16s_t sign_16s;
+	/* Color conversions */
+	__yCbCrToRGB_16s16s_P3P3_t yCbCrToRGB_16s16s_P3P3;
+	__RGBToYCbCr_16s16s_P3P3_t RGBToYCbCr_16s16s_P3P3;
+	__RGBToRGB_16s8u_P3AC4R_t RGBToRGB_16s8u_P3AC4R;
+
+	/* internal use for CPU flags and such. */
+	void *hints;
+} primitives_t;
+
+/* Prototypes for the externally-visible entrypoints. */
+FREERDP_API void primitives_init(void);
+FREERDP_API primitives_t *primitives_get(void);
+FREERDP_API UINT32 primitives_get_flags(
+	const primitives_t *prims);
+FREERDP_API void primitives_flags_str(
+	const primitives_t *prims,
+	char *str,
+	size_t len);
+FREERDP_API void primitives_deinit(void);
+
+#endif /* !__PRIMITIVES_H_INCLUDED__ */
--- a/libfreerdp/CMakeLists.txt
+++ b/libfreerdp/CMakeLists.txt
@ -31,6 +31,7 @@ set(${MODULE_PREFIX}_SUBMODULES
 	codec
 	crypto
 	locale
+	primitives
 	core)

 foreach(${MODULE_PREFIX}_SUBMODULE ${${MODULE_PREFIX}_SUBMODULES})
--- a/libfreerdp/codec/rfx.c
+++ b/libfreerdp/codec/rfx.c
@ -79,7 +79,7 @@ static void rfx_profiler_create(RFX_CONTEXT* context)
 	PROFILER_CREATE(context->priv->prof_rfx_differential_decode, "rfx_differential_decode");
 	PROFILER_CREATE(context->priv->prof_rfx_quantization_decode, "rfx_quantization_decode");
 	PROFILER_CREATE(context->priv->prof_rfx_dwt_2d_decode, "rfx_dwt_2d_decode");
-	PROFILER_CREATE(context->priv->prof_rfx_decode_ycbcr_to_rgb, "rfx_decode_ycbcr_to_rgb");
+	PROFILER_CREATE(context->priv->prof_rfx_ycbcr_to_rgb, "prims->yCbCrToRGB");
 	PROFILER_CREATE(context->priv->prof_rfx_decode_format_rgb, "rfx_decode_format_rgb");

 	PROFILER_CREATE(context->priv->prof_rfx_encode_rgb, "rfx_encode_rgb");
@ -88,7 +88,7 @@ static void rfx_profiler_create(RFX_CONTEXT* context)
 	PROFILER_CREATE(context->priv->prof_rfx_differential_encode, "rfx_differential_encode");
 	PROFILER_CREATE(context->priv->prof_rfx_quantization_encode, "rfx_quantization_encode");
 	PROFILER_CREATE(context->priv->prof_rfx_dwt_2d_encode, "rfx_dwt_2d_encode");
-	PROFILER_CREATE(context->priv->prof_rfx_encode_rgb_to_ycbcr, "rfx_encode_rgb_to_ycbcr");
+	PROFILER_CREATE(context->priv->prof_rfx_rgb_to_ycbcr, "prims->RGBToYCbCr");
 	PROFILER_CREATE(context->priv->prof_rfx_encode_format_rgb, "rfx_encode_format_rgb");
 }

@ -100,7 +100,7 @@ static void rfx_profiler_free(RFX_CONTEXT* context)
 	PROFILER_FREE(context->priv->prof_rfx_differential_decode);
 	PROFILER_FREE(context->priv->prof_rfx_quantization_decode);
 	PROFILER_FREE(context->priv->prof_rfx_dwt_2d_decode);
-	PROFILER_FREE(context->priv->prof_rfx_decode_ycbcr_to_rgb);
+	PROFILER_FREE(context->priv->prof_rfx_ycbcr_to_rgb);
 	PROFILER_FREE(context->priv->prof_rfx_decode_format_rgb);

 	PROFILER_FREE(context->priv->prof_rfx_encode_rgb);
@ -109,7 +109,7 @@ static void rfx_profiler_free(RFX_CONTEXT* context)
 	PROFILER_FREE(context->priv->prof_rfx_differential_encode);
 	PROFILER_FREE(context->priv->prof_rfx_quantization_encode);
 	PROFILER_FREE(context->priv->prof_rfx_dwt_2d_encode);
-	PROFILER_FREE(context->priv->prof_rfx_encode_rgb_to_ycbcr);
+	PROFILER_FREE(context->priv->prof_rfx_rgb_to_ycbcr);
 	PROFILER_FREE(context->priv->prof_rfx_encode_format_rgb);
 }

@ -123,7 +123,7 @@ static void rfx_profiler_print(RFX_CONTEXT* context)
 	PROFILER_PRINT(context->priv->prof_rfx_differential_decode);
 	PROFILER_PRINT(context->priv->prof_rfx_quantization_decode);
 	PROFILER_PRINT(context->priv->prof_rfx_dwt_2d_decode);
-	PROFILER_PRINT(context->priv->prof_rfx_decode_ycbcr_to_rgb);
+	PROFILER_PRINT(context->priv->prof_rfx_ycbcr_to_rgb);
 	PROFILER_PRINT(context->priv->prof_rfx_decode_format_rgb);

 	PROFILER_PRINT(context->priv->prof_rfx_encode_rgb);
@ -132,7 +132,7 @@ static void rfx_profiler_print(RFX_CONTEXT* context)
 	PROFILER_PRINT(context->priv->prof_rfx_differential_encode);
 	PROFILER_PRINT(context->priv->prof_rfx_quantization_encode);
 	PROFILER_PRINT(context->priv->prof_rfx_dwt_2d_encode);
-	PROFILER_PRINT(context->priv->prof_rfx_encode_rgb_to_ycbcr);
+	PROFILER_PRINT(context->priv->prof_rfx_rgb_to_ycbcr);
 	PROFILER_PRINT(context->priv->prof_rfx_encode_format_rgb);

 	PROFILER_PRINT_FOOTER;
@ -164,8 +164,6 @@ RFX_CONTEXT* rfx_context_new(void)
 	rfx_profiler_create(context);
 	
 	/* set up default routines */
-	context->decode_ycbcr_to_rgb = rfx_decode_ycbcr_to_rgb;
-	context->encode_rgb_to_ycbcr = rfx_encode_rgb_to_ycbcr;
 	context->quantization_decode = rfx_quantization_decode;	
 	context->quantization_encode = rfx_quantization_encode;	
 	context->dwt_2d_decode = rfx_dwt_2d_decode;
@ -414,7 +412,7 @@ static void rfx_process_message_tile(RFX_CONTEXT* context, RFX_TILE* tile, STREA
 		YLen, context->quants + (quantIdxY * 10),
 		CbLen, context->quants + (quantIdxCb * 10),
 		CrLen, context->quants + (quantIdxCr * 10),
-		tile->data);
+		tile->data, 64*sizeof(UINT32));
 }

 static void rfx_process_message_tileset(RFX_CONTEXT* context, RFX_MESSAGE* message, STREAM* s)
--- a/libfreerdp/codec/rfx_decode.c
+++ b/libfreerdp/codec/rfx_decode.c
@ -27,6 +27,7 @@
 #include <string.h>

 #include <freerdp/utils/stream.h>
+#include <freerdp/primitives.h>

 #include "rfx_types.h"
 #include "rfx_rlgr.h"
@ -36,49 +37,55 @@

 #include "rfx_decode.h"

+/* stride is bytes between rows in the output buffer. */
 static void rfx_decode_format_rgb(INT16* r_buf, INT16* g_buf, INT16* b_buf,
-	RDP_PIXEL_FORMAT pixel_format, BYTE* dst_buf)
+	RDP_PIXEL_FORMAT pixel_format, BYTE* dst_buf, int stride)
 {
+	primitives_t *prims = primitives_get();
 	INT16* r = r_buf;
 	INT16* g = g_buf;
 	INT16* b = b_buf;
+	INT16* pSrc[3];
+	static const prim_size_t roi_64x64 = { 64, 64 };
 	BYTE* dst = dst_buf;
-	int i;
+	int x, y;
 	
 	switch (pixel_format)
 	{
 		case RDP_PIXEL_FORMAT_B8G8R8A8:
-			for (i = 0; i < 4096; i++)
-			{
-				*dst++ = (BYTE) (*b++);
-				*dst++ = (BYTE) (*g++);
-				*dst++ = (BYTE) (*r++);
-				*dst++ = 0xFF;
-			}
+			pSrc[0] = r;  pSrc[1] = g;  pSrc[2] = b;
+			prims->RGBToRGB_16s8u_P3AC4R(
+				(const INT16 **) pSrc, 64*sizeof(INT16),
+				dst, stride, &roi_64x64);
 			break;
 		case RDP_PIXEL_FORMAT_R8G8B8A8:
-			for (i = 0; i < 4096; i++)
-			{
-				*dst++ = (BYTE) (*r++);
-				*dst++ = (BYTE) (*g++);
-				*dst++ = (BYTE) (*b++);
-				*dst++ = 0xFF;
-			}
+			pSrc[0] = b;  pSrc[1] = g;  pSrc[2] = r;
+			prims->RGBToRGB_16s8u_P3AC4R(
+				(const INT16 **) pSrc, 64*sizeof(INT16),
+				dst, stride, &roi_64x64);
 			break;
 		case RDP_PIXEL_FORMAT_B8G8R8:
-			for (i = 0; i < 4096; i++)
+			for (y=0; y<64; y++)
 			{
-				*dst++ = (BYTE) (*b++);
-				*dst++ = (BYTE) (*g++);
-				*dst++ = (BYTE) (*r++);
+				for (x=0; x<64; x++)
+				{
+					*dst++ = (BYTE) (*b++);
+					*dst++ = (BYTE) (*g++);
+					*dst++ = (BYTE) (*r++);
+				}
+				dst += stride - (64*3);
 			}
 			break;
 		case RDP_PIXEL_FORMAT_R8G8B8:
-			for (i = 0; i < 4096; i++)
+			for (y=0; y<64; y++)
 			{
-				*dst++ = (BYTE) (*r++);
-				*dst++ = (BYTE) (*g++);
-				*dst++ = (BYTE) (*b++);
+				for (x=0; x<64; x++)
+				{
+					*dst++ = (BYTE) (*r++);
+					*dst++ = (BYTE) (*g++);
+					*dst++ = (BYTE) (*b++);
+				}
+				dst += stride - (64*3);
 			}
 			break;
 		default:
@ -86,69 +93,6 @@ static void rfx_decode_format_rgb(INT16* r_buf, INT16* g_buf, INT16* b_buf,
 	}
 }

-#define MINMAX(_v,_l,_h) ((_v) < (_l) ? (_l) : ((_v) > (_h) ? (_h) : (_v)))
-
-void rfx_decode_ycbcr_to_rgb(INT16* y_r_buf, INT16* cb_g_buf, INT16* cr_b_buf)
-{
-	/* INT32 is used intentionally because we calculate with shifted factors! */
-	INT32 y, cb, cr;
-	INT32 r, g, b;
-	int i;
-
-	/**
-	 * The decoded YCbCr coeffectients are represented as 11.5 fixed-point numbers:
-	 *
-	 * 1 sign bit + 10 integer bits + 5 fractional bits
-	 *
-	 * However only 7 integer bits will be actually used since the value range is [-128.0, 127.0].
-	 * In other words, the decoded coeffectients is scaled by << 5 when intepreted as INT16.
-	 * It was scaled in the quantization phase, so we must scale it back here.
-	 */
-	for (i = 0; i < 4096; i++)
-	{
-		y = y_r_buf[i];
-		cb = cb_g_buf[i];
-		cr = cr_b_buf[i];
-
-#if 0
-		/**
-		 * This is the slow floating point version kept here for reference
-		 */
-
-		y = y + 4096; /* 128<<5=4096 so that we can scale the sum by >> 5 */
-
-		r = y + cr*1.403f;
-		g = y - cb*0.344f - cr*0.714f;
-		b = y + cb*1.770f;
-
-		y_r_buf[i]  = MINMAX(r>>5, 0, 255);
-		cb_g_buf[i] = MINMAX(g>>5, 0, 255);
-		cr_b_buf[i] = MINMAX(b>>5, 0, 255);
-#else
-		/**
-		 * We scale the factors by << 16 into 32-bit integers in order to avoid slower
-		 * floating point multiplications. Since the final result needs to be scaled
-		 * by >> 5 we will extract only the upper 11 bits (>> 21) from the final sum.
-		 * Hence we also have to scale the other terms of the sum by << 16.
-		 *
-		 * R: 1.403 << 16 = 91947
-		 * G: 0.344 << 16 = 22544, 0.714 << 16 = 46792
-		 * B: 1.770 << 16 = 115998
-		 */
-
-		y = (y+4096)<<16;
-
-		r = y + cr*91947;
-		g = y - cb*22544 - cr*46792;
-		b = y + cb*115998;
-
-		y_r_buf[i]  = MINMAX(r>>21, 0, 255);
-		cb_g_buf[i] = MINMAX(g>>21, 0, 255);
-		cr_b_buf[i] = MINMAX(b>>21, 0, 255);
-#endif
-	}
-}
-
 static void rfx_decode_component(RFX_CONTEXT* context, const UINT32* quantization_values,
 	const BYTE* data, int size, INT16* buffer)
 {
@ -173,11 +117,18 @@ static void rfx_decode_component(RFX_CONTEXT* context, const UINT32* quantizatio
 	PROFILER_EXIT(context->priv->prof_rfx_decode_component);
 }

+/* rfx_decode_ycbcr_to_rgb code now resides in the primitives library. */
+
+/* stride is bytes between rows in the output buffer. */
 void rfx_decode_rgb(RFX_CONTEXT* context, STREAM* data_in,
 	int y_size, const UINT32 * y_quants,
 	int cb_size, const UINT32 * cb_quants,
-	int cr_size, const UINT32 * cr_quants, BYTE* rgb_buffer)
+	int cr_size, const UINT32 * cr_quants, BYTE* rgb_buffer, int stride)
 {
+	static const prim_size_t roi_64x64 = { 64, 64 };
+	const primitives_t *prims = primitives_get();
+	INT16 *pSrcDst[3];
+
 	PROFILER_ENTER(context->priv->prof_rfx_decode_rgb);

 	rfx_decode_component(context, y_quants, stream_get_tail(data_in), y_size, context->priv->y_r_buffer); /* YData */
@ -188,12 +139,16 @@ void rfx_decode_rgb(RFX_CONTEXT* context, STREAM* data_in,
 	stream_seek(data_in, cr_size);

 	PROFILER_ENTER(context->priv->prof_rfx_decode_ycbcr_to_rgb);
-		context->decode_ycbcr_to_rgb(context->priv->y_r_buffer, context->priv->cb_g_buffer, context->priv->cr_b_buffer);
+		pSrcDst[0] = context->priv->y_r_buffer;
+		pSrcDst[1] = context->priv->cb_g_buffer;
+		pSrcDst[2] = context->priv->cr_b_buffer;
+		prims->yCbCrToRGB_16s16s_P3P3((const INT16 **) pSrcDst, 64*sizeof(INT16),
+			pSrcDst, 64*sizeof(INT16), &roi_64x64);
 	PROFILER_EXIT(context->priv->prof_rfx_decode_ycbcr_to_rgb);

 	PROFILER_ENTER(context->priv->prof_rfx_decode_format_rgb);
 		rfx_decode_format_rgb(context->priv->y_r_buffer, context->priv->cb_g_buffer, context->priv->cr_b_buffer,
-			context->pixel_format, rgb_buffer);
+			context->pixel_format, rgb_buffer, stride);
 	PROFILER_EXIT(context->priv->prof_rfx_decode_format_rgb);
 	
 	PROFILER_EXIT(context->priv->prof_rfx_decode_rgb);
--- a/libfreerdp/codec/rfx_decode.h
+++ b/libfreerdp/codec/rfx_decode.h
@ -22,12 +22,12 @@

 #include <freerdp/codec/rfx.h>

-void rfx_decode_ycbcr_to_rgb(INT16* y_r_buf, INT16* cb_g_buf, INT16* cr_b_buf);
-
+/* stride is bytes between rows in the output buffer. */
 void rfx_decode_rgb(RFX_CONTEXT* context, STREAM* data_in,
 	int y_size, const UINT32 * y_quants,
 	int cb_size, const UINT32 * cb_quants,
-	int cr_size, const UINT32 * cr_quants, BYTE* rgb_buffer);
+	int cr_size, const UINT32 * cr_quants, BYTE* rgb_buffer,
+	int stride);

 #endif /* __RFX_DECODE_H */

--- a/libfreerdp/codec/rfx_encode.c
+++ b/libfreerdp/codec/rfx_encode.c
@ -26,6 +26,8 @@
 #include <stdlib.h>
 #include <string.h>

+#include <freerdp/primitives.h>
+
 #include "rfx_types.h"
 #include "rfx_rlgr.h"
 #include "rfx_differential.h"
@ -180,47 +182,7 @@ static void rfx_encode_format_rgb(const BYTE* rgb_data, int width, int height, i
 	}
 }

-void rfx_encode_rgb_to_ycbcr(INT16* y_r_buf, INT16* cb_g_buf, INT16* cr_b_buf)
-{
-	/* INT32 is used intentionally because we calculate with shifted factors! */
-	int i;
-	INT32 r, g, b;
-	INT32 y, cb, cr;
-
-	/**
-	 * The encoded YCbCr coefficients are represented as 11.5 fixed-point numbers:
-	 *
-	 * 1 sign bit + 10 integer bits + 5 fractional bits
-	 *
-	 * However only 7 integer bits will be actually used since the value range is [-128.0, 127.0].
-	 * In other words, the encoded coefficients is scaled by << 5 when interpreted as INT16.
-	 * It will be scaled down to original during the quantization phase.
-	 */
-	for (i = 0; i < 4096; i++)
-	{
-		r = y_r_buf[i];
-		g = cb_g_buf[i];
-		b = cr_b_buf[i];
-
-		/*
-		 * We scale the factors by << 15 into 32-bit integers in order to avoid slower
-		 * floating point multiplications. Since the terms need to be scaled by << 5 we
-		 * simply scale the final sum by >> 10
-		 *
-		 * Y:  0.299000 << 15 = 9798,  0.587000 << 15 = 19235, 0.114000 << 15 = 3735
-		 * Cb: 0.168935 << 15 = 5535,  0.331665 << 15 = 10868, 0.500590 << 15 = 16403
-		 * Cr: 0.499813 << 15 = 16377, 0.418531 << 15 = 13714, 0.081282 << 15 = 2663
-		 */
-
-		y  = (r *  9798 + g *  19235 + b *  3735) >> 10;
-		cb = (r * -5535 + g * -10868 + b * 16403) >> 10;
-		cr = (r * 16377 + g * -13714 + b * -2663) >> 10;
-
-		y_r_buf[i] = MINMAX(y - 4096, -4096, 4095);
-		cb_g_buf[i] = MINMAX(cb, -4096, 4095);
-		cr_b_buf[i] = MINMAX(cr, -4096, 4095);
-	}
-}
+/* rfx_encode_rgb_to_ycbcr code now resides in the primitives library. */

 static void rfx_encode_component(RFX_CONTEXT* context, const UINT32* quantization_values,
 	INT16* data, BYTE* buffer, int buffer_size, int* size)
@ -250,6 +212,9 @@ void rfx_encode_rgb(RFX_CONTEXT* context, const BYTE* rgb_data, int width, int h
 	const UINT32* y_quants, const UINT32* cb_quants, const UINT32* cr_quants,
 	STREAM* data_out, int* y_size, int* cb_size, int* cr_size)
 {
+	primitives_t *prims = primitives_get();
+	INT16* pSrcDst[3];
+	static const prim_size_t roi_64x64 = { 64, 64 };
 	INT16* y_r_buffer = context->priv->y_r_buffer;
 	INT16* cb_g_buffer = context->priv->cb_g_buffer;
 	INT16* cr_b_buffer = context->priv->cr_b_buffer;
@ -261,9 +226,13 @@ void rfx_encode_rgb(RFX_CONTEXT* context, const BYTE* rgb_data, int width, int h
 			context->pixel_format, context->palette, y_r_buffer, cb_g_buffer, cr_b_buffer);
 	PROFILER_EXIT(context->priv->prof_rfx_encode_format_rgb);

-	PROFILER_ENTER(context->priv->prof_rfx_encode_rgb_to_ycbcr);
-		context->encode_rgb_to_ycbcr(context->priv->y_r_buffer, context->priv->cb_g_buffer, context->priv->cr_b_buffer);
-	PROFILER_EXIT(context->priv->prof_rfx_encode_rgb_to_ycbcr);
+	PROFILER_ENTER(context->priv->prof_rfx_rgb_to_ycbcr);
+		pSrcDst[0] = context->priv->y_r_buffer;
+		pSrcDst[1] = context->priv->cb_g_buffer;
+		pSrcDst[2] = context->priv->cr_b_buffer;
+		prims->RGBToYCbCr_16s16s_P3P3((const INT16 **) pSrcDst, 64*sizeof(INT16),
+			pSrcDst, 64*sizeof(INT16), &roi_64x64);
+	PROFILER_EXIT(context->priv->prof_rfx_rgb_to_ycbcr);

 	/* Ensure the buffer is reasonably large enough */
 	stream_check_size(data_out, 4096);
--- a/libfreerdp/codec/rfx_encode.h
+++ b/libfreerdp/codec/rfx_encode.h
@ -22,8 +22,6 @@

 #include <freerdp/codec/rfx.h>

-void rfx_encode_rgb_to_ycbcr(INT16* y_r_buf, INT16* cb_g_buf, INT16* cr_b_buf);
-
 void rfx_encode_rgb(RFX_CONTEXT* context, const BYTE* rgb_data, int width, int height, int rowstride,
 	const UINT32* y_quants, const UINT32* cb_quants, const UINT32* cr_quants,
 	STREAM* data_out, int* y_size, int* cb_size, int* cr_size);
--- a/libfreerdp/codec/rfx_neon.c
+++ b/libfreerdp/codec/rfx_neon.c
@ -35,56 +35,7 @@
 #include "cpu-features.h"
 #endif

-void rfx_decode_YCbCr_to_RGB_NEON(INT16 * y_r_buffer, INT16 * cb_g_buffer, INT16 * cr_b_buffer)
-{
-	int16x8_t zero = vdupq_n_s16(0);
-	int16x8_t max = vdupq_n_s16(255);
-	int16x8_t y_add = vdupq_n_s16(128);
-
-	int16x8_t* y_r_buf = (int16x8_t*) y_r_buffer;
-	int16x8_t* cb_g_buf = (int16x8_t*) cb_g_buffer;
-	int16x8_t* cr_b_buf = (int16x8_t*) cr_b_buffer;
-
-	int i;
-	for (i = 0; i < 4096 / 8; i++)
-	{
-		int16x8_t y = vld1q_s16((INT16*) &y_r_buf[i]);
-		y = vaddq_s16(y, y_add);
-
-		int16x8_t cr = vld1q_s16((INT16*) &cr_b_buf[i]);
-
-		// r = between((y + cr + (cr >> 2) + (cr >> 3) + (cr >> 5)), 0, 255);
-		int16x8_t r = vaddq_s16(y, cr);
-		r = vaddq_s16(r, vshrq_n_s16(cr, 2));
-		r = vaddq_s16(r, vshrq_n_s16(cr, 3));
-		r = vaddq_s16(r, vshrq_n_s16(cr, 5));
-		r = vminq_s16(vmaxq_s16(r, zero), max);
-		vst1q_s16((INT16*)&y_r_buf[i], r);
-
-		// cb = cb_g_buf[i];
-		int16x8_t cb = vld1q_s16((INT16*)&cb_g_buf[i]);
-
-		// g = between(y - (cb >> 2) - (cb >> 4) - (cb >> 5) - (cr >> 1) - (cr >> 3) - (cr >> 4) - (cr >> 5), 0, 255);
-		int16x8_t g = vsubq_s16(y, vshrq_n_s16(cb, 2));
-		g = vsubq_s16(g, vshrq_n_s16(cb, 4));
-		g = vsubq_s16(g, vshrq_n_s16(cb, 5));
-		g = vsubq_s16(g, vshrq_n_s16(cr, 1));
-		g = vsubq_s16(g, vshrq_n_s16(cr, 3));
-		g = vsubq_s16(g, vshrq_n_s16(cr, 4));
-		g = vsubq_s16(g, vshrq_n_s16(cr, 5));
-		g = vminq_s16(vmaxq_s16(g, zero), max);
-		vst1q_s16((INT16*)&cb_g_buf[i], g);
-
-		// b = between((y + cb + (cb >> 1) + (cb >> 2) + (cb >> 6)), 0, 255);
-		int16x8_t b = vaddq_s16(y, cb);
-		b = vaddq_s16(b, vshrq_n_s16(cb, 1));
-		b = vaddq_s16(b, vshrq_n_s16(cb, 2));
-		b = vaddq_s16(b, vshrq_n_s16(cb, 6));
-		b = vminq_s16(vmaxq_s16(b, zero), max);
-		vst1q_s16((INT16*)&cr_b_buf[i], b);
-	}
-
-}
+/* rfx_decode_YCbCr_to_RGB_NEON code now resides in the primitives library. */

 static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 rfx_quantization_decode_block_NEON(INT16 * buffer, const int buffer_size, const UINT32 factor)
@ -338,11 +289,10 @@ void rfx_init_neon(RFX_CONTEXT * context)
 	{
 		DEBUG_RFX("Using NEON optimizations");

-		IF_PROFILER(context->priv->prof_rfx_decode_ycbcr_to_rgb->name = "rfx_decode_YCbCr_to_RGB_NEON");
+		IF_PROFILER(context->priv->prof_rfx_ycbcr_to_rgb->name = "rfx_decode_YCbCr_to_RGB_NEON");
 		IF_PROFILER(context->priv->prof_rfx_quantization_decode->name = "rfx_quantization_decode_NEON");
 		IF_PROFILER(context->priv->prof_rfx_dwt_2d_decode->name = "rfx_dwt_2d_decode_NEON");

-		context->decode_ycbcr_to_rgb = rfx_decode_YCbCr_to_RGB_NEON;
 		context->quantization_decode = rfx_quantization_decode_NEON;
 		context->dwt_2d_decode = rfx_dwt_2d_decode_NEON;
 	}
--- a/libfreerdp/codec/rfx_quantization.c
+++ b/libfreerdp/codec/rfx_quantization.c
@ -21,36 +21,34 @@
 #include "config.h"
 #endif

+#include <freerdp/primitives.h>
 #include "rfx_quantization.h"

-static void rfx_quantization_decode_block(INT16* buffer, int buffer_size, UINT32 factor)
+static void rfx_quantization_decode_block(const primitives_t *prims, INT16* buffer, int buffer_size, UINT32 factor)
 {
-	INT16* dst;
-
 	if (factor == 0)
 		return;

-	for (dst = buffer; buffer_size > 0; dst++, buffer_size--)
-	{
-		*dst <<= factor;
-	}
+	prims->lShiftC_16s(buffer, factor, buffer, buffer_size);
 }

 void rfx_quantization_decode(INT16* buffer, const UINT32* quantization_values)
 {
-	/* Scale the values so that they are represented as 11.5 fixed-point number */
-	rfx_quantization_decode_block(buffer, 4096, 5);
+	const primitives_t *prims = primitives_get();

-	rfx_quantization_decode_block(buffer, 1024, quantization_values[8] - 6); /* HL1 */
-	rfx_quantization_decode_block(buffer + 1024, 1024, quantization_values[7] - 6); /* LH1 */
-	rfx_quantization_decode_block(buffer + 2048, 1024, quantization_values[9] - 6); /* HH1 */
-	rfx_quantization_decode_block(buffer + 3072, 256, quantization_values[5] - 6); /* HL2 */
-	rfx_quantization_decode_block(buffer + 3328, 256, quantization_values[4] - 6); /* LH2 */
-	rfx_quantization_decode_block(buffer + 3584, 256, quantization_values[6] - 6); /* HH2 */
-	rfx_quantization_decode_block(buffer + 3840, 64, quantization_values[2] - 6); /* HL3 */
-	rfx_quantization_decode_block(buffer + 3904, 64, quantization_values[1] - 6); /* LH3 */
-	rfx_quantization_decode_block(buffer + 3968, 64, quantization_values[3] - 6); /* HH3 */
-	rfx_quantization_decode_block(buffer + 4032, 64, quantization_values[0] - 6); /* LL3 */
+	/* Scale the values so that they are represented as 11.5 fixed-point number */
+	rfx_quantization_decode_block(prims, buffer, 4096, 5);
+
+	rfx_quantization_decode_block(prims, buffer, 1024, quantization_values[8] - 6); /* HL1 */
+	rfx_quantization_decode_block(prims, buffer + 1024, 1024, quantization_values[7] - 6); /* LH1 */
+	rfx_quantization_decode_block(prims, buffer + 2048, 1024, quantization_values[9] - 6); /* HH1 */
+	rfx_quantization_decode_block(prims, buffer + 3072, 256, quantization_values[5] - 6); /* HL2 */
+	rfx_quantization_decode_block(prims, buffer + 3328, 256, quantization_values[4] - 6); /* LH2 */
+	rfx_quantization_decode_block(prims, buffer + 3584, 256, quantization_values[6] - 6); /* HH2 */
+	rfx_quantization_decode_block(prims, buffer + 3840, 64, quantization_values[2] - 6); /* HL3 */
+	rfx_quantization_decode_block(prims, buffer + 3904, 64, quantization_values[1] - 6); /* LH3 */
+	rfx_quantization_decode_block(prims, buffer + 3968, 64, quantization_values[3] - 6); /* HH3 */
+	rfx_quantization_decode_block(prims, buffer + 4032, 64, quantization_values[0] - 6); /* LL3 */
 }

 static void rfx_quantization_encode_block(INT16* buffer, int buffer_size, UINT32 factor)
@ -62,6 +60,7 @@ static void rfx_quantization_encode_block(INT16* buffer, int buffer_size, UINT32
 		return;

 	half = (1 << (factor - 1));
+	/* Could probably use prims->rShiftC_16s(dst+half, factor, dst, buffer_size); */
 	for (dst = buffer; buffer_size > 0; dst++, buffer_size--)
 	{
 		*dst = (*dst + half) >> factor;
--- a/libfreerdp/codec/rfx_sse2.c
+++ b/libfreerdp/codec/rfx_sse2.c
@ -52,177 +52,8 @@ _mm_prefetch_buffer(char * buffer, int num_bytes)
 	}
 }

-static void rfx_decode_ycbcr_to_rgb_sse2(INT16* y_r_buffer, INT16* cb_g_buffer, INT16* cr_b_buffer)
-{	
-	__m128i zero = _mm_setzero_si128();
-	__m128i max = _mm_set1_epi16(255);
-
-	__m128i* y_r_buf = (__m128i*) y_r_buffer;
-	__m128i* cb_g_buf = (__m128i*) cb_g_buffer;
-	__m128i* cr_b_buf = (__m128i*) cr_b_buffer;
-
-	__m128i y;
-	__m128i cr;
-	__m128i cb;
-	__m128i r;
-	__m128i g;
-	__m128i b;
-
-	int i;
-
-	__m128i r_cr = _mm_set1_epi16(22986);	//  1.403 << 14
-	__m128i g_cb = _mm_set1_epi16(-5636);	// -0.344 << 14
-	__m128i g_cr = _mm_set1_epi16(-11698);	// -0.714 << 14
-	__m128i b_cb = _mm_set1_epi16(28999);	//  1.770 << 14
-	__m128i c4096 = _mm_set1_epi16(4096);
-
-	for (i = 0; i < (4096 * sizeof(INT16) / sizeof(__m128i)); i += (CACHE_LINE_BYTES / sizeof(__m128i)))
-	{
-		_mm_prefetch((char*)(&y_r_buf[i]), _MM_HINT_NTA);
-		_mm_prefetch((char*)(&cb_g_buf[i]), _MM_HINT_NTA);
-		_mm_prefetch((char*)(&cr_b_buf[i]), _MM_HINT_NTA);
-	}
-	for (i = 0; i < (4096 * sizeof(INT16) / sizeof(__m128i)); i++)
-	{
-		/*
-		In order to use SSE2 signed 16-bit integer multiplication we need to convert
-		the floating point factors to signed int without loosing information.
-		The result of this multiplication is 32 bit and we have two SSE instructions
-		that return either the hi or lo word.
-		Thus we will multiply the factors by the highest possible 2^n, take the 
-		upper 16 bits of the signed 32-bit result (_mm_mulhi_epi16) and correct	this
-		result by multiplying it by 2^(16-n).
-		For the given factors in the conversion matrix the best possible n is 14.
-
-		Example for calculating r:
-		r = (y>>5) + 128 + (cr*1.403)>>5                       // our base formula
-		r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5      // see above
-		r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5             // simplification
-		r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
-		*/
-
-		/* y = (y_r_buf[i] + 4096) >> 2 */
-		y = _mm_load_si128(&y_r_buf[i]);
-		y = _mm_add_epi16(y, c4096);
-		y = _mm_srai_epi16(y, 2);
-		/* cb = cb_g_buf[i]; */
-		cb = _mm_load_si128(&cb_g_buf[i]);
-		/* cr = cr_b_buf[i]; */
-		cr = _mm_load_si128(&cr_b_buf[i]);
-
-		/* (y + HIWORD(cr*22986)) >> 3 */
-		r = _mm_add_epi16(y, _mm_mulhi_epi16(cr, r_cr));
-		r = _mm_srai_epi16(r, 3);
-		/* y_r_buf[i] = MINMAX(r, 0, 255); */
-		_mm_between_epi16(r, zero, max);
-		_mm_store_si128(&y_r_buf[i], r);
-
-		/* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
-		g = _mm_add_epi16(y, _mm_mulhi_epi16(cb, g_cb));
-		g = _mm_add_epi16(g, _mm_mulhi_epi16(cr, g_cr));
-		g = _mm_srai_epi16(g, 3);
-		/* cb_g_buf[i] = MINMAX(g, 0, 255); */
-		_mm_between_epi16(g, zero, max);
-		_mm_store_si128(&cb_g_buf[i], g);
-
-		/* (y + HIWORD(cb*28999)) >> 3 */
-		b = _mm_add_epi16(y, _mm_mulhi_epi16(cb, b_cb));
-		b = _mm_srai_epi16(b, 3);
-		/* cr_b_buf[i] = MINMAX(b, 0, 255); */
-		_mm_between_epi16(b, zero, max);
-		_mm_store_si128(&cr_b_buf[i], b);
-	}
-}
-
-/* The encodec YCbCr coeffectients are represented as 11.5 fixed-point numbers. See rfx_encode.c */
-static void rfx_encode_rgb_to_ycbcr_sse2(INT16* y_r_buffer, INT16* cb_g_buffer, INT16* cr_b_buffer)
-{
-	__m128i min = _mm_set1_epi16(-128 << 5);
-	__m128i max = _mm_set1_epi16(127 << 5);
-
-	__m128i* y_r_buf = (__m128i*) y_r_buffer;
-	__m128i* cb_g_buf = (__m128i*) cb_g_buffer;
-	__m128i* cr_b_buf = (__m128i*) cr_b_buffer;
-
-	__m128i y;
-	__m128i cr;
-	__m128i cb;
-	__m128i r;
-	__m128i g;
-	__m128i b;
-
-	__m128i y_r  = _mm_set1_epi16(9798);   //  0.299000 << 15
-	__m128i y_g  = _mm_set1_epi16(19235);  //  0.587000 << 15
-	__m128i y_b  = _mm_set1_epi16(3735);   //  0.114000 << 15
-	__m128i cb_r = _mm_set1_epi16(-5535);  // -0.168935 << 15
-	__m128i cb_g = _mm_set1_epi16(-10868); // -0.331665 << 15
-	__m128i cb_b = _mm_set1_epi16(16403);  //  0.500590 << 15
-	__m128i cr_r = _mm_set1_epi16(16377);  //  0.499813 << 15
-	__m128i cr_g = _mm_set1_epi16(-13714); // -0.418531 << 15
-	__m128i cr_b = _mm_set1_epi16(-2663);  // -0.081282 << 15
-
-	int i;
-
-	for (i = 0; i < (4096 * sizeof(INT16) / sizeof(__m128i)); i += (CACHE_LINE_BYTES / sizeof(__m128i)))
-	{
-		_mm_prefetch((char*)(&y_r_buf[i]), _MM_HINT_NTA);
-		_mm_prefetch((char*)(&cb_g_buf[i]), _MM_HINT_NTA);
-		_mm_prefetch((char*)(&cr_b_buf[i]), _MM_HINT_NTA);
-	}
-	for (i = 0; i < (4096 * sizeof(INT16) / sizeof(__m128i)); i++)
-	{
-		/*
-		In order to use SSE2 signed 16-bit integer multiplication we need to convert
-		the floating point factors to signed int without loosing information.
-		The result of this multiplication is 32 bit and using SSE2 we get either the
-		product's hi or lo word.
-		Thus we will multiply the factors by the highest possible 2^n and take the
-		upper 16 bits of the signed 32-bit result (_mm_mulhi_epi16).
-		Since the final result needs to be scaled by << 5 and also in in order to keep
-		the precision within the upper 16 bits we will also have to scale the RGB
-		values used in the multiplication by << 5+(16-n).
-		*/
-
-		/* r = y_r_buf[i]; */
-		r = _mm_load_si128(&y_r_buf[i]);
-
-		/* g = cb_g_buf[i]; */
-		g = _mm_load_si128(&cb_g_buf[i]);
-
-		/* b = cr_b_buf[i]; */
-		b = _mm_load_si128(&cr_b_buf[i]);
-
-		/* r<<6; g<<6; b<<6 */
-		r = _mm_slli_epi16(r, 6);
-		g = _mm_slli_epi16(g, 6);
-		b = _mm_slli_epi16(b, 6);
-
-		/* y = HIWORD(r*y_r) + HIWORD(g*y_g) + HIWORD(b*y_b) + min */
-		y = _mm_mulhi_epi16(r, y_r);
-		y = _mm_add_epi16(y, _mm_mulhi_epi16(g, y_g));
-		y = _mm_add_epi16(y, _mm_mulhi_epi16(b, y_b));
-		y = _mm_add_epi16(y, min);
-		/* y_r_buf[i] = MINMAX(y, 0, (255 << 5)) - (128 << 5); */
-		_mm_between_epi16(y, min, max);
-		_mm_store_si128(&y_r_buf[i], y);
-
-		/* cb = HIWORD(r*cb_r) + HIWORD(g*cb_g) + HIWORD(b*cb_b) */
-		cb = _mm_mulhi_epi16(r, cb_r);
-		cb = _mm_add_epi16(cb, _mm_mulhi_epi16(g, cb_g));
-		cb = _mm_add_epi16(cb, _mm_mulhi_epi16(b, cb_b));
-		/* cb_g_buf[i] = MINMAX(cb, (-128 << 5), (127 << 5)); */
-		_mm_between_epi16(cb, min, max);
-		_mm_store_si128(&cb_g_buf[i], cb);
-
-		/* cr = HIWORD(r*cr_r) + HIWORD(g*cr_g) + HIWORD(b*cr_b) */
-		cr = _mm_mulhi_epi16(r, cr_r);
-		cr = _mm_add_epi16(cr, _mm_mulhi_epi16(g, cr_g));
-		cr = _mm_add_epi16(cr, _mm_mulhi_epi16(b, cr_b));
-		/* cr_b_buf[i] = MINMAX(cr, (-128 << 5), (127 << 5)); */
-		_mm_between_epi16(cr, min, max);
-		_mm_store_si128(&cr_b_buf[i], cr);
-	}
-}
+/* rfx_decode_ycbcr_to_rgb_sse2 code now resides in the primitives library. */
+/* rfx_encode_rgb_to_ycbcr_sse2 code now resides in the primitives library. */

 static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 rfx_quantization_decode_block_sse2(INT16* buffer, const int buffer_size, const UINT32 factor)
@ -658,15 +489,11 @@ void rfx_init_sse2(RFX_CONTEXT* context)
 {
 	DEBUG_RFX("Using SSE2 optimizations");

-	IF_PROFILER(context->priv->prof_rfx_decode_ycbcr_to_rgb->name = "rfx_decode_ycbcr_to_rgb_sse2");
-	IF_PROFILER(context->priv->prof_rfx_encode_rgb_to_ycbcr->name = "rfx_encode_rgb_to_ycbcr_sse2");
 	IF_PROFILER(context->priv->prof_rfx_quantization_decode->name = "rfx_quantization_decode_sse2");
 	IF_PROFILER(context->priv->prof_rfx_quantization_encode->name = "rfx_quantization_encode_sse2");
 	IF_PROFILER(context->priv->prof_rfx_dwt_2d_decode->name = "rfx_dwt_2d_decode_sse2");
 	IF_PROFILER(context->priv->prof_rfx_dwt_2d_encode->name = "rfx_dwt_2d_encode_sse2");

-	context->decode_ycbcr_to_rgb = rfx_decode_ycbcr_to_rgb_sse2;
-	context->encode_rgb_to_ycbcr = rfx_encode_rgb_to_ycbcr_sse2;
 	context->quantization_decode = rfx_quantization_decode_sse2;
 	context->quantization_encode = rfx_quantization_encode_sse2;
 	context->dwt_2d_decode = rfx_dwt_2d_decode_sse2;
--- a/libfreerdp/codec/rfx_types.h
+++ b/libfreerdp/codec/rfx_types.h
@ -60,7 +60,7 @@ struct _RFX_CONTEXT_PRIV
 	PROFILER_DEFINE(prof_rfx_differential_decode);
 	PROFILER_DEFINE(prof_rfx_quantization_decode);
 	PROFILER_DEFINE(prof_rfx_dwt_2d_decode);
-	PROFILER_DEFINE(prof_rfx_decode_ycbcr_to_rgb);
+	PROFILER_DEFINE(prof_rfx_ycbcr_to_rgb);
 	PROFILER_DEFINE(prof_rfx_decode_format_rgb);

 	PROFILER_DEFINE(prof_rfx_encode_rgb);
@ -69,7 +69,7 @@ struct _RFX_CONTEXT_PRIV
 	PROFILER_DEFINE(prof_rfx_differential_encode);
 	PROFILER_DEFINE(prof_rfx_quantization_encode);
 	PROFILER_DEFINE(prof_rfx_dwt_2d_encode);
-	PROFILER_DEFINE(prof_rfx_encode_rgb_to_ycbcr);
+	PROFILER_DEFINE(prof_rfx_rgb_to_ycbcr);
 	PROFILER_DEFINE(prof_rfx_encode_format_rgb);
 };

--- a/libfreerdp/primitives/CMakeLists.txt
+++ b/libfreerdp/primitives/CMakeLists.txt
@ -0,0 +1,81 @@
+# FreeRDP: A Remote Desktop Protocol Client
+# libfreerdp-primitives cmake build script
+# vi:ts=4 sw=4:
+#
+# (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied. See the License for the specific language governing permissions
+# and limitations under the License.
+#
+
+set(FREERDP_PRIMITIVE_CFILES
+    prim_add.c
+    prim_andor.c
+    prim_alphaComp.c
+    prim_colors.c
+    prim_copy.c
+    prim_set.c
+    prim_shift.c
+    prim_sign.c
+	primitives.c
+    )
+set(FREERDP_PRIMITIVE_HEADERS
+    prim_internal.h
+    )
+set(FREERDP_PRIMITIVE_SRCS
+    ${FREERDP_PRIMITIVE_CFILES}
+    ${FREERDP_PRIMITIVE_HEADERS}
+    )
+
+add_definitions(-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE})
+
+### IPP Variable debugging
+if(WITH_IPP)
+	if(CMAKE_COMPILER_IS_GNUCC)
+		foreach(INCLDIR ${IPP_INCLUDE_DIRS})
+			set(OPTIMIZATION "${OPTIMIZATION} -I${INCLDIR}")
+		endforeach(INCLDIR)
+	endif()
+endif()
+
+if(WITH_SSE2)
+	if(CMAKE_COMPILER_IS_GNUCC)
+		set(OPTIMIZATION "${OPTIMIZATION} -msse2 -mssse3 -Wdeclaration-after-statement")
+	endif()
+
+	if(MSVC)
+		set(OPTIMIZATION "${OPTIMIZATION} /arch:SSE2")
+	endif()
+elseif(WITH_NEON)
+	if(CMAKE_COMPILER_IS_GNUCC)
+	    set(OPTIMZATION "${OPTIMIZATION} -mfpu=neon -mfloat-abi=softfp")
+	endif()
+	# TODO: Add MSVC equivalent
+endif()
+set_property(SOURCE ${FREERDP_PRIMITIVE_CFILES} PROPERTY COMPILE_FLAGS ${OPTIMIZATION})
+
+if(WITH_MONOLITHIC_BUILD)
+	add_library(freerdp-primitives OBJECT ${FREERDP_PRIMITIVE_SRCS})
+else()
+	add_library(freerdp-primitives ${FREERDP_PRIMITIVE_SRCS})
+endif()
+
+set_target_properties(freerdp-primitives PROPERTIES VERSION ${FREERDP_VERSION_FULL} SOVERSION ${FREERDP_VERSION} PREFIX "lib")
+
+#set(FREERDP_PRIMITIVE_LIBS, rt)
+
+if(WITH_MONOLITHIC_BUILD)
+	set(FREERDP_LIBS ${FREERDP_LIBS} ${FREERDP_PRIMITIVE_LIBS} PARENT_SCOPE)
+else()
+	target_link_libraries(freerdp-primitives ${FREERDP_PRIMITIVE_LIBS})
+	install(TARGETS freerdp-primitives DESTINATION ${CMAKE_INSTALL_LIBDIR})
+endif()
+
+if(BUILD_TESTING)
+	add_subdirectory(test)
+endif(BUILD_TESTING)
--- a/libfreerdp/primitives/README.txt
+++ b/libfreerdp/primitives/README.txt
@ -0,0 +1,113 @@
+The Primitives Library
+
+Introduction
+------------
+The purpose of the primitives library is to give the freerdp code easy
+access to *run-time* optimization via SIMD operations.  When the library
+is initialized, dynamic checks of processor features are run (such as
+the support of SSE3 or Neon), and entrypoints are linked to through
+function pointers to provide the fastest possible operations.  All
+routines offer generic C alternatives as fallbacks.
+
+Run-time optimization has the advantage of allowing a single executable
+to run fast on multiple platforms with different SIMD capabilities.
+
+
+Use In Code
+-----------
+A singleton pointing to a structure containing the function pointers
+is accessed through primitives_get().   The function pointers can then
+be used from that structure, e.g.
+
+    primitives_t *prims = primitives_get();
+    prims->shiftC_16s(buffer, shifts, buffer, 256);
+
+Of course, there is some overhead in calling through the function pointer
+and setting up the SIMD operations, so it would be counterproductive to
+call the primitives library for very small operation, e.g. initializing an
+array of eight values to a constant.  The primitives library is intended
+for larger-scale operations, e.g. arrays of size 64 and larger.
+
+
+Initialization and Cleanup
+--------------------------
+Library initialization is done the first time primitives_init() is called
+or the first time primitives_get() is used.  Cleanup (if any) is done by
+primitives_deinit().
+
+
+Intel Integrated Performance Primitives (IPP)
+---------------------------------------------
+If freerdp is compiled with IPP support (-DWITH_IPP=ON), the IPP function
+calls will be used (where available) to fill the function pointers.
+Where possible, function names and parameter lists match IPP format so
+that the IPP functions can be plugged into the function pointers without
+a wrapper layer.  Use of IPP is completely optional, and in many cases
+the SSE operations in the primitives library itself are faster or similar
+in performance.
+
+
+Coverage
+--------
+The primitives library is not meant to be comprehensive, offering
+entrypoints for every operation and operand type.  Instead, the coverage
+is focused on operations known to be performance bottlenecks in the code.
+For instance, 16-bit signed operations are used widely in the RemoteFX
+software, so you'll find 16s versions of several operations, but there
+is no attempt to provide (unused) copies of the same code for 8u, 16u,
+32s, etc.
+
+
+New Optimizations
+-----------------
+As the need arises, new optimizations can be added to the library,
+including NEON, AVX, and perhaps OpenCL or other SIMD implementations.
+The initialization routine is free to do any quick run-time test to
+determine which features are available before hooking the operation's
+function pointer, or it can simply look at the processor features list
+from the hints passed to the initialization routine.
+
+
+Adding Entrypoints
+------------------
+As the need for new operations or operands arises, new entrypoints can
+be added.  
+  1) Function prototypes and pointers are added to 
+     include/freerdp/primitives.h
+  2) New module initialization and cleanup function prototypes are added
+     to prim_internal.h and called in primitives.c (primitives_init()
+     and primitives_deinit()).
+  3) Operation names and parameter lists should be compatible with the IPP.
+     IPP manuals are available online at software.intel.com.
+  4) A generic C entrypoint must be available as a fallback.
+  5) prim_templates.h contains macro-based templates for simple operations,
+     such as applying a single SSE operation to arrays of data.
+     The template functions can frequently be used to extend the
+     operations without writing a lot of new code.
+
+
+Flags
+-----
+The entrypoint primitives_get_flags() returns a bitfield of processor flags
+(as defined in primitives.h) and primitives_flag_str() returns a string
+related to those processor flags, for debugging and information.  The
+bitfield can be used elsewhere in the code as needed.
+
+
+Cache Management
+----------------
+I haven't found a lot of speed improvement by attempting prefetch, and
+in fact it seems to have a negative impact in some cases.  Done correctly
+perhaps the routines could be further accelerated by proper use of prefetch,
+fences, etc.
+
+
+Testing
+-------
+In the test subdirectory is an executable (prim_test) that tests both
+functionality and speed of primitives library operations.   Any new
+modules should be added to that test, following the conventions already
+established in that directory.  The program can be executed on various
+target hardware to compare generic C, optimized, and IPP performance
+with various array sizes.
+
--- a/libfreerdp/primitives/prim_add.c
+++ b/libfreerdp/primitives/prim_add.c
@ -0,0 +1,81 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Add operations.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ * 
+ */
+
+#include <config.h>
+#include <string.h>
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+#ifdef WITH_SSE2
+# include <emmintrin.h>
+# include <pmmintrin.h>
+#endif /* WITH_SSE2 */
+#ifdef WITH_IPP
+# include <ipps.h>
+#endif /* WITH_IPP */
+#include "prim_internal.h"
+#include "prim_templates.h"
+
+/* ----------------------------------------------------------------------------
+ * 16-bit signed add with saturation (under and over).
+ */
+PRIM_STATIC pstatus_t general_add_16s(
+	const INT16 *pSrc1,
+	const INT16 *pSrc2,
+	INT16 *pDst,
+	INT32 len)
+{
+	while (len--)
+	{
+		INT32 k = (INT32) (*pSrc1++) + (INT32) (*pSrc2++);
+		if (k > 32767) *pDst++ = ((INT16) 32767);
+		else if (k < -32768) *pDst++ = ((INT16) -32768);
+		else *pDst++ = (INT16) k;
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+#ifdef WITH_SSE2
+# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
+/* ------------------------------------------------------------------------- */
+SSE3_SSD_ROUTINE(sse3_add_16s, INT16, general_add_16s,
+	_mm_adds_epi16, general_add_16s(sptr1++, sptr2++, dptr++, 1))
+# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
+#endif
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_add(
+	const primitives_hints_t *hints,
+	primitives_t *prims)
+{
+	prims->add_16s = general_add_16s;
+#ifdef WITH_IPP
+	prims->add_16s = (__add_16s_t) ippsAdd_16s;
+#elif defined(WITH_SSE2)
+	if ((hints->x86_flags & PRIM_X86_SSE2_AVAILABLE)
+			&& (hints->x86_flags & PRIM_X86_SSE3_AVAILABLE))	/* for LDDQU */
+	{
+		prims->add_16s = sse3_add_16s;
+	}
+#endif
+}
+
+/* ------------------------------------------------------------------------- */
+void primitives_deinit_add(
+	primitives_t *prims)
+{
+	/* Nothing to do. */
+}
--- a/libfreerdp/primitives/prim_alphaComp.c
+++ b/libfreerdp/primitives/prim_alphaComp.c
@ -0,0 +1,298 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Alpha blending routines.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ *
+ * Note: this code assumes the second operand is fully opaque,
+ * e.g.
+ *   newval = alpha1*val1 + (1-alpha1)*val2
+ * rather than
+ *   newval = alpha1*val1 + (1-alpha1)*alpha2*val2
+ * The IPP gives other options.
+ */
+
+#include <config.h>
+#include <string.h>
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+#include "prim_internal.h"
+#ifdef WITH_SSE2
+# include <emmintrin.h>
+# include <pmmintrin.h>
+#endif /* WITH_SSE2 */
+#ifdef WITH_IPP
+# include <ippi.h>
+#endif /* WITH_IPP */
+
+#define ALPHA(_k_)	(((_k_) & 0xFF000000U) >> 24)
+#define RED(_k_)	(((_k_) & 0x00FF0000U) >> 16)
+#define GRN(_k_)	(((_k_) & 0x0000FF00U) >> 8)
+#define BLU(_k_)	(((_k_) & 0x000000FFU))
+
+/* ------------------------------------------------------------------------- */
+PRIM_STATIC pstatus_t general_alphaComp_argb(
+	const BYTE *pSrc1,  INT32 src1Step,
+	const BYTE *pSrc2,  INT32 src2Step,
+	BYTE *pDst,  INT32 dstStep,
+	INT32 width,  INT32 height)
+{
+    const UINT32 *sptr1 = (const UINT32 *) pSrc1;
+    const UINT32 *sptr2 = (const UINT32 *) pSrc2;
+    UINT32 *dptr = (UINT32 *) pDst;
+	int linebytes = width * sizeof(UINT32);
+	int src1Jump = (src1Step - linebytes) / sizeof(UINT32);
+	int src2Jump = (src2Step - linebytes) / sizeof(UINT32);
+	int dstJump  = (dstStep  - linebytes) / sizeof(UINT32);
+
+	int y;
+	for (y=0; y<height; y++)
+	{
+		int x;
+		for (x=0; x<width; x++)
+		{
+			const UINT32 src1 = *sptr1++;
+			const UINT32 src2 = *sptr2++;
+			UINT32 alpha = ALPHA(src1) + 1;
+			if (alpha == 256)
+			{
+				/* If alpha is 255+1, just copy src1. */
+				*dptr++ = src1;
+			}
+			else if (alpha <= 1)
+			{
+				/* If alpha is 0+1, just copy src2. */
+				*dptr++ = src2;
+			}
+			else
+			{
+				/* A perfectly accurate blend would do (a*src + (255-a)*dst)/255
+				 * rather than adding one to alpha and dividing by 256, but this
+				 * is much faster and only differs by one 16% of the time.
+				 * I'm not sure who first designed the double-ops trick
+				 * (Red Blue and Alpha Green).
+				 */
+				UINT32 rb, ag;
+				UINT32 s2rb = src2 & 0x00FF00FFU;
+				UINT32 s2ag = (src2 >> 8) & 0x00FF00FFU;
+				UINT32 s1rb = src1 & 0x00FF00FFU;
+				UINT32 s1ag = (src1 >> 8) & 0x00FF00FFU;
+
+				UINT32 drb = s1rb - s2rb;
+				UINT32 dag = s1ag - s2ag;
+				drb *= alpha;
+				dag *= alpha;
+
+				rb  =  ((drb >> 8) + s2rb)       & 0x00FF00FFU;
+				ag  = (((dag >> 8) + s2ag) << 8) & 0xFF00FF00U;
+				*dptr++ = rb | ag;
+			}
+		}
+		sptr1 += src1Jump;
+		sptr2 += src2Jump;
+		dptr  += dstJump;
+    }
+
+	return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+#ifdef WITH_SSE2
+# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
+
+PRIM_STATIC pstatus_t sse2_alphaComp_argb(
+	const BYTE *pSrc1,  INT32 src1Step,
+	const BYTE *pSrc2,  INT32 src2Step,
+	BYTE *pDst,  INT32 dstStep,
+	INT32 width,  INT32 height)
+{
+	const UINT32 *sptr1 = (const UINT32 *) pSrc1;
+	const UINT32 *sptr2 = (const UINT32 *) pSrc2;
+	UINT32 *dptr;
+    int linebytes, src1Jump, src2Jump, dstJump, y;
+	__m128i xmm0, xmm1;
+
+	if ((width <= 0) || (height <= 0)) return PRIMITIVES_SUCCESS;
+
+	if (width < 4)     /* pointless if too small */
+	{
+		return general_alphaComp_argb(pSrc1, src1Step, pSrc2, src2Step,
+			pDst, dstStep, width, height);
+	}
+	dptr = (UINT32 *) pDst;
+    linebytes = width * sizeof(UINT32);
+	src1Jump = (src1Step - linebytes) / sizeof(UINT32);
+	src2Jump = (src2Step - linebytes) / sizeof(UINT32);
+	dstJump  = (dstStep  - linebytes) / sizeof(UINT32);
+
+	xmm0 = _mm_set1_epi32(0);
+	xmm1 = _mm_set1_epi16(1);
+
+	for (y=0; y<height; ++y)
+	{
+		int pixels = width;
+		int count;
+
+		/* Get to the 16-byte boundary now. */
+		int leadIn = 0;
+		switch ((ULONG_PTR) dptr & 0x0f)
+		{
+			case 0:
+				leadIn = 0;
+				break;
+			case 4:
+				leadIn = 3;
+				break;
+			case 8:
+				leadIn = 2;
+				break;
+			case 12:
+				leadIn = 1;
+				break;
+			default:
+				/* We'll never hit a 16-byte boundary, so do the whole
+				 * thing the slow way.
+				 */
+				leadIn = width;
+				break;
+		}
+		if (leadIn)
+		{
+			general_alphaComp_argb((const BYTE *) sptr1,
+				src1Step, (const BYTE *) sptr2, src2Step,
+				(BYTE *) dptr, dstStep, leadIn, 1);
+			sptr1 += leadIn;
+			sptr2 += leadIn;
+			dptr  += leadIn;
+			pixels -= leadIn;
+		}
+
+		/* Use SSE registers to do 4 pixels at a time. */
+		count = pixels >> 2;
+		pixels -= count << 2;
+		while (count--)
+		{
+			__m128i xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+			/* BdGdRdAdBcGcRcAcBbGbRbAbBaGaRaAa */
+			xmm2 = LOAD_SI128(sptr1); sptr1 += 4;
+			/* BhGhRhAhBgGgRgAgBfGfRfAfBeGeReAe */
+			xmm3 = LOAD_SI128(sptr2); sptr2 += 4;
+			/* 00Bb00Gb00Rb00Ab00Ba00Ga00Ra00Aa */
+			xmm4 = _mm_unpackhi_epi8(xmm2, xmm0);
+			/* 00Bf00Gf00Bf00Af00Be00Ge00Re00Ae */
+			xmm5 = _mm_unpackhi_epi8(xmm3, xmm0);
+			/* subtract */
+			xmm6 = _mm_subs_epi16(xmm4, xmm5);
+			/* 00Bb00Gb00Rb00Ab00Aa00Aa00Aa00Aa */
+			xmm4 = _mm_shufflelo_epi16(xmm4, 0xff);
+			/* 00Ab00Ab00Ab00Ab00Aa00Aa00Aa00Aa */
+			xmm4 = _mm_shufflehi_epi16(xmm4, 0xff);
+			/* Add one to alphas */
+			xmm4 = _mm_adds_epi16(xmm4, xmm1);
+			/* Multiply and take low word */
+			xmm4 = _mm_mullo_epi16(xmm4, xmm6);
+			/* Shift 8 right */
+			xmm4 = _mm_srai_epi16(xmm4, 8);
+			/* Add xmm5 */
+			xmm4 = _mm_adds_epi16(xmm4, xmm5);
+			/* 00Bj00Gj00Rj00Aj00Bi00Gi00Ri00Ai */
+
+			/* 00Bd00Gd00Rd00Ad00Bc00Gc00Rc00Ac */
+			xmm5 = _mm_unpacklo_epi8(xmm2, xmm0);
+			/* 00Bh00Gh00Rh00Ah00Bg00Gg00Rg00Ag */
+			xmm6 = _mm_unpacklo_epi8(xmm3, xmm0);
+			/* subtract */
+			xmm7 = _mm_subs_epi16(xmm5, xmm6);
+			/* 00Bd00Gd00Rd00Ad00Ac00Ac00Ac00Ac */
+			xmm5 = _mm_shufflelo_epi16(xmm5, 0xff);
+			/* 00Ad00Ad00Ad00Ad00Ac00Ac00Ac00Ac */
+			xmm5 = _mm_shufflehi_epi16(xmm5, 0xff);
+			/* Add one to alphas */
+			xmm5 = _mm_adds_epi16(xmm5, xmm1);
+			/* Multiply and take low word */
+			xmm5 = _mm_mullo_epi16(xmm5, xmm7);
+			/* Shift 8 right */
+			xmm5 = _mm_srai_epi16(xmm5, 8);
+			/* Add xmm6 */
+			xmm5 = _mm_adds_epi16(xmm5, xmm6);
+			/* 00Bl00Gl00Rl00Al00Bk00Gk00Rk0ABk */
+
+			/* Must mask off remainders or pack gets confused */
+			xmm3 = _mm_set1_epi16(0x00ffU);
+			xmm4 = _mm_and_si128(xmm4, xmm3);
+			xmm5 = _mm_and_si128(xmm5, xmm3);
+
+			/* BlGlRlAlBkGkRkAkBjGjRjAjBiGiRiAi */
+			xmm5 = _mm_packus_epi16(xmm5, xmm4);
+			_mm_store_si128((__m128i *) dptr, xmm5);  dptr += 4;
+		}
+
+		/* Finish off the remainder. */
+		if (pixels)
+		{
+			general_alphaComp_argb((const BYTE *) sptr1, src1Step,
+				(const BYTE *) sptr2, src2Step,
+				(BYTE *) dptr, dstStep, pixels, 1);
+			sptr1 += pixels;
+			sptr2 += pixels;
+			dptr  += pixels;
+		}
+
+		/* Jump to next row. */
+		sptr1 += src1Jump;
+		sptr2 += src2Jump;
+		dptr  += dstJump;
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
+#endif
+
+#ifdef WITH_IPP
+/* ------------------------------------------------------------------------- */
+PRIM_STATIC pstatus_t ipp_alphaComp_argb(
+	const BYTE *pSrc1,  INT32 src1Step,
+	const BYTE *pSrc2,  INT32 src2Step,
+	BYTE *pDst,  INT32 dstStep,
+	INT32 width,  INT32 height)
+{
+	IppiSize sz;
+	sz.width  = width;
+	sz.height = height;
+	return ippiAlphaComp_8u_AC4R(pSrc1, src1Step, pSrc2, src2Step,
+		pDst, dstStep, sz, ippAlphaOver);
+}
+#endif
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_alphaComp(
+	const primitives_hints_t *hints,
+	primitives_t *prims)
+{
+	prims->alphaComp_argb = general_alphaComp_argb;
+#ifdef WITH_IPP
+	prims->alphaComp_argb = ipp_alphaComp_argb;
+#elif defined(WITH_SSE2)
+	if ((hints->x86_flags & PRIM_X86_SSE2_AVAILABLE)
+			&& (hints->x86_flags & PRIM_X86_SSE3_AVAILABLE))	/* for LDDQU */
+	{
+		prims->alphaComp_argb = sse2_alphaComp_argb;
+	}
+#endif
+}
+
+/* ------------------------------------------------------------------------- */
+void primitives_deinit_alphaComp(
+	primitives_t *prims)
+{
+	/* Nothing to do. */
+}
--- a/libfreerdp/primitives/prim_andor.c
+++ b/libfreerdp/primitives/prim_andor.c
@ -0,0 +1,94 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Logical operations.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <config.h>
+#include <string.h>
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+#ifdef WITH_SSE2
+# include <emmintrin.h>
+# include <pmmintrin.h>
+#endif /* WITH_SSE2 */
+#ifdef WITH_IPP
+# include <ipps.h>
+#endif /* WITH_IPP */
+#include "prim_internal.h"
+#include "prim_templates.h"
+
+/* ----------------------------------------------------------------------------
+ * 32-bit AND with a constant.
+ */
+PRIM_STATIC pstatus_t general_andC_32u(
+	const UINT32 *pSrc,
+	UINT32 val,
+	UINT32 *pDst,
+	INT32 len)
+{
+	if (val == 0) return PRIMITIVES_SUCCESS;
+	while (len--) *pDst++ = *pSrc++ & val;
+	return PRIMITIVES_SUCCESS;
+}
+
+/* ----------------------------------------------------------------------------
+ * 32-bit OR with a constant.
+ */
+PRIM_STATIC pstatus_t general_orC_32u(
+	const UINT32 *pSrc,
+	UINT32 val,
+	UINT32 *pDst,
+	INT32 len)
+{
+	if (val == 0) return PRIMITIVES_SUCCESS;
+	while (len--) *pDst++ = *pSrc++ | val;
+	return PRIMITIVES_SUCCESS;
+}
+
+#ifdef WITH_SSE2
+# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
+/* ------------------------------------------------------------------------- */
+SSE3_SCD_PRE_ROUTINE(sse3_andC_32u, UINT32, general_andC_32u,
+	_mm_and_si128, *dptr++ = *sptr++ & val)
+SSE3_SCD_PRE_ROUTINE(sse3_orC_32u, UINT32, general_orC_32u,
+	_mm_or_si128, *dptr++ = *sptr++ | val)
+# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
+#endif
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_andor(
+	const primitives_hints_t *hints,
+	primitives_t *prims)
+{
+	/* Start with the default. */
+	prims->andC_32u = general_andC_32u;
+	prims->orC_32u  = general_orC_32u;
+#if defined(WITH_IPP)
+	prims->andC_32u = (__andC_32u_t) ippsAndC_32u;
+	prims->orC_32u  = (__orC_32u_t) ippsOrC_32u;
+#elif defined(WITH_SSE2)
+	if ((hints->x86_flags & PRIM_X86_SSE2_AVAILABLE)
+			&& (hints->x86_flags & PRIM_X86_SSE3_AVAILABLE))
+	{
+		prims->andC_32u = sse3_andC_32u;
+		prims->orC_32u  = sse3_orC_32u;
+	}
+#endif
+}
+
+/* ------------------------------------------------------------------------- */
+void primitives_deinit_andor(
+	primitives_t *prims)
+{
+	/* Nothing to do. */
+}
--- a/libfreerdp/primitives/prim_colors.c
+++ b/libfreerdp/primitives/prim_colors.c
@ -0,0 +1,742 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Color conversion operations.
+ * vi:ts=4 sw=4:
+ *
+ * Copyright 2011 Stephen Erisman
+ * Copyright 2011 Norbert Federa <nfedera@thinstuff.com>
+ * Copyright 2011 Martin Fleisz <mfleisz@thinstuff.com>
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include <string.h>
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+#ifdef WITH_SSE2
+#include <emmintrin.h>
+#elif WITH_NEON
+#include <arm_neon.h>
+#endif /* WITH_SSE2 else WITH_NEON */
+#include "prim_internal.h"
+#include "prim_templates.h"
+
+#ifndef MINMAX
+#define MINMAX(_v_, _l_, _h_) \
+	((_v_) < (_l_) ? (_l_) : ((_v_) > (_h_) ? (_h_) : (_v_)))
+#endif /* !MINMAX */
+
+/* ------------------------------------------------------------------------- */
+PRIM_STATIC pstatus_t general_yCbCrToRGB_16s16s_P3P3(
+	const INT16 *pSrc[3],  INT32 srcStep,
+	INT16 *pDst[3],  INT32 dstStep,
+	const prim_size_t *roi)	/* region of interest */
+{
+	/**
+	 * The decoded YCbCr coeffectients are represented as 11.5 fixed-point
+	 * numbers:
+	 *
+	 * 1 sign bit + 10 integer bits + 5 fractional bits
+	 *
+	 * However only 7 integer bits will be actually used since the value range
+	 * is [-128.0, 127.0].  In other words, the decoded coefficients are scaled
+	 * by << 5 when interpreted as INT16.
+	 * It was scaled in the quantization phase, so we must scale it back here.
+	 */
+	const INT16 *yptr  = pSrc[0];
+	const INT16 *cbptr = pSrc[1];
+	const INT16 *crptr = pSrc[2];
+	INT16 *rptr = pDst[0];
+	INT16 *gptr = pDst[1];
+	INT16 *bptr = pDst[2];
+	int srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
+	int dstbump = (dstStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
+	int y;
+
+	for (y=0; y<roi->height; y++)
+	{
+		int x;
+		for (x=0; x<roi->width; ++x)
+		{
+			/* INT32 is used intentionally because we calculate
+			 * with shifted factors!
+			 */
+			INT32 y  = (INT32) (*yptr++);
+			INT32 cb = (INT32) (*cbptr++);
+			INT32 cr = (INT32) (*crptr++);
+			INT32 r,g,b;
+
+			/*
+			 * This is the slow floating point version kept here for reference.
+			 * y = y + 4096; // 128<<5=4096 so that we can scale the sum by>>5
+			 * r = y + cr*1.403f;
+			 * g = y - cb*0.344f - cr*0.714f;
+			 * b = y + cb*1.770f;
+			 * y_r_buf[i]  = MINMAX(r>>5, 0, 255);
+			 * cb_g_buf[i] = MINMAX(g>>5, 0, 255);
+			 * cr_b_buf[i] = MINMAX(b>>5, 0, 255);
+			 */
+
+			/*
+			 * We scale the factors by << 16 into 32-bit integers in order to
+			 * avoid slower floating point multiplications.  Since the final
+			 * result needs to be scaled by >> 5 we will extract only the
+			 * upper 11 bits (>> 21) from the final sum.
+			 * Hence we also have to scale the other terms of the sum by << 16.
+			 * R: 1.403 << 16 = 91947
+			 * G: 0.344 << 16 = 22544, 0.714 << 16 = 46792
+			 * B: 1.770 << 16 = 115998
+			 */
+			y = (y+4096)<<16;
+
+			r = y + cr*91947;
+			g = y - cb*22544 - cr*46792;
+			b = y + cb*115998;
+
+			*rptr++ = MINMAX(r>>21, 0, 255);
+			*gptr++ = MINMAX(g>>21, 0, 255);
+			*bptr++ = MINMAX(b>>21, 0, 255);
+		}
+		yptr  += srcbump;
+		cbptr += srcbump;
+		crptr += srcbump;
+		rptr += dstbump;
+		gptr += dstbump;
+		bptr += dstbump;
+	}
+	return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+PRIM_STATIC pstatus_t general_RGBToYCbCr_16s16s_P3P3(
+	const INT16 *pSrc[3],  INT32 srcStep,
+	INT16 *pDst[3],  INT32 dstStep,
+	const prim_size_t *roi)	/* region of interest */
+{
+	/* The encoded YCbCr coefficients are represented as 11.5 fixed-point
+	 * numbers:
+	 *
+	 * 1 sign bit + 10 integer bits + 5 fractional bits
+	 *
+	 * However only 7 integer bits will be actually used since the value
+	 * range is [-128.0, 127.0].  In other words, the encoded coefficients
+	 * is scaled by << 5 when interpreted as INT16.
+	 * It will be scaled down to original during the quantization phase.
+	 */
+	const INT16 *rptr = pSrc[0];
+	const INT16 *gptr = pSrc[1];
+	const INT16 *bptr = pSrc[2];
+	INT16 *yptr  = pDst[0];
+	INT16 *cbptr = pDst[1];
+	INT16 *crptr = pDst[2];
+	int srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
+	int dstbump = (dstStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
+	int y;
+
+	for (y=0; y<roi->height; y++)
+	{
+		int x;
+		for (x=0; x<roi->width; ++x)
+		{
+			/* INT32 is used intentionally because we calculate with
+			 * shifted factors!
+			 */
+			INT32 r = (INT32) (*rptr++);
+			INT32 g = (INT32) (*gptr++);
+			INT32 b = (INT32) (*bptr++);
+
+			/* We scale the factors by << 15 into 32-bit integers in order
+			 * to avoid slower floating point multiplications.  Since the
+			 * terms need to be scaled by << 5 we simply scale the final
+			 * sum by >> 10
+			 *
+			 * Y:  0.299000 << 15 = 9798,  0.587000 << 15 = 19235, 
+			 *     0.114000 << 15 = 3735
+			 * Cb: 0.168935 << 15 = 5535,  0.331665 << 15 = 10868, 
+			 *     0.500590 << 15 = 16403
+			 * Cr: 0.499813 << 15 = 16377, 0.418531 << 15 = 13714,
+			 *     0.081282 << 15 = 2663
+			 */
+			INT32 y  = (r *  9798 + g *  19235 + b *  3735) >> 10;
+			INT32 cb = (r * -5535 + g * -10868 + b * 16403) >> 10;
+			INT32 cr = (r * 16377 + g * -13714 + b * -2663) >> 10;
+
+			*yptr++  = (INT16) MINMAX(y - 4096, -4096, 4095);
+			*cbptr++ = (INT16) MINMAX(cb, -4096, 4095);
+			*crptr++ = (INT16) MINMAX(cr, -4096, 4095);
+		}
+		yptr  += srcbump;
+		cbptr += srcbump;
+		crptr += srcbump;
+		rptr += dstbump;
+		gptr += dstbump;
+		bptr += dstbump;
+	}
+	return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+PRIM_STATIC pstatus_t general_RGBToRGB_16s8u_P3AC4R(
+	const INT16 *pSrc[3],	/* 16-bit R,G, and B arrays */
+	int srcStep,			/* bytes between rows in source data */
+	BYTE *pDst,			/* 32-bit interleaved ARGB (ABGR?) data */
+	int dstStep,			/* bytes between rows in dest data */
+	const prim_size_t *roi)	/* region of interest */
+{
+	const INT16 *r  = pSrc[0];
+	const INT16 *g  = pSrc[1];
+	const INT16 *b  = pSrc[2];
+	BYTE *dst = pDst;
+	int x,y;
+	int srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
+	int dstbump = (dstStep - (roi->width * sizeof(UINT32)));
+
+	for (y=0; y<roi->height; ++y)
+	{
+		for (x=0; x<roi->width; ++x)
+		{
+			*dst++ = (BYTE) (*b++);
+			*dst++ = (BYTE) (*g++);
+			*dst++ = (BYTE) (*r++);
+			*dst++ = ((BYTE) (0xFFU));
+		}
+		dst += dstbump;
+		r += srcbump;
+		g += srcbump;
+		b += srcbump;
+	}
+	return PRIMITIVES_SUCCESS;
+}
+
+
+#ifdef WITH_SSE2
+
+#ifdef __GNUC__
+# define GNU_INLINE \
+	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+#else
+# define GNU_INLINE
+#endif
+
+#define CACHE_LINE_BYTES	64
+
+#define _mm_between_epi16(_val, _min, _max) \
+	do { _val = _mm_min_epi16(_max, _mm_max_epi16(_val, _min)); } while (0)
+
+#ifdef DO_PREFETCH
+/*---------------------------------------------------------------------------*/
+static inline void GNU_INLINE _mm_prefetch_buffer(
+	char * buffer, 
+	int num_bytes)
+{
+	__m128i * buf = (__m128i*) buffer;
+	unsigned int i;
+	for (i = 0; i < (num_bytes / sizeof(__m128i)); 
+		i+=(CACHE_LINE_BYTES / sizeof(__m128i)))
+	{
+		_mm_prefetch((char*)(&buf[i]), _MM_HINT_NTA);
+	}
+}
+#endif /* DO_PREFETCH */
+
+/*---------------------------------------------------------------------------*/
+PRIM_STATIC pstatus_t sse2_yCbCrToRGB_16s16s_P3P3(
+	const INT16 *pSrc[3],
+	int srcStep,
+	INT16 *pDst[3],
+	int dstStep,
+	const prim_size_t *roi)	/* region of interest */
+{
+	__m128i zero, max, r_cr, g_cb, g_cr, b_cb, c4096;
+	__m128i *y_buf, *cb_buf, *cr_buf, *r_buf, *g_buf, *b_buf;
+	int srcbump, dstbump, yp, imax;
+
+	if (((ULONG_PTR) (pSrc[0]) & 0x0f)
+			|| ((ULONG_PTR) (pSrc[1]) & 0x0f)
+			|| ((ULONG_PTR) (pSrc[2]) & 0x0f)
+			|| ((ULONG_PTR) (pDst[0]) & 0x0f)
+			|| ((ULONG_PTR) (pDst[1]) & 0x0f)
+			|| ((ULONG_PTR) (pDst[2]) & 0x0f)
+			|| (roi->width & 0x07)
+			|| (srcStep & 127)
+			|| (dstStep & 127))
+	{
+		/* We can't maintain 16-byte alignment. */
+		return general_yCbCrToRGB_16s16s_P3P3(pSrc, srcStep,
+			pDst, dstStep, roi);
+	}
+
+	zero = _mm_setzero_si128();
+	max = _mm_set1_epi16(255);
+
+	y_buf  = (__m128i*) (pSrc[0]);
+	cb_buf = (__m128i*) (pSrc[1]);
+	cr_buf = (__m128i*) (pSrc[2]);
+	r_buf  = (__m128i*) (pDst[0]);
+	g_buf  = (__m128i*) (pDst[1]);
+	b_buf  = (__m128i*) (pDst[2]);
+
+	r_cr = _mm_set1_epi16(22986);	/*  1.403 << 14 */
+	g_cb = _mm_set1_epi16(-5636);	/* -0.344 << 14 */
+	g_cr = _mm_set1_epi16(-11698);	/* -0.714 << 14 */
+	b_cb = _mm_set1_epi16(28999);	/*  1.770 << 14 */
+	c4096 = _mm_set1_epi16(4096);
+	srcbump = srcStep / sizeof(__m128i);
+	dstbump = dstStep / sizeof(__m128i);
+
+#ifdef DO_PREFETCH
+	/* Prefetch Y's, Cb's, and Cr's. */
+	for (yp=0; yp<roi->height; yp++)
+	{
+		int i;
+		for (i=0; i<roi->width * sizeof(INT16) / sizeof(__m128i);
+			i += (CACHE_LINE_BYTES / sizeof(__m128i)))
+		{
+			_mm_prefetch((char*)(&y_buf[i]),  _MM_HINT_NTA);
+			_mm_prefetch((char*)(&cb_buf[i]), _MM_HINT_NTA);
+			_mm_prefetch((char*)(&cr_buf[i]), _MM_HINT_NTA);
+		}
+		y_buf  += srcbump;
+		cb_buf += srcbump;
+		cr_buf += srcbump;
+	}
+	y_buf  = (__m128i*) (pSrc[0]);
+	cb_buf = (__m128i*) (pSrc[1]);
+	cr_buf = (__m128i*) (pSrc[2]);
+#endif /* DO_PREFETCH */
+
+	imax = roi->width * sizeof(INT16) / sizeof(__m128i);
+	for (yp=0; yp<roi->height; ++yp)
+	{
+		int i;
+		for (i=0; i<imax; i++)
+		{
+			/* In order to use SSE2 signed 16-bit integer multiplication
+			 * we need to convert the floating point factors to signed int
+			 * without losing information.
+			 * The result of this multiplication is 32 bit and we have two
+			 * SSE instructions that return either the hi or lo word.
+			 * Thus we will multiply the factors by the highest possible 2^n,
+			 * take the upper 16 bits of the signed 32-bit result
+			 * (_mm_mulhi_epi16) and correct this result by multiplying
+			 * it by 2^(16-n).
+			 *
+			 * For the given factors in the conversion matrix the best
+			 * possible n is 14.
+			 *
+			 * Example for calculating r:
+			 * r = (y>>5) + 128 + (cr*1.403)>>5             // our base formula
+			 * r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5   // see above
+			 * r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5     // simplification
+			 * r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
+			 */
+
+			/* y = (y_r_buf[i] + 4096) >> 2 */
+			__m128i y, cb, cr, r, g, b;
+			y = _mm_load_si128(y_buf + i);
+			y = _mm_add_epi16(y, c4096);
+			y = _mm_srai_epi16(y, 2);
+			/* cb = cb_g_buf[i]; */
+			cb = _mm_load_si128(cb_buf + i);
+			/* cr = cr_b_buf[i]; */
+			cr = _mm_load_si128(cr_buf + i);
+
+			/* (y + HIWORD(cr*22986)) >> 3 */
+			r = _mm_add_epi16(y, _mm_mulhi_epi16(cr, r_cr));
+			r = _mm_srai_epi16(r, 3);
+
+			/* r_buf[i] = MINMAX(r, 0, 255); */
+			_mm_between_epi16(r, zero, max);
+			_mm_store_si128(r_buf + i, r);
+
+			/* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
+			g = _mm_add_epi16(y, _mm_mulhi_epi16(cb, g_cb));
+			g = _mm_add_epi16(g, _mm_mulhi_epi16(cr, g_cr));
+			g = _mm_srai_epi16(g, 3);
+
+			/* g_buf[i] = MINMAX(g, 0, 255); */
+			_mm_between_epi16(g, zero, max);
+			_mm_store_si128(g_buf + i, g);
+
+			/* (y + HIWORD(cb*28999)) >> 3 */
+			b = _mm_add_epi16(y, _mm_mulhi_epi16(cb, b_cb));
+			b = _mm_srai_epi16(b, 3);
+			/* b_buf[i] = MINMAX(b, 0, 255); */
+			_mm_between_epi16(b, zero, max);
+			_mm_store_si128(b_buf + i, b);
+		}
+		y_buf  += srcbump;
+		cb_buf += srcbump;
+		cr_buf += srcbump;
+		r_buf += dstbump;
+		g_buf += dstbump;
+		b_buf += dstbump;
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+/*---------------------------------------------------------------------------*/
+/* The encodec YCbCr coeffectients are represented as 11.5 fixed-point
+ * numbers. See the general code above.
+ */
+PRIM_STATIC pstatus_t sse2_RGBToYCbCr_16s16s_P3P3(
+	const INT16 *pSrc[3],
+	int srcStep,
+	INT16 *pDst[3],
+	int dstStep,
+	const prim_size_t *roi)	/* region of interest */
+{
+	__m128i min, max, y_r, y_g, y_b, cb_r, cb_g, cb_b, cr_r, cr_g, cr_b;
+	__m128i *r_buf, *g_buf, *b_buf, *y_buf, *cb_buf, *cr_buf;
+	int srcbump, dstbump, yp, imax;
+
+	if (((ULONG_PTR) (pSrc[0]) & 0x0f)
+			|| ((ULONG_PTR) (pSrc[1]) & 0x0f)
+			|| ((ULONG_PTR) (pSrc[2]) & 0x0f)
+			|| ((ULONG_PTR) (pDst[0]) & 0x0f)
+			|| ((ULONG_PTR) (pDst[1]) & 0x0f)
+			|| ((ULONG_PTR) (pDst[2]) & 0x0f)
+			|| (roi->width & 0x07)
+			|| (srcStep & 127)
+			|| (dstStep & 127))
+	{
+		/* We can't maintain 16-byte alignment. */
+		return general_RGBToYCbCr_16s16s_P3P3(pSrc, srcStep,
+			pDst, dstStep, roi);
+	}
+
+	min = _mm_set1_epi16(-128 << 5);
+	max = _mm_set1_epi16(127 << 5);
+
+	r_buf  = (__m128i*) (pSrc[0]);
+	g_buf  = (__m128i*) (pSrc[1]);
+	b_buf  = (__m128i*) (pSrc[2]);
+	y_buf  = (__m128i*) (pDst[0]);
+	cb_buf = (__m128i*) (pDst[1]);
+	cr_buf = (__m128i*) (pDst[2]);
+
+	y_r  = _mm_set1_epi16(9798);   /*  0.299000 << 15 */
+	y_g  = _mm_set1_epi16(19235);  /*  0.587000 << 15 */
+	y_b  = _mm_set1_epi16(3735);   /*  0.114000 << 15 */
+	cb_r = _mm_set1_epi16(-5535);  /* -0.168935 << 15 */
+	cb_g = _mm_set1_epi16(-10868); /* -0.331665 << 15 */
+	cb_b = _mm_set1_epi16(16403);  /*  0.500590 << 15 */
+	cr_r = _mm_set1_epi16(16377);  /*  0.499813 << 15 */
+	cr_g = _mm_set1_epi16(-13714); /* -0.418531 << 15 */
+	cr_b = _mm_set1_epi16(-2663);  /* -0.081282 << 15 */
+
+	srcbump = srcStep / sizeof(__m128i);
+	dstbump = dstStep / sizeof(__m128i);
+
+#ifdef DO_PREFETCH
+	/* Prefetch RGB's. */
+	for (yp=0; yp<roi->height; yp++)
+	{
+		int i;
+		for (i=0; i<roi->width * sizeof(INT16) / sizeof(__m128i);
+			i += (CACHE_LINE_BYTES / sizeof(__m128i)))
+		{
+			_mm_prefetch((char*)(&r_buf[i]), _MM_HINT_NTA);
+			_mm_prefetch((char*)(&g_buf[i]), _MM_HINT_NTA);
+			_mm_prefetch((char*)(&b_buf[i]), _MM_HINT_NTA);
+		}
+		r_buf += srcbump;
+		g_buf += srcbump;
+		b_buf += srcbump;
+	}
+	r_buf = (__m128i*) (pSrc[0]);
+	g_buf = (__m128i*) (pSrc[1]);
+	b_buf = (__m128i*) (pSrc[2]);
+#endif /* DO_PREFETCH */
+
+	imax = roi->width * sizeof(INT16) / sizeof(__m128i);
+	for (yp=0; yp<roi->height; ++yp)
+	{
+		int i;
+		for (i=0; i<imax; i++)
+		{
+			/* In order to use SSE2 signed 16-bit integer multiplication we
+			 * need to convert the floating point factors to signed int
+			 * without loosing information.  The result of this multiplication
+			 * is 32 bit and using SSE2 we get either the product's hi or lo
+			 * word.  Thus we will multiply the factors by the highest
+			 * possible 2^n and take the upper 16 bits of the signed 32-bit
+			 * result (_mm_mulhi_epi16).  Since the final result needs to
+			 * be scaled by << 5 and also in in order to keep the precision
+			 * within the upper 16 bits we will also have to scale the RGB
+			 * values used in the multiplication by << 5+(16-n).
+			 */
+			__m128i r, g, b, y, cb, cr;
+			r = _mm_load_si128(y_buf+i);
+			g = _mm_load_si128(g_buf+i);
+			b = _mm_load_si128(b_buf+i);
+
+			/* r<<6; g<<6; b<<6 */
+			r = _mm_slli_epi16(r, 6);
+			g = _mm_slli_epi16(g, 6);
+			b = _mm_slli_epi16(b, 6);
+
+			/* y = HIWORD(r*y_r) + HIWORD(g*y_g) + HIWORD(b*y_b) + min */
+			y = _mm_mulhi_epi16(r, y_r);
+			y = _mm_add_epi16(y, _mm_mulhi_epi16(g, y_g));
+			y = _mm_add_epi16(y, _mm_mulhi_epi16(b, y_b));
+			y = _mm_add_epi16(y, min);
+			/* y_r_buf[i] = MINMAX(y, 0, (255 << 5)) - (128 << 5); */
+			_mm_between_epi16(y, min, max);
+			_mm_store_si128(y_buf+i, y);
+
+			/* cb = HIWORD(r*cb_r) + HIWORD(g*cb_g) + HIWORD(b*cb_b) */
+			cb = _mm_mulhi_epi16(r, cb_r);
+			cb = _mm_add_epi16(cb, _mm_mulhi_epi16(g, cb_g));
+			cb = _mm_add_epi16(cb, _mm_mulhi_epi16(b, cb_b));
+			/* cb_g_buf[i] = MINMAX(cb, (-128 << 5), (127 << 5)); */
+			_mm_between_epi16(cb, min, max);
+			_mm_store_si128(cb_buf+i, cb);
+
+			/* cr = HIWORD(r*cr_r) + HIWORD(g*cr_g) + HIWORD(b*cr_b) */
+			cr = _mm_mulhi_epi16(r, cr_r);
+			cr = _mm_add_epi16(cr, _mm_mulhi_epi16(g, cr_g));
+			cr = _mm_add_epi16(cr, _mm_mulhi_epi16(b, cr_b));
+			/* cr_b_buf[i] = MINMAX(cr, (-128 << 5), (127 << 5)); */
+			_mm_between_epi16(cr, min, max);
+			_mm_store_si128(cr_buf+i, cr);
+		}
+		y_buf  += srcbump;
+		cb_buf += srcbump;
+		cr_buf += srcbump;
+		r_buf += dstbump;
+		g_buf += dstbump;
+		b_buf += dstbump;
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+/*---------------------------------------------------------------------------*/
+#define LOAD128(_src_) \
+	_mm_load_si128((__m128i *) _src_)
+#define STORE128(_dst_, _src_) \
+	_mm_store_si128((__m128i *) _dst_, _src_)
+#define PUNPCKLBW(_dst_, _src_) \
+	_dst_ = _mm_unpacklo_epi8(_src_, _dst_)
+#define PUNPCKHBW(_dst_, _src_) \
+	_dst_ = _mm_unpackhi_epi8(_src_, _dst_)
+#define PUNPCKLWD(_dst_, _src_) \
+	_dst_ = _mm_unpacklo_epi16(_src_, _dst_)
+#define PUNPCKHWD(_dst_, _src_) \
+	_dst_ = _mm_unpackhi_epi16(_src_, _dst_)
+#define PACKUSWB(_dst_, _src_) \
+	_dst_ = _mm_packus_epi16(_dst_, _src_)
+#define PREFETCH(_ptr_) \
+	_mm_prefetch((const void *) _ptr_, _MM_HINT_T0)
+#define XMM_ALL_ONES \
+	_mm_set1_epi32(0xFFFFFFFFU)
+
+PRIM_STATIC pstatus_t sse2_RGBToRGB_16s8u_P3AC4R(
+	const INT16 *pSrc[3],	/* 16-bit R,G, and B arrays */
+	INT32 srcStep,			/* bytes between rows in source data */
+	BYTE *pDst,				/* 32-bit interleaved ARGB (ABGR?) data */
+	INT32 dstStep,			/* bytes between rows in dest data */
+	const prim_size_t *roi)	/* region of interest */
+{
+	const UINT16 *r = (const UINT16 *) (pSrc[0]);
+	const UINT16 *g = (const UINT16 *) (pSrc[1]);
+	const UINT16 *b = (const UINT16 *) (pSrc[2]);
+	BYTE *out;
+	int srcbump, dstbump, y;
+
+	/* Ensure 16-byte alignment on all pointers,
+	 * that width is a multiple of 8,
+	 * and that the next row will also remain aligned.
+	 * Since this is usually used for 64x64 aligned arrays,
+	 * these checks should presumably pass.
+	 */
+	if ((((ULONG_PTR) (pSrc[0]) & 0x0f) != 0)
+			|| (((ULONG_PTR) (pSrc[1]) & 0x0f) != 0)
+			|| (((ULONG_PTR) (pSrc[2]) & 0x0f) != 0)
+			|| (((ULONG_PTR) pDst & 0x0f) != 0)
+			|| (roi->width & 0x0f)
+			|| (srcStep & 0x0f)
+			|| (dstStep & 0x0f))
+	{
+		return general_RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, roi);
+	}
+
+	out = (BYTE *) pDst;
+	srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
+	dstbump = (dstStep - (roi->width * sizeof(UINT32)));
+
+	for (y=0; y<roi->height; ++y)
+	{
+		int width = roi->width;
+		do {
+			__m128i R0, R1, R2, R3, R4;
+			/* The comments below pretend these are 8-byte registers
+			 * rather than 16-byte, for readability.
+			 */
+			R0 = LOAD128(b);  b += 8;		/* R0 = 00B300B200B100B0 */
+			R1 = LOAD128(b);  b += 8;		/* R1 = 00B700B600B500B4 */
+			PACKUSWB(R0,R1);				/* R0 = B7B6B5B4B3B2B1B0 */
+			R1 = LOAD128(g);  g += 8;		/* R1 = 00G300G200G100G0 */
+			R2 = LOAD128(g);  g += 8;		/* R2 = 00G700G600G500G4 */
+			PACKUSWB(R1,R2);				/* R1 = G7G6G5G4G3G2G1G0 */
+			R2 = R1;						/* R2 = G7G6G5G4G3G2G1G0 */
+			PUNPCKLBW(R2,R0);				/* R2 = G3B3G2B2G1B1G0B0 */
+			PUNPCKHBW(R1,R0);				/* R1 = G7B7G6B7G5B5G4B4 */
+			R0 = LOAD128(r);  r += 8;		/* R0 = 00R300R200R100R0 */
+			R3 = LOAD128(r);  r += 8;		/* R3 = 00R700R600R500R4 */
+			PACKUSWB(R0,R3);				/* R0 = R7R6R5R4R3R2R1R0 */
+			R3 = XMM_ALL_ONES;				/* R3 = FFFFFFFFFFFFFFFF */
+			R4 = R3;						/* R4 = FFFFFFFFFFFFFFFF */
+			PUNPCKLBW(R4,R0);				/* R4 = FFR3FFR2FFR1FFR0 */
+			PUNPCKHBW(R3,R0);				/* R3 = FFR7FFR6FFR5FFR4 */
+			R0 = R4;						/* R0 = R4               */
+			PUNPCKLWD(R0,R2);				/* R0 = FFR1G1B1FFR0G0B0 */
+			PUNPCKHWD(R4,R2);				/* R4 = FFR3G3B3FFR2G2B2 */
+			R2 = R3;						/* R2 = R3               */
+			PUNPCKLWD(R2,R1);				/* R2 = FFR5G5B5FFR4G4B4 */
+			PUNPCKHWD(R3,R1);				/* R3 = FFR7G7B7FFR6G6B6 */
+			STORE128(out, R0);  out += 16;	/* FFR1G1B1FFR0G0B0      */
+			STORE128(out, R4);  out += 16;	/* FFR3G3B3FFR2G2B2      */
+			STORE128(out, R2);  out += 16;	/* FFR5G5B5FFR4G4B4      */
+			STORE128(out, R3);  out += 16;	/* FFR7G7B7FFR6G6B6      */
+		} while (width -= 16);
+		/* Jump to next row. */
+		r += srcbump;
+		g += srcbump;
+		b += srcbump;
+		out += dstbump;
+	}
+	return PRIMITIVES_SUCCESS;
+}
+#endif /* WITH_SSE2 */
+
+/*---------------------------------------------------------------------------*/
+#ifdef WITH_NEON
+PRIM_STATIC pstatus_t neon_yCbCrToRGB_16s16s_P3P3(
+	const INT16 *pSrc[3],
+	int srcStep,
+	INT16 *pDst[3],
+	int dstStep,
+	const prim_size_t *roi)	/* region of interest */
+{
+	/* TODO: If necessary, check alignments and call the general version. */
+
+	int16x8_t zero = vdupq_n_s16(0);
+	int16x8_t max = vdupq_n_s16(255);
+	int16x8_t y_add = vdupq_n_s16(128);
+
+	int16x8_t* y_buf  = (int16x8_t*) pSrc[0];
+	int16x8_t* cb_buf = (int16x8_t*) pSrc[1];
+	int16x8_t* cr_buf = (int16x8_t*) pSrc[2];
+	int16x8_t* r_buf  = (int16x8_t*) pDst[0];
+	int16x8_t* g_buf  = (int16x8_t*) pDst[1];
+	int16x8_t* b_buf  = (int16x8_t*) pDst[2];
+
+	int srcbump = srcStep / sizeof(int16x8_t);
+	int dstbump = dstStep / sizeof(int16x8_t);
+	int yp;
+
+	int imax = roi->width * sizeof(INT16) / sizeof(int16x8_t);
+	for (yp=0; yp<roi->height; ++yp)
+	{
+		int i;
+		for (i=0; i<imax; i++)
+		{
+			int16x8_t y = vld1q_s16((INT16*) (y_buf+i));
+			y = vaddq_s16(y, y_add);
+
+			int16x8_t cr = vld1q_s16((INT16*) (cr_buf+i));
+
+			/* r = between((y + cr + (cr >> 2) + (cr >> 3) + (cr >> 5)),
+			 *    0, 255);
+			 */
+			int16x8_t r = vaddq_s16(y, cr);
+			r = vaddq_s16(r, vshrq_n_s16(cr, 2));
+			r = vaddq_s16(r, vshrq_n_s16(cr, 3));
+			r = vaddq_s16(r, vshrq_n_s16(cr, 5));
+			r = vminq_s16(vmaxq_s16(r, zero), max);
+			vst1q_s16((INT16*) (r_buf+i), r);
+
+			/* cb = cb_g_buf[i]; */
+			int16x8_t cb = vld1q_s16((INT16*) (cb_buf+i));
+
+			/* g = between(y - (cb >> 2) - (cb >> 4) - (cb >> 5) - (cr >> 1)
+			 * - (cr >> 3) - (cr >> 4) - (cr >> 5), 0, 255);
+			 */
+			int16x8_t g = vsubq_s16(y, vshrq_n_s16(cb, 2));
+			g = vsubq_s16(g, vshrq_n_s16(cb, 4));
+			g = vsubq_s16(g, vshrq_n_s16(cb, 5));
+			g = vsubq_s16(g, vshrq_n_s16(cr, 1));
+			g = vsubq_s16(g, vshrq_n_s16(cr, 3));
+			g = vsubq_s16(g, vshrq_n_s16(cr, 4));
+			g = vsubq_s16(g, vshrq_n_s16(cr, 5));
+			g = vminq_s16(vmaxq_s16(g, zero), max);
+			vst1q_s16((INT16*) (g_buf+i), g);
+
+			/* b = between((y + cb + (cb >> 1) + (cb >> 2) + (cb >> 6)),
+			 * 0, 255);
+			 */
+			int16x8_t b = vaddq_s16(y, cb);
+			b = vaddq_s16(b, vshrq_n_s16(cb, 1));
+			b = vaddq_s16(b, vshrq_n_s16(cb, 2));
+			b = vaddq_s16(b, vshrq_n_s16(cb, 6));
+			b = vminq_s16(vmaxq_s16(b, zero), max);
+			vst1q_s16((INT16*) (b_buf+i), b);
+		}
+		y_buf  += srcbump;
+		cb_buf += srcbump;
+		cr_buf += srcbump;
+		r_buf += dstbump;
+		g_buf += dstbump;
+		b_buf += dstbump;
+	}
+}
+#endif /* WITH_NEON */
+
+
+/* I don't see a direct IPP version of this, since the input is INT16
+ * YCbCr.  It may be possible via  Deinterleave and then YCbCrToRGB_<mod>.
+ * But that would likely be slower.
+ */
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_colors(
+	const primitives_hints_t *hints,
+	primitives_t *prims)
+{
+	prims->RGBToRGB_16s8u_P3AC4R  = general_RGBToRGB_16s8u_P3AC4R;
+	prims->yCbCrToRGB_16s16s_P3P3 = general_yCbCrToRGB_16s16s_P3P3;
+	prims->RGBToYCbCr_16s16s_P3P3 = general_RGBToYCbCr_16s16s_P3P3;
+#if defined(WITH_SSE2)
+	if (hints->x86_flags & PRIM_X86_SSE2_AVAILABLE)
+	{
+		prims->RGBToRGB_16s8u_P3AC4R  = sse2_RGBToRGB_16s8u_P3AC4R;
+		prims->yCbCrToRGB_16s16s_P3P3 = sse2_yCbCrToRGB_16s16s_P3P3;
+		prims->RGBToYCbCr_16s16s_P3P3 = sse2_RGBToYCbCr_16s16s_P3P3;
+	}
+#elif defined(WITH_NEON)
+	if (hints->arm_flags & PRIM_ARM_NEON_AVAILABLE)
+	{
+		prims->yCbCrToRGB_16s16s_P3P3 = neon_yCbCrToRGB_16s16s_P3P3;
+	}
+#endif /* WITH_SSE2 */
+}
+
+/* ------------------------------------------------------------------------- */
+void primitives_deinit_colors(
+	primitives_t *prims)
+{
+	/* Nothing to do. */
+}
--- a/libfreerdp/primitives/prim_copy.c
+++ b/libfreerdp/primitives/prim_copy.c
@ -0,0 +1,177 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Copy operations.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <config.h>
+#include <string.h>
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+#ifdef WITH_IPP
+# include <ipps.h>
+# include <ippi.h>
+#endif /* WITH_IPP */
+#include "prim_internal.h"
+
+/* ------------------------------------------------------------------------- */
+/*static inline BOOL memory_regions_overlap_1d(*/
+static BOOL memory_regions_overlap_1d(
+	const BYTE *p1,
+	const BYTE *p2,
+	size_t bytes)
+{
+	const ULONG_PTR p1m = (const ULONG_PTR) p1;
+	const ULONG_PTR p2m = (const ULONG_PTR) p2;
+	if (p1m <= p2m)
+	{
+		if (p1m + bytes > p2m) return TRUE;
+	}
+	else
+	{
+		if (p2m + bytes > p1m) return TRUE;
+	}
+	/* else */
+	return FALSE;
+}
+
+/* ------------------------------------------------------------------------- */
+/*static inline BOOL memory_regions_overlap_2d( */
+static BOOL memory_regions_overlap_2d(
+	const BYTE *p1,  int p1Step,  int p1Size,
+	const BYTE *p2,  int p2Step,  int p2Size,
+	int width,  int height)
+{
+	ULONG_PTR p1m = (ULONG_PTR) p1;
+	ULONG_PTR p2m = (ULONG_PTR) p2;
+
+	if (p1m <= p2m)
+	{
+		ULONG_PTR p1mEnd = p1m + (height-1)*p1Step + width*p1Size;
+		if (p1mEnd > p2m) return TRUE;
+	}
+	else
+	{
+		ULONG_PTR p2mEnd = p2m + (height-1)*p2Step + width*p2Size;
+		if (p2mEnd > p1m) return TRUE;
+	}
+	/* else */
+	return FALSE;
+}
+
+/* ------------------------------------------------------------------------- */
+PRIM_STATIC pstatus_t general_copy_8u(
+	const BYTE *pSrc,
+	BYTE *pDst,
+	INT32 len)
+{
+	if (memory_regions_overlap_1d(pSrc, pDst, (size_t) len))
+	{
+		memmove((void *) pDst, (const void *) pSrc, (size_t) len);
+	}
+	else
+	{
+		memcpy((void *) pDst, (const void *) pSrc, (size_t) len);
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+/* Copy a block of pixels from one buffer to another.
+ * The addresses are assumed to have been already offset to the upper-left
+ * corners of the source and destination region of interest.
+ */
+PRIM_STATIC pstatus_t general_copy_8u_AC4r(
+	const BYTE *pSrc,  INT32 srcStep,
+	BYTE *pDst,  INT32 dstStep,
+	INT32 width,  INT32 height)
+{
+	primitives_t *prims = primitives_get();
+	const BYTE *src = (const BYTE *) pSrc;
+	BYTE *dst = (BYTE *) pDst;
+	int rowbytes = width * sizeof(UINT32);
+
+	if ((width == 0) || (height == 0)) return PRIMITIVES_SUCCESS;
+
+	if (memory_regions_overlap_2d(pSrc, srcStep, sizeof(UINT32),
+			pDst, dstStep, sizeof(UINT32), width, height))
+	{
+		do {
+			prims->copy(src, dst, rowbytes);
+			src += srcStep;
+			dst += dstStep;
+		} while (--height);
+	}
+	else
+	{
+		/* TODO: do it in one operation when the rowdata is adjacent. */
+		do {
+			/* If we find a replacement for memcpy that is consistently
+			 * faster, this could be replaced with that.
+			 */
+			memcpy(dst, src, rowbytes);
+			src += srcStep;
+			dst += dstStep;
+		} while (--height);
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+#ifdef WITH_IPP
+/* ------------------------------------------------------------------------- */
+/* This is just ippiCopy_8u_AC4R without the IppiSize structure parameter.   */
+static pstatus_t ippiCopy_8u_AC4r(
+	const BYTE *pSrc,  INT32 srcStep,
+	BYTE *pDst,  INT32 dstStep,
+	INT32 width,  INT32 height)
+{
+	IppiSize roi;
+	roi.width  = width;
+	roi.height = height;
+	return (pstatus_t) ippiCopy_8u_AC4R(pSrc, srcStep, pDst, dstStep, roi);
+}
+#endif /* WITH_IPP */
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_copy(
+	const primitives_hints_t *hints,
+	primitives_t *prims)
+{
+	/* Start with the default. */
+	prims->copy_8u = general_copy_8u;
+	prims->copy_8u_AC4r = general_copy_8u_AC4r;
+
+	/* Pick tuned versions if possible. */
+#ifdef WITH_IPP
+	prims->copy_8u = (__copy_8u_t) ippsCopy_8u;
+	prims->copy_8u_AC4r = (__copy_8u_AC4r_t) ippiCopy_8u_AC4r;
+#endif
+	/* Performance with an SSE2 version with no prefetch seemed to be
+	 * all over the map vs. memcpy.  
+	 * Sometimes it was significantly faster, sometimes dreadfully slower,
+	 * and it seemed to vary a lot depending on block size and processor.
+	 * Hence, no SSE version is used here unless once can be written that
+	 * is consistently faster than memcpy.
+	 */
+
+	/* This is just an alias with void* parameters */
+	prims->copy    = (__copy_t) (prims->copy_8u);
+}
+
+/* ------------------------------------------------------------------------- */
+void primitives_deinit_copy(
+	primitives_t *prims)
+{
+	/* Nothing to do. */
+}
--- a/libfreerdp/primitives/prim_internal.h
+++ b/libfreerdp/primitives/prim_internal.h
@ -0,0 +1,105 @@
+/* prim_internal.h
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.  Algorithms used by
+ * this code may be covered by patents by HP, Microsoft, or other parties.
+ *
+ */
+
+#ifdef __GNUC__
+# pragma once
+#endif
+
+#ifndef __PRIM_INTERNAL_H_INCLUDED__
+#define __PRIM_INTERNAL_H_INCLUDED__
+
+#ifndef CMAKE_BUILD_TYPE
+#define CMAKE_BUILD_TYPE Release
+#endif
+
+#include <freerdp/primitives.h>
+
+/* Normally the internal entrypoints should be static, but a benchmark
+ * program may want to access them directly and turn this off.
+ */
+#ifndef PRIM_STATIC
+# define PRIM_STATIC static
+#else
+# undef PRIM_STATIC
+# define PRIM_STATIC
+#endif /* !PRIM_STATIC */
+
+/* Use lddqu for unaligned; load for 16-byte aligned. */
+#define LOAD_SI128(_ptr_) \
+	(((ULONG_PTR) (_ptr_) & 0x0f) \
+		? _mm_lddqu_si128((__m128i *) (_ptr_)) \
+		: _mm_load_si128((__m128i *) (_ptr_)))
+
+/* This structure can (eventually) be used to provide hints to the
+ * initialization routines, e.g. whether SSE2 or NEON or IPP instructions
+ * or calls are available.
+ */
+typedef struct
+{
+	UINT32 x86_flags;
+	UINT32 arm_flags;
+} primitives_hints_t;
+
+/* Function prototypes for all the init/deinit routines. */
+extern void primitives_init_copy(
+	const primitives_hints_t *hints,
+	primitives_t *prims);
+extern void primitives_deinit_copy(
+	primitives_t *prims);
+
+extern void primitives_init_set(
+	const primitives_hints_t *hints,
+	primitives_t *prims);
+extern void primitives_deinit_set(
+	primitives_t *prims);
+
+extern void primitives_init_add(
+	const primitives_hints_t *hints,
+	primitives_t *prims);
+extern void primitives_deinit_add(
+	primitives_t *prims);
+
+extern void primitives_init_andor(
+	const primitives_hints_t *hints,
+	primitives_t *prims);
+extern void primitives_deinit_andor(
+	primitives_t *prims);
+
+extern void primitives_init_shift(
+	const primitives_hints_t *hints,
+	primitives_t *prims);
+extern void primitives_deinit_shift(
+	primitives_t *prims);
+
+extern void primitives_init_sign(
+	const primitives_hints_t *hints,
+	primitives_t *prims);
+extern void primitives_deinit_sign(
+	primitives_t *prims);
+
+extern void primitives_init_alphaComp(
+	const primitives_hints_t *hints,
+	primitives_t *prims);
+extern void primitives_deinit_alphaComp(
+	primitives_t *prims);
+
+extern void primitives_init_colors(
+	const primitives_hints_t *hints,
+	primitives_t *prims);
+extern void primitives_deinit_colors(
+	primitives_t *prims);
+
+#endif /* !__PRIM_INTERNAL_H_INCLUDED__ */
--- a/libfreerdp/primitives/prim_set.c
+++ b/libfreerdp/primitives/prim_set.c
@ -0,0 +1,309 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Routines to set a chunk of memory to a constant.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ *
+ */
+
+#include <config.h>
+#include <string.h>
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+#ifdef WITH_SSE2
+# include <emmintrin.h>
+#endif /* WITH_SSE2 */
+#ifdef WITH_IPP
+# include <ipps.h>
+#endif /* WITH_IPP */
+#include "prim_internal.h"
+
+/* ========================================================================= */
+PRIM_STATIC pstatus_t general_set_8u(
+	BYTE val,
+	BYTE *pDst,
+	INT32 len)
+{
+	memset((void *) pDst, (int) val, (size_t) len);
+	return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+PRIM_STATIC pstatus_t general_zero(
+	void *pDst,
+	size_t len)
+{
+	memset(pDst, 0, len);
+	return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+#ifdef WITH_SSE2
+# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
+PRIM_STATIC pstatus_t sse2_set_8u(
+	BYTE val,
+	BYTE *pDst,
+	INT32 len)
+{
+	BYTE byte, *dptr;
+	__m128i xmm0;
+	size_t count;
+
+	if (len < 16) return general_set_8u(val, pDst, len);
+
+	byte  = val;
+	dptr = (BYTE *) pDst;
+
+	/* Seek 16-byte alignment. */
+	while ((ULONG_PTR) dptr & 0x0f)
+	{
+		*dptr++ = byte;
+		if (--len == 0) return PRIMITIVES_SUCCESS;
+	}
+
+	xmm0 = _mm_set1_epi8(byte);
+
+	/* Cover 256-byte chunks via SSE register stores. */
+	count = len >> 8;
+	len -= count << 8;
+	/* Do 256-byte chunks using one XMM register. */
+	while (count--)
+	{
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
+	}
+
+	/* Cover 16-byte chunks via SSE register stores. */
+	count = len >> 4;
+	len -= count << 4;
+	/* Do 16-byte chunks using one XMM register. */
+	while (count--)
+	{
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 16;
+	}
+
+	/* Do leftover bytes. */
+	while (len--) *dptr++ = byte;
+
+	return PRIMITIVES_SUCCESS;
+}
+# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
+#endif /* WITH_SSE2 */
+
+/* ========================================================================= */
+PRIM_STATIC pstatus_t general_set_32s(
+	INT32 val,
+	INT32 *pDst,
+	INT32 len)
+{
+	INT32 *dptr = (INT32 *) pDst;
+	size_t span, remaining;
+	primitives_t *prims;
+
+	if (len < 256)
+	{
+		while (len--) *dptr++ = val;
+		return PRIMITIVES_SUCCESS;
+	}
+
+	/* else quadratic growth memcpy algorithm */
+	span = 1;
+	*dptr = val;
+	remaining = len - 1;
+	prims = primitives_get();
+	while (remaining)
+	{
+		size_t thiswidth = span;
+		if (thiswidth > remaining) thiswidth = remaining;
+		prims->copy_8u((BYTE *) dptr, (BYTE *) (dptr + span), thiswidth<<2);
+		remaining -= thiswidth;
+		span <<= 1;
+	}
+	return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+PRIM_STATIC pstatus_t general_set_32u(
+	UINT32 val,
+	UINT32 *pDst,
+	INT32 len)
+{
+	UINT32 *dptr = (UINT32 *) pDst;
+	size_t span, remaining;
+	primitives_t *prims;
+
+	if (len < 256)
+	{
+		while (len--) *dptr++ = val;
+		return PRIMITIVES_SUCCESS;
+	}
+
+	/* else quadratic growth memcpy algorithm */
+	span = 1;
+	*dptr = val;
+	remaining = len - 1;
+	prims = primitives_get();
+	while (remaining)
+	{
+		size_t thiswidth = span;
+		if (thiswidth > remaining) thiswidth = remaining;
+		prims->copy_8u((BYTE *) dptr, (BYTE *) (dptr + span), thiswidth<<2);
+		remaining -= thiswidth;
+		span <<= 1;
+	}
+	return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+#ifdef WITH_SSE2
+# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
+PRIM_STATIC pstatus_t sse2_set_32u(
+	UINT32 val,
+	UINT32 *pDst,
+	INT32 len)
+{
+	UINT32 *dptr = (UINT32 *) pDst;
+	__m128i xmm0;
+	size_t count;
+
+	/* If really short, just do it here. */
+	if (len < 32)
+	{
+		while (len--) *dptr++ = val;
+		return PRIMITIVES_SUCCESS;
+	}
+
+	/* Assure we can reach 16-byte alignment. */
+	if (((ULONG_PTR) dptr & 0x03) != 0)
+	{
+		return general_set_32u(val, pDst, len);
+	}
+
+	/* Seek 16-byte alignment. */
+	while ((ULONG_PTR) dptr & 0x0f)
+	{
+		*dptr++ = val;
+		if (--len == 0) return PRIMITIVES_SUCCESS;
+	}
+
+	xmm0 = _mm_set1_epi32(val);
+
+	/* Cover 256-byte chunks via SSE register stores. */
+	count = len >> 6;
+	len -= count << 6;
+	/* Do 256-byte chunks using one XMM register. */
+	while (count--)
+	{
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
+	}
+
+	/* Cover 16-byte chunks via SSE register stores. */
+	count = len >> 2;
+	len -= count << 2;
+	/* Do 16-byte chunks using one XMM register. */
+	while (count--)
+	{
+		_mm_store_si128((__m128i *) dptr, xmm0);  dptr += 4;
+	}
+
+	/* Do leftover bytes. */
+	while (len--) *dptr++ = val;
+
+	return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+PRIM_STATIC pstatus_t sse2_set_32s(
+	INT32 val,
+	INT32 *pDst,
+	INT32 len)
+{
+	UINT32 uval = *((UINT32 *) &val);
+	return sse2_set_32u(uval, (UINT32 *) pDst, len);
+}
+# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
+#endif /* WITH_SSE2 */
+
+#ifdef WITH_IPP
+/* ------------------------------------------------------------------------- */
+PRIM_STATIC pstatus_t ipp_wrapper_set_32u(
+	UINT32 val,
+	UINT32 *pDst,
+	INT32 len)
+{
+	/* A little type conversion, then use the signed version. */
+	INT32 sval = *((INT32 *) &val);
+	return ippsSet_32s(sval, (INT32 *) pDst, len);
+}
+#endif
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_set(
+	const primitives_hints_t *hints,
+	primitives_t *prims)
+{
+	/* Start with the default. */
+	prims->set_8u  = general_set_8u;
+	prims->set_32s = general_set_32s;
+	prims->set_32u = general_set_32u;
+	prims->zero = general_zero;
+
+	/* Pick tuned versions if possible. */
+#ifdef WITH_IPP
+	prims->set_8u  = (__set_8u_t)  ippsSet_8u;
+	prims->set_32s = (__set_32s_t) ippsSet_32s;
+	prims->set_32u = (__set_32u_t) ipp_wrapper_set_32u;
+	prims->zero = (__zero_t) ippsZero_8u;
+#elif defined(WITH_SSE2)
+	if (hints->x86_flags & PRIM_X86_SSE2_AVAILABLE)
+	{
+		prims->set_8u  = sse2_set_8u;
+		prims->set_32s = sse2_set_32s;
+		prims->set_32u = sse2_set_32u;
+	}
+#endif
+}
+
+/* ------------------------------------------------------------------------- */
+void primitives_deinit_set(
+	primitives_t *prims)
+{
+	/* Nothing to do. */
+}
--- a/libfreerdp/primitives/prim_shift.c
+++ b/libfreerdp/primitives/prim_shift.c
@ -0,0 +1,165 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Shift operations.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <config.h>
+#include <string.h>
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+#ifdef WITH_SSE2
+# include <emmintrin.h>
+# include <pmmintrin.h>
+#endif /* WITH_SSE2 */
+#ifdef WITH_IPP
+# include <ipps.h>
+#endif /* WITH_IPP */
+#include "prim_internal.h"
+#include "prim_templates.h"
+
+/* ------------------------------------------------------------------------- */
+PRIM_STATIC pstatus_t general_lShiftC_16s(
+	const INT16 *pSrc,
+	INT32 val,
+	INT16 *pDst,
+	INT32 len)
+{
+	if (val == 0) return PRIMITIVES_SUCCESS;
+	while (len--) *pDst++ = *pSrc++ << val;
+	return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+PRIM_STATIC pstatus_t general_rShiftC_16s(
+	const INT16 *pSrc,
+	INT32 val,
+	INT16 *pDst,
+	INT32 len)
+{
+	if (val == 0) return PRIMITIVES_SUCCESS;
+	while (len--) *pDst++ = *pSrc++ >> val;
+	return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+PRIM_STATIC pstatus_t general_lShiftC_16u(
+	const UINT16 *pSrc,
+	INT32 val,
+	UINT16 *pDst,
+	INT32 len)
+{
+	if (val == 0) return PRIMITIVES_SUCCESS;
+	while (len--) *pDst++ = *pSrc++ << val;
+	return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+PRIM_STATIC pstatus_t general_rShiftC_16u(
+	const UINT16 *pSrc,
+	INT32 val,
+	UINT16 *pDst,
+	INT32 len)
+{
+	if (val == 0) return PRIMITIVES_SUCCESS;
+	while (len--) *pDst++ = *pSrc++ >> val;
+	return PRIMITIVES_SUCCESS;
+}
+
+#ifdef WITH_SSE2
+# if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
+/* ------------------------------------------------------------------------- */
+SSE3_SCD_ROUTINE(sse2_lShiftC_16s, INT16, general_lShiftC_16s,
+	_mm_slli_epi16, *dptr++ = *sptr++ << val)
+/* ------------------------------------------------------------------------- */
+SSE3_SCD_ROUTINE(sse2_rShiftC_16s, INT16, general_rShiftC_16s,
+	_mm_srai_epi16, *dptr++ = *sptr++ >> val)
+/* ------------------------------------------------------------------------- */
+SSE3_SCD_ROUTINE(sse2_lShiftC_16u, UINT16, general_lShiftC_16u,
+	_mm_slli_epi16, *dptr++ = *sptr++ << val)
+/* ------------------------------------------------------------------------- */
+SSE3_SCD_ROUTINE(sse2_rShiftC_16u, UINT16, general_rShiftC_16u,
+	_mm_srli_epi16, *dptr++ = *sptr++ >> val)
+# endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
+#endif
+
+/* ------------------------------------------------------------------------- */
+PRIM_STATIC pstatus_t general_shiftC_16s(
+	const INT16 *pSrc,
+	INT32 val,
+	INT16 *pDst,
+	INT32 len)
+{
+	primitives_t *prims;
+
+	if (val == 0) return PRIMITIVES_SUCCESS;
+	prims = primitives_get();
+	if (val < 0) return prims->rShiftC_16s(pSrc, -val, pDst, len);
+	else         return prims->lShiftC_16s(pSrc,  val, pDst, len);
+}
+
+/* ------------------------------------------------------------------------- */
+PRIM_STATIC pstatus_t general_shiftC_16u(
+	const UINT16 *pSrc,
+	INT32 val,
+	UINT16 *pDst,
+	INT32 len)
+{
+	primitives_t *prims;
+
+	if (val == 0) return PRIMITIVES_SUCCESS;
+	prims = primitives_get();
+	if (val < 0) return prims->rShiftC_16u(pSrc, -val, pDst, len);
+	else         return prims->lShiftC_16u(pSrc,  val, pDst, len);
+}
+
+/* Note: the IPP version will have to call ippLShiftC_16s or ippRShiftC_16s
+ * depending on the sign of val.  To avoid using the deprecated inplace
+ * routines, a wrapper can use the src for the dest.
+ */
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_shift(
+	const primitives_hints_t *hints,
+	primitives_t *prims)
+{
+	/* Start with the default. */
+	prims->lShiftC_16s = general_lShiftC_16s;
+	prims->rShiftC_16s = general_rShiftC_16s;
+	prims->lShiftC_16u = general_lShiftC_16u;
+	prims->rShiftC_16u = general_rShiftC_16u;
+#if defined(WITH_IPP)
+	prims->lShiftC_16s = (__lShiftC_16s_t) ippsLShiftC_16s;
+	prims->rShiftC_16s = (__rShiftC_16s_t) ippsRShiftC_16s;
+	prims->lShiftC_16u = (__lShiftC_16u_t) ippsLShiftC_16u;
+	prims->rShiftC_16u = (__rShiftC_16u_t) ippsRShiftC_16u;
+#elif defined(WITH_SSE2)
+	if ((hints->x86_flags & PRIM_X86_SSE2_AVAILABLE)
+			&& (hints->x86_flags & PRIM_X86_SSE3_AVAILABLE))
+	{
+		prims->lShiftC_16s = sse2_lShiftC_16s;
+		prims->rShiftC_16s = sse2_rShiftC_16s;
+		prims->lShiftC_16u = sse2_lShiftC_16u;
+		prims->rShiftC_16u = sse2_rShiftC_16u;
+	}
+#endif
+	/* Wrappers */
+	prims->shiftC_16s  = general_shiftC_16s;
+	prims->shiftC_16u  = general_shiftC_16u;
+}
+
+/* ------------------------------------------------------------------------- */
+void primitives_deinit_shift(
+	primitives_t *prims)
+{
+	/* Nothing to do. */
+}
--- a/libfreerdp/primitives/prim_sign.c
+++ b/libfreerdp/primitives/prim_sign.c
@ -0,0 +1,170 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Sign operations.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <config.h>
+#include <string.h>
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+#ifdef WITH_SSE2
+# include <emmintrin.h>
+# include <tmmintrin.h>
+#endif /* WITH_SSE2 */
+#include "prim_internal.h"
+
+/* ----------------------------------------------------------------------------
+ * Set pDst to the sign-value of the 16-bit values in pSrc (-1, 0, or 1).
+ */
+PRIM_STATIC pstatus_t general_sign_16s(
+	const INT16 *pSrc,
+	INT16 *pDst,
+	INT32 len)
+{
+	while (len--)
+	{
+		INT16 src = *pSrc++;
+		*pDst++ = (src < 0) ? (-1) : ((src > 0) ? 1 : 0);
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+
+#ifdef WITH_SSE2
+/* ------------------------------------------------------------------------- */
+PRIM_STATIC pstatus_t ssse3_sign_16s(
+	const INT16 *pSrc,
+	INT16 *pDst,
+	INT32 len)
+{
+	const INT16 *sptr = (const INT16 *) pSrc;
+	INT16 *dptr = (INT16 *) pDst;
+	size_t count;
+
+	if (len < 16)
+	{
+		return general_sign_16s(pSrc, pDst, len);
+	}
+
+	/* Check for 16-byte alignment (eventually). */
+	if ((ULONG_PTR) pDst & 0x01)
+	{
+		return general_sign_16s(pSrc, pDst, len);
+	}
+
+	/* Seek 16-byte alignment. */
+	while ((ULONG_PTR) dptr & 0x0f)
+	{
+		INT16 src = *sptr++;
+		*dptr++ = (src < 0) ? (-1) : ((src > 0) ? 1 : 0);
+		if (--len == 0) return PRIMITIVES_SUCCESS;
+	}
+
+	/* Do 32-short chunks using 8 XMM registers. */
+	count = len >> 5;	/* / 32  */
+	len -= count << 5;	/* * 32 */
+	if ((ULONG_PTR) sptr & 0x0f)
+	{
+		/* Unaligned */
+		while (count--)
+		{
+			__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+			xmm0 = _mm_set1_epi16(0x0001U);
+			xmm1 = _mm_set1_epi16(0x0001U);
+			xmm2 = _mm_set1_epi16(0x0001U);
+			xmm3 = _mm_set1_epi16(0x0001U);
+			xmm4 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8;
+			xmm5 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8;
+			xmm6 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8;
+			xmm7 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8;
+			xmm0 = _mm_sign_epi16(xmm0, xmm4);
+			xmm1 = _mm_sign_epi16(xmm1, xmm5);
+			xmm2 = _mm_sign_epi16(xmm2, xmm6);
+			xmm3 = _mm_sign_epi16(xmm3, xmm7);
+			_mm_store_si128((__m128i *) dptr, xmm0);         dptr += 8;
+			_mm_store_si128((__m128i *) dptr, xmm1);         dptr += 8;
+			_mm_store_si128((__m128i *) dptr, xmm2);         dptr += 8;
+			_mm_store_si128((__m128i *) dptr, xmm3);         dptr += 8;
+		}
+	}
+	else
+	{
+		/* Aligned */
+		while (count--)
+		{
+			__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+			xmm0 = _mm_set1_epi16(0x0001U);
+			xmm1 = _mm_set1_epi16(0x0001U);
+			xmm2 = _mm_set1_epi16(0x0001U);
+			xmm3 = _mm_set1_epi16(0x0001U);
+			xmm4 = _mm_load_si128((__m128i *) sptr); sptr += 8;
+			xmm5 = _mm_load_si128((__m128i *) sptr); sptr += 8;
+			xmm6 = _mm_load_si128((__m128i *) sptr); sptr += 8;
+			xmm7 = _mm_load_si128((__m128i *) sptr); sptr += 8;
+			xmm0 = _mm_sign_epi16(xmm0, xmm4);
+			xmm1 = _mm_sign_epi16(xmm1, xmm5);
+			xmm2 = _mm_sign_epi16(xmm2, xmm6);
+			xmm3 = _mm_sign_epi16(xmm3, xmm7);
+			_mm_store_si128((__m128i *) dptr, xmm0);         dptr += 8;
+			_mm_store_si128((__m128i *) dptr, xmm1);         dptr += 8;
+			_mm_store_si128((__m128i *) dptr, xmm2);         dptr += 8;
+			_mm_store_si128((__m128i *) dptr, xmm3);         dptr += 8;
+		}
+	}
+
+	/* Do 8-short chunks using two XMM registers. */
+	count = len >> 3;
+	len -= count << 3;
+	while (count--)
+	{
+		__m128i xmm0 = _mm_set1_epi16(0x0001U);
+		__m128i xmm1 = LOAD_SI128(sptr);					sptr += 8;
+		xmm0 = _mm_sign_epi16(xmm0, xmm1);
+		_mm_store_si128((__m128i *) dptr, xmm0);			dptr += 8;
+	}
+
+	/* Do leftovers. */
+	while (len--)
+	{
+		INT16 src = *sptr++;
+		*dptr++ = (src < 0) ? -1 : ((src > 0) ? 1 : 0);
+	}
+
+	return PRIMITIVES_SUCCESS;
+}
+#endif /* WITH_SSE2 */
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_sign(
+	const primitives_hints_t *hints,
+	primitives_t *prims)
+{
+	/* Start with the default. */
+	prims->sign_16s = general_sign_16s;
+	/* Pick tuned versions if possible. */
+	/* I didn't spot an IPP version of this. */
+#if defined(WITH_SSE2)
+	if ((hints->x86_flags & PRIM_X86_SSSE3_AVAILABLE)
+			&& (hints->x86_flags & PRIM_X86_SSE3_AVAILABLE))
+	{
+		prims->sign_16s  = ssse3_sign_16s;
+	}
+#endif
+}
+
+/* ------------------------------------------------------------------------- */
+void primitives_deinit_sign(
+	primitives_t *prims)
+{
+	/* Nothing to do. */
+}
--- a/libfreerdp/primitives/prim_templates.h
+++ b/libfreerdp/primitives/prim_templates.h
@ -0,0 +1,416 @@
+/* prim_templates.h
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.  Algorithms used by
+ * this code may be covered by patents by HP, Microsoft, or other parties.
+ */
+
+#ifdef __GNUC__
+# pragma once
+#endif
+
+#ifndef __PRIM_TEMPLATES_H_INCLUDED__
+#define __PRIM_TEMPLATES_H_INCLUDED__
+
+/* These are prototypes for SSE (potentially NEON) routines that do a 
+ * simple SSE operation over an array of data.  Since so much of this
+ * code is shared except for the operation itself, these prototypes are
+ * used rather than duplicating code.  The naming convention depends on
+ * the parameters:  S=Source param; C=Constant; D=Destination.
+ * All the macros have parameters for a fallback procedure if the data
+ * is too small and an operation "the slow way" for use at 16-byte edges.
+ */
+
+/* SSE3 note:  If someone needs to support an SSE2 version of these without
+ * SSE3 support, an alternative version could be added that merely checks
+ * that 16-byte alignment on both destination and source(s) can be 
+ * achieved, rather than use LDDQU for unaligned reads.
+ */
+
+/* Note: the compiler is good at turning (16/sizeof(_type_)) into a constant.
+ * It easily can't do that if the value is stored in a variable.
+ * So don't save it as an intermediate value.
+ */
+
+/* ----------------------------------------------------------------------------
+ * SCD = Source, Constant, Destination
+ */
+#define SSE3_SCD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
+PRIM_STATIC pstatus_t _name_(const _type_ *pSrc, INT32 val, _type_ *pDst, INT32 len) \
+{ \
+	int shifts; \
+	UINT32 offBeatMask; \
+	const _type_ *sptr = pSrc; \
+	_type_ *dptr = pDst; \
+	size_t count; \
+	if (len < 16)   /* pointless if too small */ \
+	{ \
+		return _fallback_(pSrc, val, pDst, len); \
+	} \
+	if      (sizeof(_type_) == 1) shifts = 1; \
+	else if (sizeof(_type_) == 2) shifts = 2; \
+	else if (sizeof(_type_) == 4) shifts = 3; \
+	else if (sizeof(_type_) == 8) shifts = 4; \
+	offBeatMask = (1 << (shifts - 1)) - 1; \
+	if ((ULONG_PTR) pDst & offBeatMask) \
+	{ \
+		/* Incrementing the pointer skips over 16-byte boundary. */ \
+		return _fallback_(pSrc, val, pDst, len); \
+	} \
+	/* Get to the 16-byte boundary now. */ \
+	while ((ULONG_PTR) dptr & 0x0f) \
+	{ \
+		_slowWay_; \
+		if (--len == 0) return PRIMITIVES_SUCCESS; \
+	} \
+	/* Use 8 128-bit SSE registers. */ \
+	count = len >> (8-shifts); \
+	len -= count << (8-shifts); \
+	if ((ULONG_PTR) sptr & 0x0f) \
+	{ \
+		while (count--) \
+		{ \
+			__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \
+			xmm0 = _mm_lddqu_si128((__m128i *) sptr); \
+			sptr += (16/sizeof(_type_)); \
+			xmm1 = _mm_lddqu_si128((__m128i *) sptr); \
+			sptr += (16/sizeof(_type_)); \
+			xmm2 = _mm_lddqu_si128((__m128i *) sptr); \
+			sptr += (16/sizeof(_type_)); \
+			xmm3 = _mm_lddqu_si128((__m128i *) sptr); \
+			sptr += (16/sizeof(_type_)); \
+			xmm4 = _mm_lddqu_si128((__m128i *) sptr); \
+			sptr += (16/sizeof(_type_)); \
+			xmm5 = _mm_lddqu_si128((__m128i *) sptr); \
+			sptr += (16/sizeof(_type_)); \
+			xmm6 = _mm_lddqu_si128((__m128i *) sptr); \
+			sptr += (16/sizeof(_type_)); \
+			xmm7 = _mm_lddqu_si128((__m128i *) sptr); \
+			sptr += (16/sizeof(_type_)); \
+			xmm0 = _op_(xmm0, val); \
+			xmm1 = _op_(xmm1, val); \
+			xmm2 = _op_(xmm2, val); \
+			xmm3 = _op_(xmm3, val); \
+			xmm4 = _op_(xmm4, val); \
+			xmm5 = _op_(xmm5, val); \
+			xmm6 = _op_(xmm6, val); \
+			xmm7 = _op_(xmm7, val); \
+			_mm_store_si128((__m128i *) dptr, xmm0); \
+			dptr += (16/sizeof(_type_)); \
+			_mm_store_si128((__m128i *) dptr, xmm1); \
+			dptr += (16/sizeof(_type_)); \
+			_mm_store_si128((__m128i *) dptr, xmm2); \
+			dptr += (16/sizeof(_type_)); \
+			_mm_store_si128((__m128i *) dptr, xmm3); \
+			dptr += (16/sizeof(_type_)); \
+			_mm_store_si128((__m128i *) dptr, xmm4); \
+			dptr += (16/sizeof(_type_)); \
+			_mm_store_si128((__m128i *) dptr, xmm5); \
+			dptr += (16/sizeof(_type_)); \
+			_mm_store_si128((__m128i *) dptr, xmm6); \
+			dptr += (16/sizeof(_type_)); \
+			_mm_store_si128((__m128i *) dptr, xmm7); \
+			dptr += (16/sizeof(_type_)); \
+		} \
+	} \
+	else \
+	{ \
+		while (count--) \
+		{ \
+			__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \
+			xmm0 = _mm_load_si128((__m128i *) sptr); \
+			sptr += (16/sizeof(_type_)); \
+			xmm1 = _mm_load_si128((__m128i *) sptr); \
+			sptr += (16/sizeof(_type_)); \
+			xmm2 = _mm_load_si128((__m128i *) sptr); \
+			sptr += (16/sizeof(_type_)); \
+			xmm3 = _mm_load_si128((__m128i *) sptr); \
+			sptr += (16/sizeof(_type_)); \
+			xmm4 = _mm_load_si128((__m128i *) sptr); \
+			sptr += (16/sizeof(_type_)); \
+			xmm5 = _mm_load_si128((__m128i *) sptr); \
+			sptr += (16/sizeof(_type_)); \
+			xmm6 = _mm_load_si128((__m128i *) sptr); \
+			sptr += (16/sizeof(_type_)); \
+			xmm7 = _mm_load_si128((__m128i *) sptr); \
+			sptr += (16/sizeof(_type_)); \
+			xmm0 = _op_(xmm0, val); \
+			xmm1 = _op_(xmm1, val); \
+			xmm2 = _op_(xmm2, val); \
+			xmm3 = _op_(xmm3, val); \
+			xmm4 = _op_(xmm4, val); \
+			xmm5 = _op_(xmm5, val); \
+			xmm6 = _op_(xmm6, val); \
+			xmm7 = _op_(xmm7, val); \
+			_mm_store_si128((__m128i *) dptr, xmm0); \
+			dptr += (16/sizeof(_type_)); \
+			_mm_store_si128((__m128i *) dptr, xmm1); \
+			dptr += (16/sizeof(_type_)); \
+			_mm_store_si128((__m128i *) dptr, xmm2); \
+			dptr += (16/sizeof(_type_)); \
+			_mm_store_si128((__m128i *) dptr, xmm3); \
+			dptr += (16/sizeof(_type_)); \
+			_mm_store_si128((__m128i *) dptr, xmm4); \
+			dptr += (16/sizeof(_type_)); \
+			_mm_store_si128((__m128i *) dptr, xmm5); \
+			dptr += (16/sizeof(_type_)); \
+			_mm_store_si128((__m128i *) dptr, xmm6); \
+			dptr += (16/sizeof(_type_)); \
+			_mm_store_si128((__m128i *) dptr, xmm7); \
+			dptr += (16/sizeof(_type_)); \
+		} \
+	} \
+	/* Use a single 128-bit SSE register. */ \
+	count = len >> (5-shifts); \
+	len -= count << (5-shifts); \
+	while (count--) \
+	{ \
+		__m128i xmm0 = LOAD_SI128(sptr); sptr += (16/sizeof(_type_)); \
+		xmm0 = _op_(xmm0, val); \
+		_mm_store_si128((__m128i *) dptr, xmm0); \
+		dptr += (16/sizeof(_type_)); \
+	} \
+	/* Finish off the remainder. */ \
+	while (len--) { _slowWay_; } \
+	return PRIMITIVES_SUCCESS; \
+}
+
+/* ----------------------------------------------------------------------------
+ * SCD = Source, Constant, Destination
+ * PRE = preload xmm0 with the constant.
+ */
+#define SSE3_SCD_PRE_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
+PRIM_STATIC pstatus_t _name_(const _type_ *pSrc, _type_ val, _type_ *pDst, INT32 len) \
+{ \
+	int shifts; \
+	UINT32 offBeatMask; \
+	const _type_ *sptr = pSrc; \
+	_type_ *dptr = pDst; \
+	size_t count; \
+	__m128i xmm0; \
+	if (len < 16) /* pointless if too small */ \
+	{ \
+		return _fallback_(pSrc, val, pDst, len); \
+	} \
+	if      (sizeof(_type_) == 1) shifts = 1; \
+	else if (sizeof(_type_) == 2) shifts = 2; \
+	else if (sizeof(_type_) == 4) shifts = 3; \
+	else if (sizeof(_type_) == 8) shifts = 4; \
+	offBeatMask = (1 << (shifts - 1)) - 1; \
+	if ((ULONG_PTR) pDst & offBeatMask) \
+	{ \
+		/* Incrementing the pointer skips over 16-byte boundary. */ \
+		return _fallback_(pSrc, val, pDst, len); \
+	} \
+	/* Get to the 16-byte boundary now. */ \
+	while ((ULONG_PTR) dptr & 0x0f) \
+	{ \
+		_slowWay_; \
+		if (--len == 0) return PRIMITIVES_SUCCESS; \
+	} \
+	/* Use 4 128-bit SSE registers. */ \
+	count = len >> (7-shifts); \
+	len -= count << (7-shifts); \
+	xmm0 = _mm_set1_epi32(val); \
+	if ((ULONG_PTR) sptr & 0x0f) \
+	{ \
+		while (count--) \
+		{ \
+			__m128i xmm1, xmm2, xmm3, xmm4; \
+			xmm1 = _mm_lddqu_si128((__m128i *) sptr); \
+			sptr += (16/sizeof(_type_)); \
+			xmm2 = _mm_lddqu_si128((__m128i *) sptr); \
+			sptr += (16/sizeof(_type_)); \
+			xmm3 = _mm_lddqu_si128((__m128i *) sptr); \
+			sptr += (16/sizeof(_type_)); \
+			xmm4 = _mm_lddqu_si128((__m128i *) sptr); \
+			sptr += (16/sizeof(_type_)); \
+			xmm1 = _op_(xmm1, xmm0); \
+			xmm2 = _op_(xmm2, xmm0); \
+			xmm3 = _op_(xmm3, xmm0); \
+			xmm4 = _op_(xmm4, xmm0); \
+			_mm_store_si128((__m128i *) dptr, xmm1); \
+			dptr += (16/sizeof(_type_)); \
+			_mm_store_si128((__m128i *) dptr, xmm2); \
+			dptr += (16/sizeof(_type_)); \
+			_mm_store_si128((__m128i *) dptr, xmm3); \
+			dptr += (16/sizeof(_type_)); \
+			_mm_store_si128((__m128i *) dptr, xmm4); \
+			dptr += (16/sizeof(_type_)); \
+		} \
+	} \
+	else \
+	{ \
+		while (count--) \
+		{ \
+			__m128i xmm1, xmm2, xmm3, xmm4; \
+			xmm1 = _mm_load_si128((__m128i *) sptr); \
+			sptr += (16/sizeof(_type_)); \
+			xmm2 = _mm_load_si128((__m128i *) sptr); \
+			sptr += (16/sizeof(_type_)); \
+			xmm3 = _mm_load_si128((__m128i *) sptr); \
+			sptr += (16/sizeof(_type_)); \
+			xmm4 = _mm_load_si128((__m128i *) sptr); \
+			sptr += (16/sizeof(_type_)); \
+			xmm1 = _op_(xmm1, xmm0); \
+			xmm2 = _op_(xmm2, xmm0); \
+			xmm3 = _op_(xmm3, xmm0); \
+			xmm4 = _op_(xmm4, xmm0); \
+			_mm_store_si128((__m128i *) dptr, xmm1); \
+			dptr += (16/sizeof(_type_)); \
+			_mm_store_si128((__m128i *) dptr, xmm2); \
+			dptr += (16/sizeof(_type_)); \
+			_mm_store_si128((__m128i *) dptr, xmm3); \
+			dptr += (16/sizeof(_type_)); \
+			_mm_store_si128((__m128i *) dptr, xmm4); \
+			dptr += (16/sizeof(_type_)); \
+		} \
+	} \
+	/* Use a single 128-bit SSE register. */ \
+	count = len >> (5-shifts); \
+	len -= count << (5-shifts); \
+	while (count--) \
+	{ \
+		__m128i xmm1 = LOAD_SI128(sptr); sptr += (16/sizeof(_type_)); \
+		xmm1 = _op_(xmm1, xmm0); \
+		_mm_store_si128((__m128i *) dptr, xmm1); \
+		dptr += (16/sizeof(_type_)); \
+	} \
+	/* Finish off the remainder. */ \
+	while (len--) { _slowWay_; } \
+	return PRIMITIVES_SUCCESS; \
+}
+
+/* ----------------------------------------------------------------------------
+ * SSD = Source1, Source2, Destination
+ */
+#define SSE3_SSD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
+PRIM_STATIC pstatus_t _name_(const _type_ *pSrc1, const _type_ *pSrc2, _type_ *pDst, INT32 len) \
+{ \
+	int shifts; \
+	UINT32 offBeatMask; \
+	const _type_ *sptr1 = pSrc1; \
+	const _type_ *sptr2 = pSrc2; \
+	_type_ *dptr = pDst; \
+	size_t count; \
+	if (len < 16) /* pointless if too small */ \
+	{ \
+		return _fallback_(pSrc1, pSrc2, pDst, len); \
+	} \
+	if      (sizeof(_type_) == 1) shifts = 1; \
+	else if (sizeof(_type_) == 2) shifts = 2; \
+	else if (sizeof(_type_) == 4) shifts = 3; \
+	else if (sizeof(_type_) == 8) shifts = 4; \
+	offBeatMask = (1 << (shifts - 1)) - 1; \
+	if ((ULONG_PTR) pDst & offBeatMask) \
+	{ \
+		/* Incrementing the pointer skips over 16-byte boundary. */ \
+		return _fallback_(pSrc1, pSrc2, pDst, len); \
+	} \
+	/* Get to the 16-byte boundary now. */ \
+	while ((ULONG_PTR) dptr & 0x0f) \
+	{ \
+		_slowWay_; \
+		if (--len == 0) return PRIMITIVES_SUCCESS; \
+	} \
+	/* Use 4 128-bit SSE registers. */ \
+	count = len >> (7-shifts); \
+	len -= count << (7-shifts); \
+	if (((ULONG_PTR) sptr1 & 0x0f) || ((ULONG_PTR) sptr2 & 0x0f)) \
+	{ \
+		/* Unaligned loads */ \
+		while (count--) \
+		{ \
+			__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \
+			xmm0 = _mm_lddqu_si128((__m128i *) sptr1); \
+			sptr1 += (16/sizeof(_type_)); \
+			xmm1 = _mm_lddqu_si128((__m128i *) sptr1); \
+			sptr1 += (16/sizeof(_type_)); \
+			xmm2 = _mm_lddqu_si128((__m128i *) sptr1); \
+			sptr1 += (16/sizeof(_type_)); \
+			xmm3 = _mm_lddqu_si128((__m128i *) sptr1); \
+			sptr1 += (16/sizeof(_type_)); \
+			xmm4 = _mm_lddqu_si128((__m128i *) sptr2); \
+			sptr2 += (16/sizeof(_type_)); \
+			xmm5 = _mm_lddqu_si128((__m128i *) sptr2); \
+			sptr2 += (16/sizeof(_type_)); \
+			xmm6 = _mm_lddqu_si128((__m128i *) sptr2); \
+			sptr2 += (16/sizeof(_type_)); \
+			xmm7 = _mm_lddqu_si128((__m128i *) sptr2); \
+			sptr2 += (16/sizeof(_type_)); \
+			xmm0 = _op_(xmm0, xmm4); \
+			xmm1 = _op_(xmm1, xmm5); \
+			xmm2 = _op_(xmm2, xmm6); \
+			xmm3 = _op_(xmm3, xmm7); \
+			_mm_store_si128((__m128i *) dptr, xmm0); \
+			dptr += (16/sizeof(_type_)); \
+			_mm_store_si128((__m128i *) dptr, xmm1); \
+			dptr += (16/sizeof(_type_)); \
+			_mm_store_si128((__m128i *) dptr, xmm2); \
+			dptr += (16/sizeof(_type_)); \
+			_mm_store_si128((__m128i *) dptr, xmm3); \
+			dptr += (16/sizeof(_type_)); \
+		} \
+	} \
+	else \
+	{ \
+		/* Aligned loads */ \
+		while (count--) \
+		{ \
+			__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \
+			xmm0 = _mm_load_si128((__m128i *) sptr1); \
+			sptr1 += (16/sizeof(_type_)); \
+			xmm1 = _mm_load_si128((__m128i *) sptr1); \
+			sptr1 += (16/sizeof(_type_)); \
+			xmm2 = _mm_load_si128((__m128i *) sptr1); \
+			sptr1 += (16/sizeof(_type_)); \
+			xmm3 = _mm_load_si128((__m128i *) sptr1); \
+			sptr1 += (16/sizeof(_type_)); \
+			xmm4 = _mm_load_si128((__m128i *) sptr2); \
+			sptr2 += (16/sizeof(_type_)); \
+			xmm5 = _mm_load_si128((__m128i *) sptr2); \
+			sptr2 += (16/sizeof(_type_)); \
+			xmm6 = _mm_load_si128((__m128i *) sptr2); \
+			sptr2 += (16/sizeof(_type_)); \
+			xmm7 = _mm_load_si128((__m128i *) sptr2); \
+			sptr2 += (16/sizeof(_type_)); \
+			xmm0 = _op_(xmm0, xmm4); \
+			xmm1 = _op_(xmm1, xmm5); \
+			xmm2 = _op_(xmm2, xmm6); \
+			xmm3 = _op_(xmm3, xmm7); \
+			_mm_store_si128((__m128i *) dptr, xmm0); \
+			dptr += (16/sizeof(_type_)); \
+			_mm_store_si128((__m128i *) dptr, xmm1); \
+			dptr += (16/sizeof(_type_)); \
+			_mm_store_si128((__m128i *) dptr, xmm2); \
+			dptr += (16/sizeof(_type_)); \
+			_mm_store_si128((__m128i *) dptr, xmm3); \
+			dptr += (16/sizeof(_type_)); \
+		} \
+	} \
+	/* Use a single 128-bit SSE register. */ \
+	count = len >> (5-shifts); \
+	len -= count << (5-shifts); \
+	while (count--) \
+	{ \
+		__m128i xmm0, xmm1; \
+		xmm0 = LOAD_SI128(sptr1);  sptr1 += (16/sizeof(_type_)); \
+		xmm1 = LOAD_SI128(sptr2);  sptr2 += (16/sizeof(_type_)); \
+		xmm0 = _op_(xmm0, xmm1); \
+		_mm_store_si128((__m128i *) dptr, xmm0); \
+		dptr += (16/sizeof(_type_)); \
+	} \
+	/* Finish off the remainder. */ \
+	while (len--) { _slowWay_; } \
+	return PRIMITIVES_SUCCESS; \
+}
+
+#endif /* !__PRIM_TEMPLATES_H_INCLUDED__ */
--- a/libfreerdp/primitives/primitives.c
+++ b/libfreerdp/primitives/primitives.c
@ -0,0 +1,297 @@
+/* primitives.c
+ * This code queries processor features and calls the init/deinit routines.
+ * vi:ts=4 sw=4
+ *
+ * Copyright 2011 Martin Fleisz <mfleisz@thinstuff.com>
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include <string.h>
+#include <stdlib.h>
+#include <freerdp/primitives.h>
+#include "prim_internal.h"
+#ifdef ANDROID
+# include "cpu-features.h"
+#endif
+
+/* Singleton pointer used throughout the program when requested. */
+static primitives_t *pPrimitives = NULL;
+
+#define D_BIT_MMX		(1<<23)
+#define D_BIT_SSE		(1<<25)
+#define D_BIT_SSE2		(1<<26)
+#define D_BIT_3DN		(1<<30)
+#define C_BIT_SSE3		(1<<0)
+#define C_BIT_3DNP		(1<<8)
+#define C_BIT_SSSE3		(1<<9)
+#define C_BIT_SSE41		(1<<19)
+#define C_BIT_SSE42		(1<<20)
+#define C_BIT_XGETBV	(1<<27)
+#define C_BIT_AVX		(1<<28)
+#define C_BITS_AVX		(C_BIT_XGETBV|C_BIT_AVX)
+#define E_BIT_XMM		(1<<1)
+#define E_BIT_YMM		(1<<2)
+#define E_BITS_AVX		(E_BIT_XMM|E_BIT_YMM)
+#define C_BIT_FMA		(1<<11)
+#define C_BIT_AVX_AES	(1<<24)
+
+/* If x86 */
+#if defined(__x86_64) || defined(__x86_64__) || defined(__amd64) \
+	|| defined(__amd64__) || defined(_M_AMD64) || defined(_M_X64) \
+	|| defined(i386) || defined(__i386) || defined(__i386__) \
+	|| defined(_M_IX86) || defined(_X86_)
+#ifndef i386
+#define i386
+#endif
+
+/* If GCC */
+# ifdef __GNUC__
+# define xgetbv(_func_, _lo_, _hi_) \
+	__asm__ __volatile__ ("xgetbv" : "=a" (_lo_), "=d" (_hi_) : "c" (_func_))
+
+static void cpuid(
+	unsigned info, 
+	unsigned *eax, 
+	unsigned *ebx, 
+	unsigned *ecx, 
+	unsigned *edx)
+{
+	*eax = *ebx = *ecx = *edx = 0;
+	__asm volatile
+	(
+		/* The EBX (or RBX register on x86_64) is used for the PIC base address
+		 * and must not be corrupted by our inline assembly.
+		 */
+#  if defined(__i386__)
+		"mov %%ebx, %%esi;"
+		"cpuid;"
+		"xchg %%ebx, %%esi;"
+#else
+		"mov %%rbx, %%rsi;"
+		"cpuid;"
+		"xchg %%rbx, %%rsi;"
+#endif
+		: "=a" (*eax), "=S" (*ebx), "=c" (*ecx), "=d" (*edx)
+		: "0" (info)
+	);
+}
+
+static void set_hints(
+	primitives_hints_t *hints)
+{
+	unsigned a, b, c, d;
+	cpuid(1, &a, &b, &c, &d);
+	if (d & D_BIT_MMX)	 hints->x86_flags |= PRIM_X86_MMX_AVAILABLE;
+	if (d & D_BIT_SSE)	 hints->x86_flags |= PRIM_X86_SSE_AVAILABLE;
+	if (d & D_BIT_SSE2)	 hints->x86_flags |= PRIM_X86_SSE2_AVAILABLE;
+	if (d & D_BIT_3DN)	 hints->x86_flags |= PRIM_X86_3DNOW_AVAILABLE;
+	if (c & C_BIT_3DNP)  hints->x86_flags |= PRIM_X86_3DNOW_PREFETCH_AVAILABLE;
+	if (c & C_BIT_SSE3)	 hints->x86_flags |= PRIM_X86_SSE3_AVAILABLE;
+	if (c & C_BIT_SSSE3) hints->x86_flags |= PRIM_X86_SSSE3_AVAILABLE;
+	if (c & C_BIT_SSE41) hints->x86_flags |= PRIM_X86_SSE41_AVAILABLE;
+	if (c & C_BIT_SSE42) hints->x86_flags |= PRIM_X86_SSE42_AVAILABLE;
+	if ((c & C_BITS_AVX) == C_BITS_AVX)
+	{
+		int e, f;
+		xgetbv(0, e, f);
+		if ((e & E_BITS_AVX) == E_BITS_AVX)
+		{
+			hints->x86_flags |= PRIM_X86_AVX_AVAILABLE;
+			if (c & C_BIT_FMA) hints->x86_flags |= PRIM_X86_FMA_AVAILABLE;
+			if (c & C_BIT_AVX_AES) hints->x86_flags |= PRIM_X86_AVX_AES_AVAILABLE;
+		}
+	}
+	/* TODO: AVX2: set eax=7, ecx=0, cpuid, check ebx-bit5 */
+}
+# else
+static void set_hints(
+	primitives_hints_t *hints)
+{
+	/* x86 non-GCC:  TODO */
+}
+# endif /* __GNUC__ */
+/* ------------------------------------------------------------------------- */
+#elif defined(__arm__) || defined(__ARM_ARCH_7A__) \
+	|| defined(__ARM_EABI__) || defined(__ARMEL__) || defined(ANDROID)
+#ifndef __arm__
+#define __arm__
+#endif
+
+static UINT32 androidNeon(void)
+{
+#if ANDROID
+	if (android_getCpuFamily() != ANDROID_CPU_FAMILY_ARM) return 0;
+
+	UINT64 features = android_getCpuFeatures();
+
+	if ((features & ANDROID_CPU_ARM_FEATURE_ARMv7))
+	{
+		if (features & ANDROID_CPU_ARM_FEATURE_NEON)
+		{
+			return PRIM_ARM_NEON_AVAILABLE;
+		}
+	}
+	/* else */
+#endif
+	return 0;
+}
+
+static void set_hints(
+	primitives_hints_t *hints)
+{
+	/* ARM:  TODO */
+	hints->arm_flags |= androidNeon();
+}
+
+#else
+static void set_hints(
+	primitives_hints_t *hints)
+{
+}
+#endif /* x86 else ARM else */
+
+/* ------------------------------------------------------------------------- */
+void primitives_init(void)
+{
+	primitives_hints_t *hints;
+
+	if (pPrimitives == NULL)
+	{
+		pPrimitives = calloc(1, sizeof(primitives_t));
+		if (pPrimitives == NULL) return;
+	}
+
+	hints = calloc(1, sizeof(primitives_hints_t));
+	set_hints(hints);
+	pPrimitives->hints = (void *) hints;
+
+	/* Now call each section's initialization routine. */
+	primitives_init_add(hints, pPrimitives);
+	primitives_init_andor(hints, pPrimitives);
+	primitives_init_alphaComp(hints, pPrimitives);
+	primitives_init_copy(hints, pPrimitives);
+	primitives_init_set(hints, pPrimitives);
+	primitives_init_shift(hints, pPrimitives);
+	primitives_init_sign(hints, pPrimitives);
+	primitives_init_colors(hints, pPrimitives);
+}
+
+/* ------------------------------------------------------------------------- */
+primitives_t *primitives_get(void)
+{
+	if (pPrimitives == NULL) primitives_init();
+	return pPrimitives;
+}
+
+/* ------------------------------------------------------------------------- */
+UINT32 primitives_get_flags(
+	const primitives_t *prims)
+{
+	primitives_hints_t *hints = (primitives_hints_t *) (prims->hints);
+#ifdef i386
+	return hints->x86_flags;
+#elif defined(__arm__)
+	return hints->arm_flags;
+#else
+	return 0;
+#endif
+}
+
+/* ------------------------------------------------------------------------- */
+void primitives_flags_str(
+	const primitives_t *prims,
+	char *str,
+	size_t len)
+{
+	typedef struct
+	{
+		UINT32	flag;
+		const char *str;
+	} flagpair_t;
+	static const flagpair_t x86_flags[] =
+	{
+		{ PRIM_X86_MMX_AVAILABLE,				"MMX" },
+		{ PRIM_X86_3DNOW_AVAILABLE,				"3DNow" },
+		{ PRIM_X86_3DNOW_PREFETCH_AVAILABLE,	"3DNow-PF" },
+		{ PRIM_X86_SSE_AVAILABLE,				"SSE" },
+		{ PRIM_X86_SSE2_AVAILABLE,				"SSE2" },
+		{ PRIM_X86_SSE3_AVAILABLE,				"SSE3" },
+		{ PRIM_X86_SSSE3_AVAILABLE,				"SSSE3" },
+		{ PRIM_X86_SSE41_AVAILABLE,				"SSE4.1" },
+		{ PRIM_X86_SSE42_AVAILABLE,				"SSE4.2" },
+		{ PRIM_X86_AVX_AVAILABLE,				"AVX" },
+		{ PRIM_X86_FMA_AVAILABLE,				"FMA" },
+		{ PRIM_X86_AVX_AES_AVAILABLE,			"AVX-AES" },
+		{ PRIM_X86_AVX2_AVAILABLE,				"AVX2" },
+	};
+	static const flagpair_t arm_flags[] =
+	{
+		{ PRIM_ARM_VFP1_AVAILABLE,				"VFP1" },
+		{ PRIM_ARM_VFP2_AVAILABLE,				"VFP2" },
+		{ PRIM_ARM_VFP3_AVAILABLE,				"VFP3" },
+		{ PRIM_ARM_VFP4_AVAILABLE,				"VFP4" },
+		{ PRIM_ARM_FPA_AVAILABLE,				"FPA" },
+		{ PRIM_ARM_FPE_AVAILABLE,				"FPE" },
+		{ PRIM_ARM_IWMMXT_AVAILABLE,			"IWMMXT" },
+		{ PRIM_ARM_NEON_AVAILABLE,				"NEON" },
+	};
+	int i;
+	primitives_hints_t *hints;
+
+	*str = '\0';
+	--len;	/* for the '/0' */
+
+	hints = (primitives_hints_t *) (prims->hints);
+	for (i=0; i<sizeof(x86_flags)/sizeof(flagpair_t); ++i)
+	{
+		if (hints->x86_flags & x86_flags[i].flag)
+		{
+			int slen = strlen(x86_flags[i].str) + 1;
+			if (len < slen) break;
+			if (*str != '\0') strcat(str, " ");
+			strcat(str, x86_flags[i].str);
+			len -= slen;
+		}
+	}
+
+	for (i=0; i<sizeof(arm_flags)/sizeof(flagpair_t); ++i)
+	{
+		if (hints->arm_flags & arm_flags[i].flag)
+		{
+			int slen = strlen(arm_flags[i].str) + 1;
+			if (len < slen) break;
+			if (*str != '\0') strcat(str, " ");
+			strcat(str, arm_flags[i].str);
+			len -= slen;
+		}
+	}
+}
+
+/* ------------------------------------------------------------------------- */
+void primitives_deinit(void)
+{
+	if (pPrimitives == NULL) return;
+
+	/* Call each section's de-initialization routine. */
+	primitives_deinit_add(pPrimitives);
+	primitives_deinit_andor(pPrimitives);
+	primitives_deinit_alphaComp(pPrimitives);
+	primitives_deinit_copy(pPrimitives);
+	primitives_deinit_set(pPrimitives);
+	primitives_deinit_shift(pPrimitives);
+	primitives_deinit_sign(pPrimitives);
+	primitives_deinit_colors(pPrimitives);
+
+	if (pPrimitives->hints != NULL) free((void *) (pPrimitives->hints));
+	free((void *) pPrimitives);
+	pPrimitives = NULL;
+}
--- a/libfreerdp/primitives/test/CMakeLists.txt
+++ b/libfreerdp/primitives/test/CMakeLists.txt
@ -0,0 +1,139 @@
+# FreeRDP: A Remote Desktop Protocol Client
+# primitives test makefile builder
+# vi:ts=4 sw=4:
+#
+# (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied. See the License for the specific language governing permissions
+# and limitations under the License.
+#
+
+# TODO: Integrate this into the testing framework, in some form.
+# Right now this produces a standalone test that covers both functionality
+# and performance of the primitives library entrypoints.
+
+cmake_minimum_required(VERSION 2.8)
+set(MODULE_NAME "prim_test")
+set(MODULE_PREFIX "PRIMITIVES_LIBRARY_TEST")
+
+set(PRIMITIVE_TEST_CFILES
+	prim_test.c
+	test_add.c
+	test_alphaComp.c
+	test_andor.c
+	test_colors.c
+	test_copy.c
+	test_set.c
+	test_shift.c
+	test_sign.c
+    ../prim_add.c
+    ../prim_andor.c
+    ../prim_alphaComp.c
+    ../prim_colors.c
+    ../prim_copy.c
+    ../prim_set.c
+    ../prim_shift.c
+    ../prim_sign.c
+	../primitives.c
+    )
+
+set(PRIMITIVE_TEST_HEADERS
+	measure.h
+	prim_test.h
+    ../prim_internal.h
+    )
+
+set(PRIMITIVE_TEST_SRCS
+    ${PRIMITIVE_TEST_CFILES}
+    ${PRIMITIVE_TEST_HEADERS}
+    )
+
+include_directories(. ../../.. ../../../include ../../../winpr/include)
+add_definitions(-DPRIM_STATIC=auto -DALL_PRIMITIVES_VERSIONS -DHAVE_CONFIG_H)
+
+# If these haven't been set by the caller, set them now to defaults.
+if(NOT DEFINED WITH_IPP)
+	set(WITH_IPP FALSE) 
+endif()
+if(NOT DEFINED WITH_SSE2)
+	if (CMAKE_SYSTEM_PROCESSOR MATCHES "arm*")
+		set(WITH_SSE2 FALSE)
+	else()
+		set(WITH_SSE2 TRUE)
+	endif()
+endif()
+if(NOT DEFINED WITH_NEON)
+	if (CMAKE_SYSTEM_PROCESSOR MATCHES "arm*")
+		set(WITH_NEON TRUE)
+	else()
+		set(WITH_NEON FALSE)
+	endif()
+endif()
+
+if(WITH_SSE2)
+	if(CMAKE_COMPILER_IS_GNUCC)
+		set(OPTFLAGS "${OPTFLAGS} -msse2 -mssse3 -O2 -Wdeclaration-after-statement")
+	endif()
+
+	if(MSVC)
+		set(OPTFLAGS "${OPTFLAGS} /arch:SSE2")
+	endif()
+elseif(WITH_NEON)
+	if(CMAKE_COMPILER_IS_GNUCC)
+	    set(OPTIMZATION "${OPTFLAGS} -mfpu=neon -mfloat-abi=softfp -O2")
+	endif()
+	# TODO: Add MSVC equivalent
+endif()
+
+add_executable(prim_test ${PRIMITIVE_TEST_SRCS})
+
+if(WITH_IPP)
+	if(NOT DEFINED IPP_FOUND)
+		include(../../../cmake/FindIPP.cmake)
+	endif()
+	add_definitions(-DWITH_IPP)
+
+	# IPP PATH debugging messages
+    message(IPP_FOUND=${IPP_FOUND})
+    message(IPP_VERSION_STR=${IPP_VERSION_STR})
+    message(IPP_VERSION_MAJOR=${IPP_VERSION_MAJOR})
+    message(IPP_VERSION_MINOR=${IPP_VERSION_MINOR})
+    message(IPP_VERSION_BUILD=${IPP_VERSION_BUILD})
+    message(IPP_ROOT_DIR=${IPP_ROOT_DIR})
+    message(IPP_INCLUDE_DIRS=${IPP_INCLUDE_DIRS})
+    message(IPP_LIBRARY_DIRS=${IPP_LIBRARY_DIRS})
+    message(IPP_LIBRARIES=${IPP_LIBRARIES})
+    message(IPP_COMPILER_LIBRARY_DIRS=${IPP_COMPILER_LIBRARY_DIRS})
+    message(IPP_COMPILER_LIBRARIES=${IPP_COMPILER_LIBRARIES})
+    message(IPP_LIBRARY_LIST=${IPP_LIBRARY_LIST})
+    message(IPP_LIB_PREFIX=${IPP_LIB_PREFIX})
+    message(IPP_LIB_SUFFIX=${IPP_LIB_SUFFIX})
+    message(IPP_PREFIX=${IPP_PREFIX})
+    message(IPP_SUFFIX=${IPP_SUFFIX})
+    message(IPPCORE=${IPPCORE})
+    message(IPPS=${IPPS})
+    message(IPPI=${IPPI})
+    message(IPPCC=${IPPCC})
+    message(IPPCV=${IPPCV})
+    message(IPPVM=${IPPVM})
+
+	if(CMAKE_COMPILER_IS_GNUCC)
+		foreach(INCLDIR ${IPP_INCLUDE_DIRS})
+			set(OPTFLAGS "${OPTFLAGS} -I${INCLDIR}")
+		endforeach(INCLDIR)
+	endif()
+	target_link_libraries(prim_test ${IPP_LIBRARY_LIST})
+endif()
+
+set_property(SOURCE ${PRIMITIVE_TEST_CFILES} PROPERTY COMPILE_FLAGS ${OPTFLAGS})
+
+target_link_libraries(prim_test rt)
+if(NOT TESTING_OUTPUT_DIRECTORY)
+	set(TESTING_OUTPUT_DIRECTORY .)
+endif()
+add_test(prim_test ${TESTING_OUTPUT_DIRECTORY}/prim_test functionality)
--- a/libfreerdp/primitives/test/measure.h
+++ b/libfreerdp/primitives/test/measure.h
@ -0,0 +1,121 @@
+/* measure.h
+ * Macros to help with performance measurement.
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.  Algorithms used by
+ * this code may be covered by patents by HP, Microsoft, or other parties.
+ *
+ * MEASURE_LOOP_START("measurement", 2000)
+ *   code to be measured
+ * MEASURE_LOOP_STOP
+ *   buffer flush and such
+ * MEASURE_SHOW_RESULTS
+ *
+ * Define GOOGLE_PROFILER if you want gperftools included.
+ */
+
+#ifdef _GNUC_
+# pragma once
+#endif
+
+#ifndef __MEASURE_H_INCLUDED__
+#define __MEASURE_H_INCLUDED__
+
+#include <time.h>
+#include <sys/param.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#ifdef GOOGLE_PROFILER
+#include <gperftools/profiler.h>
+#define PROFILER_START(_prefix_)  \
+    do {  \
+		char _path[PATH_MAX];  \
+		sprintf(_path, "./%s.prof", (_prefix_));  \
+		ProfilerStart(_path);  \
+    } while (0);
+# define PROFILER_STOP  \
+    do {  \
+		ProfilerStop(); \
+    } while (0);
+#else
+#define PROFILER_START(_prefix_)
+#define PROFILER_STOP
+#endif // GOOGLE_PROFILER
+
+extern float _delta_time(const struct timespec *t0, const struct timespec *t1);
+extern void _floatprint(float t, char *output);
+
+#ifndef CLOCK_MONOTONIC_RAW
+#define CLOCK_MONOTONIC_RAW 4
+#endif // !CLOCK_MONOTONIC_RAW
+
+#define MEASURE_LOOP_START(_prefix_, _count_)  \
+{   struct timespec _start, _stop;  \
+    char *_prefix;  \
+    int _count = (_count_);  \
+	int _loop; \
+	float _delta; \
+	char _str1[32], _str2[32]; \
+    _prefix = strdup(_prefix_);  \
+	_str1[0] = '\0'; _str2[0] = '\0'; \
+    clock_gettime(CLOCK_MONOTONIC_RAW, &_start);  \
+    PROFILER_START(_prefix);  \
+    _loop = (_count);  \
+    do {
+
+#define MEASURE_LOOP_STOP  \
+    } while (--_loop);
+
+#define MEASURE_GET_RESULTS(_result_)  \
+    PROFILER_STOP;  \
+    clock_gettime(CLOCK_MONOTONIC_RAW, &_stop);  \
+    _delta = _delta_time(&_start, &_stop);  \
+    (_result_) = (float) _count / _delta;  \
+    free(_prefix);  \
+}
+
+#define MEASURE_SHOW_RESULTS(_result_)  \
+    PROFILER_STOP; \
+    clock_gettime(CLOCK_MONOTONIC_RAW, &_stop); \
+    _delta = _delta_time(&_start, &_stop); \
+    (_result_) = (float) _count / _delta; \
+    _floatprint((float) _count / _delta, _str1); \
+    printf("%s: %9d iterations in %5.1f seconds = %s/s \n",	\
+	_prefix, _count, _delta, _str1); \
+    free(_prefix); \
+}
+
+#define MEASURE_SHOW_RESULTS_SCALED(_scale_, _label_) \
+    PROFILER_STOP; \
+    clock_gettime(CLOCK_MONOTONIC_RAW, &_stop); \
+    _delta = _delta_time(&_start, &_stop); \
+    _floatprint((float) _count / _delta, _str1); \
+    _floatprint((float) _count / _delta * (_scale_), _str2); \
+    printf("%s: %9d iterations in %5.1f seconds = %s/s = %s%s \n",	\
+	_prefix, _count, _delta, _str1, _str2, _label_); \
+    free(_prefix); \
+}
+
+#define MEASURE_TIMED(_label_, _init_iter_, _test_time_, _result_, _call_)  \
+{   float _r;  \
+    MEASURE_LOOP_START(_label_, _init_iter_);  \
+    _call_;  \
+    MEASURE_LOOP_STOP;  \
+    MEASURE_GET_RESULTS(_r);  \
+    MEASURE_LOOP_START(_label_, _r * _test_time_);  \
+    _call_;  \
+    MEASURE_LOOP_STOP;  \
+    MEASURE_SHOW_RESULTS(_result_);  \
+}
+
+#endif // __MEASURE_H_INCLUDED__
--- a/libfreerdp/primitives/test/prim_test.c
+++ b/libfreerdp/primitives/test/prim_test.c
@ -0,0 +1,414 @@
+/* prim_test.c
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include "prim_test.h"
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <time.h>
+
+int test_sizes[] = { 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096 };
+int Quiet = 0;
+
+/* ------------------------------------------------------------------------- */
+static void get_random_data_lrand(
+    void *buffer,
+    size_t size)
+{
+	static int seeded = 0;
+	long int *ptr = (long int *) buffer;
+	unsigned char *cptr;
+
+	if (!seeded)
+	{
+		seeded = 1;
+		srand48(time(NULL));
+	}
+	/* This isn't the perfect random number generator, but that's okay. */
+	while (size >= sizeof(long int))
+	{
+		*ptr++ = lrand48();
+		size -= sizeof(long int);
+	}
+	cptr = (unsigned char *) ptr;
+	while (size > 0)
+	{
+		*cptr++ = lrand48() & 0xff;
+		--size;
+	}
+}
+
+/* ------------------------------------------------------------------------- */
+void get_random_data(
+    void *buffer,
+    size_t size)
+{
+#ifdef linux
+	size_t offset = 0;
+	int fd = open("/dev/urandom", O_RDONLY);
+	if (fd < 0)
+	{
+		get_random_data_lrand(buffer, size);
+		return;
+	}
+
+	while (size > 0)
+	{
+		ssize_t count = read(fd, buffer+offset, size);
+		size -= count;
+		offset += count;
+	}
+	close(fd);
+#else
+	get_random_data_lrand(buffer, size);
+#endif
+}
+
+/* ------------------------------------------------------------------------- */
+float _delta_time(
+    const struct timespec *t0,
+    const struct timespec *t1)
+{
+    INT64 secs = (INT64) (t1->tv_sec) - (INT64) (t0->tv_sec);
+    long   nsecs = t1->tv_nsec - t0->tv_nsec;
+	double retval;
+
+    if (nsecs < 0)
+	{
+        --secs;
+        nsecs += 1000000000;
+    }
+    retval = (double) secs + (double) nsecs / (double) 1000000000.0;
+    return (retval < 0.0) ? 0.0 : (float) retval;
+}
+
+/* ------------------------------------------------------------------------- */
+void _floatprint(
+    float t,
+    char *output)
+{
+    /* I don't want to link against -lm, so avoid log,exp,... */
+    float f = 10.0;
+	int i;
+    while (t > f) f *= 10.0;
+    f /= 1000.0;
+    i = ((int) (t/f+0.5)) * (int) f;
+    if (t < 0.0) sprintf(output, "%f", t);
+    else if (i == 0) sprintf(output, "%d", (int) (t+0.5));
+    else if (t < 1e+3) sprintf(output, "%3d", i);
+    else if (t < 1e+6) sprintf(output, "%3d,%03d",     
+		i/1000, i % 1000);
+    else if (t < 1e+9) sprintf(output, "%3d,%03d,000", 
+		i/1000000, (i % 1000000) / 1000);
+    else if (t < 1e+12) sprintf(output, "%3d,%03d,000,000", 
+		i/1000000000, (i % 1000000000) / 1000000);
+    else sprintf(output, "%f", t);
+}
+
+/* ------------------------------------------------------------------------- */
+/* Specific areas to test: */
+#define TEST_COPY8			(1<<0)
+#define TEST_SET8			(1<<1)
+#define TEST_SET32S			(1<<2)
+#define TEST_SET32U			(1<<3)
+#define TEST_SIGN16S		(1<<4)
+#define TEST_ADD16S			(1<<5)
+#define TEST_LSHIFT16S		(1<<6)
+#define TEST_LSHIFT16U		(1<<7)
+#define TEST_RSHIFT16S		(1<<8)
+#define TEST_RSHIFT16U		(1<<9)
+#define TEST_RGB			(1<<10)
+#define TEST_ALPHA			(1<<11)
+#define TEST_AND			(1<<12)
+#define TEST_OR				(1<<13)
+
+/* Specific types of testing: */
+#define TEST_FUNCTIONALITY	(1<<0)
+#define TEST_PERFORMANCE	(1<<1)
+
+/* ------------------------------------------------------------------------- */
+int main(
+    int argc,
+    char **argv)
+{
+	typedef struct
+	{
+		const char *testStr;
+		UINT32 bits;
+	} test_t;
+	static const test_t testList[] =
+	{
+		{ "all",		0xFFFFFFFFU },
+		{ "copy",		TEST_COPY8 },
+		{ "copy8",		TEST_COPY8 },
+		{ "set",		TEST_SET8|TEST_SET32S|TEST_SET32U },
+		{ "set8",		TEST_SET8 },
+		{ "set32",		TEST_SET32S|TEST_SET32U },
+		{ "set32s",		TEST_SET32S },
+		{ "set32u",		TEST_SET32U },
+		{ "sign",		TEST_SIGN16S },
+		{ "sign16s",	TEST_SIGN16S },
+		{ "add",		TEST_ADD16S },
+		{ "add16s",		TEST_ADD16S },
+		{ "lshift",		TEST_LSHIFT16S|TEST_LSHIFT16U },
+		{ "rshift",		TEST_RSHIFT16S|TEST_RSHIFT16U },
+		{ "shift",		TEST_LSHIFT16S|TEST_LSHIFT16U|TEST_RSHIFT16S|TEST_RSHIFT16U },
+		{ "lshift16s",	TEST_LSHIFT16S },
+		{ "lshift16u",	TEST_LSHIFT16U },
+		{ "rshift16s",	TEST_RSHIFT16S },
+		{ "rshift16u",	TEST_RSHIFT16U },
+		{ "rgb",		TEST_RGB },
+		{ "color",		TEST_RGB },
+		{ "colors",		TEST_RGB },
+		{ "alpha",		TEST_ALPHA },
+		{ "and",		TEST_AND },
+		{ "or",			TEST_OR },
+	};
+#define NUMTESTS (sizeof(testList)/sizeof(test_t))
+	static const test_t testTypeList[] =
+	{
+		{ "functionality",	TEST_FUNCTIONALITY },
+		{ "performance",	TEST_PERFORMANCE },
+	};
+#define NUMTESTTYPES (sizeof(testTypeList)/sizeof(test_t))
+	char hints[256];
+	UINT32 testSet = 0;
+	UINT32 testTypes = 0;
+	int i;
+	int results = SUCCESS;
+
+	/* Parse command line for the test set. */
+	for (i=1; i<argc; ++i)
+	{
+		BOOL found = 0;
+		int j;
+		for (j=0; j<NUMTESTS; ++j)
+		{
+			if (strcasecmp(argv[i], testList[j].testStr) == 0)
+			{
+				testSet |= testList[j].bits;
+				found = 1;
+				break;
+			}
+		}
+		for (j=0; j<NUMTESTTYPES; ++j)
+		{
+			if (strcasecmp(argv[i], testTypeList[j].testStr) == 0)
+			{
+				testTypes |= testTypeList[j].bits;
+				found = 1;
+				break;
+			}
+		}
+		if (!found)
+		{
+			if (strstr(argv[i], "help") != NULL)
+			{
+				printf("Available tests:\n");
+				for (j=0; j<NUMTESTS; ++j)
+				{
+					printf("  %s\n", testList[j].testStr);
+				}
+				for (j=0; j<NUMTESTTYPES; ++j)
+				{
+					printf("  %s\n", testTypeList[j].testStr);
+				}
+			}
+			else fprintf(stderr, "Unknown parameter '%s'!\n", argv[i]);
+		}
+	}
+	if (testSet == 0) testSet = 0xffffffff;
+	if (testTypes == 0) testTypes = 0xffffffff;
+
+	primitives_init();
+
+	primitives_flags_str(primitives_get(), hints, sizeof(hints));
+	printf("Hints: %s\n", hints);
+
+	/* COPY */
+	if (testSet & TEST_COPY8)
+	{
+		if (testTypes & TEST_FUNCTIONALITY)
+		{
+			results |= test_copy8u_func();
+		}
+		if (testTypes & TEST_PERFORMANCE)
+		{
+			results |= test_copy8u_speed();
+		}
+	}
+	/* SET */
+	if (testSet & TEST_SET8)
+	{
+		if (testTypes & TEST_FUNCTIONALITY)
+		{
+			results |= test_set8u_func();
+		}
+		if (testTypes & TEST_PERFORMANCE)
+		{
+			results |= test_set8u_speed();
+		}
+	}
+	if (testSet & TEST_SET32S)
+	{
+		if (testTypes & TEST_FUNCTIONALITY)
+		{
+			results |= test_set32s_func();
+		}
+		if (testTypes & TEST_PERFORMANCE)
+		{
+			results |= test_set32s_speed();
+		}
+	}
+	if (testSet & TEST_SET32U)
+	{
+		if (testTypes & TEST_FUNCTIONALITY)
+		{
+			results |= test_set32u_func();
+		}
+		if (testTypes & TEST_PERFORMANCE)
+		{
+			results |= test_set32u_speed();
+		}
+	}
+	/* SIGN */
+	if (testSet & TEST_SIGN16S)
+	{
+		if (testTypes & TEST_FUNCTIONALITY)
+		{
+			results |= test_sign16s_func();
+		}
+		if (testTypes & TEST_PERFORMANCE)
+		{
+			results |= test_sign16s_speed();
+		}
+	}
+	/* ADD */
+	if (testSet & TEST_ADD16S)
+	{
+		if (testTypes & TEST_FUNCTIONALITY)
+		{
+			results |= test_add16s_func();
+		}
+		if (testTypes & TEST_PERFORMANCE)
+		{
+			results |= test_add16s_speed();
+		}
+	}
+	/* SHIFTS */
+	if (testSet & TEST_LSHIFT16S)
+	{
+		if (testTypes & TEST_FUNCTIONALITY)
+		{
+			results |= test_lShift_16s_func();
+		}
+		if (testTypes & TEST_PERFORMANCE)
+		{
+			results |= test_lShift_16s_speed();
+		}
+	}
+	if (testSet & TEST_LSHIFT16U)
+	{
+		if (testTypes & TEST_FUNCTIONALITY)
+		{
+			results |= test_lShift_16u_func();
+		}
+		if (testTypes & TEST_PERFORMANCE)
+		{
+			results |= test_lShift_16u_speed();
+		}
+	}
+	if (testSet & TEST_RSHIFT16S)
+	{
+		if (testTypes & TEST_FUNCTIONALITY)
+		{
+			results |= test_rShift_16s_func();
+		}
+		if (testTypes & TEST_PERFORMANCE)
+		{
+			results |= test_rShift_16s_speed();
+		}
+	}
+	if (testSet & TEST_RSHIFT16U)
+	{
+		if (testTypes & TEST_FUNCTIONALITY)
+		{
+			results |= test_rShift_16u_func();
+		}
+		if (testTypes & TEST_PERFORMANCE)
+		{
+			results |= test_rShift_16u_speed();
+		}
+	}
+	/* COLORS */
+	if (testSet & TEST_RGB)
+	{
+		if (testTypes & TEST_FUNCTIONALITY)
+		{
+			results |= test_RGBToRGB_16s8u_P3AC4R_func();
+		}
+		if (testTypes & TEST_PERFORMANCE)
+		{
+			results |= test_RGBToRGB_16s8u_P3AC4R_speed();
+		}
+		if (testTypes & TEST_FUNCTIONALITY)
+		{
+			results |= test_yCbCrToRGB_16s16s_P3P3_func();
+		}
+		if (testTypes & TEST_PERFORMANCE)
+		{
+			results |= test_yCbCrToRGB_16s16s_P3P3_speed();
+		}
+	}
+	/* ALPHA COMPOSITION */
+	if (testSet & TEST_ALPHA)
+	{
+		if (testTypes & TEST_FUNCTIONALITY)
+		{
+			results |= test_alphaComp_func();
+		}
+		if (testTypes & TEST_PERFORMANCE)
+		{
+			results |= test_alphaComp_speed();
+		}
+	}
+	/* AND & OR */
+	if (testSet & TEST_AND)
+	{
+		if (testTypes & TEST_FUNCTIONALITY)
+		{
+			results |= test_and_32u_func();
+		}
+		if (testTypes & TEST_PERFORMANCE)
+		{
+			results |= test_and_32u_speed();
+		}
+	}
+	if (testSet & TEST_OR)
+	{
+		if (testTypes & TEST_FUNCTIONALITY)
+		{
+			results |= test_or_32u_func();
+		}
+		if (testTypes & TEST_PERFORMANCE)
+		{
+			results |= test_or_32u_speed();
+		}
+	}
+
+	primitives_deinit();
+	return results;
+}
--- a/libfreerdp/primitives/test/prim_test.h
+++ b/libfreerdp/primitives/test/prim_test.h
@ -0,0 +1,247 @@
+/* primtest.h
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.  Algorithms used by
+ * this code may be covered by patents by HP, Microsoft, or other parties.
+ */
+
+#ifdef __GNUC__
+# pragma once
+#endif
+
+#ifndef __PRIMTEST_H_INCLUDED__
+#define __PRIMTEST_H_INCLUDED__
+
+#include <config.h>
+#include <stdint.h>
+#include <winpr/wtypes.h>
+
+#include <measure.h>
+#include <string.h>
+#include <stdio.h>
+#include <freerdp/primitives.h>
+#ifdef WITH_IPP
+#include <ipps.h>
+#include <ippi.h>
+#endif
+
+#define BLOCK_ALIGNMENT 16
+#ifdef __GNUC__
+#define ALIGN(x) x __attribute((aligned(BLOCK_ALIGNMENT)))
+#define POSSIBLY_UNUSED(x)	x __attribute((unused))
+#else
+/* TODO: Someone needs to finish this for non-GNU C */
+#define ALIGN(x) x
+#define POSSIBLY_UNUSED(x)	x
+#endif
+#define ABS(_x_) ((_x_) < 0 ? (-(_x_)) : (_x_))
+#define MAX_TEST_SIZE 4096
+
+extern int test_sizes[];
+#define NUM_TEST_SIZES 10
+
+extern void get_random_data(void *buffer, size_t size);
+
+#ifndef SUCCESS
+#define SUCCESS 0
+#endif
+#ifndef FAILURE
+#define FAILURE 1
+#endif
+
+extern int test_copy8u_func(void);
+extern int test_copy8u_speed(void);
+
+extern int test_set8u_func(void);
+extern int test_set32s_func(void);
+extern int test_set8u_speed(void);
+extern int test_set32s_speed(void);
+extern int test_set32u_speed(void);
+
+extern int test_sign16s_func(void);
+extern int test_sign16s_speed(void);
+
+extern int test_add16s_func(void);
+extern int test_add16s_speed(void);
+
+extern int test_lShift_16s_func(void);
+extern int test_lShift_16u_func(void);
+extern int test_rShift_16s_func(void);
+extern int test_rShift_16u_func(void);
+extern int test_lShift_16s_speed(void);
+extern int test_lShift_16u_speed(void);
+extern int test_rShift_16s_speed(void);
+extern int test_rShift_16u_speed(void);
+
+extern int test_RGBToRGB_16s8u_P3AC4R_func(void);
+extern int test_RGBToRGB_16s8u_P3AC4R_speed(void);
+extern int test_yCbCrToRGB_16s16s_P3P3_func(void);
+extern int test_yCbCrToRGB_16s16s_P3P3_speed(void);
+
+extern int test_alphaComp_func(void);
+extern int test_alphaComp_speed(void);
+
+extern int test_and_32u_func(void);
+extern int test_and_32u_speed(void);
+extern int test_or_32u_func(void);
+extern int test_or_32u_speed(void);
+
+/* Since so much of this code is repeated, define a macro to build 
+ * functions to do speed tests.
+ */
+#ifdef armel
+#define SIMD_TYPE "Neon"
+#else
+#define SIMD_TYPE "SSE"
+#endif
+
+#define DO_NORMAL_MEASUREMENTS(_funcNormal_, _prework_) \
+	do { \
+		for (s=0; s<num_sizes; ++s) \
+		{ \
+			int iter; \
+			char label[256]; \
+			int size = size_array[s]; \
+			_prework_; \
+			iter = iterations/size; \
+			sprintf(label, "%s-%-4d", oplabel, size); \
+			MEASURE_TIMED(label, iter, test_time, resultNormal[s],  \
+				_funcNormal_); \
+		} \
+	} while (0)
+
+#if defined(i386) && defined(WITH_SSE2)
+#define DO_SSE_MEASUREMENTS(_funcSSE_, _prework_) \
+	do { \
+		for (s=0; s<num_sizes; ++s) \
+		{ \
+			int iter; \
+			char label[256]; \
+			int size = size_array[s]; \
+			_prework_; \
+			iter = iterations/size; \
+			sprintf(label, "%s-%s-%-4d", SIMD_TYPE, oplabel, size); \
+			MEASURE_TIMED(label, iter, test_time, resultSSENeon[s],  \
+				_funcSSE_); \
+		} \
+	} while (0)
+#else
+#define DO_SSE_MEASUREMENTS(_funcSSE_, _prework_)
+#endif
+
+#if defined(armel) && defined(INCLUDE_NEON_MEASUREMENTS)
+#define DO_NEON_MEASUREMENTS(_funcNeon_, _prework_) \
+	do { \
+		for (s=0; s<num_sizes; ++s) \
+		{ \
+			int iter; \
+			char label[256]; \
+			int size = size_array[s]; \
+			_prework_; \
+			iter = iterations/size; \
+			sprintf(label, "%s-%s-%-4d", SIMD_TYPE, oplabel, size); \
+			MEASURE_TIMED(label, iter, test_time, resultSSENeon[s],  \
+				_funcNeon_); \
+		} \
+	} while (0)
+#else
+#define DO_NEON_MEASUREMENTS(_funcNeon_, _prework_)
+#endif
+
+#if defined(i386) && defined(WITH_IPP)
+#define DO_IPP_MEASUREMENTS(_funcIPP_, _prework_) \
+	do { \
+		for (s=0; s<num_sizes; ++s) \
+		{ \
+			int iter; \
+			char label[256]; \
+			int size = size_array[s]; \
+			_prework_; \
+			iter = iterations/size; \
+			sprintf(label, "IPP-%s-%-4d", oplabel, size); \
+			MEASURE_TIMED(label, iter, test_time, resultIPP[s],  \
+				_funcIPP_); \
+		} \
+	} while (0)
+#else
+#define DO_IPP_MEASUREMENTS(_funcIPP_, _prework_)
+#endif
+
+/* ------------------------------------------------------------------------- */
+#define STD_SPEED_TEST( \
+	_name_, _srctype_, _dsttype_, _prework_, \
+	_doNormal_, _funcNormal_, \
+	_doSSE_,    _funcSSE_,  _flagsSSE_, \
+	_doNeon_,   _funcNeon_, _flagsNeon_, \
+	_doIPP_,    _funcIPP_) \
+static void _name_( \
+	const char *oplabel, const char *type, \
+	const _srctype_ *src1, const _srctype_ *src2, _srctype_ constant, \
+	_dsttype_ *dst, \
+	const int *size_array,  int num_sizes, \
+	int iterations, float test_time) \
+{ \
+	int s; \
+	float *resultNormal, *resultSSENeon, *resultIPP; \
+	UINT32 pflags = primitives_get_flags(primitives_get()); \
+	resultNormal = (float *) calloc(num_sizes, sizeof(float)); \
+	resultSSENeon = (float *) calloc(num_sizes, sizeof(float)); \
+	resultIPP = (float *) calloc(num_sizes, sizeof(float)); \
+	printf("******************** %s %s ******************\n",  \
+		oplabel, type); \
+	if (_doNormal_) { DO_NORMAL_MEASUREMENTS(_funcNormal_, _prework_); } \
+	if (_doSSE_) { \
+		if ((pflags & (_flagsSSE_)) == (_flagsSSE_)) \
+		{ \
+			DO_SSE_MEASUREMENTS(_funcSSE_, _prework_); \
+		} \
+	} \
+	if (_doNeon_) { \
+		if ((pflags & (_flagsNeon_)) == (_flagsNeon_)) \
+		{ \
+			DO_NEON_MEASUREMENTS(_funcNeon_, _prework_); \
+		} \
+	} \
+	if (_doIPP_)    { DO_IPP_MEASUREMENTS(_funcIPP_, _prework_); } \
+	printf("----------------------- SUMMARY ----------------------------\n"); \
+	printf("%8s: %15s %15s %5s %15s %5s\n", \
+		"size", "general", SIMD_TYPE, "%", "IPP", "%"); \
+	for (s=0; s<num_sizes; ++s) \
+	{ \
+		char sN[32], sSN[32], sSNp[8], sIPP[32], sIPPp[8]; \
+		strcpy(sN, "N/A"); strcpy(sSN, "N/A"); strcpy(sSNp, "N/A"); \
+		strcpy(sIPP, "N/A"); strcpy(sIPPp, "N/A"); \
+		if (resultNormal[s] > 0.0) _floatprint(resultNormal[s], sN); \
+		if (resultSSENeon[s] > 0.0) \
+		{ \
+			_floatprint(resultSSENeon[s], sSN); \
+			if (resultNormal[s] > 0.0) \
+			{ \
+				sprintf(sSNp, "%d%%", \
+					(int) (resultSSENeon[s] / resultNormal[s] * 100.0 + 0.5)); \
+			} \
+		} \
+		if (resultIPP[s] > 0.0) \
+		{ \
+			_floatprint(resultIPP[s], sIPP); \
+			if (resultNormal[s] > 0.0) \
+			{ \
+				sprintf(sIPPp, "%d%%", \
+					(int) (resultIPP[s] / resultNormal[s] * 100.0 + 0.5)); \
+			} \
+		} \
+		printf("%8d: %15s %15s %5s %15s %5s\n",  \
+			size_array[s], sN, sSN, sSNp, sIPP, sIPPp); \
+	} \
+	free(resultNormal); free(resultSSENeon);  free(resultIPP); \
+}
+
+#endif // !__PRIMTEST_H_INCLUDED__
--- a/libfreerdp/primitives/test/test_add.c
+++ b/libfreerdp/primitives/test/test_add.c
@ -0,0 +1,105 @@
+/* test_add.c
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include "prim_test.h"
+
+#define FUNC_TEST_SIZE	65536
+static const int ADD16S_PRETEST_ITERATIONS = 300000*64;
+static const int TEST_TIME = 2.0;  // seconds
+
+extern pstatus_t general_add_16s(
+	const INT16 *pSrc1, const INT16 *pSrc2, INT16 *pDst, int len);
+extern pstatus_t sse3_add_16s(
+	const INT16 *pSrc1, const INT16 *pSrc2, INT16 *pDst, int len);
+
+/* ========================================================================= */
+int test_add16s_func(void)
+{
+	INT16 ALIGN(src1[FUNC_TEST_SIZE+3]), ALIGN(src2[FUNC_TEST_SIZE+3]), 
+		ALIGN(d1[FUNC_TEST_SIZE+3]), ALIGN(d2[FUNC_TEST_SIZE+3]);
+	int failed = 0;
+	int i;
+	char testStr[256];
+	UINT32 pflags = primitives_get_flags(primitives_get());
+
+	testStr[0] = '\0';
+	get_random_data(src1, sizeof(src1));
+	get_random_data(src2, sizeof(src2));
+	memset(d1, 0, sizeof(d1));
+	memset(d2, 0, sizeof(d2));
+	general_add_16s(src1+1, src2+1, d1+1, FUNC_TEST_SIZE);
+#ifdef i386
+	if (pflags & PRIM_X86_SSE3_AVAILABLE)
+	{
+		strcat(testStr, " SSE3");
+		/* Aligned */
+		sse3_add_16s(src1+1, src2+1, d2+1, FUNC_TEST_SIZE);
+		for (i=1; i<FUNC_TEST_SIZE; ++i)
+		{
+			if (d1[i] != d2[i])
+			{ 
+				printf("ADD16S-SSE-aligned FAIL[%d] %d+%d=%d, got %d\n", 
+					i, src1[i], src2[i], d1[i], d2[i]); 
+				++failed;
+			}
+		}
+		/* Unaligned */
+		sse3_add_16s(src1+1, src2+1, d2+2, FUNC_TEST_SIZE);
+		for (i=1; i<FUNC_TEST_SIZE; ++i)
+		{
+			if (d1[i] != d2[i+1])
+			{ 
+				printf("ADD16S-SSE-unaligned FAIL[%d] %d+%d=%d, got %d\n", 
+					i, src1[i], src2[i], d1[i], d2[i+1]); 
+				++failed;
+			}
+		}
+	}
+#endif /* i386 */
+#ifdef WITH_IPP
+	strcat(testStr, " IPP");
+	ippsAdd_16s(src1+1, src2+1, d2+1, FUNC_TEST_SIZE);
+	for (i=1; i<FUNC_TEST_SIZE; ++i)
+	{
+		if (d1[i] != d2[i])
+		{ 
+			printf("ADD16S-IPP FAIL[%d] %d+%d=%d, got %d\n", 
+				i, src1[i], src2[i], d1[i], d2[i]); 
+			++failed;
+		}
+	}
+#endif /* WITH_IPP */
+	if (!failed) printf("All add16s tests passed (%s).\n", testStr);
+	return (failed > 0) ? FAILURE : SUCCESS;
+}
+ 
+/* ------------------------------------------------------------------------- */
+STD_SPEED_TEST(add16s_speed_test, INT16, INT16, dst=dst,
+	TRUE, general_add_16s(src1, src2, dst, size),
+	TRUE, sse3_add_16s(src1, src2, dst, size), PRIM_X86_SSE3_AVAILABLE,
+	FALSE, dst=dst, 0,
+	TRUE, ippsAdd_16s(src1, src2, dst, size));
+
+int test_add16s_speed(void)
+{
+	INT16 ALIGN(src1[MAX_TEST_SIZE+3]), ALIGN(src2[MAX_TEST_SIZE+3]),
+		ALIGN(dst[MAX_TEST_SIZE+3]);
+	get_random_data(src1, sizeof(src1));
+	get_random_data(src2, sizeof(src2));
+	add16s_speed_test("add16s", "aligned", src1, src2, 0, dst,
+		test_sizes, NUM_TEST_SIZES, ADD16S_PRETEST_ITERATIONS, TEST_TIME);
+	add16s_speed_test("add16s", "unaligned", src1+1, src2+2, 0, dst,
+		test_sizes, NUM_TEST_SIZES, ADD16S_PRETEST_ITERATIONS, TEST_TIME);
+	return SUCCESS;
+}
--- a/libfreerdp/primitives/test/test_alphaComp.c
+++ b/libfreerdp/primitives/test/test_alphaComp.c
@ -0,0 +1,225 @@
+/* test_alphaComp.c
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include "prim_test.h"
+
+static const int ALPHA_PRETEST_ITERATIONS = 5000000;
+static const float TEST_TIME = 5.0;
+
+static const int block_size[] = { 4, 64, 256 };
+#define NUM_BLOCK_SIZES (sizeof(block_size)/sizeof(int))
+#define MAX_BLOCK_SIZE 256
+#define SIZE_SQUARED (MAX_BLOCK_SIZE*MAX_BLOCK_SIZE)
+
+extern pstatus_t general_alphaComp_argb(
+	const BYTE *pSrc1,  int src1Step,
+	const BYTE *pSrc2,  int src2Step,
+	BYTE *pDst,  int dstStep,
+	int width,  int height);
+extern pstatus_t sse2_alphaComp_argb(
+	const BYTE *pSrc1,  int src1Step,
+	const BYTE *pSrc2,  int src2Step,
+	BYTE *pDst,  int dstStep,
+	int width,  int height);
+extern pstatus_t ipp_alphaComp_argb(
+	const BYTE *pSrc1,  int src1Step,
+	const BYTE *pSrc2,  int src2Step,
+	BYTE *pDst,  int dstStep,
+	int width,  int height);
+
+/* ========================================================================= */
+#define ALF(_c_) (((_c_) & 0xFF000000U) >> 24)
+#define RED(_c_) (((_c_) & 0x00FF0000U) >> 16)
+#define GRN(_c_) (((_c_) & 0x0000FF00U) >> 8)
+#define BLU(_c_) ((_c_) & 0x000000FFU)
+#define TOLERANCE 1
+#define PIXEL(_addr_, _bytes_, _x_, _y_) \
+    ((UINT32 *) (((BYTE *) (_addr_)) + (_x_)*4 + (_y_)*(_bytes_)))
+#define SRC1_WIDTH 6
+#define SRC1_HEIGHT 6
+#define SRC2_WIDTH 7
+#define SRC2_HEIGHT 7
+#define DST_WIDTH 9
+#define DST_HEIGHT 9
+#define TEST_WIDTH 4
+#define TEST_HEIGHT 5
+
+/* ------------------------------------------------------------------------- */
+static UINT32 alpha_add(
+	UINT32 c1,
+	UINT32 c2)
+{
+	UINT32 a1 = ALF(c1);
+	UINT32 r1 = RED(c1);
+	UINT32 g1 = GRN(c1);
+	UINT32 b1 = BLU(c1);
+
+	UINT32 a2 = ALF(c2);
+	UINT32 r2 = RED(c2);
+	UINT32 g2 = GRN(c2);
+	UINT32 b2 = BLU(c2);
+
+	UINT32 a3 = ((a1 * a1 + (255-a1) * a2) / 255) & 0xff;
+	UINT32 r3 = ((a1 * r1 + (255-a1) * r2) / 255) & 0xff;
+	UINT32 g3 = ((a1 * g1 + (255-a1) * g2) / 255) & 0xff;
+	UINT32 b3 = ((a1 * b1 + (255-a1) * b2) / 255) & 0xff;
+
+	return (a3 << 24) | (r3 << 16) | (g3 << 8) | b3;
+}
+
+/* ------------------------------------------------------------------------- */
+static UINT32 colordist(
+	UINT32 c1,
+	UINT32 c2)
+{
+	int d, maxd = 0;
+
+	d = ABS(ALF(c1) - ALF(c2));
+	if (d > maxd) maxd = d;
+	d = ABS(RED(c1) - RED(c2));
+	if (d > maxd) maxd = d;
+	d = ABS(GRN(c1) - GRN(c2));
+	if (d > maxd) maxd = d;
+	d = ABS(BLU(c1) - BLU(c2));
+	if (d > maxd) maxd = d;
+	return maxd;
+}
+
+/* ------------------------------------------------------------------------- */
+int test_alphaComp_func(void)
+{
+	UINT32 ALIGN(src1[SRC1_WIDTH*SRC1_HEIGHT]);
+	UINT32 ALIGN(src2[SRC2_WIDTH*SRC2_HEIGHT]);
+	UINT32 ALIGN(dst1[DST_WIDTH*DST_HEIGHT]);
+	UINT32 ALIGN(dst2a[DST_WIDTH*DST_HEIGHT]);
+	UINT32 ALIGN(dst2u[DST_WIDTH*DST_HEIGHT+1]);
+	UINT32 ALIGN(dst3[DST_WIDTH*DST_HEIGHT]);
+	int error = 0;
+	UINT32 pflags = primitives_get_flags(primitives_get());
+	char testStr[256];
+	UINT32 *ptr;
+	int i, x, y;
+
+	testStr[0] = '\0';
+	get_random_data(src1, sizeof(src1));
+	/* Special-case the first two values */
+	src1[0] &= 0x00FFFFFFU;
+	src1[1] |= 0xFF000000U;
+	get_random_data(src2, sizeof(src2));
+	/* Set the second operand to fully-opaque. */
+	ptr = src2;
+	for (i=0; i<sizeof(src2)/4; ++i) *ptr++ |= 0xFF000000U;
+	memset(dst1, 0, sizeof(dst1));
+	memset(dst2a, 0, sizeof(dst2a));
+	memset(dst2u, 0, sizeof(dst2u));
+	memset(dst3, 0, sizeof(dst3));
+
+	general_alphaComp_argb((const BYTE *) src1, 4*SRC1_WIDTH, 
+		(const BYTE *) src2, 4*SRC2_WIDTH,
+		(BYTE *) dst1, 4*DST_WIDTH, TEST_WIDTH, TEST_HEIGHT);
+#ifdef i386
+	if (pflags & PRIM_X86_SSE2_AVAILABLE)
+	{
+		strcat(testStr, " SSE2");
+		sse2_alphaComp_argb((const BYTE *) src1, 4*SRC1_WIDTH, 
+			(const BYTE *) src2, 4*SRC2_WIDTH,
+			(BYTE *) dst2a, 4*DST_WIDTH, TEST_WIDTH, TEST_HEIGHT);
+		sse2_alphaComp_argb((const BYTE *) src1, 4*SRC1_WIDTH, 
+			(const BYTE *) src2, 4*SRC2_WIDTH,
+			(BYTE *) (dst2u+1), 4*DST_WIDTH, TEST_WIDTH, TEST_HEIGHT);
+	}
+#endif /* i386 */
+#ifdef WITH_IPP
+	strcat(testStr, " IPP");
+	ipp_alphaComp_argb((const BYTE *) src1, 4*SRC1_WIDTH, 
+		(const BYTE *) src2, 4*SRC2_WIDTH,
+		(BYTE *) dst3, 4*DST_WIDTH, TEST_WIDTH, TEST_HEIGHT);
+#endif
+
+	for (y=0; y<TEST_HEIGHT; ++y)
+	{
+		for (x=0; x<TEST_WIDTH; ++x)
+		{
+			UINT32 s1 = *PIXEL(src1, 4*SRC1_WIDTH, x, y);
+			UINT32 s2 = *PIXEL(src2, 4*SRC2_WIDTH, x, y);
+			UINT32 c0 = alpha_add(s1, s2);
+			UINT32 c1 = *PIXEL(dst1, 4*DST_WIDTH, x, y);
+			if (colordist(c0, c1) > TOLERANCE)
+			{
+				printf("alphaComp-general: [%d,%d] 0x%08x+0x%08x=0x%08x, got 0x%08x\n", 
+					x, y, s1, s2, c0, c1);
+				error = 1;
+			}
+#ifdef i386
+			if (pflags & PRIM_X86_SSE2_AVAILABLE)
+			{
+				UINT32 c2 = *PIXEL(dst2a, 4*DST_WIDTH, x, y);
+				if (colordist(c0, c2) > TOLERANCE)
+				{
+					printf("alphaComp-SSE-aligned: [%d,%d] 0x%08x+0x%08x=0x%08x, got 0x%08x\n", 
+						x, y, s1, s2, c0, c2);
+					error = 1;
+				}
+				c2 = *PIXEL(dst2u+1, 4*DST_WIDTH, x, y);
+				if (colordist(c0, c2) > TOLERANCE)
+				{
+					printf("alphaComp-SSE-unaligned: [%d,%d] 0x%08x+0x%08x=0x%08x, got 0x%08x\n", 
+						x, y, s1, s2, c0, c2);
+					error = 1;
+				}
+			}
+#endif /* i386 */
+#ifdef WITH_IPP
+			UINT32 c3 = *PIXEL(dst3, 4*DST_WIDTH, x, y);
+			if (colordist(c0, c3) > TOLERANCE)
+			{
+				printf("alphaComp-IPP: [%d,%d] 0x%08x+0x%08x=0x%08x, got 0x%08x\n", 
+					x, y, s1, s2, c0, c3);
+				error = 1;
+			}
+#endif
+		}
+	}
+	if (!error) printf("All alphaComp tests passed (%s).\n", testStr);
+	return (error > 0) ? FAILURE : SUCCESS;
+}
+
+
+/* ------------------------------------------------------------------------- */
+STD_SPEED_TEST(alphaComp_speed, BYTE, BYTE, int bytes = size*4,
+	TRUE, general_alphaComp_argb(src1, bytes, src2, bytes, dst, bytes,
+		size, size),
+	TRUE, sse2_alphaComp_argb(src1, bytes, src2, bytes, dst, bytes,
+		size, size), PRIM_X86_SSE2_AVAILABLE,
+	FALSE, dst=dst, 0,
+	TRUE, ipp_alphaComp_argb(src1, bytes, src2, bytes, dst, bytes,
+		size, size));
+
+int test_alphaComp_speed(void)
+{
+	INT32 ALIGN(src1[MAX_BLOCK_SIZE*(MAX_BLOCK_SIZE+1)]), 
+	ALIGN(src2[SIZE_SQUARED]),
+	ALIGN(dst[SIZE_SQUARED]);
+
+	get_random_data(src1, sizeof(src1));
+	get_random_data(src2, sizeof(src2));
+
+	alphaComp_speed("alphaComp", "aligned",
+		(BYTE *) src1, (BYTE *) src2, 0, (BYTE *) dst,
+		block_size, NUM_BLOCK_SIZES, ALPHA_PRETEST_ITERATIONS, TEST_TIME);
+	alphaComp_speed("alphaComp", "unaligned",
+		(BYTE *) src1+1, (BYTE *) src2, 0, (BYTE *) dst,
+		block_size, NUM_BLOCK_SIZES, ALPHA_PRETEST_ITERATIONS, TEST_TIME);
+	return SUCCESS;
+}
--- a/libfreerdp/primitives/test/test_andor.c
+++ b/libfreerdp/primitives/test/test_andor.c
@ -0,0 +1,178 @@
+/* test_andor.c
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include "prim_test.h"
+
+#define FUNC_TEST_SIZE	65536
+static const int ANDOR_PRETEST_ITERATIONS = 100000;
+static const int TEST_TIME = 2.0;  // seconds
+
+extern pstatus_t general_andC_32u(const UINT32 *pSrc, UINT32 val,
+	UINT32 *pDst, int len);
+extern pstatus_t sse3_andC_32u(const UINT32 *pSrc, UINT32 val,
+	UINT32 *pDst, int len);
+extern pstatus_t general_orC_32u(const UINT32 *pSrc, UINT32 val,
+	UINT32 *pDst, int len);
+extern pstatus_t sse3_orC_32u(const UINT32 *pSrc, UINT32 val,
+	UINT32 *pDst, int len);
+
+#define VALUE (0xA5A5A5A5U)
+
+/* ========================================================================= */
+int test_and_32u_func(void)
+{
+	UINT32 ALIGN(src[FUNC_TEST_SIZE+3]), ALIGN(dst[FUNC_TEST_SIZE+3]);
+	int failed = 0;
+	int i;
+	UINT32 pflags = primitives_get_flags(primitives_get());
+	char testStr[256];
+
+	testStr[0] = '\0';
+	get_random_data(src, sizeof(src));
+	general_andC_32u(src+1, VALUE, dst+1, FUNC_TEST_SIZE);
+	strcat(testStr, " general");
+	for (i=1; i<=FUNC_TEST_SIZE; ++i)
+	{
+		if (dst[i] != (src[i] & VALUE))
+		{ 
+			printf("AND-general FAIL[%d] 0x%08x&0x%08x=0x%08x, got 0x%08x\n", 
+				i, src[i], VALUE, src[i] & VALUE, dst[i]); 
+			++failed;
+		}
+	}
+#ifdef i386
+	if (pflags & PRIM_X86_SSE3_AVAILABLE)
+	{
+		strcat(testStr, " SSE3");
+		/* Aligned */
+		memset(dst, 0, sizeof(dst));
+		sse3_andC_32u(src+1, VALUE, dst+1, FUNC_TEST_SIZE);
+		for (i=1; i<=FUNC_TEST_SIZE; ++i)
+		{
+			if (dst[i] != (src[i] & VALUE))
+			{ 
+				printf("AND-SSE-aligned FAIL[%d] 0x%08x&0x%08x=0x%08x, got 0x%08x\n", 
+					i, src[i], VALUE, src[i] & VALUE, dst[i]); 
+				++failed;
+			}
+		}
+		/* Unaligned */
+		memset(dst, 0, sizeof(dst));
+		sse3_andC_32u(src+1, VALUE, dst+2, FUNC_TEST_SIZE);
+		for (i=1; i<=FUNC_TEST_SIZE; ++i)
+		{
+			if (dst[i+1] != (src[i] & VALUE))
+			{ 
+				printf("AND-SSE-unaligned FAIL[%d] 0x%08x&0x%08x=0x%08x, got 0x%08x\n", 
+					i, src[i], VALUE, src[i] & VALUE, dst[i+1]); 
+				++failed;
+			}
+		}
+	}
+#endif /* i386 */
+	if (!failed) printf("All and_32u tests passed (%s).\n", testStr);
+	return (failed > 0) ? FAILURE : SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+STD_SPEED_TEST(andC_32u_speed_test, UINT32, UINT32, dst=dst,
+	TRUE, general_andC_32u(src1, constant, dst, size),
+	TRUE, sse3_andC_32u(src1, constant, dst, size), PRIM_X86_SSE3_AVAILABLE,
+	FALSE, dst=dst, 0,
+	TRUE, ippsAndC_32u(src1, constant, dst, size))
+
+int test_and_32u_speed(void)
+{
+	UINT32 ALIGN(src[MAX_TEST_SIZE+3]), ALIGN(dst[MAX_TEST_SIZE+3]);
+	get_random_data(src, sizeof(src));
+	andC_32u_speed_test("and32u", "aligned", src, NULL, VALUE, dst,
+		test_sizes, NUM_TEST_SIZES, ANDOR_PRETEST_ITERATIONS, TEST_TIME);
+	andC_32u_speed_test("and32u", "unaligned", src+1, NULL, VALUE, dst,
+		test_sizes, NUM_TEST_SIZES, ANDOR_PRETEST_ITERATIONS, TEST_TIME);
+	return SUCCESS;
+}
+
+/* ========================================================================= */
+int test_or_32u_func(void)
+{
+	UINT32 ALIGN(src[FUNC_TEST_SIZE+3]), ALIGN(dst[FUNC_TEST_SIZE+3]);
+	int failed = 0;
+	int i;
+	UINT32 pflags = primitives_get_flags(primitives_get());
+	char testStr[256];
+
+	testStr[0] = '\0';
+	get_random_data(src, sizeof(src));
+	general_orC_32u(src+1, VALUE, dst+1, FUNC_TEST_SIZE);
+	strcat(testStr, " general");
+	for (i=1; i<=FUNC_TEST_SIZE; ++i)
+	{
+		if (dst[i] != (src[i] | VALUE))
+		{ 
+			printf("OR-general general FAIL[%d] 0x%08x&0x%08x=0x%08x, got 0x%08x\n", 
+				i, src[i], VALUE, src[i] | VALUE, dst[i]); 
+		++failed;
+		}
+	}
+#ifdef i386
+	if (pflags & PRIM_X86_SSE3_AVAILABLE)
+	{
+		strcat(testStr, " SSE3");
+		/* Aligned */
+		memset(dst, 0, sizeof(dst));
+		sse3_orC_32u(src+1, VALUE, dst+1, FUNC_TEST_SIZE);
+		for (i=1; i<FUNC_TEST_SIZE; ++i)
+		{
+			if (dst[i] != (src[i] | VALUE))
+			{ 
+				printf("OR-SSE-aligned FAIL[%d] 0x%08x&0x%08x=0x%08x, got 0x%08x\n", 
+					i, src[i], VALUE, src[i] | VALUE, dst[i]); 
+				++failed;
+			}
+		}
+		/* Unaligned */
+		memset(dst, 0, sizeof(dst));
+		sse3_orC_32u(src+1, VALUE, dst+2, FUNC_TEST_SIZE);
+		for (i=1; i<FUNC_TEST_SIZE; ++i)
+		{
+			if (dst[i+1] != (src[i] | VALUE))
+			{ 
+				printf("OR-SSE-unaligned FAIL[%d] 0x%08x&0x%08x=0x%08x, got 0x%08x\n", 
+					i, src[i], VALUE, src[i] | VALUE, dst[i+1]); 
+				++failed;
+			}
+		}
+	}
+#endif /* i386 */
+	if (!failed) printf("All or_32u tests passed (%s).\n", testStr);
+	return (failed > 0) ? FAILURE : SUCCESS;
+}
+    	
+/* ------------------------------------------------------------------------- */
+STD_SPEED_TEST(orC_32u_speed_test, UINT32, UINT32, dst=dst,
+	TRUE, general_orC_32u(src1, constant, dst, size),
+	TRUE, sse3_orC_32u(src1, constant, dst, size), PRIM_X86_SSE3_AVAILABLE,
+	FALSE, dst=dst, 0,
+	TRUE, ippsOrC_32u(src1, constant, dst, size))
+
+int test_or_32u_speed(void)
+{
+	UINT32 ALIGN(src[MAX_TEST_SIZE+3]), ALIGN(dst[MAX_TEST_SIZE+3]);
+	get_random_data(src, sizeof(src));
+	orC_32u_speed_test("or32u", "aligned", src, NULL, VALUE, dst,
+		test_sizes, NUM_TEST_SIZES, ANDOR_PRETEST_ITERATIONS, TEST_TIME);
+	orC_32u_speed_test("or32u", "unaligned", src+1, NULL, VALUE, dst,
+		test_sizes, NUM_TEST_SIZES, ANDOR_PRETEST_ITERATIONS, TEST_TIME);
+	return SUCCESS;
+}
--- a/libfreerdp/primitives/test/test_colors.c
+++ b/libfreerdp/primitives/test/test_colors.c
@ -0,0 +1,226 @@
+/* test_colors.c
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include "prim_test.h"
+
+static const int RGB_TRIAL_ITERATIONS = 1000;
+static const int YCBCR_TRIAL_ITERATIONS = 1000;
+static const float TEST_TIME = 4.0;
+
+extern pstatus_t general_RGBToRGB_16s8u_P3AC4R(const INT16 *pSrc[3], 
+	int srcStep, BYTE *pDst, int dstStep, const prim_size_t *roi);
+extern pstatus_t sse2_RGBToRGB_16s8u_P3AC4R(const INT16 *pSrc[3], 
+	int srcStep, BYTE *pDst, int dstStep, const prim_size_t *roi);
+extern pstatus_t general_yCbCrToRGB_16s16s_P3P3(const INT16 *pSrc[3],
+	int srcStep, INT16 *pDst[3], int dstStep, const prim_size_t *roi);
+extern pstatus_t sse2_yCbCrToRGB_16s16s_P3P3(const INT16 *pSrc[3],
+	int srcStep, INT16 *pDst[3], int dstStep, const prim_size_t *roi);
+
+/* ------------------------------------------------------------------------- */
+int test_RGBToRGB_16s8u_P3AC4R_func(void)
+{
+	INT16 ALIGN(r[4096]), ALIGN(g[4096]), ALIGN(b[4096]);
+	UINT32 ALIGN(out1[4096]), ALIGN(out2[4096]);
+	int i;
+	int failed = 0;
+	UINT32 pflags = primitives_get_flags(primitives_get());
+	char testStr[256];
+	INT16 *ptrs[3];
+	prim_size_t roi = { 64, 64 };
+
+	testStr[0] = '\0';
+	get_random_data(r, sizeof(r));
+	get_random_data(g, sizeof(g));
+	get_random_data(b, sizeof(b));
+	/* clear upper bytes */
+	for (i=0; i<4096; ++i)
+	{
+		r[i] &= 0x00FFU;
+		g[i] &= 0x00FFU;
+		b[i] &= 0x00FFU;
+	}
+
+	ptrs[0] = r;
+	ptrs[1] = g;
+	ptrs[2] = b;
+
+	general_RGBToRGB_16s8u_P3AC4R((const INT16 **) ptrs, 64*2,
+		(BYTE *) out1, 64*4, &roi);
+#ifdef i386
+	if (pflags & PRIM_X86_SSE2_AVAILABLE)
+	{
+		strcat(testStr, " SSE2");
+		sse2_RGBToRGB_16s8u_P3AC4R((const INT16 **) ptrs, 64*2,
+			(BYTE *) out2, 64*4, &roi);
+		for (i=0; i<4096; ++i)
+		{
+			if (out1[i] != out2[i])
+			{
+				printf("RGBToRGB-SSE FAIL: out1[%d]=0x%08x out2[%d]=0x%08x\n",
+					i, out1[i], i, out2[i]);
+				failed = 1;
+			}
+		}
+	}
+#endif /* i386 */
+	if (!failed) printf("All RGBToRGB_16s8u_P3AC4R tests passed (%s).\n", testStr);
+	return (failed > 0) ? FAILURE : SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+static const prim_size_t roi64x64 = { 64, 64 };
+STD_SPEED_TEST(
+	rgb_to_argb_speed, INT16*, UINT32, dst=dst,
+	TRUE, general_RGBToRGB_16s8u_P3AC4R(
+		(const INT16 **) src1, 64*2, (BYTE *) dst, 64*4, &roi64x64),
+	TRUE, sse2_RGBToRGB_16s8u_P3AC4R(
+		(const INT16 **) src1, 64*2, (BYTE *) dst, 64*4, &roi64x64),
+		PRIM_X86_SSE2_AVAILABLE,
+	FALSE, dst=dst, 0,
+	FALSE, dst=dst);
+
+int test_RGBToRGB_16s8u_P3AC4R_speed(void)
+{
+	INT16 ALIGN(r[4096]), ALIGN(g[4096]), ALIGN(b[4096]);
+	UINT32 ALIGN(dst[4096]);
+	int i;
+	INT16 *ptrs[3];
+	int size_array[] = { 64 };
+
+	get_random_data(r, sizeof(r));
+	get_random_data(g, sizeof(g));
+	get_random_data(b, sizeof(b));
+	/* clear upper bytes */
+	for (i=0; i<4096; ++i)
+	{
+		r[i] &= 0x00FFU;
+		g[i] &= 0x00FFU;
+		b[i] &= 0x00FFU;
+	}
+
+	ptrs[0] = r;
+	ptrs[1] = g;
+	ptrs[2] = b;
+
+	rgb_to_argb_speed("RGBToARGB", "aligned", 
+		(const INT16 **) ptrs, NULL, 0, dst,
+		size_array, 1, RGB_TRIAL_ITERATIONS, TEST_TIME);
+	return SUCCESS;
+}
+
+/* ========================================================================= */
+int test_yCbCrToRGB_16s16s_P3P3_func(void)
+{
+	INT16 ALIGN(y[4096]), ALIGN(cb[4096]), ALIGN(cr[4096]);
+	INT16 ALIGN(r1[4096]), ALIGN(g1[4096]), ALIGN(b1[4096]);
+	INT16 ALIGN(r2[4096]), ALIGN(g2[4096]), ALIGN(b2[4096]);
+	int i;
+	int failed = 0;
+	UINT32 pflags = primitives_get_flags(primitives_get());
+	char testStr[256];
+	const INT16 *in[3];
+	INT16 *out1[3];
+	INT16 *out2[3];
+	prim_size_t roi = { 64, 64 };
+
+	testStr[0] = '\0';
+	get_random_data(y, sizeof(y));
+	get_random_data(cb, sizeof(cb));
+	get_random_data(cr, sizeof(cr));
+	/* Normalize to 11.5 fixed radix */
+	for (i=0; i<4096; ++i)
+	{
+		y[i]  &= 0x1FE0U;
+		cb[i] &= 0x1FE0U;
+		cr[i] &= 0x1FE0U;
+	}
+	memset(r1, 0, sizeof(r1));
+	memset(g1, 0, sizeof(g1));
+	memset(b1, 0, sizeof(b1));
+	memset(r2, 0, sizeof(r2));
+	memset(g2, 0, sizeof(g2));
+	memset(b2, 0, sizeof(b2));
+
+	in[0] = y;
+	in[1] = cb;
+	in[2] = cr;
+	out1[0] = r1;
+	out1[1] = g1;
+	out1[2] = b1;
+	out2[0] = r2;
+	out2[1] = g2;
+	out2[2] = b2;
+
+	general_yCbCrToRGB_16s16s_P3P3(in, 64*2, out1, 64*2, &roi);
+#ifdef i386
+	if (pflags & PRIM_X86_SSE2_AVAILABLE)
+	{
+		strcat(testStr, " SSE2");
+		sse2_yCbCrToRGB_16s16s_P3P3(in, 64*2, out2, 64*2, &roi);
+		for (i=0; i<4096; ++i)
+		{
+			if ((ABS(r1[i]-r2[i]) > 1)
+					|| (ABS(g1[i]-g2[i]) > 1)
+					|| (ABS(b1[i]-b2[i]) > 1)) {
+				printf("YCbCrToRGB-SSE FAIL[%d]: %d,%d,%d vs %d,%d,%d\n", i,
+					r1[i],g1[i],b1[i], r2[i],g2[i],b2[i]);
+				failed = 1;
+			}
+		}
+	}
+#endif /* i386 */
+	if (!failed) printf("All yCbCrToRGB_16s16s_P3P3 tests passed (%s).\n", testStr);
+	return (failed > 0) ? FAILURE : SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+STD_SPEED_TEST(
+	ycbcr_to_rgb_speed, INT16*, INT16*, dst=dst,
+	TRUE, general_yCbCrToRGB_16s16s_P3P3(src1, 64*2, dst, 64*2, &roi64x64),
+	TRUE, sse2_yCbCrToRGB_16s16s_P3P3(src1, 64*2, dst, 64*2, &roi64x64),
+		PRIM_X86_SSE2_AVAILABLE,
+	FALSE, dst=dst, 0,
+	FALSE, dst=dst);
+
+int test_yCbCrToRGB_16s16s_P3P3_speed(void)
+{
+	INT16 ALIGN(y[4096]), ALIGN(cb[4096]), ALIGN(cr[4096]);
+	INT16 ALIGN(r[4096]), ALIGN(g[4096]), ALIGN(b[4096]);
+	int i;
+	const INT16 *input[3];
+	INT16 *output[3];
+	int size_array[] = { 64 };
+
+	get_random_data(y, sizeof(y));
+	get_random_data(cb, sizeof(cb));
+	get_random_data(cr, sizeof(cr));
+	/* Normalize to 11.5 fixed radix */
+	for (i=0; i<4096; ++i)
+	{
+		y[i]  &= 0x1FE0U;
+		cb[i] &= 0x1FE0U;
+		cr[i] &= 0x1FE0U;
+	}
+
+	input[0] = y;
+	input[1] = cb;
+	input[2] = cr;
+	output[0] = r;
+	output[1] = g;
+	output[2] = b;
+
+	ycbcr_to_rgb_speed("yCbCrToRGB", "aligned", input, NULL, NULL, output,
+		size_array, 1, YCBCR_TRIAL_ITERATIONS, TEST_TIME);
+	return SUCCESS;
+}
--- a/libfreerdp/primitives/test/test_copy.c
+++ b/libfreerdp/primitives/test/test_copy.c
@ -0,0 +1,83 @@
+/* test_copy.c
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include "prim_test.h"
+
+static const int MEMCPY_PRETEST_ITERATIONS = 1000000;
+static const int TEST_TIME = 1.0;  // seconds
+#define COPY_TESTSIZE (256*2+16*2+15+15)
+#if 0
+extern pstatus_t sse3_copy_8u(const BYTE *pSrc, BYTE *pDst, int len);
+#endif
+
+/* ------------------------------------------------------------------------- */
+int test_copy8u_func(void)
+{
+	primitives_t *prims = primitives_get();
+	BYTE ALIGN(data[COPY_TESTSIZE+15]);
+	int i, soff;
+	int failed = 0;
+	char testStr[256];
+	BYTE ALIGN(dest[COPY_TESTSIZE+15]);
+
+	testStr[0] = '\0';
+	get_random_data(data, sizeof(data));
+
+	strcat(testStr, " ptr");
+	for (soff=0; soff<16; ++soff)
+	{
+		int doff;
+		for (doff=0; doff<16; ++doff)
+		{
+			int length;
+			for (length=1; length<=COPY_TESTSIZE-doff; ++length)
+			{
+				memset(dest, 0, sizeof(dest));
+				prims->copy_8u(data+soff, dest+doff, length);
+				for (i=0; i<length; ++i)
+				{
+					if (dest[i+doff] != data[i+soff])
+					{
+						printf("COPY8U FAIL: off=%d len=%d, dest[%d]=0x%02x"
+							"data[%d]=0x%02x\n",
+							doff, length, i+doff, dest[i+doff],
+							i+soff, data[i+soff]);
+						failed = 1;
+					}
+				}
+			}
+		}
+	}
+	if (!failed) printf("All copy8 tests passed (%s).\n", testStr);
+	return (failed > 0) ? FAILURE : SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+STD_SPEED_TEST(copy8u_speed_test, BYTE, BYTE, dst=dst,
+	TRUE, memcpy(dst, src1, size),
+	FALSE, NULL, 0,
+	FALSE, NULL, 0,
+	TRUE, ippsCopy_8u(src1, dst, size));
+
+int test_copy8u_speed(void)
+{
+	BYTE ALIGN(src[MAX_TEST_SIZE+4]);
+	BYTE ALIGN(intervening[MAX_TEST_SIZE*7]);
+	BYTE ALIGN(dst[MAX_TEST_SIZE+4]);
+	copy8u_speed_test("copy8u", "aligned", src, NULL, 0, dst,
+		test_sizes, NUM_TEST_SIZES, MEMCPY_PRETEST_ITERATIONS, TEST_TIME);
+	copy8u_speed_test("copy8u", "unaligned", src+1, NULL, 0, dst,
+		test_sizes, NUM_TEST_SIZES, MEMCPY_PRETEST_ITERATIONS, TEST_TIME);
+	return SUCCESS;
+}
--- a/libfreerdp/primitives/test/test_set.c
+++ b/libfreerdp/primitives/test/test_set.c
@ -0,0 +1,294 @@
+/* test_set.c
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include "prim_test.h"
+
+static const int MEMSET8_PRETEST_ITERATIONS = 100000000;
+static const int MEMSET32_PRETEST_ITERATIONS = 40000000;
+static const float TEST_TIME = 1.0;
+
+extern pstatus_t general_set_8u(BYTE val, BYTE *pDst, int len);
+extern pstatus_t sse2_set_8u(BYTE val, BYTE *pDst, int len);
+extern pstatus_t general_set_32s(INT32 val, INT32 *pDst, int len);
+extern pstatus_t sse2_set_32s(INT32 val, INT32 *pDst, int len);
+extern pstatus_t general_set_32u(UINT32 val, UINT32 *pDst, int len);
+extern pstatus_t sse2_set_32u(UINT32 val, UINT32 *pDst, int len);
+extern pstatus_t ipp_wrapper_set_32u(UINT32 val, UINT32 *pDst, int len);
+
+static const int set_sizes[] = { 1, 4, 16, 32, 64, 256, 1024, 4096 };
+#define NUM_SET_SIZES (sizeof(set_sizes)/sizeof(int))
+
+/* ------------------------------------------------------------------------- */
+int test_set8u_func(void)
+{
+	BYTE ALIGN(dest[48]);
+	int failed = 0;
+	int off;
+	char testStr[256];
+	UINT32 pflags = primitives_get_flags(primitives_get());
+	testStr[0] = '\0';
+
+#ifdef i386
+	/* Test SSE under various alignments */
+	if (pflags & PRIM_X86_SSE2_AVAILABLE)
+	{
+		strcat(testStr, " SSE2");
+		for (off=0; off<16; ++off)
+		{
+			int len;
+			for (len=1; len<48-off; ++len)
+			{
+				int i;
+				memset(dest, 0, sizeof(dest));
+				sse2_set_8u(0xa5, dest+off, len);
+				for (i=0; i<len; ++i)
+				{
+					if (dest[off+i] != 0xa5)
+					{
+						printf("SET8U-SSE FAILED: off=%d len=%d dest[%d]=0x%02x\n",
+							off, len, i+off, dest[i+off]);
+						failed=1;
+					}
+				}
+			}
+		}
+	}
+#endif /* i386 */
+
+#ifdef WITH_IPP
+	/* Test IPP under various alignments */
+	strcat(testStr, " IPP");
+	for (off=0; off<16; ++off)
+	{
+		int len;
+		for (len=1; len<48-off; ++len)
+		{
+			int i;
+			memset(dest, 0, sizeof(dest));
+			ippsSet_8u(0xa5, dest+off, len);
+			for (i=0; i<len; ++i)
+			{
+				if (dest[off+i] != 0xa5)
+				{
+					printf("SET8U-IPP FAILED: off=%d len=%d dest[%d]=0x%02x\n",
+						off, len, i+off, dest[i+off]);
+					failed=1;
+				}
+			}
+		}
+	}
+#endif /* WITH_IPP */
+
+	if (!failed) printf("All set8u tests passed (%s).\n", testStr);
+	return (failed > 0) ? FAILURE : SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+STD_SPEED_TEST(set8u_speed_test, BYTE, BYTE, dst=dst,
+	TRUE, memset(dst, constant, size),
+	FALSE, NULL, 0,
+	FALSE, NULL, 0,
+	TRUE, ippsSet_8u(constant, dst, size));
+
+int test_set8u_speed(void)
+{
+	BYTE ALIGN(dst[MAX_TEST_SIZE]);
+	set8u_speed_test("set8u", "aligned", NULL, NULL, 0xA5, dst,
+		set_sizes, NUM_SET_SIZES, MEMSET8_PRETEST_ITERATIONS, TEST_TIME);
+	return SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+int test_set32s_func(void)
+{
+	primitives_t *prims = primitives_get();
+	INT32 ALIGN(dest[512]);
+	int failed = 0;
+	int off;
+	char testStr[256];
+	UINT32 pflags = primitives_get_flags(primitives_get());
+	testStr[0] = '\0';
+
+#ifdef i386
+	/* Test SSE under various alignments */
+	if (pflags & PRIM_X86_SSE2_AVAILABLE)
+	{
+		strcat(testStr, " SSE2");
+		for (off=0; off<16; ++off) {
+			int len;
+			for (len=1; len<512-off; ++len)
+			{
+				int i;
+				memset(dest, 0, sizeof(dest));
+				sse2_set_32s(0xdeadbeef, dest+off, len);
+				for (i=0; i<len; ++i)
+				{
+					if (dest[off+i] != 0xdeadbeef)
+					{
+						printf("set32s-SSE FAIL: off=%d len=%d dest[%d]=0x%08x\n",
+							off, len, i+off, dest[i+off]);
+						failed=1;
+					}
+				}
+			}
+		}
+	}
+#endif /* i386 */
+
+#ifdef WITH_IPP
+	strcat(testStr, " IPP");
+	for (off=0; off<16; ++off) {
+		int len;
+		for (len=1; len<512-off; ++len)
+		{
+			int i;
+			memset(dest, 0, sizeof(dest));
+			ippsSet_32s(0xdeadbeef, dest+off, len);
+			for (i=0; i<len; ++i)
+			{
+				if (dest[off+i] != 0xdeadbeef)
+				{
+					printf("set32s-IPP FAIL: off=%d len=%d dest[%d]=0x%08x\n",
+						off, len, i+off, dest[i+off]);
+					failed=1;
+				}
+			}
+		}
+	}
+#endif /* WITH_IPP */
+
+	if (!failed) printf("All set32s tests passed (%s).\n", testStr);
+	return (failed > 0) ? FAILURE : SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+int test_set32u_func(void)
+{
+	primitives_t *prims = primitives_get();
+	UINT32 ALIGN(dest[512]);
+	int failed = 0;
+	int off;
+	char testStr[256];
+	UINT32 pflags = primitives_get_flags(primitives_get());
+	testStr[0] = '\0';
+
+#ifdef i386
+	/* Test SSE under various alignments */
+	if (pflags & PRIM_X86_SSE2_AVAILABLE)
+	{
+		strcat(testStr, " SSE2");
+		for (off=0; off<16; ++off) {
+			int len;
+			for (len=1; len<512-off; ++len)
+			{
+				int i;
+				memset(dest, 0, sizeof(dest));
+				sse2_set_32u(0xdeadbeefU, dest+off, len);
+				for (i=0; i<len; ++i)
+				{
+					if (dest[off+i] != 0xdeadbeefU)
+					{
+						printf("set32u-SSE FAIL: off=%d len=%d dest[%d]=0x%08x\n",
+							off, len, i+off, dest[i+off]);
+						failed=1;
+					}
+				}
+			}
+		}
+	}
+#endif /* i386 */
+
+#ifdef WITH_IPP
+	strcat(testStr, " IPP");
+	for (off=0; off<16; ++off) {
+		int len;
+		for (len=1; len<512-off; ++len)
+		{
+			memset(dest, 0, sizeof(dest));
+			ipp_wrapper_set_32u(0xdeadbeefU, dest+off, len);
+			int i;
+			for (i=0; i<len; ++i)
+			{
+				if (dest[off+i] != 0xdeadbeefU)
+				{
+					printf("set32u-IPP FAIL: off=%d len=%d dest[%d]=0x%08x\n",
+						off, len, i+off, dest[i+off]);
+					failed=1;
+				}
+			}
+		}
+	}
+#endif /* WITH_IPP */
+
+	if (!failed) printf("All set32u tests passed (%s).\n", testStr);
+	return (failed > 0) ? FAILURE : SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+static inline void memset32u_naive(
+	UINT32 val,
+	UINT32 *dst,
+	size_t count)
+{
+	while (count--) *dst++ = val;
+}
+
+/* ------------------------------------------------------------------------- */
+STD_SPEED_TEST(set32u_speed_test, UINT32, UINT32, dst=dst,
+	TRUE, memset32u_naive(constant, dst, size),
+	TRUE, sse2_set_32u(constant, dst, size), PRIM_X86_SSE2_AVAILABLE,
+	FALSE, dst=dst, 0,
+	TRUE, ipp_wrapper_set_32u(constant, dst, size));
+
+int test_set32u_speed(void)
+{
+	UINT32 ALIGN(dst[MAX_TEST_SIZE+1]);
+	set32u_speed_test("set32u", "aligned", NULL, NULL, 0xdeadbeef, dst,
+		set_sizes, NUM_SET_SIZES, MEMSET32_PRETEST_ITERATIONS, TEST_TIME);
+#if 0
+	/* Not really necessary; should be almost as fast. */
+	set32u_speed_test("set32u", "unaligned", NULL, NULL, dst+1,
+		set_sizes, NUM_SET_SIZES, MEMSET32_PRETEST_ITERATIONS, TEST_TIME);
+#endif
+	return SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+static inline void memset32s_naive(
+	INT32 val,
+	INT32 *dst,
+	size_t count)
+{
+	while (count--) *dst++ = val;
+}
+
+/* ------------------------------------------------------------------------- */
+STD_SPEED_TEST(set32s_speed_test, INT32, INT32, dst=dst,
+	TRUE, memset32s_naive(constant, dst, size),
+	TRUE, sse2_set_32s(constant, dst, size), PRIM_X86_SSE2_AVAILABLE,
+	FALSE, dst=dst, 0,
+	TRUE, ippsSet_32s(constant, dst, size));
+
+int test_set32s_speed(void)
+{
+	INT32 ALIGN(dst[MAX_TEST_SIZE+1]);
+	set32s_speed_test("set32s", "aligned", NULL, NULL, 0xdeadbeef, dst,
+		set_sizes, NUM_SET_SIZES, MEMSET32_PRETEST_ITERATIONS, TEST_TIME);
+#if 0
+	/* Not really necessary; should be almost as fast. */
+	set32s_speed_test("set32s", "unaligned", NULL, NULL, dst+1,
+		set_sizes, NUM_SET_SIZES, MEMSET32_PRETEST_ITERATIONS, TEST_TIME);
+#endif
+	return SUCCESS;
+}
--- a/libfreerdp/primitives/test/test_shift.c
+++ b/libfreerdp/primitives/test/test_shift.c
@ -0,0 +1,173 @@
+/* test_shift.c
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include "prim_test.h"
+
+#define FUNC_TEST_SIZE 65536
+static const int SHIFT_PRETEST_ITERATIONS = 50000;
+static const float TEST_TIME = 1.0;
+
+extern pstatus_t general_lShiftC_16s(
+	const INT16 *pSrc, int val, INT16 *pDst, int len);
+extern pstatus_t general_rShiftC_16s(
+	const INT16 *pSrc, int val, INT16 *pDst, int len);
+extern pstatus_t general_shiftC_16s(
+	const INT16 *pSrc, int val, INT16 *pDst, int len);
+extern pstatus_t general_lShiftC_16u(
+	const UINT16 *pSrc, int val, UINT16 *pDst, int len);
+extern pstatus_t general_rShiftC_16u(
+	const UINT16 *pSrc, int val, UINT16 *pDst, int len);
+extern pstatus_t general_shiftC_16u(
+	const UINT16 *pSrc, int val, UINT16 *pDst, int len);
+extern pstatus_t sse2_lShiftC_16s(
+	const INT16 *pSrc, int val, INT16 *pDst, int len);
+extern pstatus_t sse2_rShiftC_16s(
+	const INT16 *pSrc, int val, INT16 *pDst, int len);
+extern pstatus_t sse2_shiftC_16s(
+	const INT16 *pSrc, int val, INT16 *pDst, int len);
+extern pstatus_t sse2_lShiftC_16u(
+	const UINT16 *pSrc, int val, UINT16 *pDst, int len);
+extern pstatus_t sse2_rShiftC_16u(
+	const UINT16 *pSrc, int val, UINT16 *pDst, int len);
+extern pstatus_t sse2_shiftC_16u(
+	const UINT16 *pSrc, int val, UINT16 *pDst, int len);
+
+#ifdef i386
+#define SHIFT_TEST_FUNC(_name_, _type_, _str_, _f1_, _f2_) \
+int _name_(void) \
+{ \
+	_type_ ALIGN(src[FUNC_TEST_SIZE+3]), \
+		ALIGN(d1[FUNC_TEST_SIZE+3]), ALIGN(d2[FUNC_TEST_SIZE+3]); \
+	int failed = 0; \
+	int i; \
+	UINT32 pflags = primitives_get_flags(primitives_get()); \
+	char testStr[256]; \
+	testStr[0] = '\0'; \
+	get_random_data(src, sizeof(src)); \
+	_f1_(src+1, 3, d1+1, FUNC_TEST_SIZE); \
+	if (pflags & PRIM_X86_SSE3_AVAILABLE) \
+	{ \
+		strcat(testStr, " SSE3"); \
+		/* Aligned */ \
+		_f2_(src+1, 3, d2+1, FUNC_TEST_SIZE); \
+		for (i=1; i<=FUNC_TEST_SIZE; ++i) \
+		{ \
+			if (d1[i] != d2[i]) \
+			{  \
+				printf("%s-SSE-aligned FAIL[%d]: 0x%x>>3=0x%x, got 0x%x\n", \
+					_str_, i, src[i], d1[i], d2[i]);  \
+				++failed; \
+			} \
+		} \
+		/* Unaligned */ \
+		_f2_(src+1, 3, d2+2, FUNC_TEST_SIZE); \
+		for (i=1; i<=FUNC_TEST_SIZE; ++i) \
+		{ \
+			if (d1[i] != d2[i+1]) \
+			{  \
+				printf("%s-SSE-unaligned FAIL[%d]: 0x%x>>3=0x%x, got 0x%x\n", \
+					_str_, i, src[i], d1[i], d2[i+1]);  \
+				++failed; \
+			} \
+		} \
+	} \
+	if (!failed) printf("All %s tests passed (%s).\n", _str_, testStr); \
+	return (failed > 0) ? FAILURE : SUCCESS; \
+}
+#else
+#define SHIFT_TEST_FUNC(_name_, _type_, _str_, _f1_, _f2_) \
+int _name_(void) \
+{ \
+	return SUCCESS; \
+}
+#endif /* i386 */
+
+SHIFT_TEST_FUNC(test_lShift_16s_func, INT16, "lshift_16s", general_lShiftC_16s,
+    sse2_lShiftC_16s)
+SHIFT_TEST_FUNC(test_lShift_16u_func, UINT16, "lshift_16u", general_lShiftC_16u,
+    sse2_lShiftC_16u)
+SHIFT_TEST_FUNC(test_rShift_16s_func, INT16, "rshift_16s", general_rShiftC_16s,
+    sse2_rShiftC_16s)
+SHIFT_TEST_FUNC(test_rShift_16u_func, UINT16, "rshift_16u", general_rShiftC_16u,
+    sse2_rShiftC_16u)
+
+/* ========================================================================= */
+STD_SPEED_TEST(speed_lShift_16s, INT16, INT16, dst=dst,
+    TRUE, general_lShiftC_16s(src1, constant, dst, size),
+	TRUE, sse2_lShiftC_16s(src1, constant, dst, size), PRIM_X86_SSE2_AVAILABLE,
+	FALSE, dst=dst, 0,
+	TRUE, ippsLShiftC_16s(src1, constant, dst, size));
+STD_SPEED_TEST(speed_lShift_16u, UINT16, UINT16, dst=dst,
+    TRUE, general_lShiftC_16u(src1, constant, dst, size),
+	TRUE, sse2_lShiftC_16u(src1, constant, dst, size), PRIM_X86_SSE2_AVAILABLE,
+	FALSE, dst=dst, 0,
+	TRUE, ippsLShiftC_16u(src1, constant, dst, size));
+STD_SPEED_TEST(speed_rShift_16s, INT16, INT16, dst=dst,
+    TRUE, general_rShiftC_16s(src1, constant, dst, size),
+	TRUE, sse2_rShiftC_16s(src1, constant, dst, size), PRIM_X86_SSE2_AVAILABLE,
+	FALSE, dst=dst, 0,
+	TRUE, ippsRShiftC_16s(src1, constant, dst, size));
+STD_SPEED_TEST(speed_rShift_16u, UINT16, UINT16, dst=dst,
+    TRUE, general_rShiftC_16u(src1, constant, dst, size),
+	TRUE, sse2_rShiftC_16u(src1, constant, dst, size), PRIM_X86_SSE2_AVAILABLE,
+	FALSE, dst=dst, 0,
+	TRUE, ippsRShiftC_16u(src1, constant, dst, size));
+
+/* ------------------------------------------------------------------------- */
+int test_lShift_16s_speed(void)
+{
+	INT16 ALIGN(src[MAX_TEST_SIZE+1]), ALIGN(dst[MAX_TEST_SIZE+1]);
+	get_random_data(src, sizeof(src));
+	speed_lShift_16s("lShift_16s", "aligned", src, NULL, 3, dst,
+		test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME);
+	speed_lShift_16s("lShift_16s", "unaligned", src+1, NULL, 3, dst,
+		test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME);
+	return SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+int test_lShift_16u_speed(void)
+{
+	UINT16 ALIGN(src[MAX_TEST_SIZE+1]), ALIGN(dst[MAX_TEST_SIZE+1]);
+	get_random_data(src, sizeof(src));
+	speed_lShift_16u("lShift_16u", "aligned", src, NULL, 3, dst,
+		test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME);
+	speed_lShift_16u("lShift_16u", "unaligned", src+1, NULL, 3, dst,
+		test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME);
+	return SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+int test_rShift_16s_speed(void)
+{
+	INT16 ALIGN(src[MAX_TEST_SIZE+1]), ALIGN(dst[MAX_TEST_SIZE+1]);
+	get_random_data(src, sizeof(src));
+	speed_rShift_16s("rShift_16s", "aligned", src, NULL, 3, dst,
+		test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME);
+	speed_rShift_16s("rShift_16s", "unaligned", src+1, NULL, 3, dst,
+		test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME);
+	return SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+int test_rShift_16u_speed(void)
+{
+	UINT16 ALIGN(src[MAX_TEST_SIZE+1]), ALIGN(dst[MAX_TEST_SIZE+1]);
+	get_random_data(src, sizeof(src));
+	speed_rShift_16u("rShift_16u", "aligned", src, NULL, 3, dst,
+		test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME);
+	speed_rShift_16u("rShift_16u", "unaligned", src+1, NULL, 3, dst,
+		test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME);
+	return SUCCESS;
+}
--- a/libfreerdp/primitives/test/test_sign.c
+++ b/libfreerdp/primitives/test/test_sign.c
@ -0,0 +1,91 @@
+/* test_sign.c
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
+ */
+
+#include "prim_test.h"
+
+static const int SIGN_PRETEST_ITERATIONS = 100000;
+static const float TEST_TIME = 1.0;
+
+extern pstatus_t general_sign_16s(const INT16 *pSrc, INT16 *pDst, int len);
+extern pstatus_t ssse3_sign_16s(const INT16 *pSrc, INT16 *pDst, int len);
+
+/* ------------------------------------------------------------------------- */
+int test_sign16s_func(void)
+{
+	INT16 ALIGN(src[65535]), ALIGN(d1[65535]), ALIGN(d2[65535]);
+	int failed = 0;
+	int i;
+	UINT32 pflags = primitives_get_flags(primitives_get());
+	char testStr[256];
+
+	/* Test when we can reach 16-byte alignment */
+	testStr[0] = '\0';
+	get_random_data(src, sizeof(src));
+	general_sign_16s(src+1, d1+1, 65535);
+#ifdef i386
+	if (pflags & PRIM_X86_SSSE3_AVAILABLE)
+	{
+		strcat(testStr, " SSSE3");
+		ssse3_sign_16s(src+1, d2+1, 65535);
+		for (i=1; i<65535; ++i)
+		{
+			if (d1[i] != d2[i])
+			{ 
+				printf("SIGN16s-SSE-aligned FAIL[%d] of %d: want %d, got %d\n", 
+					i, src[i], d1[i], d2[i]); 
+				++failed;
+			}
+		}
+	}
+#endif /* i386 */
+
+	/* Test when we cannot reach 16-byte alignment */
+	get_random_data(src, sizeof(src));
+	general_sign_16s(src+1, d1+2, 65535);
+#ifdef i386
+	if (pflags & PRIM_X86_SSSE3_AVAILABLE)
+	{
+		ssse3_sign_16s(src+1, d2+2, 65535);
+		for (i=2; i<65535; ++i)
+		{
+			if (d1[i] != d2[i])
+			{ 
+				printf("SIGN16s-SSE-unaligned FAIL[%d] of %d: want %d, got %d\n", 
+					i, src[i-1], d1[i], d2[i]); 
+				++failed;
+			}
+		}
+	}
+#endif /* i386 */
+	if (!failed) printf("All sign16s tests passed (%s).\n", testStr);
+	return (failed > 0) ? FAILURE : SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+STD_SPEED_TEST(sign16s_speed_test, INT16, INT16, dst=dst,
+	TRUE, general_sign_16s(src1, dst, size),
+	TRUE, ssse3_sign_16s(src1, dst, size), PRIM_X86_SSSE3_AVAILABLE,
+	FALSE, dst=dst, 0,
+	FALSE, dst=dst);
+
+int test_sign16s_speed(void)
+{
+	INT16 ALIGN(src[MAX_TEST_SIZE+3]), ALIGN(dst[MAX_TEST_SIZE+3]);
+	get_random_data(src, sizeof(src));
+	sign16s_speed_test("sign16s", "aligned", src, NULL, 0, dst,
+		test_sizes, NUM_TEST_SIZES, SIGN_PRETEST_ITERATIONS, TEST_TIME);
+	sign16s_speed_test("sign16s", "unaligned", src+1, NULL, 0, dst,
+		test_sizes, NUM_TEST_SIZES, SIGN_PRETEST_ITERATIONS, TEST_TIME);
+	return SUCCESS;
+}
--- a/server/Mac/CMakeLists.txt
+++ b/server/Mac/CMakeLists.txt
@ -64,7 +64,7 @@ set(${MODULE_PREFIX}_LIBS ${${MODULE_PREFIX}_LIBS}
 set_complex_link_libraries(VARIABLE ${MODULE_PREFIX}_LIBS
 	MONOLITHIC ${MONOLITHIC_BUILD}
 	MODULE freerdp
-	MODULES freerdp-core freerdp-utils freerdp-codec)
+	MODULES freerdp-core freerdp-utils freerdp-codec freerdp-primitives)
 	
 set_complex_link_libraries(VARIABLE ${MODULE_PREFIX}_LIBS
 	MONOLITHIC ${MONOLITHIC_BUILD}
--- a/server/Sample/CMakeLists.txt
+++ b/server/Sample/CMakeLists.txt
@ -33,7 +33,7 @@ set(${MODULE_PREFIX}_LIBS ${${MODULE_PREFIX}_LIBS} freerdp-server)
 set_complex_link_libraries(VARIABLE ${MODULE_PREFIX}_LIBS
 	MONOLITHIC ${MONOLITHIC_BUILD}
 	MODULE freerdp
-	MODULES freerdp-core freerdp-utils freerdp-codec)
+	MODULES freerdp-core freerdp-utils freerdp-codec freerdp-primitives)
 	
 set_complex_link_libraries(VARIABLE ${MODULE_PREFIX}_LIBS
 	MONOLITHIC ${MONOLITHIC_BUILD}
--- a/server/Windows/CMakeLists.txt
+++ b/server/Windows/CMakeLists.txt
@ -59,7 +59,7 @@ set(${MODULE_PREFIX}_LIBS ${${MODULE_PREFIX}_LIBS} freerdp-server)
 set_complex_link_libraries(VARIABLE ${MODULE_PREFIX}_LIBS
 	MONOLITHIC ${MONOLITHIC_BUILD}
 	MODULE freerdp
-	MODULES freerdp-core freerdp-utils freerdp-codec)
+	MODULES freerdp-core freerdp-utils freerdp-codec freerdp-primitives)

 target_link_libraries(${MODULE_NAME} ${${MODULE_PREFIX}_LIBS})

--- a/server/X11/CMakeLists.txt
+++ b/server/X11/CMakeLists.txt
@ -90,7 +90,7 @@ set(${MODULE_PREFIX}_LIBS ${${MODULE_PREFIX}_LIBS} ${X11_LIBRARIES})
 set_complex_link_libraries(VARIABLE ${MODULE_PREFIX}_LIBS
 	MONOLITHIC ${MONOLITHIC_BUILD}
 	MODULE freerdp
-	MODULES freerdp-core freerdp-common freerdp-codec freerdp-utils freerdp-gdi freerdp-crypto freerdp-locale)
+	MODULES freerdp-core freerdp-common freerdp-codec freerdp-primitives freerdp-utils freerdp-gdi freerdp-crypto freerdp-locale)

 set_complex_link_libraries(VARIABLE ${MODULE_PREFIX}_LIBS
 	MONOLITHIC ${MONOLITHIC_BUILD}