YUV data conversion with SSSE3 using intrinsics

2014-09-08 16:29:01 +02:00 · 2014-09-08 16:29:01 +02:00 · fee370e4b2
commit fee370e4b2
parent 25593c7250
9 changed files with 335 additions and 1668 deletions
--- a/libfreerdp/codec/CMakeLists.txt
+++ b/libfreerdp/codec/CMakeLists.txt
@ -102,50 +102,21 @@ if(WITH_LIBAVCODEC)
 endif()

 if(WITH_LIBAVCODEC OR WITH_OPENH264)
-	if(CMAKE_SIZEOF_VOID_P EQUAL 8)
-		set(arch64 TRUE)
-	else()
-		set(arch64 FALSE)
-	endif()
-
-	if(WITH_H264_ASM)
-		set(H264_ASM  H264_ASM_o)
-		add_definitions(-DWITH_H264_ASM)
-		add_custom_target(${H264_ASM})
-
-		if(arch64)
-			set(SRC ${CMAKE_CURRENT_SOURCE_DIR}/h264_x64.asm)
-			set(OBJ ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${H264_ASM}.dir/h264_x64.asm.o)
-			add_custom_command(TARGET ${H264_ASM}
-				COMMAND nasm ARGS -f elf64 -o ${OBJ} ${SRC})
-		else()
-			set(SRC ${CMAKE_CURRENT_SOURCE_DIR}/h264_x32.asm)
-			set(OBJ ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${H264_ASM}.dir/h264_x32.asm.o)
-			add_custom_command(TARGET ${H264_ASM}
-				COMMAND nasm ARGS -f elf32 -o ${OBJ} ${SRC})
-		endif()
-
-		set(FREERDP_OPENH264_LIBS ${OPENH264_LIBRARIES} ${OBJ})
-	endif()
-
 	if(WITH_H264_SSSE3)
-		set(H264_ASM  H264_ASM_o)
 		add_definitions(-DWITH_H264_SSSE3)
-		add_custom_target(${H264_ASM})
+		set(${MODULE_PREFIX}_SRCS
+			${${MODULE_PREFIX}_SRCS}
+			h264_ssse3.c)
 		
-		if(arch64)
-			set(SRC ${CMAKE_CURRENT_SOURCE_DIR}/h264_ssse3_x64.asm)
-			set(OBJ ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${H264_ASM}.dir/h264_ssse3_x64.asm.o)
-			add_custom_command(TARGET ${H264_ASM}
-				COMMAND nasm ARGS -f elf64 -o ${OBJ} ${SRC})
-		else()
-			set(SRC ${CMAKE_CURRENT_SOURCE_DIR}/h264_ssse3_x32.asm)
-			set(OBJ ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${H264_ASM}.dir/h264_ssse3_x32.asm.o)
-			add_custom_command(TARGET ${H264_ASM}
-				COMMAND nasm ARGS -f elf32 -o ${OBJ} ${SRC})
+		if(CMAKE_COMPILER_IS_GNUCC)
+			set(OPTIMIZATION "${OPTIMIZATION} -msse2 -mssse3")
 		endif()
 		
-		set(FREERDP_OPENH264_LIBS ${OPENH264_LIBRARIES} ${OBJ})
+		if(MSVC)
+			set(OPTIMIZATION "${OPTIMIZATION} /arch:SSE2")
+		endif()
+		
+		set_property(SOURCE h264_ssse3.c PROPERTY COMPILE_FLAGS ${OPTIMIZATION})
 	endif()
 endif()

@ -179,10 +150,6 @@ else()
 	install(TARGETS ${MODULE_NAME} DESTINATION ${CMAKE_INSTALL_LIBDIR} EXPORT FreeRDPTargets)
 endif()

-if(WITH_H264_ASM OR WITH_H264_SSSE3)
-	add_dependencies(${MODULE_NAME} ${H264_ASM})
-endif()
-
 set_property(TARGET ${MODULE_NAME} PROPERTY FOLDER "FreeRDP/libfreerdp")

 if(BUILD_TESTING)
--- a/libfreerdp/codec/h264.c
+++ b/libfreerdp/codec/h264.c
@ -31,12 +31,8 @@
 #include <sys/time.h>

 #ifdef WITH_H264_SSSE3
-extern int check_ssse3();
-extern int freerdp_image_yuv420p_to_xrgb(BYTE *pDstData,BYTE **pSrcData,int nWidth,int nHeight,int *iStride,int scanline);
-#else
-#ifdef WITH_H264_ASM
-extern int freerdp_image_yuv_to_xrgb_asm(BYTE *pDstData,BYTE **pSrcData,int nWidth,int nHeight,int iStride0,int iStride1);
-#endif
+extern int freerdp_check_ssse3();
+extern int freerdp_image_yuv420p_to_xrgb_ssse3(BYTE *pDstData,BYTE **pSrcData,int nWidth,int nHeight,int *iStride,int scanline);
 #endif

 #define USE_GRAY_SCALE	0
@ -408,7 +404,6 @@ static int openh264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSiz
 	if (pSystemBuffer->iFormat != videoFormatI420)
 		return -1;

-	/* Convert I420 (same as IYUV) to XRGB. */

 	if (g_H264DumpFrames)
 	{
@ -423,25 +418,6 @@ static int openh264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSiz
 	h264->height = pSystemBuffer->iHeight;


-#if 0
-	if (h264_prepare_rgb_buffer(h264, pSystemBuffer->iWidth, pSystemBuffer->iHeight) < 0)
-		return -1;
-
-	gettimeofday(&T1,NULL);
-#ifdef WITH_H264_SSSE3
-	freerdp_image_yuv420p_to_xrgb(h264->data,pYUVData,h264->width,h264->height,pSystemBuffer->iStride[0],pSystemBuffer->iStride[1]);
-#else
-#ifdef WITH_H264_ASM
-	freerdp_image_yuv_to_xrgb_asm(h264->data,pYUVData,h264->width,h264->height,pSystemBuffer->iStride[0],pSystemBuffer->iStride[1]);
-#else
-	freerdp_image_copy_yuv420p_to_xrgb(h264->data, h264->scanline, 0, 0,
-			h264->width, h264->height, pYUVData, pSystemBuffer->iStride, 0, 0);
-#endif
-#endif
-		gettimeofday(&T2,NULL);
-		printf("\tconverting took %u sec %u usec\n",(unsigned int)(T2.tv_sec-T1.tv_sec),(unsigned int)(T2.tv_usec-T1.tv_usec));
-#endif
-
 	return 1;
 }

@ -677,7 +653,7 @@ int h264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize,
 	BYTE* pDstPoint;

 	BYTE** pYUVData;
-	BYTE* pYUVPoint[2];
+	BYTE* pYUVPoint[3];

 	RDPGFX_RECT16* rect;
 	int* iStride;
@ -743,13 +719,16 @@ int h264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize,
 		pYUVPoint[1] = pYUVData[1] + ret;
 		pYUVPoint[2] = pYUVData[2] + ret;

-#if 1
+#if 0
 		printf("regionRect: x: %d, y: %d, cx: %d, cy: %d\n",
 		       rect->left, rect->top, cx, cy);
 #endif

 #ifdef WITH_H264_SSSE3
-		freerdp_image_yuv420p_to_xrgb(pDstPoint, pYUVPoint, cx, cy, iStride, nDstStep);
+		freerdp_image_yuv420p_to_xrgb_ssse3(pDstPoint, pYUVPoint, cx, cy, iStride, nDstStep);
+#else
+		freerdp_image_copy_yuv420p_to_xrgb(pDstPoint, nDstStep, 0, 0,
+			cx, cy, pYUVPoint, iStride, 0, 0);
 #endif
 	}
 	gettimeofday(&T2,NULL);
@ -774,9 +753,9 @@ H264_CONTEXT* h264_context_new(BOOL Compressor)
 	h264 = (H264_CONTEXT*) calloc(1, sizeof(H264_CONTEXT));

 #ifdef WITH_H264_SSSE3
-	if(check_ssse3()){
+	if(freerdp_check_ssse3()){
 		printf("SSSE3 seems to be not supported on this system, try without WITH_H264_SSSE3 ...");
-		return FALSE;
+		return NULL;
 	}
 #endif

--- a/libfreerdp/codec/h264_ssse3.c
+++ b/libfreerdp/codec/h264_ssse3.c
@ -0,0 +1,298 @@
+/** function for converting YUV420p data to the RGB format (but without any special upconverting)
+ * It's completely written in nasm-x86-assembly for intel processors supporting SSSE3 and higher.
+ * The target scanline (6th parameter) must be a multiple of 16.
+ * iStride[0] must be (target scanline) / 4 or bigger and iStride[1] the next multiple of four
+ * of the half of iStride[0] or bigger
+ */
+
+#include <stdio.h>
+
+#include <emmintrin.h>
+//#include <immintrin.h>
+#include <tmmintrin.h>
+
+#include <winpr/sysinfo.h>
+#include <winpr/crt.h>
+
+int freerdp_check_ssse3()
+{
+	if(IsProcessorFeaturePresentEx(PF_EX_SSSE3)&&IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
+		return 0;
+	
+	return 1;
+}
+
+
+int freerdp_image_yuv420p_to_xrgb_ssse3(BYTE *pDstData,BYTE **pSrcData,int nWidth,int nHeight,int *iStride,int scanline)
+{
+	char last_line,last_column;
+	int i,VaddDst,VaddY,VaddUV;
+	
+	BYTE *UData,*VData,*YData;
+	
+	__m128i r0,r1,r2,r3,r4,r5,r6,r7;
+	__m128i *buffer;
+	
+	
+	buffer=_aligned_malloc(4*16,16);
+	
+	
+	YData=pSrcData[0];
+	UData=pSrcData[1];
+	VData=pSrcData[2];
+	
+	
+	if((last_column=nWidth&3)){
+		switch(last_column){
+			case 1: r7=_mm_set_epi32(0,0,0,0xFFFFFFFF); break;
+			case 2: r7=_mm_set_epi32(0,0,0xFFFFFFFF,0xFFFFFFFF); break;
+			case 3: r7=_mm_set_epi32(0,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF); break;
+		}
+		_mm_store_si128(buffer+48,r7);
+		last_column=1;
+	}
+	
+	nWidth+=3;
+	nWidth=nWidth>>2;
+	
+	
+	last_line=nHeight&1;
+	nHeight++;
+	nHeight=nHeight>>1;
+	
+	
+	VaddDst=(scanline<<1)-(nWidth<<4);
+	VaddY=(iStride[0]<<1)-(nWidth<<2);
+	VaddUV=iStride[1]-(((nWidth<<1)+2)&0xFFFC);
+	
+	
+	
+	while(nHeight-- >0){
+		if(nHeight==0){
+			last_line=last_line<<1;
+		}
+		
+		i=0;
+		do{
+/*
+ * Well, in the end it should look like this:
+ *	C = Y;
+ *	D = U - 128;
+ *	E = V - 128;
+ *
+ *	R = clip(( 256 * C           + 403 * E + 128) >> 8);
+ *	G = clip(( 256 * C -  48 * D - 120 * E + 128) >> 8);
+ *	B = clip(( 256 * C + 475 * D           + 128) >> 8);
+ */
+			if(!(i&0x01)){
+/* Y-, U- and V-data is stored in different arrays.
+ * We start with processing U-data.
+ *
+ * at first we fetch four U-values from its array and shuffle them like this:
+ *	0d0d 0c0c 0b0b 0a0a
+ * we've done two things: converting the values to signed words and duplicating
+ * each value, because always two pixel "share" the same U- (and V-) data
+ */
+				r0=_mm_cvtsi32_si128(*(UINT32 *)UData);
+				r5=_mm_set_epi32(0x80038003,0x80028002,0x80018001,0x80008000);
+				r0=_mm_shuffle_epi8(r0,r5);
+				
+				UData+=4;
+				
+				r3=_mm_set_epi16(128,128,128,128,128,128,128,128);
+				r0=_mm_subs_epi16(r0,r3);
+				
+				r2=r0;
+				
+				r4=r0;
+				r7=_mm_set_epi16(48,48,48,48,48,48,48,48);
+				r0=_mm_mullo_epi16(r0,r7);
+				r4=_mm_mulhi_epi16(r4,r7);
+				r7=r0;
+				r0=_mm_unpacklo_epi16(r0,r4);
+				r4=_mm_unpackhi_epi16(r7,r4);
+				
+				
+				r6=_mm_set_epi32(128,128,128,128);
+				r0=_mm_sub_epi32(r0,r6);
+				r4=_mm_sub_epi32(r4,r6);
+				
+				
+				r1=r2;
+				r7=_mm_set_epi16(475,475,475,475,475,475,475,475);
+				r1=_mm_mullo_epi16(r1,r7);
+				r2=_mm_mulhi_epi16(r2,r7);
+				r7=r1;
+				r1=_mm_unpacklo_epi16(r1,r2);
+				r7=_mm_unpackhi_epi16(r7,r2);
+				
+				r1=_mm_add_epi32(r1,r6);
+				r7=_mm_add_epi32(r7,r6);
+				
+				_mm_store_si128(buffer+16,r7);
+				
+/* Now we've prepared U-data. Preparing V-data is actually the same, just with other coefficients */
+				r2=_mm_cvtsi32_si128(*(UINT32 *)VData);
+				r2=_mm_shuffle_epi8(r2,r5);
+				
+				VData+=4;
+				
+				r2=_mm_subs_epi16(r2,r3);
+				
+				r5=r2;
+				
+				
+				r3=r2;
+				r7=_mm_set_epi16(403,403,403,403,403,403,403,403);
+				r2=_mm_mullo_epi16(r2,r7);
+				r3=_mm_mulhi_epi16(r3,r7);
+				r7=r2;
+				r2=_mm_unpacklo_epi16(r2,r3);
+				r7=_mm_unpackhi_epi16(r7,r3);
+				
+				r2=_mm_add_epi32(r2,r6);
+				r7=_mm_add_epi32(r7,r6);
+				
+				_mm_store_si128(buffer+32,r7);
+				
+				
+				
+				r3=r5;
+				r7=_mm_set_epi16(120,120,120,120,120,120,120,120);
+				r3=_mm_mullo_epi16(r3,r7);
+				r5=_mm_mulhi_epi16(r5,r7);
+				r7=r3;
+				r3=_mm_unpacklo_epi16(r3,r5);
+				r7=_mm_unpackhi_epi16(r7,r5);
+				
+				r0=_mm_add_epi32(r0,r3);
+				r4=_mm_add_epi32(r4,r7);
+				
+				_mm_store_si128(buffer,r4);
+			}else{
+				r1=_mm_load_si128(buffer+16);
+				r2=_mm_load_si128(buffer+32);
+				r0=_mm_load_si128(buffer);
+			}
+			
+			if(++i==nWidth)
+				last_column=last_column<<1;
+			
+			//processing Y data
+			r4=_mm_cvtsi32_si128(*(UINT32 *)YData);
+			r7=_mm_set_epi32(0x80800380,0x80800280,0x80800180,0x80800080);
+			r4=_mm_shuffle_epi8(r4,r7);
+			
+			r5=r4;
+			r6=r4;
+			
+			r4=_mm_add_epi32(r4,r2);
+			r5=_mm_sub_epi32(r5,r0);
+			r6=_mm_add_epi32(r6,r1);
+			
+			
+			r4=_mm_slli_epi32(r4,8);
+			r5=_mm_slli_epi32(r5,8);
+			r6=_mm_slli_epi32(r6,8);
+			
+			r7=_mm_set_epi32(0,0,0,0);
+			r4=_mm_max_epi16(r4,r7);
+			r5=_mm_max_epi16(r5,r7);
+			r6=_mm_max_epi16(r6,r7);
+			
+			r7=_mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000);
+			r4=_mm_min_epi16(r4,r7);
+			r5=_mm_min_epi16(r5,r7);
+			r6=_mm_min_epi16(r6,r7);
+			
+			//r7=_mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000);
+			r4=_mm_and_si128(r4,r7);
+			
+			r7=_mm_set_epi32(0x80800E80,0x80800A80,0x80800680,0x80800280);
+			r5=_mm_shuffle_epi8(r5,r7);
+			
+			r7=_mm_set_epi32(0x8080800E,0x8080800A,0x80808006,0x80808002);
+			r6=_mm_shuffle_epi8(r6,r7);
+			
+			
+			r4=_mm_or_si128(r4,r5);
+			r4=_mm_or_si128(r4,r6);
+			
+			
+			if(last_column&0x02){
+				r6=_mm_load_si128(buffer+48);
+				r4=_mm_and_si128(r4,r6);
+				r5=_mm_lddqu_si128((__m128i *)pDstData);
+				r6=_mm_andnot_si128(r6,r5);
+				r4=_mm_or_si128(r4,r6);
+			}
+			_mm_storeu_si128((__m128i *)pDstData,r4);
+			
+			//Y data processing in secound line
+			if(!(last_line&0x02)){
+				r4=_mm_cvtsi32_si128(*(UINT32 *)(YData+iStride[0]));
+				r7=_mm_set_epi32(0x80800380,0x80800280,0x80800180,0x80800080);
+				r4=_mm_shuffle_epi8(r4,r7);
+				
+				r5=r4;
+				r6=r4;
+				
+				r4=_mm_add_epi32(r4,r2);
+				r5=_mm_sub_epi32(r5,r0);
+				r6=_mm_add_epi32(r6,r1);
+				
+				
+				r4=_mm_slli_epi32(r4,8);
+				r5=_mm_slli_epi32(r5,8);
+				r6=_mm_slli_epi32(r6,8);
+				
+				r7=_mm_set_epi32(0,0,0,0);
+				r4=_mm_max_epi16(r4,r7);
+				r5=_mm_max_epi16(r5,r7);
+				r6=_mm_max_epi16(r6,r7);
+				
+				r7=_mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000);
+				r4=_mm_min_epi16(r4,r7);
+				r5=_mm_min_epi16(r5,r7);
+				r6=_mm_min_epi16(r6,r7);
+				
+				r7=_mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000);
+				r4=_mm_and_si128(r4,r7);
+				
+				r7=_mm_set_epi32(0x80800E80,0x80800A80,0x80800680,0x80800280);
+				r5=_mm_shuffle_epi8(r5,r7);
+				
+				r7=_mm_set_epi32(0x8080800E,0x8080800A,0x80808006,0x80808002);
+				r6=_mm_shuffle_epi8(r6,r7);
+				
+				
+				r4=_mm_or_si128(r4,r5);
+				r4=_mm_or_si128(r4,r6);
+				
+				
+				if(last_column&0x02){
+					r6=_mm_load_si128(buffer+48);
+					r4=_mm_and_si128(r4,r6);
+					r5=_mm_lddqu_si128((__m128i *)(pDstData+scanline));
+					r6=_mm_andnot_si128(r6,r5);
+					r4=_mm_or_si128(r4,r6);
+					
+					last_column=last_column>>1;
+				}
+				_mm_storeu_si128((__m128i *)(pDstData+scanline),r4);
+			}
+			
+			pDstData+=16;
+			YData+=4;
+			
+		}while(i<nWidth);
+		
+		pDstData+=VaddDst;
+		YData+=VaddY;
+		UData+=VaddUV;
+		VData+=VaddUV;
+	}
+		
+	_aligned_free(buffer);
+	return 0;
+}
--- a/libfreerdp/codec/h264_ssse3_x32.asm
+++ b/libfreerdp/codec/h264_ssse3_x32.asm
@ -1,454 +0,0 @@
-; a entire function for converting YUV420p data to the RGB format (without any special upconverting)
-; It's completely written in nasm-x86-assembly for intel processors supporting SSSE3 and higher.
-; Restrictions are that output data has to be aligned to 16 byte (a question of REAL performance!)
-; and the width of resolution must be divisable by four.
-;
-section .text
-	global check_ssse3
-
-check_ssse3:
-	push ebx
-	
-	pushf
-	pop eax
-	or eax,1<<21
-	push eax
-	popf
-	pushf
-	pop eax
-	test eax,1<<21
-	jz check_ssse3_end
-	
-	and eax,~(1<<21)
-	push eax
-	popf
-	
-	
-	mov eax,1
-	mov ebx,0
-	cpuid
-	test edx,1<<25	;sse
-	jz check_ssse3_end
-	test edx,1<<26	;sse2
-	jz check_ssse3_end
-	test ecx,1<<0	;sse3
-	jz check_ssse3_end
-	test ecx,1<<9	;ssse3
-	jz check_ssse3_end
-	
-	
-	pop ebx
-	mov eax,0
-	ret
-	
-	
-check_ssse3_end:
-	pop ebx
-	mov eax,1
-	ret
-	
-	
-;extern int freerdp_image_yuv420p_to_xrgb(unsigned char *pDstData,unsigned char **pSrcData,int nWidth,int nHeight,int istride0,int istride1)
-	global freerdp_image_yuv420p_to_xrgb
-freerdp_image_yuv420p_to_xrgb:
-	push ebx
-	push ebp
-	
-;check wether stack is aligned to 16 byte boundary
-;
-;	---current stack value---|-----x-----|----42 byte---|---16 byte aligned stack---
-;	lets say 508		 2	     506	    464
-;		 1FCH		 2H	     1FAH	    1D0H
-;						    1F0H    1D0H
-;				 |------1FCH&FH----|1FCH&^FH
-;				 |1FCH&FH-AH |--AH-|---16 byte aligned stack------------
-;				We've got only one problem: what if 1FCH&FH was smaller than AH?
-;				We could either add something to sp (impossible) or subtract 10H-(AH-1FCH&FH) [%10H]
-;				That's the same like (1FCH&FH-AH+10H)&FH and (1FCH+6H)&FH
-	mov eax,esp
-	add eax,6H
-	and eax,1111B
-	sub esp,eax
-	
-	mov ebp,esp
-	
-;"local variables"
-	sub esp,318	;res 8 -8,res 8 -16,res 8 -24,U 8 -32,nWidth 2 -34,nHeight 2 -36,iStride0 2 -38,iStride1 2 -40,last_line 1 -41,res 1 -42,G 16 -58,B 16 -74,
-	;R 16 -90,add:128 16 -106,sub:128 16 -122,mul:48 16 -138,mul:475 16 -154,mul:403 16 -170,mul:120 16 -186,VaddY 4 -190,VaddUV 4 -194,stack offset 8 -202,
-	;cmp:255 16 -218,cmp:0 16 -234,shuflleR 16 -250,andG 16 -266,shuffleB 16 -280,shuffleY 16 -296,shuffleUV 16 -314,scanline 4 -318
-	
-	;pDstData:edi,
-	
-	mov [ebp-202],eax
-	
-;last_line: if the last (U,V doubled) line should be skipped, set to 1B
-
-	mov edi,[ebp+eax+12]
-
-	mov ecx,[ebp+eax+16]
-	mov esi,[ecx]
-	mov ebx,[ecx+4]
-	mov [ebp-32],ebx
-	mov ebx,[ecx+8]
-	
-	
-	mov edx,[ebp+eax+20]
-	mov [ebp-34],dx
-	
-	shr word [ebp-34],2
-	
-	mov [ebp-318],edx
-	shl dword [ebp-318],2
-	
-	
-	mov ecx,[ebp+eax+24]
-	
-	mov [ebp-41],cl
-	and byte [ebp-41],1B
-	
-	inc cx
-	shr cx,1
-	mov [ebp-36],cx
-	
-	
-	mov ecx,[ebp+eax+28]
-	mov [ebp-38],cx
-	
-	shl cx,1
-	sub cx,dx
-	mov [ebp-190],ecx
-	
-	
-	mov ecx,[ebp+eax+32]
-	mov [ebp-40],cx
-	
-	
-	shr dx,1
-	sub cx,dx
-	mov [ebp-194],ecx
-
-	
-	mov eax,[ebp-32]
-	
-	
-;init masks
-	mov ecx,00000080H
-	mov [ebp-106],ecx
-	mov [ebp-102],ecx
-	mov [ebp-98],ecx
-	mov [ebp-94],ecx
-
-	mov ecx,00800080H
-	mov [ebp-122],ecx
-	mov [ebp-118],ecx
-	mov [ebp-114],ecx
-	mov [ebp-110],ecx
-	
-	mov ecx,00300030H
-	mov [ebp-138],ecx
-	mov [ebp-134],ecx
-	mov [ebp-130],ecx
-	mov [ebp-126],ecx
-	
-	mov ecx,01DB01DBH
-	mov [ebp-154],ecx
-	mov [ebp-150],ecx
-	mov [ebp-146],ecx
-	mov [ebp-142],ecx
-	
-	mov ecx,01930193H
-	mov [ebp-170],ecx
-	mov [ebp-166],ecx
-	mov [ebp-162],ecx
-	mov [ebp-158],ecx
-	
-	mov ecx,00780078H
-	mov [ebp-186],ecx
-	mov [ebp-182],ecx
-	mov [ebp-178],ecx
-	mov [ebp-174],ecx
-	
-	mov ecx,000FF0000H
-	mov [ebp-218],ecx
-	mov [ebp-214],ecx
-	mov [ebp-210],ecx
-	mov [ebp-206],ecx
-	
-	mov ecx,00000000H
-	mov [ebp-234],ecx
-	mov [ebp-230],ecx
-	mov [ebp-226],ecx
-	mov [ebp-222],ecx
-	
-;shuffle masks
-	;00 xx 00 00  00 xx 00 00  00 xx 00 00  00 xx 00 00
-	;00 rr gg bb  00 rr gg bb  00 rr gg bb  00 rr gg bb
-	mov ecx,00FF0000H
-	mov [ebp-250],ecx
-	mov [ebp-246],ecx
-	mov [ebp-242],ecx
-	mov [ebp-238],ecx
-	
-	mov ecx,80800280H
-	mov [ebp-266],ecx
-	mov ecx,80800680H
-	mov [ebp-262],ecx
-	mov ecx,80800A80H
-	mov [ebp-258],ecx
-	mov ecx,80800E80H
-	mov [ebp-254],ecx
-	
-	mov ecx,80808002H
-	mov [ebp-282],ecx
-	mov ecx,80808006H
-	mov [ebp-278],ecx
-	mov ecx,8080800AH
-	mov [ebp-274],ecx
-	mov ecx,8080800EH
-	mov [ebp-270],ecx
-	
-	;dd cc bb aa
-	;00 00 dd 00  00 00 cc 00  00 00 bb 00  00 00 aa 00
-	mov ecx,80800080H
-	mov [ebp-298],ecx
-	mov ecx,80800180H
-	mov [ebp-294],ecx
-	mov ecx,80800280H
-	mov [ebp-290],ecx
-	mov ecx,80800380H
-	mov [ebp-286],ecx
-	
-	;dd cc bb aa
-	;00 dd 00 dd  00 cc 00 cc  00 bb 00 bb  00 aa 00 aa
-	mov ecx,80008000H
-	mov [ebp-314],ecx
-	mov ecx,80018001H
-	mov [ebp-310],ecx
-	mov ecx,80028002H
-	mov [ebp-306],ecx
-	mov ecx,80038003H
-	mov [ebp-302],ecx
-	
-	
-	
-freerdp_image_yuv420p_to_xrgb_hloop:
-	dec word [ebp-36]
-	js freerdp_image_yuv420p_to_xrgb_hloop_end
-	jnz not_last_line
-	
-	shl byte [ebp-41],1
-not_last_line:
-	
-	mov cx,[ebp-34]
-freerdp_image_yuv420p_to_xrgb_wloop:
-;main loop
-;	C = Y;
-;	D = U - 128;
-;	E = V - 128;
-;
-;	R = clip(( 256 * C           + 403 * E + 128) >> 8);
-;	G = clip(( 256 * C -  48 * D - 120 * E + 128) >> 8);
-;	B = clip(( 256 * C + 475 * D           + 128) >> 8);
-
-	test cx,1B
-	jnz load_yuv_data
-	
-	
-	;prepare U data
-	movd xmm0,[eax]
-	movdqa xmm5,[ebp-314]
-	pshufb xmm0,xmm5	;but this is the omest instruction of all!!
-	
-	add eax,4
-	
-	movdqa xmm3,[ebp-122]
-	psubsw xmm0,xmm3
-	
-	movdqa xmm2,xmm0
-	
-	movdqa xmm4,xmm0
-	movdqa xmm7,[ebp-138]
-	pmullw xmm0,xmm7
-	pmulhw xmm4,xmm7
-	
-	movdqa xmm7,xmm0
-	punpcklwd xmm0,xmm4	;what an awesome instruction!
-	punpckhwd xmm7,xmm4
-	movdqa xmm4,xmm7
-	
-	movdqa xmm6,[ebp-106]
-	psubd xmm0,xmm6
-	psubd xmm4,xmm6
-	
-	
-	movdqa xmm1,xmm2
-	movdqa xmm7,[ebp-154]
-	pmullw xmm1,xmm7
-	pmulhw xmm2,xmm7
-	
-	movdqa xmm7,xmm1
-	punpcklwd xmm1,xmm2
-	punpckhwd xmm7,xmm2
-	
-	paddd xmm1,xmm6
-	paddd xmm7,xmm6
-	
-	movdqa [ebp-74],xmm7
-	
-	
-	;prepare V data
-	movd xmm2,[ebx]
-	pshufb xmm2,xmm5
-	
-	add ebx,4
-	
-	psubsw xmm2,xmm3
-	
-	movdqa xmm5,xmm2
-	
-	movdqa xmm3,xmm2
-	movdqa xmm7,[ebp-170]
-	pmullw xmm2,xmm7
-	pmulhw xmm3,xmm7
-	
-	movdqa xmm7,xmm2
-	punpcklwd xmm2,xmm3
-	punpckhwd xmm7,xmm3
-	
-	paddd xmm2,xmm6
-	paddd xmm7,xmm6
-	
-	movdqa [ebp-90],xmm7
-	
-	
-	movdqa xmm3,xmm5
-	movdqa xmm7,[ebp-186]
-	pmullw xmm3,xmm7
-	pmulhw xmm5,xmm7
-	
-	movdqa xmm7,xmm3
-	punpcklwd xmm3,xmm5
-	punpckhwd xmm7,xmm5
-	
-	paddd xmm0,xmm3
-	paddd xmm4,xmm7
-	
-	movdqa [ebp-58],xmm4
-	
-	jmp valid_yuv_data
-		
-load_yuv_data:
-	movdqa xmm1,[ebp-74]
-	movdqa xmm2,[ebp-90]
-	movdqa xmm0,[ebp-58]
-	
-valid_yuv_data:
-	
-	
-	;Y data processing
-	movd xmm4,[esi]
-	pshufb xmm4,[ebp-298]
-	
-	movdqa xmm5,xmm4
-	movdqa xmm6,xmm4
-	
-	paddd xmm4,xmm2
-	psubd xmm5,xmm0
-	paddd xmm6,xmm1
-	
-	pslld xmm4,8
-	pslld xmm5,8
-	pslld xmm6,8
-	
-	movdqa xmm7,[ebp-234]
-	pmaxsw xmm4,xmm7	;what an awesome instruction!
-	pmaxsw xmm5,xmm7
-	pmaxsw xmm6,xmm7
-	
-	movdqa xmm7,[ebp-218]
-	pminsw xmm4,xmm7
-	pminsw xmm5,xmm7
-	pminsw xmm6,xmm7
-	
-	pand xmm4,[ebp-250]
-	pshufb xmm5,[ebp-266]
-	pshufb xmm6,[ebp-282]
-	
-	por xmm4,xmm5
-	por xmm4,xmm6
-	
-	movdqa [edi],xmm4
-	
-	
-	;Y data processing in secound line
-	test byte [ebp-41],2
-	jnz skip_last_line1
-	
-	mov dx,[ebp-38]
-	and edx,0FFFFH
-	movd xmm4,[esi+edx]
-	pshufb xmm4,[ebp-298]
-	
-	
-	movdqa xmm5,xmm4
-	movdqa xmm6,xmm4
-	
-	paddd xmm4,xmm2
-	psubd xmm5,xmm0
-	paddd xmm6,xmm1
-	
-	pslld xmm4,8
-	pslld xmm5,8
-	pslld xmm6,8
-	
-	movdqa xmm7,[ebp-234]
-	pmaxsw xmm4,xmm7	;what an awesome instruction!
-	pmaxsw xmm5,xmm7
-	pmaxsw xmm6,xmm7
-	
-	movdqa xmm7,[ebp-218]
-	pminsw xmm4,xmm7
-	pminsw xmm5,xmm7
-	pminsw xmm6,xmm7
-	
-	pand xmm4,[ebp-250]
-	pshufb xmm5,[ebp-266]
-	pshufb xmm6,[ebp-282]
-	
-	por xmm4,xmm5
-	por xmm4,xmm6
-	
-	mov edx,[ebp-318]
-	movdqa [edi+edx],xmm4
-	
-skip_last_line1:
-	add edi,16
-	add esi,4
-	
-	dec cx
-	jne freerdp_image_yuv420p_to_xrgb_wloop
-
-freerdp_image_yuv420p_to_xrgb_wloop_end:
-	mov edx,[ebp-318]
-	add edi,edx
-	
-	mov edx,[ebp-190]
-	add esi,edx
-	
-	mov edx,[ebp-194]
-	add eax,edx
-	add ebx,edx
-	
-	jmp freerdp_image_yuv420p_to_xrgb_hloop
-	
-freerdp_image_yuv420p_to_xrgb_hloop_end:
-
-	mov eax,0
-freerdp_image_yuv420p_to_xrgb_end:
-	mov edx,[ebp-202]
-	
-	mov esp,ebp
-	add esp,edx
-	pop ebp
-	pop ebx
-	ret
--- a/libfreerdp/codec/h264_ssse3_x64.asm
+++ b/libfreerdp/codec/h264_ssse3_x64.asm
@ -1,628 +0,0 @@
-; function for converting YUV420p data to the RGB format (but without any special upconverting)
-; It's completely written in nasm-x86-assembly for intel processors supporting SSSE3 and higher.
-; The target scanline (6th parameter) must be a multiple of 16.
-; iStride[0] must be (target scanline) / 4 or bigger and iStride[1] the next multiple of four
-; of the half of iStride[0] or bigger
-;
-section .text
-	global check_ssse3
-
-check_ssse3:
-	push rbx
-	
-	pushf
-	pop rax
-	or rax,1<<21
-	push rax
-	popf
-	pushf
-	pop rax
-	test rax,1<<21
-	jz check_ssse3_end
-	
-	and rax,~(1<<21)
-	push rax
-	popf
-	
-	
-	mov eax,1
-	mov ebx,0
-	cpuid
-	test edx,1<<25	;sse
-	jz check_ssse3_end
-	test edx,1<<26	;sse2
-	jz check_ssse3_end
-	test ecx,1<<0	;sse3
-	jz check_ssse3_end
-	test ecx,1<<9	;ssse3
-	jz check_ssse3_end
-	
-	
-	pop rbx
-	mov eax,0
-	ret
-	
-	
-check_ssse3_end:
-	pop rbx
-	mov eax,1
-	ret
-	
-	
-;extern int freerdp_image_yuv420p_to_xrgb(unsigned char *pDstData,unsigned char **pSrcData,int nWidth,int nHeight,int *istride,int scanline)
-	global freerdp_image_yuv420p_to_xrgb
-freerdp_image_yuv420p_to_xrgb:
-	push rbx
-	push rbp
-	
-;check wether stack is aligned to 16 byte boundary
-;
-;	---current stack value---|-----x-----|----42 byte---|---16 byte aligned stack---
-;	lets say 508		 2	     506	    464
-;		 1FCH		 2H	     1FAH	    1D0H
-;						    1F0H    1D0H
-;				 |------1FCH&FH----|1FCH&^FH
-;				 |1FCH&FH-AH |--AH-|---16 byte aligned stack------------
-;				We've got only one problem: what if 1FCH&FH was smaller than AH?
-;				We could either add something to sp (impossible) or subtract 10H-(AH-1FCH&FH) [%10H]
-;				That's the same like (1FCH&FH-AH+10H)&FH and (1FCH+6H)&FH
-	mov r15,rsp
-	add r15,6H
-	and r15,1111B
-	sub rsp,r15
-	
-	mov rbp,rsp
-	
-	xor r10,r10
-	xor r11,r11
-	xor r12,r12
-	xor r13,r13
-	xor r14,r14
-	
-;"local variables"
-	sub rsp,338	;pDstData 8 -8,Y 8 -16,U 8 -24,V 8 -32,nWidth 2 -34,nHeight 2 -36,iStride0 2 -38,iStride1 2 -40,last_line 1 -41,last_column 1 -42,
-	;G 16 -58,B 16 -74,R 16 -90,add:128 16 -106,sub:128 16 -122,mul:48 16 -138,mul:475 16 -154,mul:403 16 -170,mul:120 16 -186,VaddY 2 -188,VaddUV 2 -190,
-	;res 12 -202,cmp:255 16 -218,cmp:0 16 -234,shuflleR 16 -250,andG 16 -266,shuffleB 16 -280,shuffleY 16 -296,shuffleUV 16 -314,andRemainingColumns 16 -330,
-	;VddDst 8 -338
-	
-;last_line: if the last (U,V doubled) line should be skipped, set to 10B
-;last_column: if it's the last column in a line, set to 10B (for handling line-endings not multiple by four)
-
-	mov [rbp-8],rdi
-
-	mov rax,[rsi]
-	mov [rbp-16],rax
-	mov rax,[rsi+8]
-	mov [rbp-24],rax
-	mov rax,[rsi+16]
-	mov [rbp-32],rax
-	
-	mov [rbp-34],dx
-	mov r13w,cx
-	
-	mov r10w,r9w
-	and r10,0FFFFH
-	
-	
-	mov ecx,[r8]
-	mov [rbp-38],ecx
-	mov r12d,[r8+4]
-	mov [rbp-40],r12w
-	
-	
-	mov [rbp-42],dl
-	and byte [rbp-42],11B
-
-	
-	mov [rbp-338],r10
-	shr word [rbp-338],1
-	shl cx,1
-	
-	mov r8w,[rbp-34]
-	add r8w,3
-	and r8w, 0FFFCH
-	
-	sub [rbp-338],r8w
-	sub cx,r8w
-	
-	shr r8w,1
-	
-	mov dx,r8w
-	add dx,2
-	and dx,0FFFCH
-	sub r12w,dx
-	
-	shl dword [rbp-338],2
-	mov r11w,cx
-	
-	shr r8w,1
-	
-	mov r9w,[rbp-38]
-	
-	
-	;and al,11B
-	;jz no_column_rest
-	
-	;inc word [rbp-34]
-	
-;no_column_rest:
-	;mov [rbp-41],al
-	
-	
-	
-	mov r14b,r13b
-	and r14b,1B
-	;jz no_line_rest
-	
-	inc r13w
-
-;no_line_rest:
-	shr r13w,1
-	
-	
-	
-;init masks
-	mov eax,00000080H
-	mov [rbp-106],eax
-	mov [rbp-102],eax
-	mov [rbp-98],eax
-	mov [rbp-94],eax
-
-	mov eax,00800080H
-	mov [rbp-122],eax
-	mov [rbp-118],eax
-	mov [rbp-114],eax
-	mov [rbp-110],eax
-	
-	mov eax,00300030H
-	mov [rbp-138],eax
-	mov [rbp-134],eax
-	mov [rbp-130],eax
-	mov [rbp-126],eax
-	
-	mov eax,01DB01DBH
-	mov [rbp-154],eax
-	mov [rbp-150],eax
-	mov [rbp-146],eax
-	mov [rbp-142],eax
-	
-	mov eax,01930193H
-	mov [rbp-170],eax
-	mov [rbp-166],eax
-	mov [rbp-162],eax
-	mov [rbp-158],eax
-	
-	mov eax,00780078H
-	mov [rbp-186],eax
-	mov [rbp-182],eax
-	mov [rbp-178],eax
-	mov [rbp-174],eax
-	
-	mov eax,000FF0000H
-	mov [rbp-218],eax
-	mov [rbp-214],eax
-	mov [rbp-210],eax
-	mov [rbp-206],eax
-	
-	mov eax,00000000H
-	mov [rbp-234],eax
-	mov [rbp-230],eax
-	mov [rbp-226],eax
-	mov [rbp-222],eax
-	
-;shuffle masks
-	;00 xx 00 00  00 xx 00 00  00 xx 00 00  00 xx 00 00
-	;00 rr gg bb  00 rr gg bb  00 rr gg bb  00 rr gg bb
-	mov eax,00FF0000H
-	mov [rbp-250],eax
-	mov [rbp-246],eax
-	mov [rbp-242],eax
-	mov [rbp-238],eax
-	
-	mov eax,80800280H
-	mov [rbp-266],eax
-	mov eax,80800680H
-	mov [rbp-262],eax
-	mov eax,80800A80H
-	mov [rbp-258],eax
-	mov eax,80800E80H
-	mov [rbp-254],eax
-	
-	mov eax,80808002H
-	mov [rbp-282],eax
-	mov eax,80808006H
-	mov [rbp-278],eax
-	mov eax,8080800AH
-	mov [rbp-274],eax
-	mov eax,8080800EH
-	mov [rbp-270],eax
-	
-	;dd cc bb aa
-	;00 00 dd 00  00 00 cc 00  00 00 bb 00  00 00 aa 00
-	mov eax,80800080H
-	mov [rbp-298],eax
-	mov eax,80800180H
-	mov [rbp-294],eax
-	mov eax,80800280H
-	mov [rbp-290],eax
-	mov eax,80800380H
-	mov [rbp-286],eax
-	
-	;dd cc bb aa
-	;00 dd 00 dd  00 cc 00 cc  00 bb 00 bb  00 aa 00 aa
-	mov eax,80008000H
-	mov [rbp-314],eax
-	mov eax,80018001H
-	mov [rbp-310],eax
-	mov eax,80028002H
-	mov [rbp-306],eax
-	mov eax,80038003H
-	mov [rbp-302],eax
-	
-;remaining columns and mask
-	cmp byte [rbp-42],0
-	je freerdp_image_yuv420p_to_xrgb_no_columns_remain
-
-	mov dl,[rbp-42]
-	xor ebx,ebx
-	xor ecx,ecx
-	xor esi,esi
-
-	mov eax,0FFFFFFFFH
-	cmp dl,1H
-	je freerdp_image_yuv420p_to_xrgb_write_columns_remain
-	
-	mov ebx,0FFFFFFFFH
-	cmp dl,2H
-	je freerdp_image_yuv420p_to_xrgb_write_columns_remain
-	
-	mov ecx,0FFFFFFFFH
-	
-freerdp_image_yuv420p_to_xrgb_write_columns_remain:
-	mov [rbp-330],eax
-	mov [rbp-326],ebx
-	mov [rbp-322],ecx
-	mov [rbp-318],esi
-	mov byte [rbp-42],1
-	
-freerdp_image_yuv420p_to_xrgb_no_columns_remain:
-	
-	
-	mov rsi,[rbp-16]
-	mov rax,[rbp-24]
-	mov rbx,[rbp-32]
-	
-	;jmp freerdp_image_yuv420p_to_xrgb_end
-	
-freerdp_image_yuv420p_to_xrgb_hloop:
-	dec r13w
-	js freerdp_image_yuv420p_to_xrgb_hloop_end
-	jnz not_last_line
-	
-	shl r14b,1
-not_last_line:
-	
-	xor cx,cx
-freerdp_image_yuv420p_to_xrgb_wloop:
-; Well, in the end it should look like this:
-;	C = Y;
-;	D = U - 128;
-;	E = V - 128;
-;
-;	R = clip(( 256 * C           + 403 * E + 128) >> 8);
-;	G = clip(( 256 * C -  48 * D - 120 * E + 128) >> 8);
-;	B = clip(( 256 * C + 475 * D           + 128) >> 8);
-
-	test cx,1B
-	jnz freerdp_image_yuv420p_to_xrgb_load_yuv_data
-	
-	
-; Y-, U- and V-data is stored in different arrays.
-; We start with processing U-data.
-
-; at first we fetch four U-values from its array and shuffle them like this:
-;	0d0d 0c0c 0b0b 0a0a
-; we've done two things: converting the values to signed words and duplicating
-; each value, because always two pixel "share" the same U- (and V-) data
-	movd xmm0,[rax]
-	movdqa xmm5,[rbp-314]
-	pshufb xmm0,xmm5	;but this is the awesomest instruction of all!!
-	
-	add rax,4
-	
-; then we subtract 128 from each value, so we get D
-	movdqa xmm3,[rbp-122]
-	psubsw xmm0,xmm3
-	
-; we need to do two things with our D, so let's store it for later use
-	movdqa xmm2,xmm0
-	
-; now we can multiply our D with 48 and unpack it to xmm4:xmm0
-; this is what we need to get G data later on
-	movdqa xmm4,xmm0
-	movdqa xmm7,[rbp-138]
-	pmullw xmm0,xmm7
-	pmulhw xmm4,xmm7
-	
-	movdqa xmm7,xmm0
-	punpcklwd xmm0,xmm4	;what an awesome instruction!
-	punpckhwd xmm7,xmm4
-	movdqa xmm4,xmm7
-	
-; to complete this step, add (?) 128 to each value (rounding ?!)
-; yeah, add. in the end this will be subtracted from something,
-; because it's part of G: 256*C - (48*D + 120*E - 128), 48*D-128 !
-; by the way, our values have become signed dwords during multiplication!
-	movdqa xmm6,[rbp-106]
-	psubd xmm0,xmm6
-	psubd xmm4,xmm6
-	
-	
-; to get B data, we need to prepare a secound value, D*475+128
-	movdqa xmm1,xmm2
-	movdqa xmm7,[rbp-154]
-	pmullw xmm1,xmm7
-	pmulhw xmm2,xmm7
-	
-	movdqa xmm7,xmm1
-	punpcklwd xmm1,xmm2
-	punpckhwd xmm7,xmm2
-	
-	paddd xmm1,xmm6
-	paddd xmm7,xmm6
-	
-; so we got something like this: xmm7:xmm1
-; this pair contains values for 16 pixel:
-; aabbccdd
-; aabbccdd, but we can only work on four pixel at once, so we need to save upper values
-	movdqa [rbp-74],xmm7
-	
-	
-; Now we've prepared U-data. Preparing V-data is actually the same, just with other coefficients.
-	movd xmm2,[rbx]
-	pshufb xmm2,xmm5
-	
-	add rbx,4
-	
-	psubsw xmm2,xmm3
-	
-	movdqa xmm5,xmm2
-	
-; this is also known as E*403+128, we need it to convert R data
-	movdqa xmm3,xmm2
-	movdqa xmm7,[rbp-170]
-	pmullw xmm2,xmm7
-	pmulhw xmm3,xmm7
-	
-	movdqa xmm7,xmm2
-	punpcklwd xmm2,xmm3
-	punpckhwd xmm7,xmm3
-	
-	paddd xmm2,xmm6
-	paddd xmm7,xmm6
-	
-; and preserve upper four values for future ...
-	movdqa [rbp-90],xmm7
-	
-	
-; doing this step: E*120
-	movdqa xmm3,xmm5
-	movdqa xmm7,[rbp-186]
-	pmullw xmm3,xmm7
-	pmulhw xmm5,xmm7
-	
-	movdqa xmm7,xmm3
-	punpcklwd xmm3,xmm5
-	punpckhwd xmm7,xmm5
-	
-; now we complete what we've begun above:
-; (48*D-128) + (120*E) = (48*D +120*E -128)
-	paddd xmm0,xmm3
-	paddd xmm4,xmm7
-	
-; and store to memory !
-	movdqa [rbp-58],xmm4
-	
-; real assembly programmers do not only produce best results between 0 and 5 o'clock,
-; but are also kangaroos!
-	jmp freerdp_image_yuv420p_to_xrgb_valid_yuv_data
-	
-freerdp_image_yuv420p_to_xrgb_load_yuv_data:
-; maybe you've wondered about the conditional jump to this label above ?
-; Well, we prepared UV data for eight pixel in each line, but can only process four
-; per loop. So we need to load the upper four pixel data from memory each secound loop!
-	movdqa xmm1,[rbp-74]
-	movdqa xmm2,[rbp-90]
-	movdqa xmm0,[rbp-58]
-	
-freerdp_image_yuv420p_to_xrgb_valid_yuv_data:
-
-	inc cx
-	cmp cx,r8w
-	jne freerdp_image_yuv420p_to_xrgb_not_last_columns
-	
-	shl byte [rbp-42],1
-	
-	
-freerdp_image_yuv420p_to_xrgb_not_last_columns:
-	
-; We didn't produce any output yet, so let's do so!
-; Ok, fetch four pixel from the Y-data array and shuffle them like this:
-; 00d0 00c0 00b0 00a0, to get signed dwords and multiply by 256
-	movd xmm4,[rsi]
-	pshufb xmm4,[rbp-298]
-	
-	movdqa xmm5,xmm4
-	movdqa xmm6,xmm4
-	
-; no we can perform the "real" conversion itself and produce output!
-	paddd xmm4,xmm2
-	psubd xmm5,xmm0
-	paddd xmm6,xmm1
-	
-; in the end, we only need bytes for RGB values.
-; So, what do we do? right! shifting left makes values bigger and thats always good.
-; before we had dwords of data, and by shifting left and treating the result
-; as packed words, we get not only signed words, but do also divide by 256
-; imagine, data is now ordered this way: ddx0 ccx0 bbx0 aax0, and x is the least
-; significant byte, that we don't need anymore, because we've done some rounding
-	pslld xmm4,8
-	pslld xmm5,8
-	pslld xmm6,8
-	
-; one thing we still have to face is the clip() function ...
-; we have still signed words, and there are those min/max instructions in SSE2 ...
-; the max instruction takes always the bigger of the two operands and stores it in the first one,
-; and it operates with signs !
-; if we feed it with our values and zeros, it takes the zeros if our values are smaller than
-; zero and otherwise our values
-	movdqa xmm7,[rbp-234]
-	pmaxsw xmm4,xmm7	;what an awesome instruction!
-	pmaxsw xmm5,xmm7
-	pmaxsw xmm6,xmm7
-	
-; the same thing just completely different can be used to limit our values to 255,
-; but now using the min instruction and 255s
-	movdqa xmm7,[rbp-218]
-	pminsw xmm4,xmm7
-	pminsw xmm5,xmm7
-	pminsw xmm6,xmm7
-	
-; Now we got our bytes.
-; the moment has come to assemble the three channels R,G and B to the xrgb dwords
-; on Red channel we just have to and each futural dword with 00FF0000H
-	pand xmm4,[rbp-250]
-; on Green channel we have to shuffle somehow, so we get something like this:
-; 00d0 00c0 00b0 00a0
-	pshufb xmm5,[rbp-266]
-; and on Blue channel that one:
-; 000d 000c 000b 000a
-	pshufb xmm6,[rbp-282]
-	
-; and at last we or it together and get this one:
-; xrgb xrgb xrgb xrgb
-	por xmm4,xmm5
-	por xmm4,xmm6
-	
-; Only thing to do know is writing data to memory, but this gets a bit more
-; complicated if the width is not a multiple of four and it is the last column in line.
-; but otherwise just play the kangaroo
-	test byte [rbp-42],2
-	je freerdp_image_yuv420p_to_xrgb_column_process_complete
-	
-; let's say, we need to only convert six pixel in width
-; Ok, the first 4 pixel will be converted just like every 4 pixel else, but
-; if it's the last loop in line, [rbp-42] is shifted left by one (curious? have a look above),
-; and we land here. Through initialisation a mask was prepared. In this case it looks like
-; 0000FFFFH 0000FFFFH 0000FFFFH 0000FFFFH
-	movdqa xmm6,[rbp-330]
-; we and our output data with this mask to get only the valid pixel
-	pand xmm4,xmm6
-; then we fetch memory from the destination array ...
-	movdqu xmm5,[rdi]
-; ... and and it with the inverse mask. We get only those pixel, which should not be updated
-	pandn xmm6,xmm5
-; we only have to or the two values together and write it back to the destination array,
-; and only the pixel that should be updated really get changed.
-	por xmm4,xmm6
-	
-freerdp_image_yuv420p_to_xrgb_column_process_complete:
-	movdqu [rdi],xmm4
-	
-	
-; Because UV data is the same for two lines, we can process the secound line just here,
-; in the same loop. Only thing we need to do is to add some offsets to the Y- and destination
-; pointer. These offsets are iStride[0] and the target scanline.
-; But if we don't need to process the secound line, like if we are in the last line of processing nine lines,
-; we just skip all this.
-	test r14b,2
-	jnz freerdp_yuv420p_to_xrgb_skip_last_line
-	
-	movd xmm4,[rsi+r9]
-	pshufb xmm4,[rbp-298]
-	
-	
-	movdqa xmm5,xmm4
-	movdqa xmm6,xmm4
-	
-	paddd xmm4,xmm2
-	psubd xmm5,xmm0
-	paddd xmm6,xmm1
-	
-	pslld xmm4,8
-	pslld xmm5,8
-	pslld xmm6,8
-	
-	movdqa xmm7,[rbp-234]
-	pmaxsw xmm4,xmm7	;what an awesome instruction!
-	pmaxsw xmm5,xmm7
-	pmaxsw xmm6,xmm7
-	
-	movdqa xmm7,[rbp-218]
-	pminsw xmm4,xmm7
-	pminsw xmm5,xmm7
-	pminsw xmm6,xmm7
-	
-	pand xmm4,[rbp-250]
-	pshufb xmm5,[rbp-266]
-	pshufb xmm6,[rbp-282]
-	
-	por xmm4,xmm5
-	por xmm4,xmm6
-	
-	test byte [rbp-42],2
-	je freerdp_image_yuv420p_to_xrgb_column_process_complete2
-	
-	movdqa xmm6,[rbp-330]
-	pand xmm4,xmm6
-	movdqu xmm5,[rdi+r10]
-	pandn xmm6,xmm5
-	por xmm4,xmm6
-	
-; only thing is, we should shift [rbp-42] back here, because we have processed the last column,
-; and this "special condition" can be released
-	shr byte [rbp-42],1
-	
-freerdp_image_yuv420p_to_xrgb_column_process_complete2:
-	movdqu [rdi+r10],xmm4
-	
-	
-freerdp_yuv420p_to_xrgb_skip_last_line:
-; after all we have to increase the destination- and Y-data pointer by four pixel
-	add rdi,16
-	add rsi,4
-	
-	cmp cx,r8w
-	jne freerdp_image_yuv420p_to_xrgb_wloop
-
-freerdp_image_yuv420p_to_xrgb_wloop_end:
-; after each line we have to add the scanline to the destination pointer, because
-; we are processing two lines at once, but only increasing the destination pointer
-; in the first line. Well, we only have one pointer, so it's the easiest way to access
-; the secound line with the one pointer and an offset (scanline)
-; if we're not converting the full width of the scanline, like only 64 pixel, but the
-; output buffer was "designed" for 1920p HD, we have to add the remaining length for each line,
-; to get into the next line.
-	add rdi,[rbp-338]
-	
-; same thing has to be done for Y-data, but with iStride[0] instead of the target scanline
-	add rsi,r11
-	
-; and again for UV data, but here it's enough to add the remaining length, because
-; UV data is the same for two lines and there exists only one "UV line" on two "real lines"
-	add rax,r12
-	add rbx,r12
-	;mov eax,r12d
-	;jmp freerdp_image_yuv420p_to_xrgb_end
-
-	jmp freerdp_image_yuv420p_to_xrgb_hloop
-	
-freerdp_image_yuv420p_to_xrgb_hloop_end:
-
-	mov eax,0
-freerdp_image_yuv420p_to_xrgb_end:
-	mov rsp,rbp
-	add rsp,r15
-	pop rbp
-	pop rbx
-	ret
--- a/libfreerdp/codec/h264_x32.asm
+++ b/libfreerdp/codec/h264_x32.asm
@ -1,240 +0,0 @@
-;R=(256*Y+403*(V-128)+128)/265			=(256*Y+403*V-51456)/256
-;G=(256*Y-48*(U-128)-120*(V-128)+128)/256	=(256*Y-48*U-120*V+21632)/256
-;B=(256*Y+475*(U-128)+128)/256			=(256*Y+475*U-60672)/256
-
-section .text
-	;global YUV_to_RGB_asm
-YUV_to_RGB_asm:
-	shl edi,8
-	
-	mov eax,edx
-	imul eax,403
-	add eax,edi
-	sub eax,51456
-	
-	jae YUV_to_RGB_asm1
-	mov eax,0
-	jmp YUV_to_RGB_asm11
-
-YUV_to_RGB_asm1:
-	cmp eax, 0xFFFF
-	jbe YUV_to_RGB_asm11
-	mov eax,0xFF00
-	
-YUV_to_RGB_asm11:
-	and eax,0xFF00
-	shl eax,8
-	
-	mov ebx,esi
-	imul ebx,475
-	add ebx,edi
-	sub ebx,60672
-	
-	jae YUV_to_RGB_asm2
-	mov ebx, 0
-	jmp YUV_to_RGB_asm21
-
-YUV_to_RGB_asm2:
-	cmp ebx,0xFFFF
-	jbe YUV_to_RGB_asm21
-	mov ebx,0xFF00
-	
-YUV_to_RGB_asm21:
-	and ebx,0xFF00
-	shr ebx,8
-	
-	imul edx,120
-	sub edi,edx
-	imul esi,48
-	sub edi,esi
-	add edi,21632
-	
-	bt edi,31
-	jae YUV_to_RGB_asm3
-	mov edi, 0
-	jmp YUV_to_RGB_asm31
-	
-YUV_to_RGB_asm3:
-	cmp edi,0xFFFF
-	jbe YUV_to_RGB_asm31
-	mov edi, 0xFF00
-	
-YUV_to_RGB_asm31:
-	and edi,0xFF00
-	
-	or eax,edi
-	or eax,ebx
-	
-	ret
-
-;extern int freerdp_image_yuv_to_xrgb_asm(unsigned char *pDstData,unsigned char **pSrcData,int nWidth,int nHeight);
-	global freerdp_image_yuv_to_xrgb_asm
-freerdp_image_yuv_to_xrgb_asm:
-	push ebp
-	mov ebp, esp
-			;cWidth: cx
-	sub esp,36	;pDstData,pSrcData[3],nWidth,nHeight,cHeight,scanline,iStride[0] addition
-	push ebx
-	
-	
-	mov edi,[ebp+8]
-	mov [ebp-4],edi
-	
-	mov esi,[ebp+12]
-	mov eax,[esi]
-	mov [ebp-8],eax
-	mov eax,[esi+4]
-	mov [ebp-12],eax
-	mov eax,[esi+8]
-	mov [ebp-16],eax
-	
-	mov edx,[ebp+16]
-	mov [ebp-20],edx
-	
-	
-	mov ecx,[ebp+20]
-	shr ecx,1	;/2
-	mov [ebp-24],ecx
-	
-	
-	shl edx,2
-	mov [ebp-32],edx
-	
-	
-	mov eax,[ebp-24]
-	mov [ebp-28],eax
-	
-	
-	mov ebx,[ebp+24]
-	mov [ebp-36],ebx
-	mov eax,[ebp-20]
-	shl dword [ebp-36],1
-	sub [ebp-36],eax
-
-	shr eax,1
-	sub [ebp+28],eax
-	
-freerdp_image_yuv_to_xrgb_asm_loopH:
-	mov ecx,[ebp-20]
-	shr ecx,1
-	
-	
-freerdp_image_yuv_to_xrgb_asm_loopW:
-	mov eax,[ebp-8]
-	mov edi,[eax]
-	and edi,0xFF
-	
-	mov eax,[ebp-12]
-	mov esi,[eax]
-	and esi,0xFF
-	
-	mov eax,[ebp-16]
-	mov edx,[eax]
-	and edx,0xFF
-	
-	call YUV_to_RGB_asm
-	
-	mov ebx,[ebp-4]
-	mov [ebx],eax
-	
-	
-	mov eax,[ebp-8]
-	mov ebx,[ebp+24]
-	mov edi,[eax+ebx]
-	inc eax
-	mov [ebp-8],eax
-	and edi,0xFF
-	
-	mov eax,[ebp-12]
-	mov esi,[eax]
-	and esi,0xFF
-	
-	mov eax,[ebp-16]
-	mov edx,[eax]
-	and edx,0xFF
-	
-	call YUV_to_RGB_asm
-	
-	mov ebx,[ebp-4]
-	mov edx,[ebp-32]
-	mov [ebx+edx],eax
-	add ebx,4
-	mov [ebp-4],ebx
-	
-	
-	mov eax,[ebp-8]
-	mov edi,[eax]
-	and edi,0xFF
-	
-	mov eax,[ebp-12]
-	mov esi,[eax]
-	and esi,0xFF
-	
-	mov eax,[ebp-16]
-	mov edx,[eax]
-	and edx,0xFF
-	
-	call YUV_to_RGB_asm
-	
-	mov ebx,[ebp-4]
-	mov [ebx],eax
-	
-	
-	mov eax,[ebp-8]
-	mov ebx,[ebp+24]
-	mov edi,[eax+ebx]
-	inc eax
-	mov [ebp-8],eax
-	and edi,0xFF
-	
-	mov eax,[ebp-12]
-	mov esi,[eax]
-	inc eax
-	mov [ebp-12],eax
-	and esi,0xFF
-	
-	mov eax,[ebp-16]
-	mov edx,[eax]
-	inc eax
-	mov [ebp-16],eax
-	and edx,0xFF
-	
-	call YUV_to_RGB_asm
-
-	mov ebx,[ebp-4]
-	mov edx,[ebp-32]
-	mov [ebx+edx],eax
-	add ebx,4
-	mov [ebp-4],ebx
-
-	dec cx
-	jne freerdp_image_yuv_to_xrgb_asm_loopW
-	
-	
-	mov eax,[ebp-4]
-	add eax,[ebp-32]
-	mov [ebp-4],eax
-	
-	mov eax,[ebp-8]
-	add eax,[ebp-36]
-	mov [ebp-8],eax
-	
-	mov ebx,[ebp+28]
-	mov eax,[ebp-12]
-	add eax,ebx
-	mov [ebp-12],eax
-	
-	mov eax,[ebp-16]
-	add eax,ebx
-	mov [ebp-16],eax
-	
-	dec dword [ebp-28]
-	jne freerdp_image_yuv_to_xrgb_asm_loopH
-	
-;END
-	mov eax,0
-END:
-	pop ebx
-	mov esp,ebp
-	pop ebp
-	ret
--- a/libfreerdp/codec/h264_x64.asm
+++ b/libfreerdp/codec/h264_x64.asm
@ -1,269 +0,0 @@
-;R=(256*Y+403*(V-128)+128)/265			=(256*Y+403*V-51456)/256
-;G=(256*Y-48*(U-128)-120*(V-128)+128)/256	=(256*Y-48*U-120*V+21632)/256
-;B=(256*Y+475*(U-128)+128)/256			=(256*Y+475*U-60672)/256
-
-section .text
-	;global YUV_to_RGB_asm
-YUV_to_RGB_asm:
-	shl rdi,8
-	
-	mov eax,edx
-	imul eax,403
-	add eax,edi
-	sub eax,51456
-	
-	jae YUV_to_RGB_asm1
-	mov eax,0
-	jmp YUV_to_RGB_asm11
-
-YUV_to_RGB_asm1:
-	cmp eax, 0xFFFF
-	jbe YUV_to_RGB_asm11
-	mov eax,0xFF00
-	
-YUV_to_RGB_asm11:
-	and eax,0xFF00
-	shl eax,8
-	
-	mov ebx,esi
-	imul ebx,475
-	add ebx,edi
-	sub ebx,60672
-	
-	jae YUV_to_RGB_asm2
-	mov ebx, 0
-	jmp YUV_to_RGB_asm21
-
-YUV_to_RGB_asm2:
-	cmp ebx,0xFFFF
-	jbe YUV_to_RGB_asm21
-	mov ebx,0xFF00
-	
-YUV_to_RGB_asm21:
-	and ebx,0xFF00
-	shr ebx,8
-	
-	imul edx,120
-	sub edi,edx
-	imul esi,48
-	sub edi,esi
-	add edi,21632
-	
-	bt edi,31
-	jae YUV_to_RGB_asm3
-	mov edi, 0
-	jmp YUV_to_RGB_asm31
-	
-YUV_to_RGB_asm3:
-	cmp edi,0xFFFF
-	jbe YUV_to_RGB_asm31
-	mov edi, 0xFF00
-	
-YUV_to_RGB_asm31:
-	and edi,0xFF00
-	
-	or eax,edi
-	or eax,ebx
-	
-	ret
-
-;extern int freerdp_image_yuv_to_xrgb_asm(unsigned char *pDstData,unsigned char **pSrcData,int nWidth,int nHeight,int *istride,int scanline);
-	global freerdp_image_yuv_to_xrgb_asm
-freerdp_image_yuv_to_xrgb_asm:
-	push rbx
-	push rbp
-	mov rbp, rsp
-			;cWidth: cx
-	sub rsp,82	;pDstData -8,pSrcData[3] -32,nWidth -40,nHeight -48,cHeight -56,scanline -64,iStride[0] -72,VaddDst -80,last_column 1 -81,last_line 1 -82
-	
-;last_column: set to 10B, if last column should be skipped ('cause UV data is the same for two columns and two columns are processed at once)
-;last_line: set to 10B, if last line should be skipped ('cause UV data is the same for two lines and two lines are processed at once)
-	
-	
-	mov [rbp-8],rdi
-	
-	mov rax,[rsi]
-	mov [rbp-16],rax
-	mov rax,[rsi+8]
-	mov [rbp-24],rax
-	mov rax,[rsi+16]
-	mov [rbp-32],rax
-	
-	and rdx,0FFFFH
-	;mov [rbp-40],rdx
-	
-	
-	shr rcx,1	;/2
-	mov [rbp-48],rcx
-	
-	
-	and r9,0FFFFH
-	mov [rbp-64],r9
-	
-	shr r9d,1
-	sub r9d,edx
-	shl r9d,2
-	mov [rbp-80],r9
-	
-	
-	mov rax,[rbp-48]
-	mov [rbp-56],rax
-	
-	
-	mov rcx,[r8]
-	and rcx,0FFFFH
-	mov [rbp-72],rcx
-	shl dword [rbp-72],1
-	sub [rbp-72],rdx
-
-	mov r9,[r8+4]
-	mov r8,rcx
-	
-	and r9,0FFFFH
-	shr rax,1
-	sub r9,rax
-	
-	
-	mov al,dl
-	and al,1B
-	mov [rbp-81],al
-	inc dx
-	shr edx,1
-	mov [rbp-40],rdx
-	
-freerdp_image_yuv_to_xrgb_asm_loopH:
-	mov cx,[rbp-40]
-	
-	
-freerdp_image_yuv_to_xrgb_asm_loopW:
-	dec cx
-	jne freerdp_image_yuv_to_xrgb_asm_not_last_column
-	
-	shl byte [rbp-81],1
-	
-freerdp_image_yuv_to_xrgb_asm_not_last_column:
-
-
-	mov rax,[rbp-16]
-	mov edi,[rax]
-	and edi,0xFF
-	
-	mov rax,[rbp-24]
-	mov esi,[rax]
-	and esi,0xFF
-	
-	mov rax,[rbp-32]
-	mov edx,[rax]
-	and edx,0xFF
-	
-	call YUV_to_RGB_asm
-	
-	mov rbx,[rbp-8]
-	mov [rbx],eax
-	
-	
-	test byte [rbp-81],2
-	jne freerdp_image_yuv_to_xrgb_asm_skip_last_column
-	
-	mov rax,[rbp-16]
-	mov edi,[rax+r8]
-	and edi,0xFF
-	
-	mov rax,[rbp-24]
-	mov esi,[rax]
-	and esi,0xFF
-	
-	mov rax,[rbp-32]
-	mov edx,[rax]
-	and edx,0xFF
-	
-	call YUV_to_RGB_asm
-	
-	mov rbx,[rbp-8]
-	mov rdx,[rbp-64]
-	mov [rbx+rdx],eax
-	
-freerdp_image_yuv_to_xrgb_asm_skip_last_column:
-	add qword [rbp-8],4
-	inc qword [rbp-16]
-	
-	
-	mov rax,[rbp-16]
-	mov edi,[rax]
-	and edi,0xFF
-	
-	mov rax,[rbp-24]
-	mov esi,[rax]
-	and esi,0xFF
-	
-	mov rax,[rbp-32]
-	mov edx,[rax]
-	and edx,0xFF
-	
-	call YUV_to_RGB_asm
-	
-	mov rbx,[rbp-8]
-	mov [rbx],eax
-	
-	
-	test byte [rbp-81],2
-	jne freerdp_image_yuv_to_xrgb_asm_skip_last_column2
-	
-	mov rax,[rbp-16]
-	mov edi,[rax+r8]
-	and edi,0xFF
-	
-	mov rax,[rbp-24]
-	mov esi,[rax]
-	and esi,0xFF
-	
-	mov rax,[rbp-32]
-	mov edx,[rax]
-	and edx,0xFF
-	
-	call YUV_to_RGB_asm
-	
-	;shr [rbp-81],1
-	
-	mov rbx,[rbp-8]
-	mov rdx,[rbp-64]
-	mov [rbx+rdx],eax
-	
-freerdp_image_yuv_to_xrgb_asm_skip_last_column2:
-	add qword [rbp-8],4
-	inc qword [rbp-16]
-	inc qword [rbp-24]
-	inc qword [rbp-32]
-
-
-	test cx,0FFFFH
-	jne freerdp_image_yuv_to_xrgb_asm_loopW
-	jmp END
-	
-	
-	mov rax,[rbp-8]
-	add rax,[rbp-80]
-	mov [rbp-8],rax
-	
-	mov rax,[rbp-16]
-	add rax,[rbp-72]
-	mov [rbp-16],rax
-	
-	mov rax,[rbp-24]
-	add rax,r9
-	mov [rbp-24],rax
-	
-	mov rax,[rbp-32]
-	add rax,r9
-	mov [rbp-32],rax
-	
-	dec qword [rbp-56]
-	jne freerdp_image_yuv_to_xrgb_asm_loopH
-	
-;END
-	mov rax,0
-END:
-	mov rsp,rbp
-	pop rbp
-	pop rbx
-	ret
--- a/libfreerdp/codec/test/Makefile.TestOpenH264SSSE3
+++ b/libfreerdp/codec/test/Makefile.TestOpenH264SSSE3
@ -0,0 +1,14 @@
+TestOpenH264: TestOpenH264ASM.c.o h264.c.o h264_ssse3.c.o
+	gcc -o TestOpenH264 TestOpenH264ASM.c.o h264.c.o h264_ssse3.c.o -lwinpr
+
+h264_ssse3.c.o: ../h264_ssse3.c
+	gcc -c -O3 -o h264_ssse3.c.o ../h264_ssse3.c -mssse3
+
+TestOpenH264ASM.c.o: TestOpenH264ASM.c
+	gcc -c -o TestOpenH264ASM.c.o TestOpenH264ASM.c
+
+h264.c.o: ../h264.c
+	gcc -c -o h264.c.o ../h264.c
+
+clean:
+	rm -f TestOpenH264 TestOpenH264ASM.c.o h264_ssse3.asm.o h264.c.o h264.asm.o
--- a/libfreerdp/codec/test/TestOpenH264
+++ b/libfreerdp/codec/test/TestOpenH264