From bcf1266f517f07212e737fd24bba548a93157a37 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marc-Andr=C3=A9=20Moreau?= <marcandre.moreau@gmail.com>
Date: Tue, 9 Sep 2014 19:15:07 -0400
Subject: [PATCH] libfreerdp-primitives: integrate H264 SSE3 color converter

---
 include/freerdp/codec/h264.h         |  19 --
 libfreerdp/codec/h264.c              |  55 ++--
 libfreerdp/primitives/prim_YUV.c     |  39 +--
 libfreerdp/primitives/prim_YUV_opt.c | 380 +++++++++++++--------------
 4 files changed, 225 insertions(+), 268 deletions(-)

diff --git a/include/freerdp/codec/h264.h b/include/freerdp/codec/h264.h
index 969914709..e539cb0b3 100644
--- a/include/freerdp/codec/h264.h
+++ b/include/freerdp/codec/h264.h
@@ -44,31 +44,12 @@ struct _H264_CONTEXT
 {
 	BOOL Compressor;
 
-	//BYTE* data;
-	//UINT32 size;
 	UINT32 width;
 	UINT32 height;
-	//int scanline;
 	
-	BYTE* pYUVData[3];
 	int iStride[3];
-
-/*
-<<<<<<< HEAD
-#ifdef WITH_OPENH264
-	ISVCDecoder* pDecoder;
 	BYTE* pYUVData[3];
-	int iStride[2];
-#endif
 
-#ifdef WITH_LIBAVCODEC
-	AVCodec* codec;
-	AVCodecContext* codecContext;
-	AVCodecParserContext* codecParser;
-	AVFrame* videoFrame;
-#endif
-=======
-*/
 	void* pSystemData;
 	H264_CONTEXT_SUBSYSTEM* subsystem;
 };
diff --git a/libfreerdp/codec/h264.c b/libfreerdp/codec/h264.c
index 5f8f688ab..cf5d2be58 100644
--- a/libfreerdp/codec/h264.c
+++ b/libfreerdp/codec/h264.c
@@ -28,9 +28,6 @@
 #include <freerdp/primitives.h>
 #include <freerdp/codec/h264.h>
 
-#include <sys/time.h>
-
-
 /**
  * Dummy subsystem
  */
@@ -87,8 +84,6 @@ static int openh264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSiz
 	SSysMEMBuffer* pSystemBuffer;
 	H264_CONTEXT_OPENH264* sys = (H264_CONTEXT_OPENH264*) h264->pSystemData;
 
-	struct timeval T1,T2;
-
 	if (!sys->pDecoder)
 		return -1;
 
@@ -102,7 +97,6 @@ static int openh264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSiz
 
 	ZeroMemory(&sBufferInfo, sizeof(sBufferInfo));
 
-	gettimeofday(&T1,NULL);
 	state = (*sys->pDecoder)->DecodeFrame2(
 		sys->pDecoder,
 		pSrcData,
@@ -119,9 +113,6 @@ static int openh264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSiz
 
 	if (sBufferInfo.iBufferStatus != 1)
 		state = (*sys->pDecoder)->DecodeFrame2(sys->pDecoder, NULL, 0, h264->pYUVData, &sBufferInfo);
-	
-	gettimeofday(&T2,NULL);
-	printf("OpenH264: decoding took: %u sec %u usec\n",(unsigned int)(T2.tv_sec-T1.tv_sec),(unsigned int)(T2.tv_usec-T1.tv_usec));
 
 	pSystemBuffer = &sBufferInfo.UsrData.sSystemBuffer;
 
@@ -285,18 +276,12 @@ static int libavcodec_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcS
 	AVPacket packet;
 	H264_CONTEXT_LIBAVCODEC* sys = (H264_CONTEXT_LIBAVCODEC*) h264->pSystemData;
 
-	struct timeval T1,T2;
-
 	av_init_packet(&packet);
 
 	packet.data = pSrcData;
 	packet.size = SrcSize;
 
-	gettimeofday(&T1,NULL);
 	status = avcodec_decode_video2(sys->codecContext, sys->videoFrame, &gotFrame, &packet);
-	gettimeofday(&T2,NULL);
-
-	printf("libavcodec: decoding took: %u sec %u usec\n",(unsigned int)(T2.tv_sec-T1.tv_sec),(unsigned int)(T2.tv_usec-T1.tv_usec));
 
 	if (status < 0)
 	{
@@ -437,20 +422,18 @@ static H264_CONTEXT_SUBSYSTEM g_Subsystem_libavcodec =
 int h264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize,
 		BYTE** ppDstData, DWORD DstFormat, int nDstStep, int nDstHeight, RDPGFX_RECT16* regionRects, int numRegionRects)
 {
+	int index;
+	int status;
+	int* iStride;
 	BYTE* pDstData;
 	BYTE* pDstPoint;
-
+	prim_size_t roi;
 	BYTE** pYUVData;
+	int width, height;
 	BYTE* pYUVPoint[3];
-
 	RDPGFX_RECT16* rect;
-	int* iStride;
-	int ret, i, cx, cy;
 	int UncompressedSize;
 	primitives_t *prims = primitives_get();
-	prim_size_t roi;
-	
-	struct timeval T1,T2;
 
 	if (!h264)
 		return -1;
@@ -463,23 +446,23 @@ int h264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize,
 	if (!(pDstData = *ppDstData))
 		return -1;
 
-
-	if ((ret = h264->subsystem->Decompress(h264, pSrcData, SrcSize)) < 0)
-		return ret;
-
+	if ((status = h264->subsystem->Decompress(h264, pSrcData, SrcSize)) < 0)
+		return status;
 
 	UncompressedSize = h264->width * h264->height * 4;
+
 	if (UncompressedSize > (nDstStep * nDstHeight))
 		return -1;
 
 	pYUVData = h264->pYUVData;
 	iStride = h264->iStride;
 
-	gettimeofday(&T1,NULL);
-	for (i = 0; i < numRegionRects; i++){
-		rect = &(regionRects[i]);
-		cx = rect->right - rect->left;
-		cy = rect->bottom - rect->top;
+	for (index = 0; index < numRegionRects; index++)
+	{
+		rect = &(regionRects[index]);
+
+		width = rect->right - rect->left;
+		height = rect->bottom - rect->top;
 		
 		pDstPoint = pDstData + rect->top * nDstStep + rect->left * 4;
 		pYUVPoint[0] = pYUVData[0] + rect->top * iStride[0] + rect->left;
@@ -488,17 +471,15 @@ int h264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize,
 		pYUVPoint[2] = pYUVData[2] + rect->top/2 * iStride[2] + rect->left/2;
 
 #if 0
-		printf("regionRect: x: %d, y: %d, cx: %d, cy: %d\n",
-		       rect->left, rect->top, cx, cy);
+		printf("regionRect: x: %d y: %d width: %d height: %d\n",
+		       rect->left, rect->top, width, height);
 #endif
 
-		roi.width = cx;
-		roi.height = cy;
+		roi.width = width;
+		roi.height = height;
 
 		prims->YUV420ToRGB_8u_P3AC4R((const BYTE**) pYUVPoint, iStride, pDstPoint, nDstStep, &roi);
 	}
-	gettimeofday(&T2,NULL);
-	printf("converting took %u sec %u usec\n",(unsigned int)(T2.tv_sec-T1.tv_sec),(unsigned int)(T2.tv_usec-T1.tv_usec));
 
 	return 1;
 }
diff --git a/libfreerdp/primitives/prim_YUV.c b/libfreerdp/primitives/prim_YUV.c
index 0425c9e8f..24ff1a49a 100644
--- a/libfreerdp/primitives/prim_YUV.c
+++ b/libfreerdp/primitives/prim_YUV.c
@@ -27,6 +27,16 @@
 #include "prim_internal.h"
 #include "prim_YUV.h"
 
+/**
+ * | R |    ( | 256     0    403 | |    Y    | )
+ * | G | = (  | 256   -48   -120 | | U - 128 |  ) >> 8
+ * | B |    ( | 256   475      0 | | V - 128 | )
+ *
+ * | Y |    ( |  54   183     18 | | R | )         |  0  |
+ * | U | = (  | -29   -99    128 | | G |  ) >> 8 + | 128 |
+ * | V |    ( | 128  -116    -12 | | B | )         | 128 |
+ */
+
 pstatus_t general_YUV420ToRGB_8u_P3AC4R(const BYTE* pSrc[3], int srcStep[3],
 		BYTE* pDst, int dstStep, const prim_size_t* roi)
 {
@@ -45,14 +55,14 @@ pstatus_t general_YUV420ToRGB_8u_P3AC4R(const BYTE* pSrc[3], int srcStep[3],
 	int Vp403, Vp120;
 	BYTE* pRGB = pDst;
 	int nWidth, nHeight;
-	int last_line, last_column;
+	int lastRow, lastCol;
 
 	pY = pSrc[0];
 	pU = pSrc[1];
 	pV = pSrc[2];
 	
-	last_column = roi->width & 0x01;
-	last_line = roi->height & 0x01;
+	lastCol = roi->width & 0x01;
+	lastRow = roi->height & 0x01;
 	
 	nWidth = (roi->width + 1) & ~0x0001;
 	nHeight = (roi->height + 1) & ~0x0001;
@@ -68,15 +78,13 @@ pstatus_t general_YUV420ToRGB_8u_P3AC4R(const BYTE* pSrc[3], int srcStep[3],
 
 	for (y = 0; y < halfHeight; )
 	{
-		y++;
-		if (y == halfHeight)
-			last_line = last_line << 1;
+		if (++y == halfHeight)
+			lastRow <<= 1;
 
 		for (x = 0; x < halfWidth; )
 		{
-			x++;
-			if (x == halfWidth)
-				last_column = last_column << 1;
+			if (++x == halfWidth)
+				lastCol <<= 1;
 
 			U = *pU++;
 			V = *pV++;
@@ -121,7 +129,7 @@ pstatus_t general_YUV420ToRGB_8u_P3AC4R(const BYTE* pSrc[3], int srcStep[3],
 
 			/* 2nd pixel */
 
-			if (!(last_column & 0x02))
+			if (!(lastCol & 0x02))
 			{
 				Y = *pY++;
 				Yp = Y << 8;
@@ -154,7 +162,7 @@ pstatus_t general_YUV420ToRGB_8u_P3AC4R(const BYTE* pSrc[3], int srcStep[3],
 			{
 				pY++;
 				pRGB += 4;
-				last_column = last_column >> 1;
+				lastCol >>= 1;
 			}
 		}
 
@@ -165,9 +173,8 @@ pstatus_t general_YUV420ToRGB_8u_P3AC4R(const BYTE* pSrc[3], int srcStep[3],
 
 		for (x = 0; x < halfWidth; )
 		{
-			x++;
-			if (x == halfWidth)
-				last_column = last_column << 1;
+			if (++x == halfWidth)
+				lastCol <<= 1;
 
 			U = *pU++;
 			V = *pV++;
@@ -212,7 +219,7 @@ pstatus_t general_YUV420ToRGB_8u_P3AC4R(const BYTE* pSrc[3], int srcStep[3],
 
 			/* 4th pixel */
 
-			if(!(last_column & 0x02))
+			if (!(lastCol & 0x02))
 			{
 				Y = *pY++;
 				Yp = Y << 8;
@@ -245,7 +252,7 @@ pstatus_t general_YUV420ToRGB_8u_P3AC4R(const BYTE* pSrc[3], int srcStep[3],
 			{
 				pY++;
 				pRGB += 4;
-				last_column = last_column >> 1;
+				lastCol >>= 1;
 			}
 		}
 
diff --git a/libfreerdp/primitives/prim_YUV_opt.c b/libfreerdp/primitives/prim_YUV_opt.c
index a8010b9d3..eaf7bf6d7 100644
--- a/libfreerdp/primitives/prim_YUV_opt.c
+++ b/libfreerdp/primitives/prim_YUV_opt.c
@@ -25,73 +25,68 @@
 pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(const BYTE **pSrc, int *srcStep,
 		BYTE *pDst, int dstStep, const prim_size_t *roi)
 {
-	char last_line,last_column;
-/* last_line: if the last (U,V doubled) line should be skipped, set to 10B
- * last_column: if it's the last column in a line, set to 10B (for handling line-endings not multiple by four) */
-
-	int i,nWidth,nHeight,VaddDst,VaddY,VaddU,VaddV;
-	
+	int lastRow, lastCol;
 	BYTE *UData,*VData,*YData;
-	
+	int i,nWidth,nHeight,VaddDst,VaddY,VaddU,VaddV;
 	__m128i r0,r1,r2,r3,r4,r5,r6,r7;
 	__m128i *buffer;
 	
+	/* last_line: if the last (U,V doubled) line should be skipped, set to 10B
+	 * last_column: if it's the last column in a line, set to 10B (for handling line-endings not multiple by four) */
+
+	buffer = _aligned_malloc(4 * 16, 16);
 	
-	buffer=_aligned_malloc(4*16,16);
+	YData = (BYTE*) pSrc[0];
+	UData = (BYTE*) pSrc[1];
+	VData = (BYTE*) pSrc[2];
 	
+	nWidth = roi->width;
+	nHeight = roi->height;
 	
-	YData=(BYTE *)pSrc[0];
-	UData=(BYTE *)pSrc[1];
-	VData=(BYTE *)pSrc[2];
-	
-	nWidth=roi->width;
-	nHeight=roi->height;
-	
-	
-	if((last_column=nWidth&3)){
-		switch(last_column){
-			case 1: r7=_mm_set_epi32(0,0,0,0xFFFFFFFF); break;
-			case 2: r7=_mm_set_epi32(0,0,0xFFFFFFFF,0xFFFFFFFF); break;
-			case 3: r7=_mm_set_epi32(0,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF); break;
+	if ((lastCol = (nWidth & 3)))
+	{
+		switch (lastCol)
+		{
+			case 1:
+				r7 = _mm_set_epi32(0,0,0,0xFFFFFFFF);
+				break;
+
+			case 2:
+				r7 = _mm_set_epi32(0,0,0xFFFFFFFF,0xFFFFFFFF);
+				break;
+
+			case 3:
+				r7 = _mm_set_epi32(0,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF);
+				break;
 		}
+
 		_mm_store_si128(buffer+3,r7);
-		last_column=1;
+		lastCol = 1;
 	}
 	
-	nWidth+=3;
-	nWidth=nWidth>>2;
+	nWidth += 3;
+	nWidth = nWidth >> 2;
 	
-	
-	last_line=nHeight&1;
+	lastRow = nHeight & 1;
 	nHeight++;
-	nHeight=nHeight>>1;
+	nHeight = nHeight >> 1;
 	
+	VaddDst = (dstStep << 1) - (nWidth << 4);
+	VaddY = (srcStep[0] << 1) - (nWidth << 2);
+	VaddU = srcStep[1] - (((nWidth << 1) + 2) & 0xFFFC);
+	VaddV = srcStep[2] - (((nWidth << 1) + 2) & 0xFFFC);
 	
-	VaddDst=(dstStep<<1)-(nWidth<<4);
-	VaddY=(srcStep[0]<<1)-(nWidth<<2);
-	VaddU=srcStep[1]-(((nWidth<<1)+2)&0xFFFC);
-	VaddV=srcStep[2]-(((nWidth<<1)+2)&0xFFFC);
-	
-	
-	while(nHeight-- >0){
-		if(nHeight==0){
-			last_line=last_line<<1;
-		}
+	while (nHeight-- > 0)
+	{
+		if (nHeight == 0)
+			lastRow <<= 1;
+
+		i = 0;
 		
-		i=0;
-		do{
-/*
- * Well, in the end it should look like this:
- *	C = Y;
- *	D = U - 128;
- *	E = V - 128;
- *
- *	R = clip(( 256 * C           + 403 * E + 128) >> 8);
- *	G = clip(( 256 * C -  48 * D - 120 * E + 128) >> 8);
- *	B = clip(( 256 * C + 475 * D           + 128) >> 8);
- */
-			if(!(i&0x01)){
-				
+		do
+		{
+			if (!(i & 0x01))
+			{
 			/* Y-, U- and V-data is stored in different arrays.
 			* We start with processing U-data.
 			*
@@ -99,50 +94,48 @@ pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(const BYTE **pSrc, int *srcStep,
 			*	0d0d 0c0c 0b0b 0a0a
 			* we've done two things: converting the values to signed words and duplicating
 			* each value, because always two pixel "share" the same U- (and V-) data */
-				r0=_mm_cvtsi32_si128(*(UINT32 *)UData);
-				r5=_mm_set_epi32(0x80038003,0x80028002,0x80018001,0x80008000);
-				r0=_mm_shuffle_epi8(r0,r5);
+				r0 = _mm_cvtsi32_si128(*(UINT32 *)UData);
+				r5 = _mm_set_epi32(0x80038003,0x80028002,0x80018001,0x80008000);
+				r0 = _mm_shuffle_epi8(r0,r5);
 				
-				UData+=4;
+				UData += 4;
 				
 			/* then we subtract 128 from each value, so we get D */
-				r3=_mm_set_epi16(128,128,128,128,128,128,128,128);
-				r0=_mm_subs_epi16(r0,r3);
+				r3 = _mm_set_epi16(128,128,128,128,128,128,128,128);
+				r0 = _mm_subs_epi16(r0,r3);
 				
 			/* we need to do two things with our D, so let's store it for later use */
-				r2=r0;
+				r2 = r0;
 				
 			/* now we can multiply our D with 48 and unpack it to xmm4:xmm0
 			 * this is what we need to get G data later on */
-				r4=r0;
-				r7=_mm_set_epi16(48,48,48,48,48,48,48,48);
-				r0=_mm_mullo_epi16(r0,r7);
-				r4=_mm_mulhi_epi16(r4,r7);
-				r7=r0;
-				r0=_mm_unpacklo_epi16(r0,r4);
-				r4=_mm_unpackhi_epi16(r7,r4);
-				
+				r4 = r0;
+				r7 = _mm_set_epi16(48,48,48,48,48,48,48,48);
+				r0 = _mm_mullo_epi16(r0,r7);
+				r4 = _mm_mulhi_epi16(r4,r7);
+				r7 = r0;
+				r0 = _mm_unpacklo_epi16(r0,r4);
+				r4 = _mm_unpackhi_epi16(r7,r4);
 				
 			/* to complete this step, add (?) 128 to each value (rounding ?!)
 			 * yeah, add. in the end this will be subtracted from something,
 			 * because it's part of G: 256*C - (48*D + 120*E - 128), 48*D-128 !
 			 * by the way, our values have become signed dwords during multiplication! */
-				r6=_mm_set_epi32(128,128,128,128);
-				r0=_mm_sub_epi32(r0,r6);
-				r4=_mm_sub_epi32(r4,r6);
-				
+				r6 = _mm_set_epi32(128,128,128,128);
+				r0 = _mm_sub_epi32(r0,r6);
+				r4 = _mm_sub_epi32(r4,r6);
 				
 			/* to get B data, we need to prepare a secound value, D*475+128 */
-				r1=r2;
-				r7=_mm_set_epi16(475,475,475,475,475,475,475,475);
-				r1=_mm_mullo_epi16(r1,r7);
-				r2=_mm_mulhi_epi16(r2,r7);
-				r7=r1;
-				r1=_mm_unpacklo_epi16(r1,r2);
-				r7=_mm_unpackhi_epi16(r7,r2);
+				r1 = r2;
+				r7 = _mm_set_epi16(475,475,475,475,475,475,475,475);
+				r1 = _mm_mullo_epi16(r1,r7);
+				r2 = _mm_mulhi_epi16(r2,r7);
+				r7 = r1;
+				r1 = _mm_unpacklo_epi16(r1,r2);
+				r7 = _mm_unpackhi_epi16(r7,r2);
 				
-				r1=_mm_add_epi32(r1,r6);
-				r7=_mm_add_epi32(r7,r6);
+				r1 = _mm_add_epi32(r1,r6);
+				r7 = _mm_add_epi32(r7,r6);
 				
 			/* so we got something like this: xmm7:xmm1
 			 * this pair contains values for 16 pixel:
@@ -151,76 +144,74 @@ pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(const BYTE **pSrc, int *srcStep,
 				_mm_store_si128(buffer+1,r7);
 				
 			/* Now we've prepared U-data. Preparing V-data is actually the same, just with other coefficients */
-				r2=_mm_cvtsi32_si128(*(UINT32 *)VData);
-				r2=_mm_shuffle_epi8(r2,r5);
+				r2 = _mm_cvtsi32_si128(*(UINT32 *)VData);
+				r2 = _mm_shuffle_epi8(r2,r5);
 				
-				VData+=4;
+				VData += 4;
 				
-				r2=_mm_subs_epi16(r2,r3);
-				
-				r5=r2;
+				r2 = _mm_subs_epi16(r2,r3);
 				
+				r5 = r2;
 				
 			/* this is also known as E*403+128, we need it to convert R data */
-				r3=r2;
-				r7=_mm_set_epi16(403,403,403,403,403,403,403,403);
-				r2=_mm_mullo_epi16(r2,r7);
-				r3=_mm_mulhi_epi16(r3,r7);
-				r7=r2;
-				r2=_mm_unpacklo_epi16(r2,r3);
-				r7=_mm_unpackhi_epi16(r7,r3);
+				r3 = r2;
+				r7 = _mm_set_epi16(403,403,403,403,403,403,403,403);
+				r2 = _mm_mullo_epi16(r2,r7);
+				r3 = _mm_mulhi_epi16(r3,r7);
+				r7 = r2;
+				r2 = _mm_unpacklo_epi16(r2,r3);
+				r7 = _mm_unpackhi_epi16(r7,r3);
 				
-				r2=_mm_add_epi32(r2,r6);
-				r7=_mm_add_epi32(r7,r6);
+				r2 = _mm_add_epi32(r2,r6);
+				r7 = _mm_add_epi32(r7,r6);
 				
 			/* and preserve upper four values for future ... */
 				_mm_store_si128(buffer+2,r7);
 				
-				
-				
 			/* doing this step: E*120 */
-				r3=r5;
-				r7=_mm_set_epi16(120,120,120,120,120,120,120,120);
-				r3=_mm_mullo_epi16(r3,r7);
-				r5=_mm_mulhi_epi16(r5,r7);
-				r7=r3;
-				r3=_mm_unpacklo_epi16(r3,r5);
-				r7=_mm_unpackhi_epi16(r7,r5);
+				r3 = r5;
+				r7 = _mm_set_epi16(120,120,120,120,120,120,120,120);
+				r3 = _mm_mullo_epi16(r3,r7);
+				r5 = _mm_mulhi_epi16(r5,r7);
+				r7 = r3;
+				r3 = _mm_unpacklo_epi16(r3,r5);
+				r7 = _mm_unpackhi_epi16(r7,r5);
 				
 			/* now we complete what we've begun above:
 			 * (48*D-128) + (120*E) = (48*D +120*E -128) */
-				r0=_mm_add_epi32(r0,r3);
-				r4=_mm_add_epi32(r4,r7);
+				r0 = _mm_add_epi32(r0,r3);
+				r4 = _mm_add_epi32(r4,r7);
 				
 			/* and store to memory ! */
 				_mm_store_si128(buffer,r4);
-			}else{
+			}
+			else
+			{
 			/* maybe you've wondered about the conditional above ?
 			 * Well, we prepared UV data for eight pixel in each line, but can only process four
 			 * per loop. So we need to load the upper four pixel data from memory each secound loop! */
-				r1=_mm_load_si128(buffer+1);
-				r2=_mm_load_si128(buffer+2);
-				r0=_mm_load_si128(buffer);
+				r1 = _mm_load_si128(buffer+1);
+				r2 = _mm_load_si128(buffer+2);
+				r0 = _mm_load_si128(buffer);
 			}
 			
-			if(++i==nWidth)
-				last_column=last_column<<1;
+			if (++i == nWidth)
+				lastCol <<= 1;
 			
 		/* We didn't produce any output yet, so let's do so!
 		 * Ok, fetch four pixel from the Y-data array and shuffle them like this:
 		 * 00d0 00c0 00b0 00a0, to get signed dwords and multiply by 256 */
-			r4=_mm_cvtsi32_si128(*(UINT32 *)YData);
-			r7=_mm_set_epi32(0x80800380,0x80800280,0x80800180,0x80800080);
-			r4=_mm_shuffle_epi8(r4,r7);
+			r4 = _mm_cvtsi32_si128(*(UINT32 *)YData);
+			r7 = _mm_set_epi32(0x80800380,0x80800280,0x80800180,0x80800080);
+			r4 = _mm_shuffle_epi8(r4,r7);
 			
-			r5=r4;
-			r6=r4;
+			r5 = r4;
+			r6 = r4;
 			
 		/* no we can perform the "real" conversion itself and produce output! */
-			r4=_mm_add_epi32(r4,r2);
-			r5=_mm_sub_epi32(r5,r0);
-			r6=_mm_add_epi32(r6,r1);
-			
+			r4 = _mm_add_epi32(r4,r2);
+			r5 = _mm_sub_epi32(r5,r0);
+			r6 = _mm_add_epi32(r6,r1);
 			
 		/* in the end, we only need bytes for RGB values.
 		 * So, what do we do? right! shifting left makes values bigger and thats always good.
@@ -228,9 +219,9 @@ pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(const BYTE **pSrc, int *srcStep,
 		 * as packed words, we get not only signed words, but do also divide by 256
 		 * imagine, data is now ordered this way: ddx0 ccx0 bbx0 aax0, and x is the least
 		 * significant byte, that we don't need anymore, because we've done some rounding */
-			r4=_mm_slli_epi32(r4,8);
-			r5=_mm_slli_epi32(r5,8);
-			r6=_mm_slli_epi32(r6,8);
+			r4 = _mm_slli_epi32(r4,8);
+			r5 = _mm_slli_epi32(r5,8);
+			r6 = _mm_slli_epi32(r6,8);
 			
 		/* one thing we still have to face is the clip() function ...
 		 * we have still signed words, and there are those min/max instructions in SSE2 ...
@@ -238,128 +229,125 @@ pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(const BYTE **pSrc, int *srcStep,
 		 * and it operates with signs !
 		 * if we feed it with our values and zeros, it takes the zeros if our values are smaller than
 		 * zero and otherwise our values */
-			r7=_mm_set_epi32(0,0,0,0);
-			r4=_mm_max_epi16(r4,r7);
-			r5=_mm_max_epi16(r5,r7);
-			r6=_mm_max_epi16(r6,r7);
+			r7 = _mm_set_epi32(0,0,0,0);
+			r4 = _mm_max_epi16(r4,r7);
+			r5 = _mm_max_epi16(r5,r7);
+			r6 = _mm_max_epi16(r6,r7);
 			
 		/* the same thing just completely different can be used to limit our values to 255,
 		 * but now using the min instruction and 255s */
-			r7=_mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000);
-			r4=_mm_min_epi16(r4,r7);
-			r5=_mm_min_epi16(r5,r7);
-			r6=_mm_min_epi16(r6,r7);
+			r7 = _mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000);
+			r4 = _mm_min_epi16(r4,r7);
+			r5 = _mm_min_epi16(r5,r7);
+			r6 = _mm_min_epi16(r6,r7);
 			
 		/* Now we got our bytes.
 		 * the moment has come to assemble the three channels R,G and B to the xrgb dwords
 		 * on Red channel we just have to and each futural dword with 00FF0000H */
 			//r7=_mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000);
-			r4=_mm_and_si128(r4,r7);
+			r4 = _mm_and_si128(r4,r7);
 			
 		/* on Green channel we have to shuffle somehow, so we get something like this:
 		 * 00d0 00c0 00b0 00a0 */
-			r7=_mm_set_epi32(0x80800E80,0x80800A80,0x80800680,0x80800280);
-			r5=_mm_shuffle_epi8(r5,r7);
+			r7 = _mm_set_epi32(0x80800E80,0x80800A80,0x80800680,0x80800280);
+			r5 = _mm_shuffle_epi8(r5,r7);
 			
 		/* and on Blue channel that one:
 		 * 000d 000c 000b 000a */
-			r7=_mm_set_epi32(0x8080800E,0x8080800A,0x80808006,0x80808002);
-			r6=_mm_shuffle_epi8(r6,r7);
-			
+			r7 = _mm_set_epi32(0x8080800E,0x8080800A,0x80808006,0x80808002);
+			r6 = _mm_shuffle_epi8(r6,r7);
 			
 		/* and at last we or it together and get this one:
 		 * xrgb xrgb xrgb xrgb */
-			r4=_mm_or_si128(r4,r5);
-			r4=_mm_or_si128(r4,r6);
-			
+			r4 = _mm_or_si128(r4,r5);
+			r4 = _mm_or_si128(r4,r6);
 			
 		/* Only thing to do know is writing data to memory, but this gets a bit more
 		 * complicated if the width is not a multiple of four and it is the last column in line. */
-			if(last_column&0x02){
+			if (lastCol & 0x02)
+			{
 			/* let's say, we need to only convert six pixel in width
 			 * Ok, the first 4 pixel will be converted just like every 4 pixel else, but
 			 * if it's the last loop in line, last_column is shifted left by one (curious? have a look above),
 			 * and we land here. Through initialisation a mask was prepared. In this case it looks like
 			 * 0000FFFFH 0000FFFFH 0000FFFFH 0000FFFFH */
-				r6=_mm_load_si128(buffer+3);
+				r6 = _mm_load_si128(buffer+3);
 			/* we and our output data with this mask to get only the valid pixel */
-				r4=_mm_and_si128(r4,r6);
+				r4 = _mm_and_si128(r4,r6);
 			/* then we fetch memory from the destination array ... */
-				r5=_mm_lddqu_si128((__m128i *)pDst);
+				r5 = _mm_lddqu_si128((__m128i *)pDst);
 			/* ... and and it with the inverse mask. We get only those pixel, which should not be updated */
-				r6=_mm_andnot_si128(r6,r5);
+				r6 = _mm_andnot_si128(r6,r5);
 			/* we only have to or the two values together and write it back to the destination array,
 			 * and only the pixel that should be updated really get changed. */
-				r4=_mm_or_si128(r4,r6);
+				r4 = _mm_or_si128(r4,r6);
 			}
 			_mm_storeu_si128((__m128i *)pDst,r4);
 			
-			
-			if(!(last_line&0x02)){
+			if (!(lastRow & 0x02))
+			{
 			/* Because UV data is the same for two lines, we can process the secound line just here,
 			 * in the same loop. Only thing we need to do is to add some offsets to the Y- and destination
 			 * pointer. These offsets are iStride[0] and the target scanline.
 			 * But if we don't need to process the secound line, like if we are in the last line of processing nine lines,
 			 * we just skip all this. */
-				r4=_mm_cvtsi32_si128(*(UINT32 *)(YData+srcStep[0]));
-				r7=_mm_set_epi32(0x80800380,0x80800280,0x80800180,0x80800080);
-				r4=_mm_shuffle_epi8(r4,r7);
+				r4 = _mm_cvtsi32_si128(*(UINT32 *)(YData+srcStep[0]));
+				r7 = _mm_set_epi32(0x80800380,0x80800280,0x80800180,0x80800080);
+				r4 = _mm_shuffle_epi8(r4,r7);
 				
-				r5=r4;
-				r6=r4;
+				r5 = r4;
+				r6 = r4;
 				
-				r4=_mm_add_epi32(r4,r2);
-				r5=_mm_sub_epi32(r5,r0);
-				r6=_mm_add_epi32(r6,r1);
+				r4 = _mm_add_epi32(r4,r2);
+				r5 = _mm_sub_epi32(r5,r0);
+				r6 = _mm_add_epi32(r6,r1);
 				
+				r4 = _mm_slli_epi32(r4,8);
+				r5 = _mm_slli_epi32(r5,8);
+				r6 = _mm_slli_epi32(r6,8);
 				
-				r4=_mm_slli_epi32(r4,8);
-				r5=_mm_slli_epi32(r5,8);
-				r6=_mm_slli_epi32(r6,8);
+				r7 = _mm_set_epi32(0,0,0,0);
+				r4 = _mm_max_epi16(r4,r7);
+				r5 = _mm_max_epi16(r5,r7);
+				r6 = _mm_max_epi16(r6,r7);
 				
-				r7=_mm_set_epi32(0,0,0,0);
-				r4=_mm_max_epi16(r4,r7);
-				r5=_mm_max_epi16(r5,r7);
-				r6=_mm_max_epi16(r6,r7);
+				r7 = _mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000);
+				r4 = _mm_min_epi16(r4,r7);
+				r5 = _mm_min_epi16(r5,r7);
+				r6 = _mm_min_epi16(r6,r7);
 				
-				r7=_mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000);
-				r4=_mm_min_epi16(r4,r7);
-				r5=_mm_min_epi16(r5,r7);
-				r6=_mm_min_epi16(r6,r7);
+				r7 = _mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000);
+				r4 = _mm_and_si128(r4,r7);
 				
-				r7=_mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000);
-				r4=_mm_and_si128(r4,r7);
+				r7 = _mm_set_epi32(0x80800E80,0x80800A80,0x80800680,0x80800280);
+				r5 = _mm_shuffle_epi8(r5,r7);
 				
-				r7=_mm_set_epi32(0x80800E80,0x80800A80,0x80800680,0x80800280);
-				r5=_mm_shuffle_epi8(r5,r7);
+				r7 = _mm_set_epi32(0x8080800E,0x8080800A,0x80808006,0x80808002);
+				r6 = _mm_shuffle_epi8(r6,r7);
 				
-				r7=_mm_set_epi32(0x8080800E,0x8080800A,0x80808006,0x80808002);
-				r6=_mm_shuffle_epi8(r6,r7);
+				r4 = _mm_or_si128(r4,r5);
+				r4 = _mm_or_si128(r4,r6);
 				
-				
-				r4=_mm_or_si128(r4,r5);
-				r4=_mm_or_si128(r4,r6);
-				
-				
-				if(last_column&0x02){
-					r6=_mm_load_si128(buffer+3);
-					r4=_mm_and_si128(r4,r6);
-					r5=_mm_lddqu_si128((__m128i *)(pDst+dstStep));
-					r6=_mm_andnot_si128(r6,r5);
-					r4=_mm_or_si128(r4,r6);
+				if (lastCol & 0x02)
+				{
+					r6 = _mm_load_si128(buffer+3);
+					r4 = _mm_and_si128(r4,r6);
+					r5 = _mm_lddqu_si128((__m128i *)(pDst+dstStep));
+					r6 = _mm_andnot_si128(r6,r5);
+					r4 = _mm_or_si128(r4,r6);
 					
 				/* only thing is, we should shift [rbp-42] back here, because we have processed the last column,
 				 * and this "special condition" can be released */
-					last_column=last_column>>1;
+					lastCol >>= 1;
 				}
 				_mm_storeu_si128((__m128i *)(pDst+dstStep),r4);
 			}
 			
 		/* after all we have to increase the destination- and Y-data pointer by four pixel */
-			pDst+=16;
-			YData+=4;
-			
-		}while(i<nWidth);
+			pDst += 16;
+			YData += 4;
+		}
+		while (i < nWidth);
 		
 	/* after each line we have to add the scanline to the destination pointer, because
 	 * we are processing two lines at once, but only increasing the destination pointer
@@ -368,17 +356,17 @@ pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(const BYTE **pSrc, int *srcStep,
 	 * if we're not converting the full width of the scanline, like only 64 pixel, but the
 	 * output buffer was "designed" for 1920p HD, we have to add the remaining length for each line,
 	 * to get into the next line. */
-		pDst+=VaddDst;
+		pDst += VaddDst;
 		
 	/* same thing has to be done for Y-data, but with iStride[0] instead of the target scanline */
-		YData+=VaddY;
+		YData += VaddY;
 		
 	/* and again for UV data, but here it's enough to add the remaining length, because
 	 * UV data is the same for two lines and there exists only one "UV line" on two "real lines" */
-		UData+=VaddU;
-		VData+=VaddV;
+		UData += VaddU;
+		VData += VaddV;
 	}
-		
+
 	_aligned_free(buffer);
 	
 	return PRIMITIVES_SUCCESS;
@@ -388,9 +376,9 @@ pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(const BYTE **pSrc, int *srcStep,
 void primitives_init_YUV_opt(primitives_t *prims)
 {
 #ifdef WITH_SSE2
-	if(IsProcessorFeaturePresentEx(PF_EX_SSSE3)&&IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
+	if (IsProcessorFeaturePresentEx(PF_EX_SSSE3) && IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
 	{
-		prims->YUV420ToRGB_8u_P3AC4R=ssse3_YUV420ToRGB_8u_P3AC4R;
+		prims->YUV420ToRGB_8u_P3AC4R = ssse3_YUV420ToRGB_8u_P3AC4R;
 	}
 #endif
 }