Added SSE2 yuv to rgb conversion code. This code needs to move into the media kit though

git-svn-id: file:///srv/svn/repos/haiku/haiku/trunk@33200 a95241bf-73f2-0310-859d-f6bbb57e9c96
2009-09-19 23:34:23 +00:00 · 2009-09-19 23:34:23 +00:00 · 6bfbea62cd
commit 6bfbea62cd
parent 6a400ee80f
8 changed files with 593 additions and 70 deletions
--- a/src/add-ons/media/plugins/ffmpeg/AVCodecDecoder.cpp
+++ b/src/add-ons/media/plugins/ffmpeg/AVCodecDecoder.cpp
@ -215,7 +215,6 @@ AVCodecDecoder::Setup(media_format* ioEncodedFormat, const void* infoBuffer,
 					fBlockAlign
 						= ioEncodedFormat->u.encoded_audio.output.buffer_size;
 				}
-printf("XXX extra data size %ld\n", infoSize);
 				if (extraData != NULL && fExtraDataSize > 0) {
 					TRACE("AVCodecDecoder: extra data size %ld\n", infoSize);
 					fExtraData = new(std::nothrow) char[fExtraDataSize];
@ -731,13 +730,13 @@ AVCodecDecoder::_DecodeVideo(void* outBuffer, int64* outFrameCount,
 			profileCounter++;
 			if (!(fFrame % 10)) {
 				if (info) {
-					TRACE("[v] profile: d1 = %lld, d2 = %lld (%Ld) required "
+					printf("[v] profile: d1 = %lld, d2 = %lld (%Ld) required "
 						"%Ld\n",
 						decodingTime / profileCounter,
 						conversionTime / profileCounter,
 						fFrame, info->time_to_decode);
 				} else {
-					TRACE("[v] profile: d1 = %lld, d2 = %lld (%Ld) required "
+					printf("[v] profile: d1 = %lld, d2 = %lld (%Ld) required "
 						"%Ld\n",
 						decodingTime / profileCounter,
 						conversionTime / profileCounter,
--- a/src/add-ons/media/plugins/ffmpeg/CpuCapabilities.cpp
+++ b/src/add-ons/media/plugins/ffmpeg/CpuCapabilities.cpp
@ -0,0 +1,146 @@
+/*
+ * Copyright (C) 2009 David McPaul
+ *
+ * includes code from sysinfo.c which is
+ * Copyright 2004-2008, Axel Dörfler, axeld@pinc-software.de.
+ * Copyright (c) 2002, Carlos Hasan, for Haiku.
+ *
+ * All rights reserved. Distributed under the terms of the MIT License.
+ */
+ 
+#include <string.h>
+#include <cpu_type.h>
+
+#include "CpuCapabilities.h"
+
+CPUCapabilities::~CPUCapabilities()
+{
+}
+
+CPUCapabilities::CPUCapabilities()
+{
+	#ifdef __INTEL__
+		setIntelCapabilities();
+	#endif
+	
+	PrintCapabilities();
+}
+
+void
+CPUCapabilities::setIntelCapabilities()
+{
+	cpuid_info baseInfo;
+	cpuid_info cpuInfo;
+	int32 maxStandardFunction, maxExtendedFunction = 0;
+
+	if (get_cpuid(&baseInfo, 0L, 0L) != B_OK) {
+		// this CPU doesn't support cpuid
+		return;
+	}
+
+	maxStandardFunction = baseInfo.eax_0.max_eax;
+	if (maxStandardFunction >= 500) {
+		maxStandardFunction = 0; /* old Pentium sample chips has cpu signature here */
+	}
+	
+	/* Extended cpuid */
+
+	get_cpuid(&cpuInfo, 0x80000000, 0L);
+
+	// extended cpuid is only supported if max_eax is greater than the service id
+	if (cpuInfo.eax_0.max_eax > 0x80000000) {
+		maxExtendedFunction = cpuInfo.eax_0.max_eax & 0xff;
+	}
+	
+	if (maxStandardFunction > 0) {
+
+		get_cpuid(&cpuInfo, 1L, 0L);
+		if (cpuInfo.eax_1.features & (1UL << 23)) {
+			capabilities = CAPABILITY_MMX;
+		}
+	
+		if (cpuInfo.eax_1.features & (1UL << 25)) {
+			capabilities = CAPABILITY_SSE1;
+		}
+
+		if (cpuInfo.eax_1.features & (1UL << 26)) {
+			capabilities = CAPABILITY_SSE2;
+		}
+
+		if (maxStandardFunction >= 1) {
+			/* Extended features */
+			if (cpuInfo.eax_1.extended_features & (1UL << 0)) {
+				capabilities = CAPABILITY_SSE3;
+			}
+			if (cpuInfo.eax_1.extended_features & (1UL << 9)) {
+				capabilities = CAPABILITY_SSSE3;
+			}
+			if (cpuInfo.eax_1.extended_features & (1UL << 19)) {
+				capabilities = CAPABILITY_SSE41;
+			}
+			if (cpuInfo.eax_1.extended_features & (1UL << 20)) {
+				capabilities = CAPABILITY_SSE42;
+			}
+		}
+	}
+}
+
+bool
+CPUCapabilities::HasMMX()
+{
+	return capabilities >= CAPABILITY_MMX;
+}
+
+bool
+CPUCapabilities::HasSSE1()
+{
+	return capabilities >= CAPABILITY_SSE1;
+}
+
+bool
+CPUCapabilities::HasSSE2()
+{
+	return capabilities >= CAPABILITY_SSE2;
+}
+
+bool
+CPUCapabilities::HasSSE3()
+{
+	return capabilities >= CAPABILITY_SSE3;
+}
+
+bool
+CPUCapabilities::HasSSSE3()
+{
+	return capabilities >= CAPABILITY_SSSE3;
+}
+
+bool
+CPUCapabilities::HasSSE41()
+{
+	return capabilities >= CAPABILITY_SSE41;
+}
+
+bool
+CPUCapabilities::HasSSE42()
+{
+	return capabilities >= CAPABILITY_SSE42;
+}
+
+void
+CPUCapabilities::PrintCapabilities()
+{
+	static const char *CapArray[8] = {
+		"", "MMX", "SSE1", "SSE2", "SSE3", "SSSE3", "SSE4.1", "SSE4.2"
+	};
+
+	printf("CPU is capable of running ");
+	if (capabilities) {
+		for (uint32 i=1;i<=capabilities;i++) {
+			printf("%s ",CapArray[i]);
+		}
+	} else {
+		printf("no extensions");
+	}
+	printf("\n");
+}
--- a/src/add-ons/media/plugins/ffmpeg/CpuCapabilities.h
+++ b/src/add-ons/media/plugins/ffmpeg/CpuCapabilities.h
@ -0,0 +1,40 @@
+/*
+ * Copyright (C) 2009 David McPaul
+ *
+ * All rights reserved. Distributed under the terms of the MIT License.
+ */
+ 
+#ifndef __CPU_CAPABILITIES__
+#define __CPU_CAPABILITIES__
+
+#define CAPABILITY_MMX 1
+#define CAPABILITY_SSE1 2
+#define CAPABILITY_SSE2 3
+#define CAPABILITY_SSE3 4
+#define CAPABILITY_SSSE3 5
+#define CAPABILITY_SSE41 6
+#define CAPABILITY_SSE42 7
+
+
+class CPUCapabilities {
+	public:
+		CPUCapabilities();
+		~CPUCapabilities();
+	
+		bool HasMMX();
+		bool HasSSE1();
+		bool HasSSE2();
+		bool HasSSE3();
+		bool HasSSSE3();
+		bool HasSSE41();
+		bool HasSSE42();
+	
+		void PrintCapabilities();
+	
+	private:
+		uint32		capabilities;
+	
+		void setIntelCapabilities();
+};
+
+#endif	//__CPU_CAPABILITIES__
--- a/src/add-ons/media/plugins/ffmpeg/Jamfile
+++ b/src/add-ons/media/plugins/ffmpeg/Jamfile
@ -19,11 +19,13 @@ Addon ffmpeg :
 	EncoderTable.cpp
 	FFmpegPlugin.cpp
 	MuxerTable.cpp
+	CpuCapabilities.cpp

 	gfx_conv_c.cpp
 	gfx_conv_c_lookup.cpp
-#	gfx_conv_mmx.cpp
+	gfx_conv_mmx.cpp
 	gfx_util.cpp
+ 	yuvrgb.nasm
 	:
 	libavformat.a
 	libavcodec.a
--- a/src/add-ons/media/plugins/ffmpeg/gfx_conv_mmx.cpp
+++ b/src/add-ons/media/plugins/ffmpeg/gfx_conv_mmx.cpp
@ -0,0 +1,84 @@
+#include "gfx_conv_mmx.h"
+#include "gfx_conv_c.h"
+
+extern "C" void _Convert_YUV420P_RGBA32_SSE2(void *fromYPtr, void *fromUPtr, void *fromVPtr, void *toPtr, int width);
+extern "C" void _Convert_YUV422_RGBA32_SSE2(void *fromYPtr, void *toPtr, int width);
+
+void gfx_conv_null_mmx(AVFrame *in, AVFrame *out, int width, int height) {
+	memcpy(out->data[0], in->data[0], height * in->linesize[0]);
+}
+
+void gfx_conv_yuv410p_ycbcr422_mmx(AVFrame *in, AVFrame *out, int width, int height)
+{
+//	img_convert((AVPicture *)out,PIX_FMT_YUV422P,(const AVPicture *)in,PIX_FMT_YUV410P,width,height);
+}
+
+void gfx_conv_yuv411p_ycbcr422_mmx(AVFrame *in, AVFrame *out, int width, int height)
+{
+//	img_convert((AVPicture *)out,PIX_FMT_YUV422P,(const AVPicture *)in,PIX_FMT_YUV411P,width,height);
+}
+
+void gfx_conv_yuv420p_ycbcr422_mmx(AVFrame *in, AVFrame *out, int width, int height)
+{
+//	img_convert((AVPicture *)out,PIX_FMT_YUV422P,(const AVPicture *)in,PIX_FMT_YUV420P,width,height);
+}
+
+void gfx_conv_yuv410p_rgb32_mmx(AVFrame *in, AVFrame *out, int width, int height)
+{
+//	img_convert((AVPicture *)out,PIX_FMT_RGB32,(const AVPicture *)in,PIX_FMT_YUV410P,width,height);
+}
+
+void gfx_conv_yuv411p_rgb32_mmx(AVFrame *in, AVFrame *out, int width, int height)
+{
+//	img_convert((AVPicture *)out,PIX_FMT_RGB32,(const AVPicture *)in,PIX_FMT_YUV411P,width,height);
+}
+
+void gfx_conv_yuv420p_rgb32_mmx(AVFrame *in, AVFrame *out, int width, int height)
+{
+//	img_convert((AVPicture *)out,PIX_FMT_RGB32,(const AVPicture *)in,PIX_FMT_YUV420P,width,height);
+}
+
+// Planar YUV420
+void gfx_conv_yuv420p_rgba32_sse2(AVFrame *in, AVFrame *out, int width, int height)
+{
+	// width must be divisibile by 8 and height divisible by 2
+	if (width % 8 == 0 && height % 2 == 0) {
+	
+		uint8 *ybase = (uint8 *)in->data[0];
+		uint8 *ubase = (uint8 *)in->data[1];
+		uint8 *vbase = (uint8 *)in->data[2];
+		uint8 *rgbbase = (uint8 *)out->data[0];
+	
+		for (int i=0;i<height;i+=2) {
+			_Convert_YUV420P_RGBA32_SSE2(ybase, ubase, vbase, rgbbase, width);	// First Y row
+			ybase += in->linesize[0];
+			rgbbase += out->linesize[0];
+		
+			_Convert_YUV420P_RGBA32_SSE2(ybase, ubase, vbase, rgbbase, width);	// Second Y row but same u and v row
+			ybase += in->linesize[0];
+			ubase += in->linesize[1];
+			vbase += in->linesize[2];
+			rgbbase += out->linesize[0];
+		}
+	} else {
+		gfx_conv_YCbCr420p_RGB32_c(in, out, width, height);
+	}
+}
+
+// Packed YUV422
+void gfx_conv_yuv422p_rgba32_sse2(AVFrame *in, AVFrame *out, int width, int height)
+{
+	// width must be divisibile by 8 
+	if (width % 8 == 0) {
+		uint8 *ybase = (uint8 *)in->data[0];
+		uint8 *rgbbase = (uint8 *)out->data[0];
+
+		for (int i = 0; i <= height; i++) {
+			_Convert_YUV422_RGBA32_SSE2(ybase, rgbbase, width);
+			ybase += in->linesize[0];
+			rgbbase += out->linesize[0];
+		}
+	} else {
+		gfx_conv_YCbCr422_RGB32_c(in, out, width, height);
+	}
+}
--- a/src/add-ons/media/plugins/ffmpeg/gfx_conv_mmx.h
+++ b/src/add-ons/media/plugins/ffmpeg/gfx_conv_mmx.h
@ -5,8 +5,6 @@
 #include <GraphicsDefs.h>
 #include "libavcodec/avcodec.h"

-bool IsMmxCpu();
-
 void gfx_conv_null_mmx(AVFrame *in, AVFrame *out, int width, int height);

 void gfx_conv_yuv410p_ycbcr422_mmx(AVFrame *in, AVFrame *out, int width, int height);
@ -15,6 +13,7 @@ void gfx_conv_yuv420p_ycbcr422_mmx(AVFrame *in, AVFrame *out, int width, int hei

 void gfx_conv_yuv410p_rgb32_mmx(AVFrame *in, AVFrame *out, int width, int height);
 void gfx_conv_yuv411p_rgb32_mmx(AVFrame *in, AVFrame *out, int width, int height);
-void gfx_conv_yuv420p_rgb32_mmx(AVFrame *in, AVFrame *out, int width, int height);
+void gfx_conv_yuv420p_rgba32_sse2(AVFrame *in, AVFrame *out, int width, int height);
+void gfx_conv_yuv422p_rgba32_sse2(AVFrame *in, AVFrame *out, int width, int height);

 #endif
--- a/src/add-ons/media/plugins/ffmpeg/gfx_util.cpp
+++ b/src/add-ons/media/plugins/ffmpeg/gfx_util.cpp
@ -3,6 +3,7 @@
 #include "gfx_util.h"
 #include "gfx_conv_c.h"
 #include "gfx_conv_mmx.h"
+#include "CpuCapabilities.h"

 /*
 * ref docs
@ -15,61 +16,51 @@
  #define TRACE(a...)
 #endif

-//#define INCLUDE_MMX 	defined(__INTEL__)
-#define INCLUDE_MMX 	0
-
 // this function will try to find the best colorspaces for both the ff-codec and 
 // the Media Kit sides.
 gfx_convert_func resolve_colorspace(color_space colorSpace, PixelFormat pixelFormat)
 {
-#if INCLUDE_MMX
-	bool mmx = IsMmxCpu();
-#endif
+CPUCapabilities cpu;

 	switch (colorSpace)
 	{
 		case B_RGB32:
 			if (pixelFormat == PIX_FMT_YUV410P) {
-				#if INCLUDE_MMX
-				if (mmx) {
-					TRACE("resolve_colorspace: gfx_conv_yuv410p_rgb32_mmx\n");
-					return gfx_conv_yuv410p_rgb32_mmx;
-				} else
-				#endif
-				{
+//				if (cpu.HasMMX()) {
+//					TRACE("resolve_colorspace: gfx_conv_yuv410p_rgb32_mmx\n");
+//					return gfx_conv_yuv410p_rgb32_mmx;
+//				} else {
 					TRACE("resolve_colorspace: gfx_conv_yuv410p_rgb32_c\n");
 					return gfx_conv_yuv410p_rgb32_c;
-				}
+//				}
 			}
 			
 			if (pixelFormat == PIX_FMT_YUV411P) {
-				#if INCLUDE_MMX
-				if (mmx) {
-					TRACE("resolve_colorspace: gfx_conv_yuv411p_rgb32_mmx\n");
-					return gfx_conv_yuv411p_rgb32_mmx;
-				} else
-				#endif
-				{
+//				if (cpu.HasMMX()) {
+//					TRACE("resolve_colorspace: gfx_conv_yuv411p_rgb32_mmx\n");
+//					return gfx_conv_yuv411p_rgb32_mmx;
+//				} else {
 					TRACE("resolve_colorspace: gfx_conv_yuv411p_rgb32_c\n");
 					return gfx_conv_yuv411p_rgb32_c;
-				}
+//				}
 			}
 					
 			if (pixelFormat == PIX_FMT_YUV420P || pixelFormat == PIX_FMT_YUVJ420P) {
-				#if INCLUDE_MMX
-				if (mmx) {
-					TRACE("resolve_colorspace: gfx_conv_yuv420p_rgb32_mmx\n");
-					return gfx_conv_yuv420p_rgb32_mmx;
-				} else
-				#endif
-				{
-					TRACE("resolve_colorspace: gfx_conv_yuv420p_rgb32_c\n");
+				if (cpu.HasSSE2()) {
+					TRACE("resolve_colorspace: gfx_conv_yuv420p_rgba32_sse2\n");
+					return gfx_conv_yuv420p_rgba32_sse2;
+				} else {
+					TRACE("resolve_colorspace: gfx_conv_YCbCr420p_RGB32_c\n");
 					return gfx_conv_YCbCr420p_RGB32_c;
 				}
 			}
 			
 			if (pixelFormat == PIX_FMT_YUV422P || pixelFormat == PIX_FMT_YUVJ422P) {
-				return gfx_conv_YCbCr422_RGB32_c;
+				if (cpu.HasSSE2()) {
+					return gfx_conv_yuv422p_rgba32_sse2;
+				} else {
+					return gfx_conv_YCbCr422_RGB32_c;
+				}
 			}

 			TRACE("resolve_colorspace: %s => B_RGB32: NULL\n", pixfmt_to_string(pixelFormat));
@ -86,55 +77,43 @@ gfx_convert_func resolve_colorspace(color_space colorSpace, PixelFormat pixelFor
 		case B_YCbCr422:

 			if (pixelFormat == PIX_FMT_YUV410P) {
-				#if INCLUDE_MMX
-				if (mmx) {
-					TRACE("resolve_colorspace: gfx_conv_yuv410p_ycbcr422_mmx\n");
-					return gfx_conv_yuv410p_ycbcr422_mmx;
-				} else
-				#endif
-				{
+//				if (cpu.HasMMX()) {
+//					TRACE("resolve_colorspace: gfx_conv_yuv410p_ycbcr422_mmx\n");
+//					return gfx_conv_yuv410p_ycbcr422_mmx;
+//				} else {
 					TRACE("resolve_colorspace: gfx_conv_yuv410p_ycbcr422_c\n");
 					return gfx_conv_yuv410p_ycbcr422_c;
-				}
+//				}
 			}

 			if (pixelFormat == PIX_FMT_YUV411P) {
-				#if INCLUDE_MMX
-				if (mmx) {
-					TRACE("resolve_colorspace: gfx_conv_yuv411p_ycbcr422_mmx\n");
-					return gfx_conv_yuv411p_ycbcr422_mmx;
-				} else
-				#endif
-				{
+//				if (cpu.HasMMX()) {
+//					TRACE("resolve_colorspace: gfx_conv_yuv411p_ycbcr422_mmx\n");
+//					return gfx_conv_yuv411p_ycbcr422_mmx;
+//				} else {
 					TRACE("resolve_colorspace: gfx_conv_yuv411p_ycbcr422_c\n");
 					return gfx_conv_yuv411p_ycbcr422_c;
-				}
+//				}
 			}
 			
 			if (pixelFormat == PIX_FMT_YUV420P || pixelFormat == PIX_FMT_YUVJ420P) {
-				#if INCLUDE_MMX
-				if (mmx) {
-					TRACE("resolve_colorspace: gfx_conv_yuv420p_ycbcr422_mmx\n");
-					return gfx_conv_yuv420p_ycbcr422_mmx;
-				} else
-				#endif
-				{
+//				if (cpu.HasMMX()) {
+//					TRACE("resolve_colorspace: gfx_conv_yuv420p_ycbcr422_mmx\n");
+//					return gfx_conv_yuv420p_ycbcr422_mmx;
+//				} else {
 					TRACE("resolve_colorspace: gfx_conv_yuv420p_ycbcr422_c\n");
 					return gfx_conv_yuv420p_ycbcr422_c;
-				}
+//				}
 			}
 			
 			if (pixelFormat == PIX_FMT_YUYV422) {
-				#if INCLUDE_MMX
-				if (mmx) {
-					TRACE("resolve_colorspace: PIX_FMT_YUV422 => B_YCbCr422: gfx_conv_null_mmx\n");
-					return gfx_conv_null_mmx;
-				} else
-				#endif
-				{
+//				if (cpu.HasMMX()) {
+//					TRACE("resolve_colorspace: PIX_FMT_YUV422 => B_YCbCr422: gfx_conv_null_mmx\n");
+//					return gfx_conv_null_mmx;
+//				} else {
 					TRACE("resolve_colorspace: PIX_FMT_YUV422 => B_YCbCr422: gfx_conv_null_c\n");
 					return gfx_conv_null_c;
-				}
+//				}
 			}
 			
 			TRACE("resolve_colorspace: %s => B_YCbCr422: NULL\n", pixfmt_to_string(pixelFormat));
--- a/src/add-ons/media/plugins/ffmpeg/yuvrgb.nasm
+++ b/src/add-ons/media/plugins/ffmpeg/yuvrgb.nasm
@ -0,0 +1,274 @@
+/*
+ * Copyright (C) 2009 David McPaul
+ *
+ * All rights reserved. Distributed under the terms of the MIT License.
+ */
+
+; A rather unoptimised set of yuv to rgb converters
+; does 8 pixels at a time
+
+; inputer:
+; reads 128bits of yuv 8 bit data and puts
+; the y values converted to 16 bit in xmm0
+; the u values converted to 16 bit and duplicated into xmm1
+; the v values converted to 16 bit and duplicated into xmm2
+
+; conversion:
+; does the yuv to rgb conversion using 16 bit fixed point and the
+; results are placed into the following registers as 8 bit clamped values
+; r values in xmm3
+; g values in xmm4
+; b values in xmm5
+
+; outputer:
+; writes out the rgba pixels as 8 bit values with 0 for alpha
+
+; xmm6 used for scratch
+; xmm7 used for scratch
+
+%macro  cglobal 1
+	global  _%1
+	%define %1 _%1
+	align 16
+%1:
+%endmacro
+
+; conversion code 
+%macro yuv2rgb 0
+; u = u - 128
+; v = v - 128
+; r = y + v + v >> 2 + v >> 3 + v >> 5 
+; g = y - (u >> 2 + u >> 4 + u >> 5) - (v >> 1 + v >> 3 + v >> 4 + v >> 5)
+; b = y + u + u >> 1 + u >> 2 + u >> 6
+; subtract 16 from y
+	movdqa xmm7, [Const16]			; loads a constant using data cache
+	psubsw xmm0,xmm7				; y = y - 16
+; subtract 128 from u and v
+;	mov eax,128*10001H				; load a constant using instruction cache
+;	movd xmm7,eax					; but requires eax to be saved
+;	pshufd xmm7,xmm7,0				; and uses more instructions
+	movdqa xmm7, [Const128]			; loads a constant using data cache (slower on first fetch but then cached)
+	psubsw xmm1,xmm7				; u = u - 128
+	psubsw xmm2,xmm7				; v = v - 128
+; load r,g,b with y 
+	movdqa xmm3,xmm0				; r = y 
+	pshufd xmm4,xmm0, 0xE4			; g = y 
+	movdqa xmm5,xmm0				; b = y 
+; r = r + v + v >> 2 + v >> 3 + v >> 5
+	paddsw xmm3, xmm2				; add v to r
+	movdqa xmm6, xmm2				; move v to scratch
+	psraw  xmm6,2					; divide by 4
+	paddsw xmm3, xmm6				; and add to r
+	psraw  xmm6,1					; divide by 2
+	paddsw xmm3, xmm6				; and add to r
+	psraw  xmm6,2					; divide by 4
+	paddsw xmm3, xmm6				; and add to r
+	
+; g = y - u >> 2 - u >> 4 - u >> 5 - v >> 1 - v >> 3 - v >> 4 - v >> 5
+	movdqa xmm6,xmm1				; move u to scratch
+	psraw  xmm6,2					; divide by 4
+	psubsw xmm4,xmm6				; subtract from g
+	psraw  xmm6,2					; divide by 4
+	psubsw xmm4,xmm6				; subtract from g
+	psraw  xmm6,1					; divide by 2
+	psubsw xmm4,xmm6				; subtract from g
+
+	movdqa xmm6,xmm2				; move v to scratch
+	psraw  xmm6,1					; divide by 2
+	psubsw xmm4,xmm6				; subtract from g
+	psraw  xmm6,2					; divide by 4
+	psubsw xmm4,xmm6				; subtract from g
+	psraw  xmm6,1					; divide by 2
+	psubsw xmm4,xmm6				; subtract from g
+	psraw  xmm6,1					; divide by 2
+	psubsw xmm4,xmm6				; subtract from g
+; b = y + u + u >> 1 + u >> 2 + u >> 6
+	paddsw xmm5, xmm1				; add u to b
+	movdqa xmm6, xmm1				; move u to scratch
+	psraw  xmm6,1					; divide by 2
+	paddsw xmm5, xmm6				; and add to b
+	psraw  xmm6,1					; divide by 2
+	paddsw xmm5, xmm6				; and add to b
+	psraw  xmm6,4					; divide by 32
+	paddsw xmm5, xmm6				; and add to b
+%endmacro
+
+; outputer
+%macro rgba32output 0
+; clamp values
+	pxor xmm7,xmm7
+	packuswb xmm3,xmm7				; clamp to 0,255 and pack R to 8 bit per pixel
+	packuswb xmm4,xmm7				; clamp to 0,255 and pack G to 8 bit per pixel
+	packuswb xmm5,xmm7				; clamp to 0,255 and pack B to 8 bit per pixel
+; convert to bgra32 packed
+	punpcklbw xmm3,xmm7				; r0r0r0r0r0r0r0r0
+	punpcklbw xmm5,xmm4				; bgbgbgbgbgbgbgbg
+	movdqa xmm0, xmm5				; save gb values
+	punpcklwd xmm5,xmm3				; lower half bgr0bgr0bgr0bgr0
+	punpckhwd xmm0,xmm3				; upper half bgr0bgr0bgr0bgr0
+; write to output ptr
+	movntdq [edi], xmm5				; output first 4 pixels bypassing cache
+	movntdq [edi+16], xmm0			; output second 4 pixels bypassing cache
+%endmacro
+
+SECTION .data align=16
+
+Const16	dw	16
+	dw	16
+	dw	16
+	dw	16
+	dw	16
+	dw	16
+	dw	16
+	dw	16
+
+Const128	dw	128
+	dw	128
+	dw	128
+	dw	128
+	dw	128
+	dw	128
+	dw	128
+	dw	128
+
+; void Convert_YUV422_RGBA32_SSE2(void *fromPtr, void *toPtr, int width)
+width    equ	ebp+16
+toPtr    equ	ebp+12
+fromPtr  equ	ebp+8
+
+; void Convert_YUV420P_RGBA32_SSE2(void *fromYPtr, void *fromUPtr, void *fromVPtr, void *toPtr, int width)
+width1    equ	ebp+24
+toPtr1    equ	ebp+20
+fromVPtr  equ	ebp+16
+fromUPtr  equ	ebp+12
+fromYPtr  equ	ebp+8
+
+SECTION .text align=16
+
+cglobal Convert_YUV422_RGBA32_SSE2
+; reserve variables
+	push ebp
+	mov ebp, esp
+	push edi
+	push esi
+	push ecx
+	
+	mov esi, [fromPtr]
+	mov edi, [toPtr]
+	mov ecx, [width]
+	prefetchnta [esi]	; hint that we will be loading our data outside of cache
+; loop width / 8 times
+	shr ecx,3
+	test ecx,ecx
+	jng ENDLOOP
+REPEATLOOP:							; loop over width / 8
+;	push ecx						; preserve loop counter
+; YUV422 packed inputer
+	movdqa xmm0, [esi]				; should have yuyv yuyv yuyv yuyv
+	pshufd xmm1, xmm0, 0xE4			; copy to xmm1
+	movdqa xmm2, xmm0				; copy to xmm2
+; extract y
+	pxor xmm7,xmm7					; 00000000000000000000000000000000
+	pcmpeqd xmm6,xmm6				; ffffffffffffffffffffffffffffffff
+	punpcklbw xmm6,xmm7				; interleave xmm7 into xmm6 ff00ff00ff00ff00ff00ff00ff00ff00
+	pand xmm0, xmm6					; clear all but y values leaving y0y0 etc
+; extract u and duplicate so each u in yuyv becomes 0u0u
+	psrld xmm6,8					; 00ff0000 00ff0000 00ff0000 00ff0000
+	pand xmm1, xmm6					; clear all yv values leaving 0u00 etc
+	psrld xmm1,8					; rotate u to get u000
+	pshuflw xmm1,xmm1, 0xA0			; copy u values
+	pshufhw xmm1,xmm1, 0xA0			; to get u0u0
+; extract v
+	pslld xmm6,16					; 000000ff000000ff 000000ff000000ff
+	pand xmm2, xmm6					; clear all yu values leaving 000v etc
+	psrld xmm2,8					; rotate v to get 00v0
+	pshuflw xmm2,xmm2, 0xF5			; copy v values
+	pshufhw xmm2,xmm2, 0xF5			; to get v0v0
+
+yuv2rgb
+	
+rgba32output
+
+; endloop
+	add edi,32
+	add esi,16
+;	pop ecx
+	sub ecx, 1				; apparently sub is better than dec
+	jnz REPEATLOOP
+ENDLOOP:
+; Cleanup
+	pop ecx
+	pop esi
+	pop edi
+	mov esp, ebp
+	pop ebp
+	ret
+
+cglobal Convert_YUV420P_RGBA32_SSE2
+; reserve variables
+	push ebp
+	mov ebp, esp
+	push edi
+	push esi
+	push ecx
+	push eax
+	push ebx
+		
+	mov esi, [fromYPtr]
+	mov eax, [fromUPtr]
+	mov ebx, [fromVPtr]
+	mov edi, [toPtr1]
+	mov ecx, [width1]
+; loop width / 8 times
+	shr ecx,3
+	test ecx,ecx
+	jng ENDLOOP1
+REPEATLOOP1:						; loop over width / 8
+;	push ecx						; preserve loop counter
+; YUV420 Planar inputer
+	movq mm0, [esi]					; fetch 8 y values (8 bit) (direct unaligned sse2 loads might be better)
+	movd mm1, [eax]					; fetch 4 u values
+	movd mm2, [ebx]					; fetch 4 v values
+	
+	movq2dq xmm0, mm0				; copy y to sse register  yyyyyyyy00000000
+	movq2dq xmm1, mm1				; copy u to sse register  uuuu000000000000
+	movq2dq xmm2, mm2				; copy v to sse register  vvvv000000000000
+; extract y
+	pxor xmm7,xmm7					; 00000000000000000000000000000000
+	punpcklbw xmm0,xmm7				; interleave xmm7 into xmm0 y0y0y0y0y0y0y0y0
+; extract u and duplicate so each becomes 0u0u
+	punpcklbw xmm1,xmm7				; interleave xmm7 into xmm1 u0u0u0u000000000
+	punpcklwd xmm1,xmm7				; interleave again u000u000u000u000
+	pshuflw xmm1,xmm1, 0xA0			; copy u values
+	pshufhw xmm1,xmm1, 0xA0			; to get u0u0
+; extract v
+	punpcklbw xmm2,xmm7				; interleave xmm7 into xmm1 v0v0v0v000000000
+	punpcklwd xmm2,xmm7				; interleave again v000v000v000v000
+	pshuflw xmm2,xmm2, 0xA0			; copy v values
+	pshufhw xmm2,xmm2, 0xA0			; to get v0v0
+
+yuv2rgb
+	
+rgba32output
+
+; endloop
+	add edi,32
+	add esi,8
+	add eax,4
+	add ebx,4
+;	pop ecx
+	sub ecx, 1				; apparently sub is better than dec
+	jnz REPEATLOOP1
+ENDLOOP1:
+; Cleanup
+	emms
+	pop ebx
+	pop eax
+	pop ecx
+	pop esi
+	pop edi
+	mov esp, ebp
+	pop ebp
+	ret
+
+SECTION .note.GNU-stack noalloc noexec nowrite progbits