From 51d9b4fe14bec902662f90f9ff7dca5df5622527 Mon Sep 17 00:00:00 2001
From: David McPaul <dlmcpaul@gmail.com>
Date: Tue, 21 Dec 2010 07:25:40 +0000
Subject: [PATCH] correct yuv422 planar conversion.  Seperate sse, sse2 and
 ssse3 asm code.  Add packed convertor

git-svn-id: file:///srv/svn/repos/haiku/haiku/trunk@39913 a95241bf-73f2-0310-859d-f6bbb57e9c96
---
 src/add-ons/media/plugins/ffmpeg/Jamfile      |   4 +-
 .../media/plugins/ffmpeg/gfx_conv_mmx.cpp     | 258 +++++++++++----
 .../media/plugins/ffmpeg/gfx_conv_mmx.h       |  12 +-
 src/add-ons/media/plugins/ffmpeg/gfx_util.cpp |  27 +-
 .../media/plugins/ffmpeg/yuvrgb_sse.nasm      | 268 +++++++++++++++
 .../ffmpeg/{yuvrgb.nasm => yuvrgb_sse2.nasm}  | 201 +-----------
 .../media/plugins/ffmpeg/yuvrgb_ssse3.nasm    | 307 ++++++++++++++++++
 7 files changed, 816 insertions(+), 261 deletions(-)
 create mode 100644 src/add-ons/media/plugins/ffmpeg/yuvrgb_sse.nasm
 rename src/add-ons/media/plugins/ffmpeg/{yuvrgb.nasm => yuvrgb_sse2.nasm} (56%)
 create mode 100644 src/add-ons/media/plugins/ffmpeg/yuvrgb_ssse3.nasm

diff --git a/src/add-ons/media/plugins/ffmpeg/Jamfile b/src/add-ons/media/plugins/ffmpeg/Jamfile
index e1632547cf..7fc1bc2eeb 100644
--- a/src/add-ons/media/plugins/ffmpeg/Jamfile
+++ b/src/add-ons/media/plugins/ffmpeg/Jamfile
@@ -27,7 +27,9 @@ Addon ffmpeg :
 	gfx_conv_c_lookup.cpp
 	gfx_conv_mmx.cpp
  	gfx_util.cpp
- 	yuvrgb.nasm
+ 	yuvrgb_sse.nasm
+ 	yuvrgb_sse2.nasm
+ 	yuvrgb_ssse3.nasm
 	:
 	libavformat.a
 	libavcodec.a
diff --git a/src/add-ons/media/plugins/ffmpeg/gfx_conv_mmx.cpp b/src/add-ons/media/plugins/ffmpeg/gfx_conv_mmx.cpp
index c5b984ba1a..fe38498401 100644
--- a/src/add-ons/media/plugins/ffmpeg/gfx_conv_mmx.cpp
+++ b/src/add-ons/media/plugins/ffmpeg/gfx_conv_mmx.cpp
@@ -1,22 +1,63 @@
 #include "gfx_conv_mmx.h"
 #include "gfx_conv_c.h"
 
-
-extern "C" void _Convert_YUV420P_RGBA32_SSE2(void *fromYPtr, void *fromUPtr,
-	void *fromVPtr, void *toPtr, int width);
-extern "C" void _Convert_YUV422_RGBA32_SSE2(void *fromYPtr, void *toPtr,
-	int width);
-extern "C" void _Convert_YUV420P_RGBA32_SSE(void *fromYPtr, void *fromUPtr,
-	void *fromVPtr, void *toPtr, int width);
+// Packed
 extern "C" void _Convert_YUV422_RGBA32_SSE(void *fromYPtr, void *toPtr,
 	int width);
+extern "C" void _Convert_YUV422_RGBA32_SSE2(void *fromYPtr, void *toPtr,
+	int width);
+extern "C" void _Convert_YUV422_RGBA32_SSSE3(void *fromYPtr, void *toPtr,
+	int width);
+
+// Planar
+extern "C" void _Convert_YUV420P_RGBA32_SSE(void *fromYPtr, void *fromUPtr,
+	void *fromVPtr, void *toPtr, int width);
+extern "C" void _Convert_YUV420P_RGBA32_SSE2(void *fromYPtr, void *fromUPtr,
+	void *fromVPtr, void *toPtr, int width);
+extern "C" void _Convert_YUV420P_RGBA32_SSSE3(void *fromYPtr, void *fromUPtr,
+	void *fromVPtr, void *toPtr, int width);
 
 
+// Planar YUV420 means 2 Y lines share a UV line
+void
+gfx_conv_yuv420p_rgba32_sse(AVFrame *in, AVFrame *out, int width, int height)
+{
+	// in and out buffers must be aligned to 16 bytes,
+	// in should be as ffmpeg allocates it
+	if ((off_t)out->data[0] % 16 != 0) {
+		gfx_conv_YCbCr420p_RGB32_c(in, out, width, height);
+		return;
+	}
+	
+	uint8 *ybase = (uint8 *)in->data[0];
+	uint8 *ubase = (uint8 *)in->data[1];
+	uint8 *vbase = (uint8 *)in->data[2];
+	uint8 *rgbbase = (uint8 *)out->data[0];
+	
+	int yBaseInc = in->linesize[0];
+	int uBaseInc = in->linesize[1];
+	int vBaseInc = in->linesize[2];
+	int rgbBaseInc = out->linesize[0];
+	
+	for (int i=0;i<height;i+=2) {
+		// First Y row
+		_Convert_YUV420P_RGBA32_SSE(ybase, ubase, vbase, rgbbase, width);
+		ybase += yBaseInc;
+		rgbbase += rgbBaseInc;
+
+		// Second Y row but same u and v row
+		_Convert_YUV420P_RGBA32_SSE(ybase, ubase, vbase, rgbbase, width);
+		ybase += yBaseInc;
+		ubase += uBaseInc;
+		vbase += vBaseInc;
+		rgbbase += rgbBaseInc;
+	}
+}
+
+// Planar YUV420 means 2 Y lines share a UV line
 void
 gfx_conv_yuv420p_rgba32_sse2(AVFrame *in, AVFrame *out, int width, int height)
-{
-	// Planar YUV420
-
+{	
 	// in and out buffers must be aligned to 32 bytes,
 	// in should be as ffmpeg allocates it
 	if ((off_t)out->data[0] % 32 != 0) {
@@ -49,12 +90,157 @@ gfx_conv_yuv420p_rgba32_sse2(AVFrame *in, AVFrame *out, int width, int height)
 	}
 }
 
+// Planar YUV420 means 2 Y lines share a UV line
+void
+gfx_conv_yuv420p_rgba32_ssse3(AVFrame *in, AVFrame *out, int width, int height)
+{	
+	// in and out buffers must be aligned to 32 bytes,
+	// in should be as ffmpeg allocates it
+	if ((off_t)out->data[0] % 32 != 0) {
+		gfx_conv_YCbCr420p_RGB32_c(in, out, width, height);
+		return;
+	}
 
+	uint8 *ybase = (uint8 *)in->data[0];
+	uint8 *ubase = (uint8 *)in->data[1];
+	uint8 *vbase = (uint8 *)in->data[2];
+	uint8 *rgbbase = (uint8 *)out->data[0];
+	
+	int yBaseInc = in->linesize[0];
+	int uBaseInc = in->linesize[1];
+	int vBaseInc = in->linesize[2];
+	int rgbBaseInc = out->linesize[0];
+	
+	for (int i=0;i<height;i+=2) {
+		// First Y row
+		_Convert_YUV420P_RGBA32_SSSE3(ybase, ubase, vbase, rgbbase, width);
+		ybase += yBaseInc;
+		rgbbase += rgbBaseInc;
+
+		// Second Y row but same u and v row
+		_Convert_YUV420P_RGBA32_SSSE3(ybase, ubase, vbase, rgbbase, width);
+		ybase += yBaseInc;
+		ubase += uBaseInc;
+		vbase += vBaseInc;
+		rgbbase += rgbBaseInc;
+	}
+}
+
+// Planar YUV422 means each Y line has it's own UV line
+void
+gfx_conv_yuv422p_rgba32_sse(AVFrame *in, AVFrame *out, int width, int height)
+{
+	// in and out buffers must be aligned to 32 bytes,
+	// in should be as ffmpeg allocates it
+	if ((off_t)out->data[0] % 32 != 0) {
+		gfx_conv_YCbCr422_RGB32_c(in, out, width, height);
+		return;
+	}
+
+	uint8 *ybase = (uint8 *)in->data[0];
+	uint8 *ubase = (uint8 *)in->data[1];
+	uint8 *vbase = (uint8 *)in->data[2];
+	uint8 *rgbbase = (uint8 *)out->data[0];
+	
+	int yBaseInc = in->linesize[0];
+	int uBaseInc = in->linesize[1];
+	int vBaseInc = in->linesize[2];
+	int rgbBaseInc = out->linesize[0];
+	
+	for (int i=0;i<height;i++) {
+		_Convert_YUV420P_RGBA32_SSE(ybase, ubase, vbase, rgbbase, width);
+		ybase += yBaseInc;
+		ubase += uBaseInc;
+		vbase += vBaseInc;
+		rgbbase += rgbBaseInc;
+	}
+}
+
+// Planar YUV422 means each Y line has it's own UV line
 void
 gfx_conv_yuv422p_rgba32_sse2(AVFrame *in, AVFrame *out, int width, int height)
 {
-	// Packed YUV422
+	// in and out buffers must be aligned to 32 bytes,
+	// in should be as ffmpeg allocates it
+	if ((off_t)out->data[0] % 32 != 0) {
+		gfx_conv_YCbCr422_RGB32_c(in, out, width, height);
+		return;
+	}
 
+	uint8 *ybase = (uint8 *)in->data[0];
+	uint8 *ubase = (uint8 *)in->data[1];
+	uint8 *vbase = (uint8 *)in->data[2];
+	uint8 *rgbbase = (uint8 *)out->data[0];
+	
+	int yBaseInc = in->linesize[0];
+	int uBaseInc = in->linesize[1];
+	int vBaseInc = in->linesize[2];
+	int rgbBaseInc = out->linesize[0];
+	
+	for (int i=0;i<height;i++) {
+		_Convert_YUV420P_RGBA32_SSE2(ybase, ubase, vbase, rgbbase, width);
+		ybase += yBaseInc;
+		ubase += uBaseInc;
+		vbase += vBaseInc;
+		rgbbase += rgbBaseInc;
+	}
+}
+
+// Planar YUV422 means each Y line has it's own UV line
+void
+gfx_conv_yuv422p_rgba32_ssse3(AVFrame *in, AVFrame *out, int width, int height)
+{
+	// in and out buffers must be aligned to 32 bytes,
+	// in should be as ffmpeg allocates it
+	if ((off_t)out->data[0] % 32 != 0) {
+		gfx_conv_YCbCr422_RGB32_c(in, out, width, height);
+		return;
+	}
+
+	uint8 *ybase = (uint8 *)in->data[0];
+	uint8 *ubase = (uint8 *)in->data[1];
+	uint8 *vbase = (uint8 *)in->data[2];
+	uint8 *rgbbase = (uint8 *)out->data[0];
+	
+	int yBaseInc = in->linesize[0];
+	int uBaseInc = in->linesize[1];
+	int vBaseInc = in->linesize[2];
+	int rgbBaseInc = out->linesize[0];
+	
+	for (int i=0;i<height;i++) {
+		_Convert_YUV420P_RGBA32_SSSE3(ybase, ubase, vbase, rgbbase, width);
+		ybase += yBaseInc;
+		ubase += uBaseInc;
+		vbase += vBaseInc;
+		rgbbase += rgbBaseInc;
+	}
+}
+
+// Packed YUV422 (YUYV)
+void
+gfx_conv_yuv422_rgba32_sse(AVFrame *in, AVFrame *out, int width, int height)
+{
+	// in and out buffers must be aligned to 16 bytes,
+	// in should be as ffmpeg allocates it
+	if ((off_t)out->data[0] % 16 != 0) {
+		gfx_conv_YCbCr422_RGB32_c(in, out, width, height);
+		return;
+	}
+		
+	uint8 *ybase = (uint8 *)in->data[0];
+	uint8 *rgbbase = (uint8 *)out->data[0];
+
+	for (int i = 0; i <= height; i++) {
+		_Convert_YUV422_RGBA32_SSE(ybase, rgbbase, width);
+		ybase += in->linesize[0];
+		rgbbase += out->linesize[0];
+	}
+}
+
+// Packed YUV422 (YUYV)
+void
+gfx_conv_yuv422_rgba32_sse2(AVFrame *in, AVFrame *out, int width, int height)
+{
 	// in and out buffers must be aligned to 32 bytes,
 	// in should be as ffmpeg allocates it
 	if ((off_t)out->data[0] % 32 != 0) {
@@ -72,53 +258,13 @@ gfx_conv_yuv422p_rgba32_sse2(AVFrame *in, AVFrame *out, int width, int height)
 	}
 }
 
-
+// Packed YUV422 (YUYV)
 void
-gfx_conv_yuv420p_rgba32_sse(AVFrame *in, AVFrame *out, int width, int height)
+gfx_conv_yuv422_rgba32_ssse3(AVFrame *in, AVFrame *out, int width, int height)
 {
-	// Planar YUV420
-
-	// in and out buffers must be aligned to 16 bytes,
+	// in and out buffers must be aligned to 32 bytes,
 	// in should be as ffmpeg allocates it
-	if ((off_t)out->data[0] % 16 != 0) {
-		gfx_conv_YCbCr420p_RGB32_c(in, out, width, height);
-		return;
-	}
-	
-	uint8 *ybase = (uint8 *)in->data[0];
-	uint8 *ubase = (uint8 *)in->data[1];
-	uint8 *vbase = (uint8 *)in->data[2];
-	uint8 *rgbbase = (uint8 *)out->data[0];
-	
-	int yBaseInc = in->linesize[0];
-	int uBaseInc = in->linesize[1];
-	int vBaseInc = in->linesize[2];
-	int rgbBaseInc = out->linesize[0];
-	
-	for (int i=0;i<height;i+=2) {
-		// First Y row
-		_Convert_YUV420P_RGBA32_SSE(ybase, ubase, vbase, rgbbase, width);
-		ybase += yBaseInc;
-		rgbbase += rgbBaseInc;
-
-		// Second Y row but same u and v row
-		_Convert_YUV420P_RGBA32_SSE(ybase, ubase, vbase, rgbbase, width);
-		ybase += yBaseInc;
-		ubase += uBaseInc;
-		vbase += vBaseInc;
-		rgbbase += rgbBaseInc;
-	}
-}
-
-
-void
-gfx_conv_yuv422p_rgba32_sse(AVFrame *in, AVFrame *out, int width, int height)
-{
-	// Packed YUV422
-
-	// in and out buffers must be aligned to 16 bytes,
-	// in should be as ffmpeg allocates it
-	if ((off_t)out->data[0] % 16 != 0) {
+	if ((off_t)out->data[0] % 32 != 0) {
 		gfx_conv_YCbCr422_RGB32_c(in, out, width, height);
 		return;
 	}
@@ -127,7 +273,7 @@ gfx_conv_yuv422p_rgba32_sse(AVFrame *in, AVFrame *out, int width, int height)
 	uint8 *rgbbase = (uint8 *)out->data[0];
 
 	for (int i = 0; i <= height; i++) {
-		_Convert_YUV422_RGBA32_SSE(ybase, rgbbase, width);
+		_Convert_YUV422_RGBA32_SSSE3(ybase, rgbbase, width);
 		ybase += in->linesize[0];
 		rgbbase += out->linesize[0];
 	}
diff --git a/src/add-ons/media/plugins/ffmpeg/gfx_conv_mmx.h b/src/add-ons/media/plugins/ffmpeg/gfx_conv_mmx.h
index 3052286600..8fa7bd1960 100644
--- a/src/add-ons/media/plugins/ffmpeg/gfx_conv_mmx.h
+++ b/src/add-ons/media/plugins/ffmpeg/gfx_conv_mmx.h
@@ -7,9 +7,17 @@
 
 void gfx_conv_null_mmx(AVFrame *in, AVFrame *out, int width, int height);
 
-void gfx_conv_yuv420p_rgba32_sse2(AVFrame *in, AVFrame *out, int width, int height);
-void gfx_conv_yuv422p_rgba32_sse2(AVFrame *in, AVFrame *out, int width, int height);
+// Planar
 void gfx_conv_yuv420p_rgba32_sse(AVFrame *in, AVFrame *out, int width, int height);
+void gfx_conv_yuv420p_rgba32_sse2(AVFrame *in, AVFrame *out, int width, int height);
+void gfx_conv_yuv420p_rgba32_ssse3(AVFrame *in, AVFrame *out, int width, int height);
 void gfx_conv_yuv422p_rgba32_sse(AVFrame *in, AVFrame *out, int width, int height);
+void gfx_conv_yuv422p_rgba32_sse2(AVFrame *in, AVFrame *out, int width, int height);
+void gfx_conv_yuv422p_rgba32_ssse3(AVFrame *in, AVFrame *out, int width, int height);
+
+// Packed
+void gfx_conv_yuv422_rgba32_sse(AVFrame *in, AVFrame *out, int width, int height);
+void gfx_conv_yuv422_rgba32_sse2(AVFrame *in, AVFrame *out, int width, int height);
+void gfx_conv_yuv422_rgba32_ssse3(AVFrame *in, AVFrame *out, int width, int height);
 
 #endif
diff --git a/src/add-ons/media/plugins/ffmpeg/gfx_util.cpp b/src/add-ons/media/plugins/ffmpeg/gfx_util.cpp
index 9c1e288e30..9abddce7e5 100644
--- a/src/add-ons/media/plugins/ffmpeg/gfx_util.cpp
+++ b/src/add-ons/media/plugins/ffmpeg/gfx_util.cpp
@@ -29,6 +29,7 @@ resolve_colorspace(color_space colorSpace, PixelFormat pixelFormat, int width,
 
 	switch (colorSpace) {
 		case B_RGB32:
+			// Planar Formats
 			if (pixelFormat == PIX_FMT_YUV410P) {
 				TRACE("resolve_colorspace: gfx_conv_yuv410p_rgb32_c\n");
 				return gfx_conv_yuv410p_rgb32_c;
@@ -57,14 +58,32 @@ resolve_colorspace(color_space colorSpace, PixelFormat pixelFormat, int width,
 
 			if (pixelFormat == PIX_FMT_YUV422P
 				|| pixelFormat == PIX_FMT_YUVJ422P) {
-				if (cpu.HasSSE2() && width % 8 == 0)
+				if (cpu.HasSSE2() && width % 8 == 0) {
+					TRACE("resolve_colorspace: gfx_conv_yuv422p_RGB32_sse2\n");
 					return gfx_conv_yuv422p_rgba32_sse2;
-				else if (cpu.HasSSE1() && width % 4 == 0)
+				} else if (cpu.HasSSE1() && width % 4 == 0) {
+					TRACE("resolve_colorspace: gfx_conv_yuv422p_RGB32_sse\n");
 					return gfx_conv_yuv422p_rgba32_sse;
-				else
+				} else {
+					TRACE("resolve_colorspace: gfx_conv_YCbCr422p_RGB32_c\n");
 					return gfx_conv_YCbCr422_RGB32_c;
+				}
 			}
-
+			
+			// Packed Formats
+			if (pixelFormat == PIX_FMT_YUYV422) {
+				if (cpu.HasSSSE3() && width % 8 == 0) {
+					return gfx_conv_yuv422_rgba32_ssse3;
+				} else if (cpu.HasSSE2() && width % 8 == 0) {
+					return gfx_conv_yuv422_rgba32_sse2;
+				} else if (cpu.HasSSE1() && width % 4 == 0
+					&& height % 2 == 0) {
+					return gfx_conv_yuv422_rgba32_sse;
+				} else {
+					return gfx_conv_YCbCr422_RGB32_c;
+				}
+			}
+			
 			TRACE("resolve_colorspace: %s => B_RGB32: NULL\n",
 				pixfmt_to_string(pixelFormat));
 			return NULL;
diff --git a/src/add-ons/media/plugins/ffmpeg/yuvrgb_sse.nasm b/src/add-ons/media/plugins/ffmpeg/yuvrgb_sse.nasm
new file mode 100644
index 0000000000..ff7fc893bd
--- /dev/null
+++ b/src/add-ons/media/plugins/ffmpeg/yuvrgb_sse.nasm
@@ -0,0 +1,268 @@
+;
+; Copyright (C) 2009-2010 David McPaul
+;
+; All rights reserved. Distributed under the terms of the MIT License.
+;
+
+; A rather unoptimised set of sse yuv to rgb converters
+; does 4 pixels per loop
+
+; inputer:
+; reads 128 bits of yuv 8 bit data and puts
+; the y values converted to 16 bit in mm0
+; the u values converted to 16 bit and duplicated into mm1
+; the v values converted to 16 bit and duplicated into mm2
+
+; conversion:
+; does the yuv to rgb conversion using 16 bit fixed point and the
+; results are placed into the following registers as 8 bit clamped values
+; r values in mm3
+; g values in mm4
+; b values in mm5
+
+; outputer:
+; writes out the rgba pixels as 8 bit values with 0 for alpha
+
+; mm6 used for scratch
+; mm7 used for scratch
+
+%macro  cglobal 1
+	global  _%1
+	%define %1 _%1
+	align 16
+%1:
+%endmacro
+
+; conversion code 
+%macro yuv2rgbsse 0
+; u = u - 128
+; v = v - 128
+; r = y + v + v >> 2 + v >> 3 + v >> 5 
+; g = y - (u >> 2 + u >> 4 + u >> 5) - (v >> 1 + v >> 3 + v >> 4 + v >> 5)
+; b = y + u + u >> 1 + u >> 2 + u >> 6
+; subtract 16 from y
+	movq mm7, [Const16]				; loads a constant using data cache (slower on first fetch but then cached)
+	psubsw mm0,mm7					; y = y - 16
+; subtract 128 from u and v
+	movq mm7, [Const128]			; loads a constant using data cache (slower on first fetch but then cached)
+	psubsw mm1,mm7					; u = u - 128
+	psubsw mm2,mm7					; v = v - 128
+; load r,g,b with y 
+	movq mm3,mm0					; r = y 
+	pshufw mm5,mm0, 0xE4			; b = y 
+
+; r = r + v + v >> 2 + v >> 3 + v >> 5
+	paddsw mm3, mm2					; add v to r
+	movq mm7, mm1					; move u to scratch
+	pshufw mm6, mm2, 0xE4			; move v to scratch
+	
+	psraw  mm6,2					; divide v by 4
+	paddsw mm3, mm6					; and add to r
+	psraw  mm6,1					; divide v by 2
+	paddsw mm3, mm6					; and add to r
+	psraw  mm6,2					; divide v by 4
+	paddsw mm3, mm6					; and add to r
+
+; b = y + u + u >> 1 + u >> 2 + u >> 6
+	paddsw mm5, mm1					; add u to b
+	psraw  mm7,1					; divide u by 2
+	paddsw mm5, mm7					; and add to b
+	psraw  mm7,1					; divide u by 2
+	paddsw mm5, mm7					; and add to b
+	psraw  mm7,4					; divide u by 32
+	paddsw mm5, mm7					; and add to b
+	
+; g = y - u >> 2 - u >> 4 - u >> 5 - v >> 1 - v >> 3 - v >> 4 - v >> 5
+	movq mm7,mm2					; move v to scratch
+	pshufw mm6,mm1, 0xE4			; move u to scratch
+	movq mm4,mm0					; g = y 
+	
+	psraw  mm6,2					; divide u by 4
+	psubsw mm4,mm6					; subtract from g
+	psraw  mm6,2					; divide u by 4
+	psubsw mm4,mm6					; subtract from g
+	psraw  mm6,1					; divide u by 2
+	psubsw mm4,mm6					; subtract from g
+
+	psraw  mm7,1					; divide v by 2
+	psubsw mm4,mm7					; subtract from g
+	psraw  mm7,2					; divide v by 4
+	psubsw mm4,mm7					; subtract from g
+	psraw  mm7,1					; divide v by 2
+	psubsw mm4,mm7					; subtract from g
+	psraw  mm7,1					; divide v by 2
+	psubsw mm4,mm7					; subtract from g
+%endmacro
+
+; outputer
+%macro rgba32sseoutput 0
+; clamp values
+	pxor mm7,mm7
+	packuswb mm3,mm7				; clamp to 0,255 and pack R to 8 bit per pixel
+	packuswb mm4,mm7				; clamp to 0,255 and pack G to 8 bit per pixel
+	packuswb mm5,mm7				; clamp to 0,255 and pack B to 8 bit per pixel
+; convert to bgra32 packed
+	punpcklbw mm5,mm4				; bgbgbgbgbgbgbgbg
+	movq mm0, mm5					; save bg values
+	punpcklbw mm3,mm7				; r0r0r0r0
+	punpcklwd mm5,mm3				; lower half bgr0bgr0
+	punpckhwd mm0,mm3				; upper half bgr0bgr0
+; write to output ptr
+	movq [edi], mm5					; output first 2 pixels 
+	movq [edi+8], mm0				; output second 2 pixels 
+%endmacro
+
+SECTION .data align=16
+
+Const16	dw	16
+	dw	16
+	dw	16
+	dw	16
+	dw	16
+	dw	16
+	dw	16
+	dw	16
+
+Const128	dw	128
+	dw	128
+	dw	128
+	dw	128
+	dw	128
+	dw	128
+	dw	128
+	dw	128
+
+; Packed Convert
+; void Convert_YUV422_RGBA32_SSE(void *fromPtr, void *toPtr, int width)
+width    equ	ebp+16
+toPtr    equ	ebp+12
+fromPtr  equ	ebp+8
+
+; Planar Convert
+; void Convert_YUV420P_RGBA32_SSE(void *fromYPtr, void *fromUPtr, void *fromVPtr, void *toPtr, int width)
+width1    equ	ebp+24
+toPtr1    equ	ebp+20
+fromVPtr  equ	ebp+16
+fromUPtr  equ	ebp+12
+fromYPtr  equ	ebp+8
+
+SECTION .text align=16
+
+; YUY2 FOURCC
+cglobal Convert_YUV422_RGBA32_SSE
+; reserve variables
+	push ebp
+	mov ebp, esp
+	push edi
+	push esi
+	push ecx
+	
+	mov esi, [fromPtr]
+	mov ecx, [width]
+	mov edi, [toPtr]
+; loop width / 4 times
+	shr ecx,2
+	test ecx,ecx
+	jng ENDLOOP2
+REPEATLOOP2:						; loop over width / 4
+
+; YUV422 packed inputer
+	movq mm0, [esi]					; should have yuyv yuyv
+	pshufw mm1, mm0, 0xE4			; copy to mm1
+	movq mm2, mm0					; copy to mm2
+; extract y
+	pxor mm7,mm7					; 0000000000000000
+	pcmpeqb mm6,mm6					; ffffffffffffffff
+	punpckhbw mm6,mm7				; interleave mm7 into mm6 ff00ff00ff00ff00
+	pand mm0, mm6					; clear all but y values leaving y0y0 etc
+; extract u and duplicate so each u in yuyv becomes 0u0u
+	psrld mm6,8						; 00ff0000 00ff0000 
+	pand mm1, mm6					; clear all yv values leaving 0u00 etc
+	psrld mm1,8						; rotate u to get u000
+	pshufw mm1,mm1, 0xA0			; copy u values	to get u0u0		(SSE not MMX)
+; extract v
+	pslld mm6,16					; 000000ff000000ff
+	pand mm2, mm6					; clear all yu values leaving 000v etc
+	psrld mm2,8						; rotate v to get 00v0
+	pshufw mm2,mm2, 0xF5			; copy v values to get v0v0    (SSE not MMX)
+
+yuv2rgbsse
+
+rgba32sseoutput
+
+	; endloop
+	add edi,16
+	add esi,8
+	sub ecx, 1						; apparently sub is better than dec
+	jnz REPEATLOOP2
+ENDLOOP2:
+; Cleanup
+	emms							; reset mmx regs back to float
+	pop ecx
+	pop esi
+	pop edi
+	mov esp, ebp
+	pop ebp
+	ret
+
+cglobal Convert_YUV420P_RGBA32_SSE
+; reserve variables
+	push ebp
+	mov ebp, esp
+	push edi
+	push esi
+	push ecx
+	push eax
+	push ebx
+		
+	mov esi, [fromYPtr]
+	mov eax, [fromUPtr]
+	mov ebx, [fromVPtr]
+	mov edi, [toPtr1]
+	mov ecx, [width1]
+; loop width / 4 times
+	shr ecx,2
+	test ecx,ecx
+	jng ENDLOOP3
+REPEATLOOP3:						; loop over width / 4
+; YUV420 Planar inputer
+	movq mm0, [esi]					; fetch 4 y values (8 bit) yyyy0000
+	movd mm1, [eax]					; fetch 2 u values (8 bit) uu000000
+	movd mm2, [ebx]					; fetch 2 v values (8 bit) vv000000
+	
+; extract y
+	pxor mm7,mm7					; 0000000000000000
+	punpcklbw mm0,mm7				; interleave xmm7 into xmm0 y0y0y0y
+; extract u and duplicate so each becomes 0u0u
+	punpcklbw mm1,mm7				; interleave xmm7 into xmm1 u0u00000
+	punpcklwd mm1,mm7				; interleave again u000u000
+	pshufw mm1,mm1, 0xA0			; copy u values to get u0u0
+; extract v
+	punpcklbw mm2,mm7				; interleave xmm7 into xmm1 v0v00000
+	punpcklwd mm2,mm7				; interleave again v000v000
+	pshufw mm2,mm2, 0xA0			; copy v values to get v0v0
+
+yuv2rgbsse
+	
+rgba32sseoutput
+
+; endloop
+	add edi,16
+	add esi,4
+	add eax,2
+	add ebx,2
+	sub ecx, 1				; apparently sub is better than dec
+	jnz REPEATLOOP3
+ENDLOOP3:
+; Cleanup
+	emms
+	pop ebx
+	pop eax
+	pop ecx
+	pop esi
+	pop edi
+	mov esp, ebp
+	pop ebp
+	ret
+
+SECTION .note.GNU-stack noalloc noexec nowrite progbits
diff --git a/src/add-ons/media/plugins/ffmpeg/yuvrgb.nasm b/src/add-ons/media/plugins/ffmpeg/yuvrgb_sse2.nasm
similarity index 56%
rename from src/add-ons/media/plugins/ffmpeg/yuvrgb.nasm
rename to src/add-ons/media/plugins/ffmpeg/yuvrgb_sse2.nasm
index 0cabc75c4a..890855b6ce 100644
--- a/src/add-ons/media/plugins/ffmpeg/yuvrgb.nasm
+++ b/src/add-ons/media/plugins/ffmpeg/yuvrgb_sse2.nasm
@@ -4,11 +4,11 @@
 ; All rights reserved. Distributed under the terms of the MIT License.
 ;
 
-; A rather unoptimised set of yuv to rgb converters
-; does 8 pixels at a time
+; A rather unoptimised set of sse2 yuv to rgb converters
+; does 8 pixels per loop
 
 ; inputer:
-; reads 128bits of yuv 8 bit data and puts
+; reads 128 bits of yuv 8 bit data and puts
 ; the y values converted to 16 bit in xmm0
 ; the u values converted to 16 bit and duplicated into xmm1
 ; the v values converted to 16 bit and duplicated into xmm2
@@ -94,67 +94,6 @@
 	psubsw xmm4,xmm7				; subtract from g
 %endmacro
 
-; conversion code 
-%macro yuv2rgbsse 0
-; u = u - 128
-; v = v - 128
-; r = y + v + v >> 2 + v >> 3 + v >> 5 
-; g = y - (u >> 2 + u >> 4 + u >> 5) - (v >> 1 + v >> 3 + v >> 4 + v >> 5)
-; b = y + u + u >> 1 + u >> 2 + u >> 6
-; subtract 16 from y
-	movq mm7, [Const16]				; loads a constant using data cache (slower on first fetch but then cached)
-	psubsw mm0,mm7					; y = y - 16
-; subtract 128 from u and v
-	movq mm7, [Const128]			; loads a constant using data cache (slower on first fetch but then cached)
-	psubsw mm1,mm7					; u = u - 128
-	psubsw mm2,mm7					; v = v - 128
-; load r,g,b with y 
-	movq mm3,mm0					; r = y 
-	pshufw mm5,mm0, 0xE4			; b = y 
-
-; r = r + v + v >> 2 + v >> 3 + v >> 5
-	paddsw mm3, mm2					; add v to r
-	movq mm7, mm1					; move u to scratch
-	pshufw mm6, mm2, 0xE4			; move v to scratch
-	
-	psraw  mm6,2					; divide v by 4
-	paddsw mm3, mm6					; and add to r
-	psraw  mm6,1					; divide v by 2
-	paddsw mm3, mm6					; and add to r
-	psraw  mm6,2					; divide v by 4
-	paddsw mm3, mm6					; and add to r
-
-; b = y + u + u >> 1 + u >> 2 + u >> 6
-	paddsw mm5, mm1					; add u to b
-	psraw  mm7,1					; divide u by 2
-	paddsw mm5, mm7					; and add to b
-	psraw  mm7,1					; divide u by 2
-	paddsw mm5, mm7					; and add to b
-	psraw  mm7,4					; divide u by 32
-	paddsw mm5, mm7					; and add to b
-	
-; g = y - u >> 2 - u >> 4 - u >> 5 - v >> 1 - v >> 3 - v >> 4 - v >> 5
-	movq mm7,mm2					; move v to scratch
-	pshufw mm6,mm1, 0xE4			; move u to scratch
-	movq mm4,mm0					; g = y 
-	
-	psraw  mm6,2					; divide u by 4
-	psubsw mm4,mm6					; subtract from g
-	psraw  mm6,2					; divide u by 4
-	psubsw mm4,mm6					; subtract from g
-	psraw  mm6,1					; divide u by 2
-	psubsw mm4,mm6					; subtract from g
-
-	psraw  mm7,1					; divide v by 2
-	psubsw mm4,mm7					; subtract from g
-	psraw  mm7,2					; divide v by 4
-	psubsw mm4,mm7					; subtract from g
-	psraw  mm7,1					; divide v by 2
-	psubsw mm4,mm7					; subtract from g
-	psraw  mm7,1					; divide v by 2
-	psubsw mm4,mm7					; subtract from g
-%endmacro
-
 ; outputer
 %macro rgba32sse2output 0
 ; clamp values
@@ -173,24 +112,6 @@
 	movntdq [edi+16], xmm0			; output second 4 pixels bypassing cache
 %endmacro
 
-; outputer
-%macro rgba32sseoutput 0
-; clamp values
-	pxor mm7,mm7
-	packuswb mm3,mm7				; clamp to 0,255 and pack R to 8 bit per pixel
-	packuswb mm4,mm7				; clamp to 0,255 and pack G to 8 bit per pixel
-	packuswb mm5,mm7				; clamp to 0,255 and pack B to 8 bit per pixel
-; convert to bgra32 packed
-	punpcklbw mm5,mm4				; bgbgbgbgbgbgbgbg
-	movq mm0, mm5					; save bg values
-	punpcklbw mm3,mm7				; r0r0r0r0
-	punpcklwd mm5,mm3				; lower half bgr0bgr0
-	punpckhwd mm0,mm3				; upper half bgr0bgr0
-; write to output ptr
-	movq [edi], mm5					; output first 2 pixels 
-	movq [edi+8], mm0				; output second 2 pixels 
-%endmacro
-
 SECTION .data align=16
 
 Const16	dw	16
@@ -342,120 +263,4 @@ ENDLOOP1:
 	pop ebp
 	ret
 
-cglobal Convert_YUV422_RGBA32_SSE
-; reserve variables
-	push ebp
-	mov ebp, esp
-	push edi
-	push esi
-	push ecx
-	
-	mov esi, [fromPtr]
-	mov ecx, [width]
-	mov edi, [toPtr]
-; loop width / 4 times
-	shr ecx,2
-	test ecx,ecx
-	jng ENDLOOP2
-REPEATLOOP2:						; loop over width / 4
-
-; YUV422 packed inputer
-	movq mm0, [esi]					; should have yuyv yuyv
-	pshufw mm1, mm0, 0xE4			; copy to mm1
-	movq mm2, mm0					; copy to mm2
-; extract y
-	pxor mm7,mm7					; 0000000000000000
-	pcmpeqb mm6,mm6					; ffffffffffffffff
-	punpckhbw mm6,mm7				; interleave mm7 into mm6 ff00ff00ff00ff00
-	pand mm0, mm6					; clear all but y values leaving y0y0 etc
-; extract u and duplicate so each u in yuyv becomes 0u0u
-	psrld mm6,8						; 00ff0000 00ff0000 
-	pand mm1, mm6					; clear all yv values leaving 0u00 etc
-	psrld mm1,8						; rotate u to get u000
-	pshufw mm1,mm1, 0xA0			; copy u values	to get u0u0		(SSE not MMX)
-; extract v
-	pslld mm6,16					; 000000ff000000ff
-	pand mm2, mm6					; clear all yu values leaving 000v etc
-	psrld mm2,8						; rotate v to get 00v0
-	pshufw mm2,mm2, 0xF5			; copy v values to get v0v0    (SSE not MMX)
-
-yuv2rgbsse
-
-rgba32sseoutput
-
-	; endloop
-	add edi,16
-	add esi,8
-	sub ecx, 1						; apparently sub is better than dec
-	jnz REPEATLOOP2
-ENDLOOP2:
-; Cleanup
-	emms							; reset mmx regs back to float
-	pop ecx
-	pop esi
-	pop edi
-	mov esp, ebp
-	pop ebp
-	ret
-
-cglobal Convert_YUV420P_RGBA32_SSE
-; reserve variables
-	push ebp
-	mov ebp, esp
-	push edi
-	push esi
-	push ecx
-	push eax
-	push ebx
-		
-	mov esi, [fromYPtr]
-	mov eax, [fromUPtr]
-	mov ebx, [fromVPtr]
-	mov edi, [toPtr1]
-	mov ecx, [width1]
-; loop width / 4 times
-	shr ecx,2
-	test ecx,ecx
-	jng ENDLOOP3
-REPEATLOOP3:						; loop over width / 4
-; YUV420 Planar inputer
-	movq mm0, [esi]					; fetch 4 y values (8 bit) yyyy0000
-	movd mm1, [eax]					; fetch 2 u values (8 bit) uu000000
-	movd mm2, [ebx]					; fetch 2 v values (8 bit) vv000000
-	
-; extract y
-	pxor mm7,mm7					; 0000000000000000
-	punpcklbw mm0,mm7				; interleave xmm7 into xmm0 y0y0y0y
-; extract u and duplicate so each becomes 0u0u
-	punpcklbw mm1,mm7				; interleave xmm7 into xmm1 u0u00000
-	punpcklwd mm1,mm7				; interleave again u000u000
-	pshufw mm1,mm1, 0xA0			; copy u values to get u0u0
-; extract v
-	punpcklbw mm2,mm7				; interleave xmm7 into xmm1 v0v00000
-	punpcklwd mm2,mm7				; interleave again v000v000
-	pshufw mm2,mm2, 0xA0			; copy v values to get v0v0
-
-yuv2rgbsse
-	
-rgba32sseoutput
-
-; endloop
-	add edi,16
-	add esi,4
-	add eax,2
-	add ebx,2
-	sub ecx, 1				; apparently sub is better than dec
-	jnz REPEATLOOP3
-ENDLOOP3:
-; Cleanup
-	emms
-	pop ebx
-	pop eax
-	pop ecx
-	pop esi
-	pop edi
-	mov esp, ebp
-	pop ebp
-	ret
-
 SECTION .note.GNU-stack noalloc noexec nowrite progbits
diff --git a/src/add-ons/media/plugins/ffmpeg/yuvrgb_ssse3.nasm b/src/add-ons/media/plugins/ffmpeg/yuvrgb_ssse3.nasm
new file mode 100644
index 0000000000..4a3545453f
--- /dev/null
+++ b/src/add-ons/media/plugins/ffmpeg/yuvrgb_ssse3.nasm
@@ -0,0 +1,307 @@
+;
+; Copyright (C) 2009-2010 David McPaul
+;
+; All rights reserved. Distributed under the terms of the MIT License.
+;
+
+; A rather unoptimised set of ssse3 yuv to rgb converters
+; does 8 pixels per loop
+
+; inputer:
+; reads 128 bits of yuv 8 bit data and puts
+; the y values converted to 16 bit in xmm0
+; the u values converted to 16 bit and duplicated into xmm1
+; the v values converted to 16 bit and duplicated into xmm2
+
+; conversion:
+; does the yuv to rgb conversion using 16 bit fixed point and the
+; results are placed into the following registers as 8 bit clamped values
+; r values in xmm3
+; g values in xmm4
+; b values in xmm5
+
+; outputer:
+; writes out the rgba pixels as 8 bit values with 0 for alpha
+
+; xmm6 used for scratch
+; xmm7 used for scratch
+
+%macro  cglobal 1
+	global  _%1
+	%define %1 _%1
+	align 16
+%1:
+%endmacro
+
+; conversion code 
+%macro yuv2rgbsse2 0
+; u = u - 128
+; v = v - 128
+; r = y + v + v >> 2 + v >> 3 + v >> 5 
+; g = y - (u >> 2 + u >> 4 + u >> 5) - (v >> 1 + v >> 3 + v >> 4 + v >> 5)
+; b = y + u + u >> 1 + u >> 2 + u >> 6
+; subtract 16 from y
+	movdqa xmm7, [Const16]			; loads a constant using data cache (slower on first fetch but then cached)
+	psubsw xmm0,xmm7				; y = y - 16
+; subtract 128 from u and v
+	movdqa xmm7, [Const128]			; loads a constant using data cache (slower on first fetch but then cached)
+	psubsw xmm1,xmm7				; u = u - 128
+	psubsw xmm2,xmm7				; v = v - 128
+; load r,b with y 
+	movdqa xmm3,xmm0				; r = y 
+	pshufd xmm5,xmm0, 0xE4			; b = y 
+
+; r = y + v + v >> 2 + v >> 3 + v >> 5
+	paddsw xmm3, xmm2				; add v to r
+	movdqa xmm7, xmm1				; move u to scratch
+	pshufd xmm6, xmm2, 0xE4			; move v to scratch
+	
+	psraw  xmm6,2					; divide v by 4
+	paddsw xmm3, xmm6				; and add to r
+	psraw  xmm6,1					; divide v by 2
+	paddsw xmm3, xmm6				; and add to r
+	psraw  xmm6,2					; divide v by 4
+	paddsw xmm3, xmm6				; and add to r
+
+; b = y + u + u >> 1 + u >> 2 + u >> 6
+	paddsw xmm5, xmm1				; add u to b
+	psraw  xmm7,1					; divide u by 2
+	paddsw xmm5, xmm7				; and add to b
+	psraw  xmm7,1					; divide u by 2
+	paddsw xmm5, xmm7				; and add to b
+	psraw  xmm7,4					; divide u by 32
+	paddsw xmm5, xmm7				; and add to b
+	
+; g = y - u >> 2 - u >> 4 - u >> 5 - v >> 1 - v >> 3 - v >> 4 - v >> 5
+	movdqa xmm7,xmm2				; move v to scratch
+	pshufd xmm6,xmm1, 0xE4			; move u to scratch
+	movdqa xmm4,xmm0				; g = y 
+	
+	psraw  xmm6,2					; divide u by 4
+	psubsw xmm4,xmm6				; subtract from g
+	psraw  xmm6,2					; divide u by 4
+	psubsw xmm4,xmm6				; subtract from g
+	psraw  xmm6,1					; divide u by 2
+	psubsw xmm4,xmm6				; subtract from g
+
+	psraw  xmm7,1					; divide v by 2
+	psubsw xmm4,xmm7				; subtract from g
+	psraw  xmm7,2					; divide v by 4
+	psubsw xmm4,xmm7				; subtract from g
+	psraw  xmm7,1					; divide v by 2
+	psubsw xmm4,xmm7				; subtract from g
+	psraw  xmm7,1					; divide v by 2
+	psubsw xmm4,xmm7				; subtract from g
+%endmacro
+
+; outputer
+%macro rgba32sse2output 0
+; clamp values
+	pxor xmm7,xmm7
+	packuswb xmm3,xmm7				; clamp to 0,255 and pack R to 8 bit per pixel
+	packuswb xmm4,xmm7				; clamp to 0,255 and pack G to 8 bit per pixel
+	packuswb xmm5,xmm7				; clamp to 0,255 and pack B to 8 bit per pixel
+; convert to bgra32 packed
+	punpcklbw xmm5,xmm4				; bgbgbgbgbgbgbgbg
+	movdqa xmm0, xmm5				; save bg values
+	punpcklbw xmm3,xmm7				; r0r0r0r0r0r0r0r0
+	punpcklwd xmm5,xmm3				; lower half bgr0bgr0bgr0bgr0
+	punpckhwd xmm0,xmm3				; upper half bgr0bgr0bgr0bgr0
+; write to output ptr
+	movntdq [edi], xmm5				; output first 4 pixels bypassing cache
+	movntdq [edi+16], xmm0			; output second 4 pixels bypassing cache
+%endmacro
+
+SECTION .data align=16
+
+Const16	dw	16
+	dw	16
+	dw	16
+	dw	16
+	dw	16
+	dw	16
+	dw	16
+	dw	16
+
+Const128	dw	128
+	dw	128
+	dw	128
+	dw	128
+	dw	128
+	dw	128
+	dw	128
+	dw	128
+	
+UMask	db	0x01
+	db	0x80
+	db	0x01
+	db	0x80
+	db	0x05
+	db	0x80
+	db	0x05
+	db	0x80
+	db	0x09
+	db	0x80
+	db	0x09
+	db	0x80
+	db	0x0d
+	db	0x80
+	db	0x0d
+	db	0x80
+
+VMask	db	0x03
+	db	0x80
+	db	0x03
+	db	0x80
+	db	0x07
+	db	0x80
+	db	0x07
+	db	0x80
+	db	0x0b
+	db	0x80
+	db	0x0b
+	db	0x80
+	db	0x0f
+	db	0x80
+	db	0x0f
+	db	0x80
+	
+YMask	db	0x00
+	db	0x80
+	db	0x02
+	db	0x80
+	db	0x04
+	db	0x80
+	db	0x06
+	db	0x80
+	db	0x08
+	db	0x80
+	db	0x0a
+	db	0x80
+	db	0x0c
+	db	0x80
+	db	0x0e
+	db	0x80
+
+
+; void Convert_YUV422_RGBA32_SSSE3(void *fromPtr, void *toPtr, int width)
+width    equ	ebp+16
+toPtr    equ	ebp+12
+fromPtr  equ	ebp+8
+
+; void Convert_YUV420P_RGBA32_SSSE3(void *fromYPtr, void *fromUPtr, void *fromVPtr, void *toPtr, int width)
+width1    equ	ebp+24
+toPtr1    equ	ebp+20
+fromVPtr  equ	ebp+16
+fromUPtr  equ	ebp+12
+fromYPtr  equ	ebp+8
+
+SECTION .text align=16
+
+cglobal Convert_YUV422_RGBA32_SSSE3
+; reserve variables
+	push ebp
+	mov ebp, esp
+	push edi
+	push esi
+	push ecx
+	
+	mov esi, [fromPtr]
+	mov edi, [toPtr]
+	mov ecx, [width]
+; loop width / 8 times
+	shr ecx,3
+	test ecx,ecx
+	jng ENDLOOP
+REPEATLOOP:							; loop over width / 8
+; YUV422 packed inputer
+	movdqa xmm0, [esi]				; should have yuyv yuyv yuyv yuyv
+	pshufd xmm1, xmm0, 0xE4			; copy to xmm1
+	movdqa xmm2, xmm0				; copy to xmm2
+; extract both y giving y0y0
+	pshufb xmm0, [YMask]
+; extract u and duplicate so each u in yuyv becomes u0u0
+	pshufb xmm1, [UMask]
+; extract v and duplicate so each v in yuyv becomes v0v0
+	pshufb xmm2, [VMask]
+
+yuv2rgbsse2
+	
+rgba32sse2output
+
+; endloop
+	add edi,32
+	add esi,16
+	sub ecx, 1				; apparently sub is better than dec
+	jnz REPEATLOOP
+ENDLOOP:
+; Cleanup
+	pop ecx
+	pop esi
+	pop edi
+	mov esp, ebp
+	pop ebp
+	ret
+
+cglobal Convert_YUV420P_RGBA32_SSSE3
+; reserve variables
+	push ebp
+	mov ebp, esp
+	push edi
+	push esi
+	push ecx
+	push eax
+	push ebx
+		
+	mov esi, [fromYPtr]
+	mov eax, [fromUPtr]
+	mov ebx, [fromVPtr]
+	mov edi, [toPtr1]
+	mov ecx, [width1]
+; loop width / 8 times
+	shr ecx,3
+	test ecx,ecx
+	jng ENDLOOP1
+REPEATLOOP1:						; loop over width / 8
+; YUV420 Planar inputer
+	movq xmm0, [esi]				; fetch 8 y values (8 bit) yyyyyyyy00000000
+	movd xmm1, [eax]				; fetch 4 u values (8 bit) uuuu000000000000
+	movd xmm2, [ebx]				; fetch 4 v values (8 bit) vvvv000000000000
+	
+; extract y
+	pxor xmm7,xmm7					; 00000000000000000000000000000000
+	punpcklbw xmm0,xmm7				; interleave xmm7 into xmm0 y0y0y0y0y0y0y0y0
+; extract u and duplicate so each becomes 0u0u
+	punpcklbw xmm1,xmm7				; interleave xmm7 into xmm1 u0u0u0u000000000
+	punpcklwd xmm1,xmm7				; interleave again u000u000u000u000
+	pshuflw xmm1,xmm1, 0xA0			; copy u values
+	pshufhw xmm1,xmm1, 0xA0			; to get u0u0
+; extract v
+	punpcklbw xmm2,xmm7				; interleave xmm7 into xmm1 v0v0v0v000000000
+	punpcklwd xmm2,xmm7				; interleave again v000v000v000v000
+	pshuflw xmm2,xmm2, 0xA0			; copy v values
+	pshufhw xmm2,xmm2, 0xA0			; to get v0v0
+
+yuv2rgbsse2
+	
+rgba32sse2output
+
+; endloop
+	add edi,32
+	add esi,8
+	add eax,4
+	add ebx,4
+	sub ecx, 1				; apparently sub is better than dec
+	jnz REPEATLOOP1
+ENDLOOP1:
+; Cleanup
+	pop ebx
+	pop eax
+	pop ecx
+	pop esi
+	pop edi
+	mov esp, ebp
+	pop ebp
+	ret
+
+SECTION .note.GNU-stack noalloc noexec nowrite progbits