From 6608bd7c78bd1fec5a856e9dcc50d62375c54c93 Mon Sep 17 00:00:00 2001
From: David McPaul <dlmcpaul@gmail.com>
Date: Sun, 15 Mar 2009 01:34:21 +0000
Subject: [PATCH] sync with ffmpeg 0.5 release

git-svn-id: file:///srv/svn/repos/haiku/haiku/trunk@29534 a95241bf-73f2-0310-859d-f6bbb57e9c96
---
 .../media/plugins/avcodec/libswscale/Jamfile  |    3 +-
 .../plugins/avcodec/libswscale/cs_test.c      |  175 --
 .../plugins/avcodec/libswscale/rgb2rgb.c      |  206 +-
 .../plugins/avcodec/libswscale/rgb2rgb.h      |   69 +-
 .../avcodec/libswscale/rgb2rgb_template.c     |  488 ++--
 .../avcodec/libswscale/swscale-example.c      |  229 --
 .../plugins/avcodec/libswscale/swscale.c      | 1230 ++++++----
 .../plugins/avcodec/libswscale/swscale.h      |  130 +-
 .../libswscale/swscale_altivec_template.c     |   10 +-
 .../avcodec/libswscale/swscale_avoption.c     |    5 +-
 .../plugins/avcodec/libswscale/swscale_bfin.c |   15 +-
 .../avcodec/libswscale/swscale_internal.h     |   75 +-
 .../avcodec/libswscale/swscale_template.c     | 2078 ++++++++---------
 .../plugins/avcodec/libswscale/yuv2rgb.c      | 1099 ++++-----
 .../avcodec/libswscale/yuv2rgb_altivec.c      |   43 +-
 .../plugins/avcodec/libswscale/yuv2rgb_bfin.c |   21 +-
 .../plugins/avcodec/libswscale/yuv2rgb_mlib.c |    2 +-
 .../avcodec/libswscale/yuv2rgb_template.c     |  343 +--
 .../plugins/avcodec/libswscale/yuv2rgb_vis.c  |   12 +-
 19 files changed, 2804 insertions(+), 3429 deletions(-)
 delete mode 100644 src/add-ons/media/plugins/avcodec/libswscale/cs_test.c
 delete mode 100644 src/add-ons/media/plugins/avcodec/libswscale/swscale-example.c

diff --git a/src/add-ons/media/plugins/avcodec/libswscale/Jamfile b/src/add-ons/media/plugins/avcodec/libswscale/Jamfile
index 10267c0230..d2cea3e719 100644
--- a/src/add-ons/media/plugins/avcodec/libswscale/Jamfile
+++ b/src/add-ons/media/plugins/avcodec/libswscale/Jamfile
@@ -1,5 +1,6 @@
 SubDir HAIKU_TOP src add-ons media plugins avcodec libswscale ;
 
+SubDirHdrs [ FDirName $(SUBDIR) .. ] ;
 SubDirHdrs [ FDirName $(SUBDIR) ../libavutil ] ;
 SubDirHdrs [ FDirName $(SUBDIR) ../libavcodec ] ;
 
@@ -8,7 +9,7 @@ TARGET_WARNING_CCFLAGS = [ FFilter $(TARGET_WARNING_CCFLAGS)
 	: -Wall -Wmissing-prototypes -Wsign-compare -Wpointer-arith ] ;
 
 SubDirCcFlags -fomit-frame-pointer -DPIC ;
-SubDirCcFlags -DHAVE_AV_CONFIG_H=1 ;
+#SubDirCcFlags -DHAVE_AV_CONFIG_H=1 ;
 
 StaticLibrary libswscale.a :
 	rgb2rgb.c
diff --git a/src/add-ons/media/plugins/avcodec/libswscale/cs_test.c b/src/add-ons/media/plugins/avcodec/libswscale/cs_test.c
deleted file mode 100644
index d49a60582f..0000000000
--- a/src/add-ons/media/plugins/avcodec/libswscale/cs_test.c
+++ /dev/null
@@ -1,175 +0,0 @@
-/*
- * Copyright (C) 2002 Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <stdio.h>
-#include <string.h>              /* for memset() */
-#include <unistd.h>
-#include <stdlib.h>
-#include <inttypes.h>
-
-#include "swscale.h"
-#include "rgb2rgb.h"
-
-#define SIZE 1000
-#define srcByte 0x55
-#define dstByte 0xBB
-
-#define FUNC(s,d,n) {s,d,#n,n}
-
-static int cpu_caps;
-
-static char *args_parse(int argc, char *argv[])
-{
-    int o;
-
-    while ((o = getopt(argc, argv, "m23")) != -1) {
-        switch (o) {
-            case 'm':
-                cpu_caps |= SWS_CPU_CAPS_MMX;
-                break;
-            case '2':
-                cpu_caps |= SWS_CPU_CAPS_MMX2;
-                break;
-            case '3':
-                cpu_caps |= SWS_CPU_CAPS_3DNOW;
-                break;
-            default:
-                av_log(NULL, AV_LOG_ERROR, "Unknown option %c\n", o);
-        }
-    }
-
-    return argv[optind];
-}
-
-int main(int argc, char **argv)
-{
-    int i, funcNum;
-    uint8_t *srcBuffer= (uint8_t*)av_malloc(SIZE);
-    uint8_t *dstBuffer= (uint8_t*)av_malloc(SIZE);
-    int failedNum=0;
-    int passedNum=0;
-
-    av_log(NULL, AV_LOG_INFO, "memory corruption test ...\n");
-    args_parse(argc, argv);
-    av_log(NULL, AV_LOG_INFO, "CPU capabilities forced to %x\n", cpu_caps);
-    sws_rgb2rgb_init(cpu_caps);
-
-    for(funcNum=0; ; funcNum++){
-        struct func_info_s {
-            int src_bpp;
-            int dst_bpp;
-            char *name;
-            void (*func)(const uint8_t *src, uint8_t *dst, long src_size);
-        } func_info[] = {
-            FUNC(2, 2, rgb15to16),
-            FUNC(2, 3, rgb15to24),
-            FUNC(2, 4, rgb15to32),
-            FUNC(2, 3, rgb16to24),
-            FUNC(2, 4, rgb16to32),
-            FUNC(3, 2, rgb24to15),
-            FUNC(3, 2, rgb24to16),
-            FUNC(3, 4, rgb24to32),
-            FUNC(4, 2, rgb32to15),
-            FUNC(4, 2, rgb32to16),
-            FUNC(4, 3, rgb32to24),
-            FUNC(2, 2, rgb16to15),
-            FUNC(2, 2, rgb15tobgr15),
-            FUNC(2, 2, rgb15tobgr16),
-            FUNC(2, 3, rgb15tobgr24),
-            FUNC(2, 4, rgb15tobgr32),
-            FUNC(2, 2, rgb16tobgr15),
-            FUNC(2, 2, rgb16tobgr16),
-            FUNC(2, 3, rgb16tobgr24),
-            FUNC(2, 4, rgb16tobgr32),
-            FUNC(3, 2, rgb24tobgr15),
-            FUNC(3, 2, rgb24tobgr16),
-            FUNC(3, 3, rgb24tobgr24),
-            FUNC(3, 4, rgb24tobgr32),
-            FUNC(4, 2, rgb32tobgr15),
-            FUNC(4, 2, rgb32tobgr16),
-            FUNC(4, 3, rgb32tobgr24),
-            FUNC(4, 4, rgb32tobgr32),
-            FUNC(0, 0, NULL)
-        };
-        int width;
-        int failed=0;
-        int srcBpp=0;
-        int dstBpp=0;
-
-        if (!func_info[funcNum].func) break;
-
-        av_log(NULL, AV_LOG_INFO,".");
-        memset(srcBuffer, srcByte, SIZE);
-
-        for(width=63; width>0; width--){
-            int dstOffset;
-            for(dstOffset=128; dstOffset<196; dstOffset+=4){
-                int srcOffset;
-                memset(dstBuffer, dstByte, SIZE);
-
-                for(srcOffset=128; srcOffset<196; srcOffset+=4){
-                    uint8_t *src= srcBuffer+srcOffset;
-                    uint8_t *dst= dstBuffer+dstOffset;
-                    char *name=NULL;
-
-                    if(failed) break; //don't fill the screen with shit ...
-
-                    srcBpp = func_info[funcNum].src_bpp;
-                    dstBpp = func_info[funcNum].dst_bpp;
-                    name   = func_info[funcNum].name;
-
-                    func_info[funcNum].func(src, dst, width*srcBpp);
-
-                    if(!srcBpp) break;
-
-                    for(i=0; i<SIZE; i++){
-                        if(srcBuffer[i]!=srcByte){
-                            av_log(NULL, AV_LOG_INFO, "src damaged at %d w:%d src:%d dst:%d %s\n",
-                                   i, width, srcOffset, dstOffset, name);
-                            failed=1;
-                            break;
-                        }
-                    }
-                    for(i=0; i<dstOffset; i++){
-                        if(dstBuffer[i]!=dstByte){
-                            av_log(NULL, AV_LOG_INFO, "dst damaged at %d w:%d src:%d dst:%d %s\n",
-                                   i, width, srcOffset, dstOffset, name);
-                            failed=1;
-                            break;
-                        }
-                    }
-                    for(i=dstOffset + width*dstBpp; i<SIZE; i++){
-                        if(dstBuffer[i]!=dstByte){
-                            av_log(NULL, AV_LOG_INFO, "dst damaged at %d w:%d src:%d dst:%d %s\n",
-                                   i, width, srcOffset, dstOffset, name);
-                            failed=1;
-                            break;
-                        }
-                    }
-                }
-            }
-        }
-        if(failed) failedNum++;
-        else if(srcBpp) passedNum++;
-    }
-
-    av_log(NULL, AV_LOG_INFO, "\n%d converters passed, %d converters randomly overwrote memory\n", passedNum, failedNum);
-    return failedNum;
-}
diff --git a/src/add-ons/media/plugins/avcodec/libswscale/rgb2rgb.c b/src/add-ons/media/plugins/avcodec/libswscale/rgb2rgb.c
index e82d5ada38..ad69265c37 100644
--- a/src/add-ons/media/plugins/avcodec/libswscale/rgb2rgb.c
+++ b/src/add-ons/media/plugins/avcodec/libswscale/rgb2rgb.c
@@ -27,32 +27,30 @@
  */
 #include <inttypes.h>
 #include "config.h"
-#include "x86_cpu.h"
-#include "bswap.h"
+#include "libavutil/x86_cpu.h"
+#include "libavutil/bswap.h"
 #include "rgb2rgb.h"
 #include "swscale.h"
 #include "swscale_internal.h"
 
 #define FAST_BGR2YV12 // use 7-bit instead of 15-bit coefficients
 
-void (*rgb24to32)(const uint8_t *src, uint8_t *dst, long src_size);
-void (*rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size);
-void (*rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size);
-void (*rgb32to24)(const uint8_t *src, uint8_t *dst, long src_size);
+void (*rgb24tobgr32)(const uint8_t *src, uint8_t *dst, long src_size);
+void (*rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size);
+void (*rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size);
+void (*rgb32tobgr24)(const uint8_t *src, uint8_t *dst, long src_size);
 void (*rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size);
 void (*rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size);
 void (*rgb15to16)(const uint8_t *src, uint8_t *dst, long src_size);
-void (*rgb15to24)(const uint8_t *src, uint8_t *dst, long src_size);
+void (*rgb15tobgr24)(const uint8_t *src, uint8_t *dst, long src_size);
 void (*rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size);
 void (*rgb16to15)(const uint8_t *src, uint8_t *dst, long src_size);
-void (*rgb16to24)(const uint8_t *src, uint8_t *dst, long src_size);
+void (*rgb16tobgr24)(const uint8_t *src, uint8_t *dst, long src_size);
 void (*rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size);
-//void (*rgb24tobgr32)(const uint8_t *src, uint8_t *dst, long src_size);
 void (*rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size);
-void (*rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size);
-void (*rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size);
+void (*rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size);
+void (*rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size);
 void (*rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size);
-//void (*rgb32tobgr24)(const uint8_t *src, uint8_t *dst, long src_size);
 void (*rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size);
 void (*rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size);
 
@@ -65,6 +63,9 @@ void (*yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc
 void (*yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
                       long width, long height,
                       long lumStride, long chromStride, long dstStride);
+void (*yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
+                      long width, long height,
+                      long lumStride, long chromStride, long dstStride);
 void (*yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
                    long width, long height,
                    long lumStride, long chromStride, long srcStride);
@@ -87,12 +88,13 @@ void (*yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *sr
                      long srcStride1, long srcStride2,
                      long srcStride3, long dstStride);
 
-#if defined(ARCH_X86) && defined(CONFIG_GPL)
+#if ARCH_X86 && CONFIG_GPL
 DECLARE_ASM_CONST(8, uint64_t, mmx_null)     = 0x0000000000000000ULL;
 DECLARE_ASM_CONST(8, uint64_t, mmx_one)      = 0xFFFFFFFFFFFFFFFFULL;
 DECLARE_ASM_CONST(8, uint64_t, mask32b)      = 0x000000FF000000FFULL;
 DECLARE_ASM_CONST(8, uint64_t, mask32g)      = 0x0000FF000000FF00ULL;
 DECLARE_ASM_CONST(8, uint64_t, mask32r)      = 0x00FF000000FF0000ULL;
+DECLARE_ASM_CONST(8, uint64_t, mask32a)      = 0xFF000000FF000000ULL;
 DECLARE_ASM_CONST(8, uint64_t, mask32)       = 0x00FFFFFF00FFFFFFULL;
 DECLARE_ASM_CONST(8, uint64_t, mask3216br)   = 0x00F800F800F800F8ULL;
 DECLARE_ASM_CONST(8, uint64_t, mask3216g)    = 0x0000FC000000FC00ULL;
@@ -121,22 +123,7 @@ DECLARE_ASM_CONST(8, uint64_t, blue_16mask)  = 0x0000001f0000001fULL;
 DECLARE_ASM_CONST(8, uint64_t, red_15mask)   = 0x00007c0000007c00ULL;
 DECLARE_ASM_CONST(8, uint64_t, green_15mask) = 0x000003e0000003e0ULL;
 DECLARE_ASM_CONST(8, uint64_t, blue_15mask)  = 0x0000001f0000001fULL;
-
-#if 0
-static volatile uint64_t __attribute__((aligned(8))) b5Dither;
-static volatile uint64_t __attribute__((aligned(8))) g5Dither;
-static volatile uint64_t __attribute__((aligned(8))) g6Dither;
-static volatile uint64_t __attribute__((aligned(8))) r5Dither;
-
-static uint64_t __attribute__((aligned(8))) dither4[2]={
-    0x0103010301030103LL,
-    0x0200020002000200LL,};
-
-static uint64_t __attribute__((aligned(8))) dither8[2]={
-    0x0602060206020602LL,
-    0x0004000400040004LL,};
-#endif
-#endif /* defined(ARCH_X86) */
+#endif /* ARCH_X86 */
 
 #define RGB2YUV_SHIFT 8
 #define BY ((int)( 0.098*(1<<RGB2YUV_SHIFT)+0.5))
@@ -153,37 +140,37 @@ static uint64_t __attribute__((aligned(8))) dither8[2]={
 //plain C versions
 #undef HAVE_MMX
 #undef HAVE_MMX2
-#undef HAVE_3DNOW
+#undef HAVE_AMD3DNOW
 #undef HAVE_SSE2
+#define HAVE_MMX 0
+#define HAVE_MMX2 0
+#define HAVE_AMD3DNOW 0
+#define HAVE_SSE2 0
 #define RENAME(a) a ## _C
 #include "rgb2rgb_template.c"
 
-#if defined(ARCH_X86) && defined(CONFIG_GPL)
+#if ARCH_X86 && CONFIG_GPL
 
 //MMX versions
 #undef RENAME
-#define HAVE_MMX
-#undef HAVE_MMX2
-#undef HAVE_3DNOW
-#undef HAVE_SSE2
+#undef HAVE_MMX
+#define HAVE_MMX 1
 #define RENAME(a) a ## _MMX
 #include "rgb2rgb_template.c"
 
 //MMX2 versions
 #undef RENAME
-#define HAVE_MMX
-#define HAVE_MMX2
-#undef HAVE_3DNOW
-#undef HAVE_SSE2
+#undef HAVE_MMX2
+#define HAVE_MMX2 1
 #define RENAME(a) a ## _MMX2
 #include "rgb2rgb_template.c"
 
 //3DNOW versions
 #undef RENAME
-#define HAVE_MMX
 #undef HAVE_MMX2
-#define HAVE_3DNOW
-#undef HAVE_SSE2
+#undef HAVE_AMD3DNOW
+#define HAVE_MMX2 0
+#define HAVE_AMD3DNOW 1
 #define RENAME(a) a ## _3DNOW
 #include "rgb2rgb_template.c"
 
@@ -197,7 +184,7 @@ static uint64_t __attribute__((aligned(8))) dither8[2]={
 */
 
 void sws_rgb2rgb_init(int flags){
-#if (defined(HAVE_MMX2) || defined(HAVE_3DNOW) || defined(HAVE_MMX))  && defined(CONFIG_GPL)
+#if (HAVE_MMX2 || HAVE_AMD3DNOW || HAVE_MMX)  && CONFIG_GPL
     if (flags & SWS_CPU_CAPS_MMX2)
         rgb2rgb_init_MMX2();
     else if (flags & SWS_CPU_CAPS_3DNOW)
@@ -205,89 +192,28 @@ void sws_rgb2rgb_init(int flags){
     else if (flags & SWS_CPU_CAPS_MMX)
         rgb2rgb_init_MMX();
     else
-#endif /* defined(HAVE_MMX2) || defined(HAVE_3DNOW) || defined(HAVE_MMX) */
+#endif /* HAVE_MMX2 || HAVE_AMD3DNOW || HAVE_MMX */
         rgb2rgb_init_C();
 }
 
 /**
- * Palette is assumed to contain BGR32.
+ * Convert the palette to the same packet 32-bit format as the palette
  */
-void palette8torgb32(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette)
+void palette8topacked32(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette)
 {
     long i;
 
-/*
     for (i=0; i<num_pixels; i++)
-        ((unsigned *)dst)[i] = ((unsigned *)palette)[src[i]];
-*/
-
-    for (i=0; i<num_pixels; i++)
-    {
-        #ifdef WORDS_BIGENDIAN
-            dst[3]= palette[src[i]*4+2];
-            dst[2]= palette[src[i]*4+1];
-            dst[1]= palette[src[i]*4+0];
-        #else
-        //FIXME slow?
-            dst[0]= palette[src[i]*4+2];
-            dst[1]= palette[src[i]*4+1];
-            dst[2]= palette[src[i]*4+0];
-            //dst[3]= 0; /* do we need this cleansing? */
-        #endif
-        dst+= 4;
-    }
-}
-
-void palette8tobgr32(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette)
-{
-    long i;
-    for (i=0; i<num_pixels; i++)
-    {
-        #ifdef WORDS_BIGENDIAN
-            dst[3]= palette[src[i]*4+0];
-            dst[2]= palette[src[i]*4+1];
-            dst[1]= palette[src[i]*4+2];
-        #else
-            //FIXME slow?
-            dst[0]= palette[src[i]*4+0];
-            dst[1]= palette[src[i]*4+1];
-            dst[2]= palette[src[i]*4+2];
-            //dst[3]= 0; /* do we need this cleansing? */
-        #endif
-
-        dst+= 4;
-    }
+        ((uint32_t *) dst)[i] = ((const uint32_t *) palette)[src[i]];
 }
 
 /**
- * Palette is assumed to contain BGR32.
+ * Palette format: ABCD -> dst format: ABC
  */
-void palette8torgb24(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette)
+void palette8topacked24(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette)
 {
     long i;
-/*
-    Writes 1 byte too much and might cause alignment issues on some architectures?
-    for (i=0; i<num_pixels; i++)
-        ((unsigned *)(&dst[i*3])) = ((unsigned *)palette)[src[i]];
-*/
-    for (i=0; i<num_pixels; i++)
-    {
-        //FIXME slow?
-        dst[0]= palette[src[i]*4+2];
-        dst[1]= palette[src[i]*4+1];
-        dst[2]= palette[src[i]*4+0];
-        dst+= 3;
-    }
-}
 
-void palette8tobgr24(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette)
-{
-    long i;
-/*
-    Writes 1 byte too much and might cause alignment issues on some architectures?
-    for (i=0; i<num_pixels; i++)
-        ((unsigned *)(&dst[i*3])) = ((unsigned *)palette)[src[i]];
-*/
     for (i=0; i<num_pixels; i++)
     {
         //FIXME slow?
@@ -330,7 +256,7 @@ void palette8tobgr15(const uint8_t *src, uint8_t *dst, long num_pixels, const ui
         ((uint16_t *)dst)[i] = bswap_16(((const uint16_t *)palette)[src[i]]);
 }
 
-void rgb32tobgr24(const uint8_t *src, uint8_t *dst, long src_size)
+void rgb32to24(const uint8_t *src, uint8_t *dst, long src_size)
 {
     long i;
     long num_pixels = src_size >> 2;
@@ -349,14 +275,14 @@ void rgb32tobgr24(const uint8_t *src, uint8_t *dst, long src_size)
     }
 }
 
-void rgb24tobgr32(const uint8_t *src, uint8_t *dst, long src_size)
+void rgb24to32(const uint8_t *src, uint8_t *dst, long src_size)
 {
     long i;
     for (i=0; 3*i<src_size; i++)
     {
         #ifdef WORDS_BIGENDIAN
             /* RGB24 (= R,G,B) -> BGR32 (= A,R,G,B) */
-            dst[4*i + 0] = 0;
+            dst[4*i + 0] = 255;
             dst[4*i + 1] = src[3*i + 0];
             dst[4*i + 2] = src[3*i + 1];
             dst[4*i + 3] = src[3*i + 2];
@@ -364,7 +290,7 @@ void rgb24tobgr32(const uint8_t *src, uint8_t *dst, long src_size)
             dst[4*i + 0] = src[3*i + 2];
             dst[4*i + 1] = src[3*i + 1];
             dst[4*i + 2] = src[3*i + 0];
-            dst[4*i + 3] = 0;
+            dst[4*i + 3] = 255;
         #endif
     }
 }
@@ -380,7 +306,7 @@ void rgb16tobgr32(const uint8_t *src, uint8_t *dst, long src_size)
         register uint16_t bgr;
         bgr = *s++;
         #ifdef WORDS_BIGENDIAN
-            *d++ = 0;
+            *d++ = 255;
             *d++ = (bgr&0x1F)<<3;
             *d++ = (bgr&0x7E0)>>3;
             *d++ = (bgr&0xF800)>>8;
@@ -388,12 +314,12 @@ void rgb16tobgr32(const uint8_t *src, uint8_t *dst, long src_size)
             *d++ = (bgr&0xF800)>>8;
             *d++ = (bgr&0x7E0)>>3;
             *d++ = (bgr&0x1F)<<3;
-            *d++ = 0;
+            *d++ = 255;
         #endif
     }
 }
 
-void rgb16tobgr24(const uint8_t *src, uint8_t *dst, long src_size)
+void rgb16to24(const uint8_t *src, uint8_t *dst, long src_size)
 {
     const uint16_t *end;
     uint8_t *d = dst;
@@ -416,13 +342,8 @@ void rgb16tobgr16(const uint8_t *src, uint8_t *dst, long src_size)
 
     for (i=0; i<num_pixels; i++)
     {
-        unsigned b,g,r;
-        register uint16_t rgb;
-        rgb = src[2*i];
-        r = rgb&0x1F;
-        g = (rgb&0x7E0)>>5;
-        b = (rgb&0xF800)>>11;
-        dst[2*i] = (b&0x1F) | ((g&0x3F)<<5) | ((r&0x1F)<<11);
+        unsigned rgb = ((const uint16_t*)src)[i];
+        ((uint16_t*)dst)[i] = (rgb>>11) | (rgb&0x7E0) | (rgb<<11);
     }
 }
 
@@ -433,13 +354,8 @@ void rgb16tobgr15(const uint8_t *src, uint8_t *dst, long src_size)
 
     for (i=0; i<num_pixels; i++)
     {
-        unsigned b,g,r;
-        register uint16_t rgb;
-        rgb = src[2*i];
-        r = rgb&0x1F;
-        g = (rgb&0x7E0)>>5;
-        b = (rgb&0xF800)>>11;
-        dst[2*i] = (b&0x1F) | ((g&0x1F)<<5) | ((r&0x1F)<<10);
+        unsigned rgb = ((const uint16_t*)src)[i];
+        ((uint16_t*)dst)[i] = (rgb>>11) | ((rgb&0x7C0)>>1) | ((rgb&0x1F)<<10);
     }
 }
 
@@ -454,7 +370,7 @@ void rgb15tobgr32(const uint8_t *src, uint8_t *dst, long src_size)
         register uint16_t bgr;
         bgr = *s++;
         #ifdef WORDS_BIGENDIAN
-            *d++ = 0;
+            *d++ = 255;
             *d++ = (bgr&0x1F)<<3;
             *d++ = (bgr&0x3E0)>>2;
             *d++ = (bgr&0x7C00)>>7;
@@ -462,12 +378,12 @@ void rgb15tobgr32(const uint8_t *src, uint8_t *dst, long src_size)
             *d++ = (bgr&0x7C00)>>7;
             *d++ = (bgr&0x3E0)>>2;
             *d++ = (bgr&0x1F)<<3;
-            *d++ = 0;
+            *d++ = 255;
         #endif
     }
 }
 
-void rgb15tobgr24(const uint8_t *src, uint8_t *dst, long src_size)
+void rgb15to24(const uint8_t *src, uint8_t *dst, long src_size)
 {
     const uint16_t *end;
     uint8_t *d = dst;
@@ -490,13 +406,8 @@ void rgb15tobgr16(const uint8_t *src, uint8_t *dst, long src_size)
 
     for (i=0; i<num_pixels; i++)
     {
-        unsigned b,g,r;
-        register uint16_t rgb;
-        rgb = src[2*i];
-        r = rgb&0x1F;
-        g = (rgb&0x3E0)>>5;
-        b = (rgb&0x7C00)>>10;
-        dst[2*i] = (b&0x1F) | ((g&0x3F)<<5) | ((r&0x1F)<<11);
+        unsigned rgb = ((const uint16_t*)src)[i];
+        ((uint16_t*)dst)[i] = ((rgb&0x7C00)>>10) | ((rgb&0x3E0)<<1) | (rgb<<11);
     }
 }
 
@@ -507,17 +418,14 @@ void rgb15tobgr15(const uint8_t *src, uint8_t *dst, long src_size)
 
     for (i=0; i<num_pixels; i++)
     {
-        unsigned b,g,r;
-        register uint16_t rgb;
-        rgb = src[2*i];
-        r = rgb&0x1F;
-        g = (rgb&0x3E0)>>5;
-        b = (rgb&0x7C00)>>10;
-        dst[2*i] = (b&0x1F) | ((g&0x1F)<<5) | ((r&0x1F)<<10);
+        unsigned br;
+        unsigned rgb = ((const uint16_t*)src)[i];
+        br = rgb&0x7c1F;
+        ((uint16_t*)dst)[i] = (br>>10) | (rgb&0x3E0) | (br<<10);
     }
 }
 
-void rgb8tobgr8(const uint8_t *src, uint8_t *dst, long src_size)
+void bgr8torgb8(const uint8_t *src, uint8_t *dst, long src_size)
 {
     long i;
     long num_pixels = src_size;
diff --git a/src/add-ons/media/plugins/avcodec/libswscale/rgb2rgb.h b/src/add-ons/media/plugins/avcodec/libswscale/rgb2rgb.h
index f2697c65d6..df912c8533 100644
--- a/src/add-ons/media/plugins/avcodec/libswscale/rgb2rgb.h
+++ b/src/add-ons/media/plugins/avcodec/libswscale/rgb2rgb.h
@@ -23,58 +23,56 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#ifndef FFMPEG_RGB2RGB_H
-#define FFMPEG_RGB2RGB_H
+#ifndef SWSCALE_RGB2RGB_H
+#define SWSCALE_RGB2RGB_H
 
 #include <inttypes.h>
 
 /* A full collection of RGB to RGB(BGR) converters */
-extern void (*rgb24to32)   (const uint8_t *src, uint8_t *dst, long src_size);
-extern void (*rgb24to16)   (const uint8_t *src, uint8_t *dst, long src_size);
-extern void (*rgb24to15)   (const uint8_t *src, uint8_t *dst, long src_size);
-extern void (*rgb32to24)   (const uint8_t *src, uint8_t *dst, long src_size);
+extern void (*rgb24tobgr32)(const uint8_t *src, uint8_t *dst, long src_size);
+extern void (*rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size);
+extern void (*rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size);
+extern void (*rgb32tobgr24)(const uint8_t *src, uint8_t *dst, long src_size);
 extern void (*rgb32to16)   (const uint8_t *src, uint8_t *dst, long src_size);
 extern void (*rgb32to15)   (const uint8_t *src, uint8_t *dst, long src_size);
 extern void (*rgb15to16)   (const uint8_t *src, uint8_t *dst, long src_size);
-extern void (*rgb15to24)   (const uint8_t *src, uint8_t *dst, long src_size);
+extern void (*rgb15tobgr24)(const uint8_t *src, uint8_t *dst, long src_size);
 extern void (*rgb15to32)   (const uint8_t *src, uint8_t *dst, long src_size);
 extern void (*rgb16to15)   (const uint8_t *src, uint8_t *dst, long src_size);
-extern void (*rgb16to24)   (const uint8_t *src, uint8_t *dst, long src_size);
+extern void (*rgb16tobgr24)(const uint8_t *src, uint8_t *dst, long src_size);
 extern void (*rgb16to32)   (const uint8_t *src, uint8_t *dst, long src_size);
 extern void (*rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size);
-extern void (*rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size);
-extern void (*rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size);
+extern void (*rgb24to16)   (const uint8_t *src, uint8_t *dst, long src_size);
+extern void (*rgb24to15)   (const uint8_t *src, uint8_t *dst, long src_size);
 extern void (*rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size);
 extern void (*rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size);
 extern void (*rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size);
 
-extern void rgb24tobgr32(const uint8_t *src, uint8_t *dst, long src_size);
-extern void rgb32tobgr24(const uint8_t *src, uint8_t *dst, long src_size);
-extern void rgb16tobgr32(const uint8_t *src, uint8_t *dst, long src_size);
-extern void rgb16tobgr24(const uint8_t *src, uint8_t *dst, long src_size);
-extern void rgb16tobgr16(const uint8_t *src, uint8_t *dst, long src_size);
-extern void rgb16tobgr15(const uint8_t *src, uint8_t *dst, long src_size);
-extern void rgb15tobgr32(const uint8_t *src, uint8_t *dst, long src_size);
-extern void rgb15tobgr24(const uint8_t *src, uint8_t *dst, long src_size);
-extern void rgb15tobgr16(const uint8_t *src, uint8_t *dst, long src_size);
-extern void rgb15tobgr15(const uint8_t *src, uint8_t *dst, long src_size);
-extern void rgb8tobgr8  (const uint8_t *src, uint8_t *dst, long src_size);
+void rgb24to32   (const uint8_t *src, uint8_t *dst, long src_size);
+void rgb32to24   (const uint8_t *src, uint8_t *dst, long src_size);
+void rgb16tobgr32(const uint8_t *src, uint8_t *dst, long src_size);
+void rgb16to24   (const uint8_t *src, uint8_t *dst, long src_size);
+void rgb16tobgr16(const uint8_t *src, uint8_t *dst, long src_size);
+void rgb16tobgr15(const uint8_t *src, uint8_t *dst, long src_size);
+void rgb15tobgr32(const uint8_t *src, uint8_t *dst, long src_size);
+void rgb15to24   (const uint8_t *src, uint8_t *dst, long src_size);
+void rgb15tobgr16(const uint8_t *src, uint8_t *dst, long src_size);
+void rgb15tobgr15(const uint8_t *src, uint8_t *dst, long src_size);
+void bgr8torgb8  (const uint8_t *src, uint8_t *dst, long src_size);
 
 
-extern void palette8torgb32(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette);
-extern void palette8tobgr32(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette);
-extern void palette8torgb24(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette);
-extern void palette8tobgr24(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette);
-extern void palette8torgb16(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette);
-extern void palette8tobgr16(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette);
-extern void palette8torgb15(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette);
-extern void palette8tobgr15(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette);
+void palette8topacked32(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette);
+void palette8topacked24(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette);
+void palette8torgb16(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette);
+void palette8tobgr16(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette);
+void palette8torgb15(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette);
+void palette8tobgr15(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette);
 
 /**
  * Height should be a multiple of 2 and width should be a multiple of 16.
  * (If this is a problem for anyone then tell me, and I will fix it.)
  * Chrominance data is only taken from every second line, others are ignored.
- * FIXME: Write HQ version.
+ * FIXME: Write high quality version.
  */
 //void uyvytoyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
 
@@ -109,11 +107,18 @@ extern void (*yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_
                           long width, long height,
                           long lumStride, long chromStride, long dstStride);
 
+/**
+ * Width should be a multiple of 16.
+ */
+extern void (*yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
+                             long width, long height,
+                             long lumStride, long chromStride, long dstStride);
+
 /**
  * Height should be a multiple of 2 and width should be a multiple of 2.
  * (If this is a problem for anyone then tell me, and I will fix it.)
  * Chrominance data is only taken from every second line, others are ignored.
- * FIXME: Write HQ version.
+ * FIXME: Write high quality version.
  */
 extern void (*rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
                            long width, long height,
@@ -139,4 +144,4 @@ extern void (*yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint
 
 void sws_rgb2rgb_init(int flags);
 
-#endif /* FFMPEG_RGB2RGB_H */
+#endif /* SWSCALE_RGB2RGB_H */
diff --git a/src/add-ons/media/plugins/avcodec/libswscale/rgb2rgb_template.c b/src/add-ons/media/plugins/avcodec/libswscale/rgb2rgb_template.c
index ffbf2c734b..e95b628049 100644
--- a/src/add-ons/media/plugins/avcodec/libswscale/rgb2rgb_template.c
+++ b/src/add-ons/media/plugins/avcodec/libswscale/rgb2rgb_template.c
@@ -28,12 +28,6 @@
  */
 
 #include <stddef.h>
-#include <inttypes.h> /* for __WORDSIZE */
-
-#ifndef __WORDSIZE
-// #warning You have a misconfigured system and will probably lose performance!
-#define __WORDSIZE MP_WORDSIZE
-#endif
 
 #undef PREFETCH
 #undef MOVNTQ
@@ -43,38 +37,33 @@
 #undef PREFETCHW
 #undef PAVGB
 
-#ifdef HAVE_SSE2
+#if HAVE_SSE2
 #define MMREG_SIZE 16
 #else
 #define MMREG_SIZE 8
 #endif
 
-#ifdef HAVE_3DNOW
+#if HAVE_AMD3DNOW
 #define PREFETCH  "prefetch"
 #define PREFETCHW "prefetchw"
 #define PAVGB     "pavgusb"
-#elif defined (HAVE_MMX2)
+#elif HAVE_MMX2
 #define PREFETCH "prefetchnta"
 #define PREFETCHW "prefetcht0"
 #define PAVGB     "pavgb"
 #else
-#ifdef __APPLE__
-#define PREFETCH "#"
-#define PREFETCHW "#"
-#else
 #define PREFETCH  " # nop"
 #define PREFETCHW " # nop"
 #endif
-#endif
 
-#ifdef HAVE_3DNOW
-/* On K6 femms is faster than emms. On K7 femms is directly mapped on emms. */
+#if HAVE_AMD3DNOW
+/* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
 #define EMMS     "femms"
 #else
 #define EMMS     "emms"
 #endif
 
-#ifdef HAVE_MMX2
+#if HAVE_MMX2
 #define MOVNTQ "movntq"
 #define SFENCE "sfence"
 #else
@@ -82,22 +71,22 @@
 #define SFENCE " # nop"
 #endif
 
-static inline void RENAME(rgb24to32)(const uint8_t *src, uint8_t *dst, long src_size)
+static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
 {
     uint8_t *dest = dst;
     const uint8_t *s = src;
     const uint8_t *end;
-    #ifdef HAVE_MMX
+    #if HAVE_MMX
         const uint8_t *mm_end;
     #endif
     end = s + src_size;
-    #ifdef HAVE_MMX
-        asm volatile(PREFETCH"    %0"::"m"(*s):"memory");
+    #if HAVE_MMX
+        __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
         mm_end = end - 23;
-        asm volatile("movq        %0, %%mm7"::"m"(mask32):"memory");
+        __asm__ volatile("movq        %0, %%mm7"::"m"(mask32a):"memory");
         while (s < mm_end)
         {
-            asm volatile(
+            __asm__ volatile(
             PREFETCH"    32%1           \n\t"
             "movd          %1, %%mm0    \n\t"
             "punpckldq    3%1, %%mm0    \n\t"
@@ -107,10 +96,10 @@ static inline void RENAME(rgb24to32)(const uint8_t *src, uint8_t *dst, long src_
             "punpckldq   15%1, %%mm2    \n\t"
             "movd        18%1, %%mm3    \n\t"
             "punpckldq   21%1, %%mm3    \n\t"
-            "pand       %%mm7, %%mm0    \n\t"
-            "pand       %%mm7, %%mm1    \n\t"
-            "pand       %%mm7, %%mm2    \n\t"
-            "pand       %%mm7, %%mm3    \n\t"
+            "por        %%mm7, %%mm0    \n\t"
+            "por        %%mm7, %%mm1    \n\t"
+            "por        %%mm7, %%mm2    \n\t"
+            "por        %%mm7, %%mm3    \n\t"
             MOVNTQ"     %%mm0,   %0     \n\t"
             MOVNTQ"     %%mm1,  8%0     \n\t"
             MOVNTQ"     %%mm2, 16%0     \n\t"
@@ -121,14 +110,14 @@ static inline void RENAME(rgb24to32)(const uint8_t *src, uint8_t *dst, long src_
             dest += 32;
             s += 24;
         }
-        asm volatile(SFENCE:::"memory");
-        asm volatile(EMMS:::"memory");
+        __asm__ volatile(SFENCE:::"memory");
+        __asm__ volatile(EMMS:::"memory");
     #endif
     while (s < end)
     {
     #ifdef WORDS_BIGENDIAN
         /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
-        *dest++ = 0;
+        *dest++ = 255;
         *dest++ = s[2];
         *dest++ = s[1];
         *dest++ = s[0];
@@ -137,26 +126,26 @@ static inline void RENAME(rgb24to32)(const uint8_t *src, uint8_t *dst, long src_
         *dest++ = *s++;
         *dest++ = *s++;
         *dest++ = *s++;
-        *dest++ = 0;
+        *dest++ = 255;
     #endif
     }
 }
 
-static inline void RENAME(rgb32to24)(const uint8_t *src, uint8_t *dst, long src_size)
+static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
 {
     uint8_t *dest = dst;
     const uint8_t *s = src;
     const uint8_t *end;
-#ifdef HAVE_MMX
+#if HAVE_MMX
     const uint8_t *mm_end;
 #endif
     end = s + src_size;
-#ifdef HAVE_MMX
-    asm volatile(PREFETCH"    %0"::"m"(*s):"memory");
+#if HAVE_MMX
+    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
     mm_end = end - 31;
     while (s < mm_end)
     {
-        asm volatile(
+        __asm__ volatile(
         PREFETCH"    32%1           \n\t"
         "movq          %1, %%mm0    \n\t"
         "movq         8%1, %%mm1    \n\t"
@@ -207,8 +196,8 @@ static inline void RENAME(rgb32to24)(const uint8_t *src, uint8_t *dst, long src_
         dest += 24;
         s += 32;
     }
-    asm volatile(SFENCE:::"memory");
-    asm volatile(EMMS:::"memory");
+    __asm__ volatile(SFENCE:::"memory");
+    __asm__ volatile(EMMS:::"memory");
 #endif
     while (s < end)
     {
@@ -241,13 +230,13 @@ static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, long src_
     register const uint8_t *end;
     const uint8_t *mm_end;
     end = s + src_size;
-#ifdef HAVE_MMX
-    asm volatile(PREFETCH"    %0"::"m"(*s));
-    asm volatile("movq        %0, %%mm4"::"m"(mask15s));
+#if HAVE_MMX
+    __asm__ volatile(PREFETCH"    %0"::"m"(*s));
+    __asm__ volatile("movq        %0, %%mm4"::"m"(mask15s));
     mm_end = end - 15;
     while (s<mm_end)
     {
-        asm volatile(
+        __asm__ volatile(
         PREFETCH"  32%1         \n\t"
         "movq        %1, %%mm0  \n\t"
         "movq       8%1, %%mm2  \n\t"
@@ -265,8 +254,8 @@ static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, long src_
         d+=16;
         s+=16;
     }
-    asm volatile(SFENCE:::"memory");
-    asm volatile(EMMS:::"memory");
+    __asm__ volatile(SFENCE:::"memory");
+    __asm__ volatile(EMMS:::"memory");
 #endif
     mm_end = end - 3;
     while (s < mm_end)
@@ -290,14 +279,14 @@ static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, long src_
     register const uint8_t *end;
     const uint8_t *mm_end;
     end = s + src_size;
-#ifdef HAVE_MMX
-    asm volatile(PREFETCH"    %0"::"m"(*s));
-    asm volatile("movq        %0, %%mm7"::"m"(mask15rg));
-    asm volatile("movq        %0, %%mm6"::"m"(mask15b));
+#if HAVE_MMX
+    __asm__ volatile(PREFETCH"    %0"::"m"(*s));
+    __asm__ volatile("movq        %0, %%mm7"::"m"(mask15rg));
+    __asm__ volatile("movq        %0, %%mm6"::"m"(mask15b));
     mm_end = end - 15;
     while (s<mm_end)
     {
-        asm volatile(
+        __asm__ volatile(
         PREFETCH"  32%1         \n\t"
         "movq        %1, %%mm0  \n\t"
         "movq       8%1, %%mm2  \n\t"
@@ -319,8 +308,8 @@ static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, long src_
         d+=16;
         s+=16;
     }
-    asm volatile(SFENCE:::"memory");
-    asm volatile(EMMS:::"memory");
+    __asm__ volatile(SFENCE:::"memory");
+    __asm__ volatile(EMMS:::"memory");
 #endif
     mm_end = end - 3;
     while (s < mm_end)
@@ -343,15 +332,15 @@ static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_
 {
     const uint8_t *s = src;
     const uint8_t *end;
-#ifdef HAVE_MMX
+#if HAVE_MMX
     const uint8_t *mm_end;
 #endif
     uint16_t *d = (uint16_t *)dst;
     end = s + src_size;
-#ifdef HAVE_MMX
+#if HAVE_MMX
     mm_end = end - 15;
 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
-    asm volatile(
+    __asm__ volatile(
     "movq           %3, %%mm5   \n\t"
     "movq           %4, %%mm6   \n\t"
     "movq           %5, %%mm7   \n\t"
@@ -386,14 +375,14 @@ static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_
     : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
     );
 #else
-    asm volatile(PREFETCH"    %0"::"m"(*src):"memory");
-    asm volatile(
+    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
+    __asm__ volatile(
         "movq    %0, %%mm7    \n\t"
         "movq    %1, %%mm6    \n\t"
         ::"m"(red_16mask),"m"(green_16mask));
     while (s < mm_end)
     {
-        asm volatile(
+        __asm__ volatile(
         PREFETCH"    32%1           \n\t"
         "movd          %1, %%mm0    \n\t"
         "movd         4%1, %%mm3    \n\t"
@@ -427,8 +416,8 @@ static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_
         s += 16;
     }
 #endif
-    asm volatile(SFENCE:::"memory");
-    asm volatile(EMMS:::"memory");
+    __asm__ volatile(SFENCE:::"memory");
+    __asm__ volatile(EMMS:::"memory");
 #endif
     while (s < end)
     {
@@ -441,21 +430,21 @@ static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long s
 {
     const uint8_t *s = src;
     const uint8_t *end;
-#ifdef HAVE_MMX
+#if HAVE_MMX
     const uint8_t *mm_end;
 #endif
     uint16_t *d = (uint16_t *)dst;
     end = s + src_size;
-#ifdef HAVE_MMX
-    asm volatile(PREFETCH"    %0"::"m"(*src):"memory");
-    asm volatile(
+#if HAVE_MMX
+    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
+    __asm__ volatile(
         "movq          %0, %%mm7    \n\t"
         "movq          %1, %%mm6    \n\t"
         ::"m"(red_16mask),"m"(green_16mask));
     mm_end = end - 15;
     while (s < mm_end)
     {
-        asm volatile(
+        __asm__ volatile(
         PREFETCH"    32%1           \n\t"
         "movd          %1, %%mm0    \n\t"
         "movd         4%1, %%mm3    \n\t"
@@ -488,8 +477,8 @@ static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long s
         d += 4;
         s += 16;
     }
-    asm volatile(SFENCE:::"memory");
-    asm volatile(EMMS:::"memory");
+    __asm__ volatile(SFENCE:::"memory");
+    __asm__ volatile(EMMS:::"memory");
 #endif
     while (s < end)
     {
@@ -502,15 +491,15 @@ static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_
 {
     const uint8_t *s = src;
     const uint8_t *end;
-#ifdef HAVE_MMX
+#if HAVE_MMX
     const uint8_t *mm_end;
 #endif
     uint16_t *d = (uint16_t *)dst;
     end = s + src_size;
-#ifdef HAVE_MMX
+#if HAVE_MMX
     mm_end = end - 15;
 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
-    asm volatile(
+    __asm__ volatile(
     "movq           %3, %%mm5   \n\t"
     "movq           %4, %%mm6   \n\t"
     "movq           %5, %%mm7   \n\t"
@@ -545,14 +534,14 @@ static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_
     : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
     );
 #else
-    asm volatile(PREFETCH"    %0"::"m"(*src):"memory");
-    asm volatile(
+    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
+    __asm__ volatile(
         "movq          %0, %%mm7    \n\t"
         "movq          %1, %%mm6    \n\t"
         ::"m"(red_15mask),"m"(green_15mask));
     while (s < mm_end)
     {
-        asm volatile(
+        __asm__ volatile(
         PREFETCH"    32%1           \n\t"
         "movd          %1, %%mm0    \n\t"
         "movd         4%1, %%mm3    \n\t"
@@ -586,8 +575,8 @@ static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_
         s += 16;
     }
 #endif
-    asm volatile(SFENCE:::"memory");
-    asm volatile(EMMS:::"memory");
+    __asm__ volatile(SFENCE:::"memory");
+    __asm__ volatile(EMMS:::"memory");
 #endif
     while (s < end)
     {
@@ -600,21 +589,21 @@ static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long s
 {
     const uint8_t *s = src;
     const uint8_t *end;
-#ifdef HAVE_MMX
+#if HAVE_MMX
     const uint8_t *mm_end;
 #endif
     uint16_t *d = (uint16_t *)dst;
     end = s + src_size;
-#ifdef HAVE_MMX
-    asm volatile(PREFETCH"    %0"::"m"(*src):"memory");
-    asm volatile(
+#if HAVE_MMX
+    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
+    __asm__ volatile(
         "movq          %0, %%mm7    \n\t"
         "movq          %1, %%mm6    \n\t"
         ::"m"(red_15mask),"m"(green_15mask));
     mm_end = end - 15;
     while (s < mm_end)
     {
-        asm volatile(
+        __asm__ volatile(
         PREFETCH"    32%1           \n\t"
         "movd          %1, %%mm0    \n\t"
         "movd         4%1, %%mm3    \n\t"
@@ -647,8 +636,8 @@ static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long s
         d += 4;
         s += 16;
     }
-    asm volatile(SFENCE:::"memory");
-    asm volatile(EMMS:::"memory");
+    __asm__ volatile(SFENCE:::"memory");
+    __asm__ volatile(EMMS:::"memory");
 #endif
     while (s < end)
     {
@@ -657,25 +646,25 @@ static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long s
     }
 }
 
-static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
+static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
 {
     const uint8_t *s = src;
     const uint8_t *end;
-#ifdef HAVE_MMX
+#if HAVE_MMX
     const uint8_t *mm_end;
 #endif
     uint16_t *d = (uint16_t *)dst;
     end = s + src_size;
-#ifdef HAVE_MMX
-    asm volatile(PREFETCH"    %0"::"m"(*src):"memory");
-    asm volatile(
+#if HAVE_MMX
+    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
+    __asm__ volatile(
         "movq         %0, %%mm7     \n\t"
         "movq         %1, %%mm6     \n\t"
         ::"m"(red_16mask),"m"(green_16mask));
     mm_end = end - 11;
     while (s < mm_end)
     {
-        asm volatile(
+        __asm__ volatile(
         PREFETCH"    32%1           \n\t"
         "movd          %1, %%mm0    \n\t"
         "movd         3%1, %%mm3    \n\t"
@@ -708,8 +697,8 @@ static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_
         d += 4;
         s += 12;
     }
-    asm volatile(SFENCE:::"memory");
-    asm volatile(EMMS:::"memory");
+    __asm__ volatile(SFENCE:::"memory");
+    __asm__ volatile(EMMS:::"memory");
 #endif
     while (s < end)
     {
@@ -720,25 +709,25 @@ static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_
     }
 }
 
-static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
+static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
 {
     const uint8_t *s = src;
     const uint8_t *end;
-#ifdef HAVE_MMX
+#if HAVE_MMX
     const uint8_t *mm_end;
 #endif
     uint16_t *d = (uint16_t *)dst;
     end = s + src_size;
-#ifdef HAVE_MMX
-    asm volatile(PREFETCH"    %0"::"m"(*src):"memory");
-    asm volatile(
+#if HAVE_MMX
+    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
+    __asm__ volatile(
         "movq         %0, %%mm7     \n\t"
         "movq         %1, %%mm6     \n\t"
         ::"m"(red_16mask),"m"(green_16mask));
     mm_end = end - 15;
     while (s < mm_end)
     {
-        asm volatile(
+        __asm__ volatile(
         PREFETCH"    32%1           \n\t"
         "movd          %1, %%mm0    \n\t"
         "movd         3%1, %%mm3    \n\t"
@@ -771,8 +760,8 @@ static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long s
         d += 4;
         s += 12;
     }
-    asm volatile(SFENCE:::"memory");
-    asm volatile(EMMS:::"memory");
+    __asm__ volatile(SFENCE:::"memory");
+    __asm__ volatile(EMMS:::"memory");
 #endif
     while (s < end)
     {
@@ -783,25 +772,25 @@ static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long s
     }
 }
 
-static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
+static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
 {
     const uint8_t *s = src;
     const uint8_t *end;
-#ifdef HAVE_MMX
+#if HAVE_MMX
     const uint8_t *mm_end;
 #endif
     uint16_t *d = (uint16_t *)dst;
     end = s + src_size;
-#ifdef HAVE_MMX
-    asm volatile(PREFETCH"    %0"::"m"(*src):"memory");
-    asm volatile(
+#if HAVE_MMX
+    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
+    __asm__ volatile(
         "movq          %0, %%mm7    \n\t"
         "movq          %1, %%mm6    \n\t"
         ::"m"(red_15mask),"m"(green_15mask));
     mm_end = end - 11;
     while (s < mm_end)
     {
-        asm volatile(
+        __asm__ volatile(
         PREFETCH"    32%1           \n\t"
         "movd          %1, %%mm0    \n\t"
         "movd         3%1, %%mm3    \n\t"
@@ -834,8 +823,8 @@ static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_
         d += 4;
         s += 12;
     }
-    asm volatile(SFENCE:::"memory");
-    asm volatile(EMMS:::"memory");
+    __asm__ volatile(SFENCE:::"memory");
+    __asm__ volatile(EMMS:::"memory");
 #endif
     while (s < end)
     {
@@ -846,25 +835,25 @@ static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_
     }
 }
 
-static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
+static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
 {
     const uint8_t *s = src;
     const uint8_t *end;
-#ifdef HAVE_MMX
+#if HAVE_MMX
     const uint8_t *mm_end;
 #endif
     uint16_t *d = (uint16_t *)dst;
     end = s + src_size;
-#ifdef HAVE_MMX
-    asm volatile(PREFETCH"    %0"::"m"(*src):"memory");
-    asm volatile(
+#if HAVE_MMX
+    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
+    __asm__ volatile(
         "movq         %0, %%mm7     \n\t"
         "movq         %1, %%mm6     \n\t"
         ::"m"(red_15mask),"m"(green_15mask));
     mm_end = end - 15;
     while (s < mm_end)
     {
-        asm volatile(
+        __asm__ volatile(
         PREFETCH"   32%1            \n\t"
         "movd         %1, %%mm0     \n\t"
         "movd        3%1, %%mm3     \n\t"
@@ -897,8 +886,8 @@ static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long s
         d += 4;
         s += 12;
     }
-    asm volatile(SFENCE:::"memory");
-    asm volatile(EMMS:::"memory");
+    __asm__ volatile(SFENCE:::"memory");
+    __asm__ volatile(EMMS:::"memory");
 #endif
     while (s < end)
     {
@@ -930,21 +919,21 @@ static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long s
        |
    original bits
 */
-static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, long src_size)
+static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
 {
     const uint16_t *end;
-#ifdef HAVE_MMX
+#if HAVE_MMX
     const uint16_t *mm_end;
 #endif
     uint8_t *d = dst;
     const uint16_t *s = (const uint16_t*)src;
     end = s + src_size/2;
-#ifdef HAVE_MMX
-    asm volatile(PREFETCH"    %0"::"m"(*s):"memory");
+#if HAVE_MMX
+    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
     mm_end = end - 7;
     while (s < mm_end)
     {
-        asm volatile(
+        __asm__ volatile(
         PREFETCH"    32%1           \n\t"
         "movq          %1, %%mm0    \n\t"
         "movq          %1, %%mm1    \n\t"
@@ -1007,7 +996,7 @@ static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, long src_
         :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
         :"memory");
         /* borrowed 32 to 24 */
-        asm volatile(
+        __asm__ volatile(
         "movq       %%mm0, %%mm4    \n\t"
         "movq       %%mm3, %%mm5    \n\t"
         "movq       %%mm6, %%mm0    \n\t"
@@ -1059,8 +1048,8 @@ static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, long src_
         d += 24;
         s += 8;
     }
-    asm volatile(SFENCE:::"memory");
-    asm volatile(EMMS:::"memory");
+    __asm__ volatile(SFENCE:::"memory");
+    __asm__ volatile(EMMS:::"memory");
 #endif
     while (s < end)
     {
@@ -1072,21 +1061,21 @@ static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, long src_
     }
 }
 
-static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, long src_size)
+static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
 {
     const uint16_t *end;
-#ifdef HAVE_MMX
+#if HAVE_MMX
     const uint16_t *mm_end;
 #endif
     uint8_t *d = (uint8_t *)dst;
     const uint16_t *s = (const uint16_t *)src;
     end = s + src_size/2;
-#ifdef HAVE_MMX
-    asm volatile(PREFETCH"    %0"::"m"(*s):"memory");
+#if HAVE_MMX
+    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
     mm_end = end - 7;
     while (s < mm_end)
     {
-        asm volatile(
+        __asm__ volatile(
         PREFETCH"    32%1           \n\t"
         "movq          %1, %%mm0    \n\t"
         "movq          %1, %%mm1    \n\t"
@@ -1148,7 +1137,7 @@ static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, long src_
         :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
         :"memory");
         /* borrowed 32 to 24 */
-        asm volatile(
+        __asm__ volatile(
         "movq       %%mm0, %%mm4    \n\t"
         "movq       %%mm3, %%mm5    \n\t"
         "movq       %%mm6, %%mm0    \n\t"
@@ -1200,8 +1189,8 @@ static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, long src_
         d += 24;
         s += 8;
     }
-    asm volatile(SFENCE:::"memory");
-    asm volatile(EMMS:::"memory");
+    __asm__ volatile(SFENCE:::"memory");
+    __asm__ volatile(EMMS:::"memory");
 #endif
     while (s < end)
     {
@@ -1216,19 +1205,19 @@ static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, long src_
 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
 {
     const uint16_t *end;
-#ifdef HAVE_MMX
+#if HAVE_MMX
     const uint16_t *mm_end;
 #endif
     uint8_t *d = dst;
     const uint16_t *s = (const uint16_t *)src;
     end = s + src_size/2;
-#ifdef HAVE_MMX
-    asm volatile(PREFETCH"    %0"::"m"(*s):"memory");
-    asm volatile("pxor    %%mm7,%%mm7    \n\t":::"memory");
+#if HAVE_MMX
+    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
+    __asm__ volatile("pxor    %%mm7,%%mm7    \n\t":::"memory");
     mm_end = end - 3;
     while (s < mm_end)
     {
-        asm volatile(
+        __asm__ volatile(
         PREFETCH"    32%1           \n\t"
         "movq          %1, %%mm0    \n\t"
         "movq          %1, %%mm1    \n\t"
@@ -1264,8 +1253,8 @@ static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_
         d += 16;
         s += 4;
     }
-    asm volatile(SFENCE:::"memory");
-    asm volatile(EMMS:::"memory");
+    __asm__ volatile(SFENCE:::"memory");
+    __asm__ volatile(EMMS:::"memory");
 #endif
     while (s < end)
     {
@@ -1276,7 +1265,7 @@ static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_
         register uint16_t bgr;
         bgr = *s++;
 #ifdef WORDS_BIGENDIAN
-        *d++ = 0;
+        *d++ = 255;
         *d++ = (bgr&0x7C00)>>7;
         *d++ = (bgr&0x3E0)>>2;
         *d++ = (bgr&0x1F)<<3;
@@ -1284,7 +1273,7 @@ static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_
         *d++ = (bgr&0x1F)<<3;
         *d++ = (bgr&0x3E0)>>2;
         *d++ = (bgr&0x7C00)>>7;
-        *d++ = 0;
+        *d++ = 255;
 #endif
 
 #endif
@@ -1294,19 +1283,19 @@ static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_
 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size)
 {
     const uint16_t *end;
-#ifdef HAVE_MMX
+#if HAVE_MMX
     const uint16_t *mm_end;
 #endif
     uint8_t *d = dst;
     const uint16_t *s = (const uint16_t*)src;
     end = s + src_size/2;
-#ifdef HAVE_MMX
-    asm volatile(PREFETCH"    %0"::"m"(*s):"memory");
-    asm volatile("pxor    %%mm7,%%mm7    \n\t":::"memory");
+#if HAVE_MMX
+    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
+    __asm__ volatile("pxor    %%mm7,%%mm7    \n\t":::"memory");
     mm_end = end - 3;
     while (s < mm_end)
     {
-        asm volatile(
+        __asm__ volatile(
         PREFETCH"    32%1           \n\t"
         "movq          %1, %%mm0    \n\t"
         "movq          %1, %%mm1    \n\t"
@@ -1342,15 +1331,15 @@ static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_
         d += 16;
         s += 4;
     }
-    asm volatile(SFENCE:::"memory");
-    asm volatile(EMMS:::"memory");
+    __asm__ volatile(SFENCE:::"memory");
+    __asm__ volatile(EMMS:::"memory");
 #endif
     while (s < end)
     {
         register uint16_t bgr;
         bgr = *s++;
 #ifdef WORDS_BIGENDIAN
-        *d++ = 0;
+        *d++ = 255;
         *d++ = (bgr&0xF800)>>8;
         *d++ = (bgr&0x7E0)>>3;
         *d++ = (bgr&0x1F)<<3;
@@ -1358,7 +1347,7 @@ static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_
         *d++ = (bgr&0x1F)<<3;
         *d++ = (bgr&0x7E0)>>3;
         *d++ = (bgr&0xF800)>>8;
-        *d++ = 0;
+        *d++ = 255;
 #endif
     }
 }
@@ -1368,8 +1357,8 @@ static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long s
     long idx = 15 - src_size;
     const uint8_t *s = src-idx;
     uint8_t *d = dst-idx;
-#ifdef HAVE_MMX
-    asm volatile(
+#if HAVE_MMX
+    __asm__ volatile(
     "test          %0, %0           \n\t"
     "jns           2f               \n\t"
     PREFETCH"       (%1, %0)        \n\t"
@@ -1382,7 +1371,7 @@ static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long s
     PREFETCH"     32(%1, %0)        \n\t"
     "movq           (%1, %0), %%mm0 \n\t"
     "movq          8(%1, %0), %%mm1 \n\t"
-# ifdef HAVE_MMX2
+# if HAVE_MMX2
     "pshufw      $177, %%mm0, %%mm3 \n\t"
     "pshufw      $177, %%mm1, %%mm5 \n\t"
     "pand       %%mm7, %%mm0        \n\t"
@@ -1430,9 +1419,9 @@ static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long s
 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
 {
     unsigned i;
-#ifdef HAVE_MMX
+#if HAVE_MMX
     long mmx_size= 23 - src_size;
-    asm volatile (
+    __asm__ volatile (
     "test             %%"REG_a", %%"REG_a"          \n\t"
     "jns                     2f                     \n\t"
     "movq     "MANGLE(mask24r)", %%mm5              \n\t"
@@ -1476,8 +1465,8 @@ static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long s
     : "r" (src-mmx_size), "r"(dst-mmx_size)
     );
 
-    asm volatile(SFENCE:::"memory");
-    asm volatile(EMMS:::"memory");
+    __asm__ volatile(SFENCE:::"memory");
+    __asm__ volatile(EMMS:::"memory");
 
     if (mmx_size==23) return; //finished, was multiple of 8
 
@@ -1505,9 +1494,9 @@ static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *u
     const long chromWidth= width>>1;
     for (y=0; y<height; y++)
     {
-#ifdef HAVE_MMX
+#if HAVE_MMX
 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
-        asm volatile(
+        __asm__ volatile(
         "xor                 %%"REG_a", %%"REG_a"   \n\t"
         ASMALIGN(4)
         "1:                                         \n\t"
@@ -1542,16 +1531,16 @@ static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *u
         );
 #else
 
-#if defined ARCH_ALPHA && defined HAVE_MVI
+#if ARCH_ALPHA && HAVE_MVI
 #define pl2yuy2(n)                  \
     y1 = yc[n];                     \
     y2 = yc2[n];                    \
     u = uc[n];                      \
     v = vc[n];                      \
-    asm("unpkbw %1, %0" : "=r"(y1) : "r"(y1));  \
-    asm("unpkbw %1, %0" : "=r"(y2) : "r"(y2));  \
-    asm("unpkbl %1, %0" : "=r"(u) : "r"(u));    \
-    asm("unpkbl %1, %0" : "=r"(v) : "r"(v));    \
+    __asm__("unpkbw %1, %0" : "=r"(y1) : "r"(y1));  \
+    __asm__("unpkbw %1, %0" : "=r"(y2) : "r"(y2));  \
+    __asm__("unpkbl %1, %0" : "=r"(u) : "r"(u));    \
+    __asm__("unpkbl %1, %0" : "=r"(v) : "r"(v));    \
     yuv1 = (u << 8) + (v << 24);                \
     yuv2 = yuv1 + y2;               \
     yuv1 += y1;                     \
@@ -1568,10 +1557,10 @@ static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *u
             uint64_t y1, y2, yuv1, yuv2;
             uint64_t u, v;
             /* Prefetch */
-            asm("ldq $31,64(%0)" :: "r"(yc));
-            asm("ldq $31,64(%0)" :: "r"(yc2));
-            asm("ldq $31,64(%0)" :: "r"(uc));
-            asm("ldq $31,64(%0)" :: "r"(vc));
+            __asm__("ldq $31,64(%0)" :: "r"(yc));
+            __asm__("ldq $31,64(%0)" :: "r"(yc2));
+            __asm__("ldq $31,64(%0)" :: "r"(uc));
+            __asm__("ldq $31,64(%0)" :: "r"(vc));
 
             pl2yuy2(0);
             pl2yuy2(1);
@@ -1589,7 +1578,7 @@ static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *u
         ysrc += lumStride;
         dst += dstStride;
 
-#elif __WORDSIZE >= 64
+#elif HAVE_FAST_64BIT
         int i;
         uint64_t *ldst = (uint64_t *) dst;
         const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
@@ -1630,8 +1619,8 @@ static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *u
         ysrc += lumStride;
         dst  += dstStride;
     }
-#ifdef HAVE_MMX
-asm(    EMMS"       \n\t"
+#if HAVE_MMX
+__asm__(    EMMS"       \n\t"
         SFENCE"     \n\t"
         :::"memory");
 #endif
@@ -1657,9 +1646,9 @@ static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *u
     const long chromWidth= width>>1;
     for (y=0; y<height; y++)
     {
-#ifdef HAVE_MMX
+#if HAVE_MMX
 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
-        asm volatile(
+        __asm__ volatile(
         "xor                %%"REG_a", %%"REG_a"    \n\t"
         ASMALIGN(4)
         "1:                                         \n\t"
@@ -1695,7 +1684,7 @@ static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *u
 #else
 //FIXME adapt the Alpha ASM code from yv12->yuy2
 
-#if __WORDSIZE >= 64
+#if HAVE_FAST_64BIT
         int i;
         uint64_t *ldst = (uint64_t *) dst;
         const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
@@ -1736,8 +1725,8 @@ static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *u
         ysrc += lumStride;
         dst += dstStride;
     }
-#ifdef HAVE_MMX
-asm(    EMMS"       \n\t"
+#if HAVE_MMX
+__asm__(    EMMS"       \n\t"
         SFENCE"     \n\t"
         :::"memory");
 #endif
@@ -1755,6 +1744,16 @@ static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc,
     RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
 }
 
+/**
+ * Width should be a multiple of 16.
+ */
+static inline void RENAME(yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
+                                         long width, long height,
+                                         long lumStride, long chromStride, long dstStride)
+{
+    RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
+}
+
 /**
  * Width should be a multiple of 16.
  */
@@ -1777,8 +1776,8 @@ static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t
     const long chromWidth= width>>1;
     for (y=0; y<height; y+=2)
     {
-#ifdef HAVE_MMX
-        asm volatile(
+#if HAVE_MMX
+        __asm__ volatile(
         "xor                 %%"REG_a", %%"REG_a"   \n\t"
         "pcmpeqw                 %%mm7, %%mm7       \n\t"
         "psrlw                      $8, %%mm7       \n\t" // FF,00,FF,00...
@@ -1833,7 +1832,7 @@ static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t
         ydst += lumStride;
         src  += srcStride;
 
-        asm volatile(
+        __asm__ volatile(
         "xor                 %%"REG_a", %%"REG_a"   \n\t"
         ASMALIGN(4)
         "1:                                         \n\t"
@@ -1882,8 +1881,8 @@ static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t
         ydst += lumStride;
         src  += srcStride;
     }
-#ifdef HAVE_MMX
-asm volatile(   EMMS"       \n\t"
+#if HAVE_MMX
+__asm__ volatile(   EMMS"       \n\t"
                 SFENCE"     \n\t"
                 :::"memory");
 #endif
@@ -1915,9 +1914,9 @@ static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWi
         dst+= dstStride;
 
     for (y=1; y<srcHeight; y++){
-#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
+#if HAVE_MMX2 || HAVE_AMD3DNOW
         const long mmxSize= srcWidth&~15;
-        asm volatile(
+        __asm__ volatile(
         "mov           %4, %%"REG_a"            \n\t"
         "1:                                     \n\t"
         "movq         (%0, %%"REG_a"), %%mm0    \n\t"
@@ -1994,8 +1993,8 @@ static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWi
     }
 #endif
 
-#ifdef HAVE_MMX
-asm volatile(   EMMS"       \n\t"
+#if HAVE_MMX
+__asm__ volatile(   EMMS"       \n\t"
                 SFENCE"     \n\t"
                 :::"memory");
 #endif
@@ -2015,16 +2014,16 @@ static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t
     const long chromWidth= width>>1;
     for (y=0; y<height; y+=2)
     {
-#ifdef HAVE_MMX
-        asm volatile(
-        "xorl                %%eax, %%eax   \n\t"
+#if HAVE_MMX
+        __asm__ volatile(
+        "xor                 %%"REG_a", %%"REG_a"   \n\t"
         "pcmpeqw             %%mm7, %%mm7   \n\t"
         "psrlw                  $8, %%mm7   \n\t" // FF,00,FF,00...
         ASMALIGN(4)
         "1:                                 \n\t"
-        PREFETCH" 64(%0, %%eax, 4)          \n\t"
-        "movq       (%0, %%eax, 4), %%mm0   \n\t" // UYVY UYVY(0)
-        "movq      8(%0, %%eax, 4), %%mm1   \n\t" // UYVY UYVY(4)
+        PREFETCH" 64(%0, %%"REG_a", 4)          \n\t"
+        "movq       (%0, %%"REG_a", 4), %%mm0   \n\t" // UYVY UYVY(0)
+        "movq      8(%0, %%"REG_a", 4), %%mm1   \n\t" // UYVY UYVY(4)
         "movq                %%mm0, %%mm2   \n\t" // UYVY UYVY(0)
         "movq                %%mm1, %%mm3   \n\t" // UYVY UYVY(4)
         "pand                %%mm7, %%mm0   \n\t" // U0V0 U0V0(0)
@@ -2034,10 +2033,10 @@ static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t
         "packuswb            %%mm1, %%mm0   \n\t" // UVUV UVUV(0)
         "packuswb            %%mm3, %%mm2   \n\t" // YYYY YYYY(0)
 
-        MOVNTQ"              %%mm2,  (%1, %%eax, 2) \n\t"
+        MOVNTQ"              %%mm2,  (%1, %%"REG_a", 2) \n\t"
 
-        "movq     16(%0, %%eax, 4), %%mm1   \n\t" // UYVY UYVY(8)
-        "movq     24(%0, %%eax, 4), %%mm2   \n\t" // UYVY UYVY(12)
+        "movq     16(%0, %%"REG_a", 4), %%mm1   \n\t" // UYVY UYVY(8)
+        "movq     24(%0, %%"REG_a", 4), %%mm2   \n\t" // UYVY UYVY(12)
         "movq                %%mm1, %%mm3   \n\t" // UYVY UYVY(8)
         "movq                %%mm2, %%mm4   \n\t" // UYVY UYVY(12)
         "pand                %%mm7, %%mm1   \n\t" // U0V0 U0V0(8)
@@ -2047,7 +2046,7 @@ static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t
         "packuswb            %%mm2, %%mm1   \n\t" // UVUV UVUV(8)
         "packuswb            %%mm4, %%mm3   \n\t" // YYYY YYYY(8)
 
-        MOVNTQ"              %%mm3, 8(%1, %%eax, 2) \n\t"
+        MOVNTQ"              %%mm3, 8(%1, %%"REG_a", 2) \n\t"
 
         "movq                %%mm0, %%mm2   \n\t" // UVUV UVUV(0)
         "movq                %%mm1, %%mm3   \n\t" // UVUV UVUV(8)
@@ -2058,28 +2057,28 @@ static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t
         "packuswb            %%mm1, %%mm0   \n\t" // VVVV VVVV(0)
         "packuswb            %%mm3, %%mm2   \n\t" // UUUU UUUU(0)
 
-        MOVNTQ"              %%mm0, (%3, %%eax) \n\t"
-        MOVNTQ"              %%mm2, (%2, %%eax) \n\t"
+        MOVNTQ"              %%mm0, (%3, %%"REG_a") \n\t"
+        MOVNTQ"              %%mm2, (%2, %%"REG_a") \n\t"
 
-        "addl                   $8, %%eax   \n\t"
-        "cmpl                   %4, %%eax   \n\t"
+        "add                    $8, %%"REG_a"   \n\t"
+        "cmp                    %4, %%"REG_a"   \n\t"
         " jb                    1b          \n\t"
         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
-        : "memory", "%eax"
+        : "memory", "%"REG_a
         );
 
         ydst += lumStride;
         src  += srcStride;
 
-        asm volatile(
-        "xorl                %%eax, %%eax   \n\t"
+        __asm__ volatile(
+        "xor                 %%"REG_a", %%"REG_a"   \n\t"
         ASMALIGN(4)
         "1:                                 \n\t"
-        PREFETCH" 64(%0, %%eax, 4)          \n\t"
-        "movq       (%0, %%eax, 4), %%mm0   \n\t" // YUYV YUYV(0)
-        "movq      8(%0, %%eax, 4), %%mm1   \n\t" // YUYV YUYV(4)
-        "movq     16(%0, %%eax, 4), %%mm2   \n\t" // YUYV YUYV(8)
-        "movq     24(%0, %%eax, 4), %%mm3   \n\t" // YUYV YUYV(12)
+        PREFETCH" 64(%0, %%"REG_a", 4)          \n\t"
+        "movq       (%0, %%"REG_a", 4), %%mm0   \n\t" // YUYV YUYV(0)
+        "movq      8(%0, %%"REG_a", 4), %%mm1   \n\t" // YUYV YUYV(4)
+        "movq     16(%0, %%"REG_a", 4), %%mm2   \n\t" // YUYV YUYV(8)
+        "movq     24(%0, %%"REG_a", 4), %%mm3   \n\t" // YUYV YUYV(12)
         "psrlw                  $8, %%mm0   \n\t" // Y0Y0 Y0Y0(0)
         "psrlw                  $8, %%mm1   \n\t" // Y0Y0 Y0Y0(4)
         "psrlw                  $8, %%mm2   \n\t" // Y0Y0 Y0Y0(8)
@@ -2087,15 +2086,15 @@ static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t
         "packuswb            %%mm1, %%mm0   \n\t" // YYYY YYYY(0)
         "packuswb            %%mm3, %%mm2   \n\t" // YYYY YYYY(8)
 
-        MOVNTQ"              %%mm0,  (%1, %%eax, 2) \n\t"
-        MOVNTQ"              %%mm2, 8(%1, %%eax, 2) \n\t"
+        MOVNTQ"              %%mm0,  (%1, %%"REG_a", 2) \n\t"
+        MOVNTQ"              %%mm2, 8(%1, %%"REG_a", 2) \n\t"
 
-        "addl                   $8, %%eax   \n\t"
-        "cmpl                   %4, %%eax   \n\t"
+        "add                    $8, %%"REG_a"   \n\t"
+        "cmp                    %4, %%"REG_a"   \n\t"
         " jb                    1b          \n\t"
 
         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
-        : "memory", "%eax"
+        : "memory", "%"REG_a
         );
 #else
         long i;
@@ -2120,8 +2119,8 @@ static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t
         ydst += lumStride;
         src  += srcStride;
     }
-#ifdef HAVE_MMX
-asm volatile(   EMMS"       \n\t"
+#if HAVE_MMX
+__asm__ volatile(   EMMS"       \n\t"
                 SFENCE"     \n\t"
                 :::"memory");
 #endif
@@ -2140,13 +2139,13 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_
 {
     long y;
     const long chromWidth= width>>1;
-#ifdef HAVE_MMX
+#if HAVE_MMX
     for (y=0; y<height-2; y+=2)
     {
         long i;
         for (i=0; i<2; i++)
         {
-            asm volatile(
+            __asm__ volatile(
             "mov                        %2, %%"REG_a"   \n\t"
             "movq  "MANGLE(ff_bgr2YCoeff)", %%mm6       \n\t"
             "movq       "MANGLE(ff_w1111)", %%mm5       \n\t"
@@ -2219,7 +2218,7 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_
             src  += srcStride;
         }
         src -= srcStride*2;
-        asm volatile(
+        __asm__ volatile(
         "mov                        %4, %%"REG_a"   \n\t"
         "movq       "MANGLE(ff_w1111)", %%mm5       \n\t"
         "movq  "MANGLE(ff_bgr2UCoeff)", %%mm6       \n\t"
@@ -2230,7 +2229,7 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_
         "1:                                         \n\t"
         PREFETCH"    64(%0, %%"REG_d")              \n\t"
         PREFETCH"    64(%1, %%"REG_d")              \n\t"
-#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
+#if HAVE_MMX2 || HAVE_AMD3DNOW
         "movq          (%0, %%"REG_d"), %%mm0       \n\t"
         "movq          (%1, %%"REG_d"), %%mm1       \n\t"
         "movq         6(%0, %%"REG_d"), %%mm2       \n\t"
@@ -2291,7 +2290,7 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_
         "packssdw                %%mm1, %%mm0       \n\t" // V1 V0 U1 U0
         "psraw                      $7, %%mm0       \n\t"
 
-#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
+#if HAVE_MMX2 || HAVE_AMD3DNOW
         "movq        12(%0, %%"REG_d"), %%mm4       \n\t"
         "movq        12(%1, %%"REG_d"), %%mm1       \n\t"
         "movq        18(%0, %%"REG_d"), %%mm2       \n\t"
@@ -2373,7 +2372,7 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_
         src  += srcStride*2;
     }
 
-    asm volatile(   EMMS"       \n\t"
+    __asm__ volatile(   EMMS"       \n\t"
                     SFENCE"     \n\t"
                     :::"memory");
 #else
@@ -2430,7 +2429,7 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_
     }
 }
 
-void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
+static void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
                              long width, long height, long src1Stride,
                              long src2Stride, long dstStride){
     long h;
@@ -2439,9 +2438,9 @@ void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
     {
         long w;
 
-#ifdef HAVE_MMX
-#ifdef HAVE_SSE2
-        asm(
+#if HAVE_MMX
+#if HAVE_SSE2
+        __asm__(
         "xor              %%"REG_a", %%"REG_a"  \n\t"
         "1:                                     \n\t"
         PREFETCH" 64(%1, %%"REG_a")             \n\t"
@@ -2460,7 +2459,7 @@ void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
         : "memory", "%"REG_a""
         );
 #else
-        asm(
+        __asm__(
         "xor %%"REG_a", %%"REG_a"               \n\t"
         "1:                                     \n\t"
         PREFETCH" 64(%1, %%"REG_a")             \n\t"
@@ -2502,8 +2501,8 @@ void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
                 src1 += src1Stride;
                 src2 += src2Stride;
     }
-#ifdef HAVE_MMX
-    asm(
+#if HAVE_MMX
+    __asm__(
         EMMS"       \n\t"
         SFENCE"     \n\t"
         ::: "memory"
@@ -2519,8 +2518,8 @@ static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
 {
     long y,x,w,h;
     w=width/2; h=height/2;
-#ifdef HAVE_MMX
-    asm volatile(
+#if HAVE_MMX
+    __asm__ volatile(
     PREFETCH" %0    \n\t"
     PREFETCH" %1    \n\t"
     ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
@@ -2529,10 +2528,10 @@ static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
     const uint8_t* s1=src1+srcStride1*(y>>1);
     uint8_t* d=dst1+dstStride1*y;
     x=0;
-#ifdef HAVE_MMX
+#if HAVE_MMX
     for (;x<w-31;x+=32)
     {
-        asm volatile(
+        __asm__ volatile(
         PREFETCH"   32%1        \n\t"
         "movq         %1, %%mm0 \n\t"
         "movq        8%1, %%mm2 \n\t"
@@ -2569,10 +2568,10 @@ static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
     const uint8_t* s2=src2+srcStride2*(y>>1);
     uint8_t* d=dst2+dstStride2*y;
     x=0;
-#ifdef HAVE_MMX
+#if HAVE_MMX
     for (;x<w-31;x+=32)
     {
-        asm volatile(
+        __asm__ volatile(
         PREFETCH"   32%1        \n\t"
         "movq         %1, %%mm0 \n\t"
         "movq        8%1, %%mm2 \n\t"
@@ -2605,8 +2604,8 @@ static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
 #endif
     for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
     }
-#ifdef HAVE_MMX
-    asm(
+#if HAVE_MMX
+    __asm__(
         EMMS"       \n\t"
         SFENCE"     \n\t"
         ::: "memory"
@@ -2628,10 +2627,10 @@ static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2
     const uint8_t* vp=src3+srcStride3*(y>>2);
     uint8_t* d=dst+dstStride*y;
     x=0;
-#ifdef HAVE_MMX
+#if HAVE_MMX
     for (;x<w-7;x+=8)
     {
-        asm volatile(
+        __asm__ volatile(
         PREFETCH"   32(%1, %0)          \n\t"
         PREFETCH"   32(%2, %0)          \n\t"
         PREFETCH"   32(%3, %0)          \n\t"
@@ -2696,8 +2695,8 @@ static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2
         d[8*x+7] = vp[x];
     }
     }
-#ifdef HAVE_MMX
-    asm(
+#if HAVE_MMX
+    __asm__(
         EMMS"       \n\t"
         SFENCE"     \n\t"
         ::: "memory"
@@ -2707,19 +2706,19 @@ static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2
 
 static inline void RENAME(rgb2rgb_init)(void){
     rgb15to16       = RENAME(rgb15to16);
-    rgb15to24       = RENAME(rgb15to24);
+    rgb15tobgr24    = RENAME(rgb15tobgr24);
     rgb15to32       = RENAME(rgb15to32);
-    rgb16to24       = RENAME(rgb16to24);
+    rgb16tobgr24    = RENAME(rgb16tobgr24);
     rgb16to32       = RENAME(rgb16to32);
     rgb16to15       = RENAME(rgb16to15);
-    rgb24to16       = RENAME(rgb24to16);
-    rgb24to15       = RENAME(rgb24to15);
-    rgb24to32       = RENAME(rgb24to32);
+    rgb24tobgr16    = RENAME(rgb24tobgr16);
+    rgb24tobgr15    = RENAME(rgb24tobgr15);
+    rgb24tobgr32    = RENAME(rgb24tobgr32);
     rgb32to16       = RENAME(rgb32to16);
     rgb32to15       = RENAME(rgb32to15);
-    rgb32to24       = RENAME(rgb32to24);
-    rgb24tobgr15    = RENAME(rgb24tobgr15);
-    rgb24tobgr16    = RENAME(rgb24tobgr16);
+    rgb32tobgr24    = RENAME(rgb32tobgr24);
+    rgb24to15       = RENAME(rgb24to15);
+    rgb24to16       = RENAME(rgb24to16);
     rgb24tobgr24    = RENAME(rgb24tobgr24);
     rgb32tobgr32    = RENAME(rgb32tobgr32);
     rgb32tobgr16    = RENAME(rgb32tobgr16);
@@ -2727,6 +2726,7 @@ static inline void RENAME(rgb2rgb_init)(void){
     yv12toyuy2      = RENAME(yv12toyuy2);
     yv12touyvy      = RENAME(yv12touyvy);
     yuv422ptoyuy2   = RENAME(yuv422ptoyuy2);
+    yuv422ptouyvy   = RENAME(yuv422ptouyvy);
     yuy2toyv12      = RENAME(yuy2toyv12);
 //    uyvytoyv12      = RENAME(uyvytoyv12);
 //    yvu9toyv12      = RENAME(yvu9toyv12);
diff --git a/src/add-ons/media/plugins/avcodec/libswscale/swscale-example.c b/src/add-ons/media/plugins/avcodec/libswscale/swscale-example.c
deleted file mode 100644
index 79e21a922a..0000000000
--- a/src/add-ons/media/plugins/avcodec/libswscale/swscale-example.c
+++ /dev/null
@@ -1,229 +0,0 @@
-/*
- * Copyright (C) 2003 Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <inttypes.h>
-#include <stdarg.h>
-
-#undef HAVE_AV_CONFIG_H
-#include "libavutil/avutil.h"
-#include "swscale.h"
-#include "swscale_internal.h"
-#include "rgb2rgb.h"
-
-static uint64_t getSSD(uint8_t *src1, uint8_t *src2, int stride1, int stride2, int w, int h){
-    int x,y;
-    uint64_t ssd=0;
-
-//printf("%d %d\n", w, h);
-
-    for (y=0; y<h; y++){
-        for (x=0; x<w; x++){
-            int d= src1[x + y*stride1] - src2[x + y*stride2];
-            ssd+= d*d;
-//printf("%d", abs(src1[x + y*stride1] - src2[x + y*stride2])/26 );
-        }
-//printf("\n");
-    }
-    return ssd;
-}
-
-// test by ref -> src -> dst -> out & compare out against ref
-// ref & out are YV12
-static int doTest(uint8_t *ref[3], int refStride[3], int w, int h, int srcFormat, int dstFormat,
-                  int srcW, int srcH, int dstW, int dstH, int flags){
-    uint8_t *src[3];
-    uint8_t *dst[3];
-    uint8_t *out[3];
-    int srcStride[3], dstStride[3];
-    int i;
-    uint64_t ssdY, ssdU, ssdV;
-    struct SwsContext *srcContext, *dstContext, *outContext;
-    int res;
-
-    res = 0;
-    for (i=0; i<3; i++){
-        // avoid stride % bpp != 0
-        if (srcFormat==PIX_FMT_RGB24 || srcFormat==PIX_FMT_BGR24)
-            srcStride[i]= srcW*3;
-        else
-            srcStride[i]= srcW*4;
-
-        if (dstFormat==PIX_FMT_RGB24 || dstFormat==PIX_FMT_BGR24)
-            dstStride[i]= dstW*3;
-        else
-            dstStride[i]= dstW*4;
-
-        src[i]= (uint8_t*) malloc(srcStride[i]*srcH);
-        dst[i]= (uint8_t*) malloc(dstStride[i]*dstH);
-        out[i]= (uint8_t*) malloc(refStride[i]*h);
-        if (!src[i] || !dst[i] || !out[i]) {
-            perror("Malloc");
-            res = -1;
-
-            goto end;
-        }
-    }
-
-    dstContext = outContext = NULL;
-    srcContext= sws_getContext(w, h, PIX_FMT_YUV420P, srcW, srcH, srcFormat, flags, NULL, NULL, NULL);
-    if (!srcContext) {
-        fprintf(stderr, "Failed to get %s ---> %s\n",
-                sws_format_name(PIX_FMT_YUV420P),
-                sws_format_name(srcFormat));
-        res = -1;
-
-        goto end;
-    }
-    dstContext= sws_getContext(srcW, srcH, srcFormat, dstW, dstH, dstFormat, flags, NULL, NULL, NULL);
-    if (!dstContext) {
-        fprintf(stderr, "Failed to get %s ---> %s\n",
-                sws_format_name(srcFormat),
-                sws_format_name(dstFormat));
-        res = -1;
-
-        goto end;
-    }
-    outContext= sws_getContext(dstW, dstH, dstFormat, w, h, PIX_FMT_YUV420P, flags, NULL, NULL, NULL);
-    if (!outContext) {
-        fprintf(stderr, "Failed to get %s ---> %s\n",
-                sws_format_name(dstFormat),
-                sws_format_name(PIX_FMT_YUV420P));
-        res = -1;
-
-        goto end;
-    }
-//    printf("test %X %X %X -> %X %X %X\n", (int)ref[0], (int)ref[1], (int)ref[2],
-//        (int)src[0], (int)src[1], (int)src[2]);
-
-    sws_scale(srcContext, ref, refStride, 0, h   , src, srcStride);
-    sws_scale(dstContext, src, srcStride, 0, srcH, dst, dstStride);
-    sws_scale(outContext, dst, dstStride, 0, dstH, out, refStride);
-
-#if defined(ARCH_X86)
-    asm volatile ("emms\n\t");
-#endif
-
-    ssdY= getSSD(ref[0], out[0], refStride[0], refStride[0], w, h);
-    ssdU= getSSD(ref[1], out[1], refStride[1], refStride[1], (w+1)>>1, (h+1)>>1);
-    ssdV= getSSD(ref[2], out[2], refStride[2], refStride[2], (w+1)>>1, (h+1)>>1);
-
-    if (srcFormat == PIX_FMT_GRAY8 || dstFormat==PIX_FMT_GRAY8) ssdU=ssdV=0; //FIXME check that output is really gray
-
-    ssdY/= w*h;
-    ssdU/= w*h/4;
-    ssdV/= w*h/4;
-
-    if (ssdY>100 || ssdU>100 || ssdV>100){
-        printf(" %s %dx%d -> %s %4dx%4d flags=%2d SSD=%5lld,%5lld,%5lld\n",
-               sws_format_name(srcFormat), srcW, srcH,
-               sws_format_name(dstFormat), dstW, dstH,
-               flags,
-               ssdY, ssdU, ssdV);
-    }
-
-    end:
-
-    sws_freeContext(srcContext);
-    sws_freeContext(dstContext);
-    sws_freeContext(outContext);
-
-    for (i=0; i<3; i++){
-        free(src[i]);
-        free(dst[i]);
-        free(out[i]);
-    }
-
-    return res;
-}
-
-void fast_memcpy(void *a, void *b, int s){ //FIXME
-    memcpy(a, b, s);
-}
-
-static void selfTest(uint8_t *src[3], int stride[3], int w, int h){
-    enum PixelFormat srcFormat, dstFormat;
-    int srcW, srcH, dstW, dstH;
-    int flags;
-
-    for (srcFormat = 0; srcFormat < PIX_FMT_NB; srcFormat++) {
-        for (dstFormat = 0; dstFormat < PIX_FMT_NB; dstFormat++) {
-            printf("%s -> %s\n",
-                   sws_format_name(srcFormat),
-                   sws_format_name(dstFormat));
-
-            srcW= w;
-            srcH= h;
-            for (dstW=w - w/3; dstW<= 4*w/3; dstW+= w/3){
-                for (dstH=h - h/3; dstH<= 4*h/3; dstH+= h/3){
-                    for (flags=1; flags<33; flags*=2) {
-                        int res;
-
-                        res = doTest(src, stride, w, h, srcFormat, dstFormat,
-                                     srcW, srcH, dstW, dstH, flags);
-                        if (res < 0) {
-                            dstW = 4 * w / 3;
-                            dstH = 4 * h / 3;
-                            flags = 33;
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-
-#define W 96
-#define H 96
-
-int main(int argc, char **argv){
-    uint8_t *rgb_data = malloc (W*H*4);
-    uint8_t *rgb_src[3]= {rgb_data, NULL, NULL};
-    int rgb_stride[3]={4*W, 0, 0};
-    uint8_t *data = malloc (3*W*H);
-    uint8_t *src[3]= {data, data+W*H, data+W*H*2};
-    int stride[3]={W, W, W};
-    int x, y;
-    struct SwsContext *sws;
-
-    sws= sws_getContext(W/12, H/12, PIX_FMT_RGB32, W, H, PIX_FMT_YUV420P, 2, NULL, NULL, NULL);
-
-    for (y=0; y<H; y++){
-        for (x=0; x<W*4; x++){
-            rgb_data[ x + y*4*W]= random();
-        }
-    }
-#if defined(ARCH_X86)
-    sws_rgb2rgb_init(SWS_CPU_CAPS_MMX*0);
-#else
-    sws_rgb2rgb_init(0);
-#endif
-    sws_scale(sws, rgb_src, rgb_stride, 0, H, src, stride);
-
-#if defined(ARCH_X86)
-    asm volatile ("emms\n\t");
-#endif
-
-    selfTest(src, stride, W, H);
-
-    return 123;
-}
diff --git a/src/add-ons/media/plugins/avcodec/libswscale/swscale.c b/src/add-ons/media/plugins/avcodec/libswscale/swscale.c
index b2afb71946..7c335f1680 100644
--- a/src/add-ons/media/plugins/avcodec/libswscale/swscale.c
+++ b/src/add-ons/media/plugins/avcodec/libswscale/swscale.c
@@ -22,7 +22,7 @@
  */
 
 /*
-  supported Input formats: YV12, I420/IYUV, YUY2, UYVY, BGR32, BGR24, BGR16, BGR15, RGB32, RGB24, Y8/Y800, YVU9/IF09, PAL8
+  supported Input formats: YV12, I420/IYUV, YUY2, UYVY, BGR32, BGR32_1, BGR24, BGR16, BGR15, RGB32, RGB32_1, RGB24, Y8/Y800, YVU9/IF09, PAL8
   supported output formats: YV12, I420/IYUV, YUY2, UYVY, {BGR,RGB}{1,4,8,15,16,24,32}, Y8/Y800, YVU9/IF09
   {BGR,RGB}{1,4,8,15,16} support dithering
 
@@ -46,7 +46,7 @@ tested special converters (most are tested actually, but I did not write it down
  YVU9 -> YV12
 
 untested special converters
-  YV12/I420 -> BGR15/BGR24/BGR32 (it is the yuv2rgb stuff, so it should be ok)
+  YV12/I420 -> BGR15/BGR24/BGR32 (it is the yuv2rgb stuff, so it should be OK)
   YV12/I420 -> YV12/I420
   YUY2/BGR15/BGR24/BGR32/RGB24/RGB32 -> same format
   BGR24 -> BGR32 & RGB24 -> RGB32
@@ -54,6 +54,7 @@ untested special converters
   BGR24 -> YV12
 */
 
+#define _SVID_SOURCE //needed for MAP_ANONYMOUS
 #include <inttypes.h>
 #include <string.h>
 #include <math.h>
@@ -61,7 +62,7 @@ untested special converters
 #include <unistd.h>
 #include "config.h"
 #include <assert.h>
-#ifdef HAVE_SYS_MMAN_H
+#if HAVE_SYS_MMAN_H
 #include <sys/mman.h>
 #if defined(MAP_ANON) && !defined(MAP_ANONYMOUS)
 #define MAP_ANONYMOUS MAP_ANON
@@ -70,22 +71,27 @@ untested special converters
 #include "swscale.h"
 #include "swscale_internal.h"
 #include "rgb2rgb.h"
-#include "x86_cpu.h"
-#include "bswap.h"
+#include "libavutil/x86_cpu.h"
+#include "libavutil/bswap.h"
+
+unsigned swscale_version(void)
+{
+    return LIBSWSCALE_VERSION_INT;
+}
 
 #undef MOVNTQ
 #undef PAVGB
 
 //#undef HAVE_MMX2
-//#define HAVE_3DNOW
+//#define HAVE_AMD3DNOW
 //#undef HAVE_MMX
 //#undef ARCH_X86
 //#define WORDS_BIGENDIAN
 #define DITHER1XBPP
 
-#define FAST_BGR2YV12 // use 7 bit coeffs instead of 15bit
+#define FAST_BGR2YV12 // use 7 bit coefficients instead of 15 bit
 
-#define RET 0xC3 //near return opcode for X86
+#define RET 0xC3 //near return opcode for x86
 
 #ifdef M_PI
 #define PI M_PI
@@ -99,15 +105,18 @@ untested special converters
         || (x)==PIX_FMT_YUYV422     \
         || (x)==PIX_FMT_UYVY422     \
         || (x)==PIX_FMT_RGB32       \
+        || (x)==PIX_FMT_RGB32_1     \
         || (x)==PIX_FMT_BGR24       \
         || (x)==PIX_FMT_BGR565      \
         || (x)==PIX_FMT_BGR555      \
         || (x)==PIX_FMT_BGR32       \
+        || (x)==PIX_FMT_BGR32_1     \
         || (x)==PIX_FMT_RGB24       \
         || (x)==PIX_FMT_RGB565      \
         || (x)==PIX_FMT_RGB555      \
         || (x)==PIX_FMT_GRAY8       \
         || (x)==PIX_FMT_YUV410P     \
+        || (x)==PIX_FMT_YUV440P     \
         || (x)==PIX_FMT_GRAY16BE    \
         || (x)==PIX_FMT_GRAY16LE    \
         || (x)==PIX_FMT_YUV444P     \
@@ -119,6 +128,8 @@ untested special converters
         || (x)==PIX_FMT_BGR4_BYTE   \
         || (x)==PIX_FMT_RGB4_BYTE   \
         || (x)==PIX_FMT_YUV440P     \
+        || (x)==PIX_FMT_MONOWHITE   \
+        || (x)==PIX_FMT_MONOBLACK   \
     )
 #define isSupportedOut(x)   (       \
            (x)==PIX_FMT_YUV420P     \
@@ -135,6 +146,7 @@ untested special converters
         || (x)==PIX_FMT_GRAY16LE    \
         || (x)==PIX_FMT_GRAY8       \
         || (x)==PIX_FMT_YUV410P     \
+        || (x)==PIX_FMT_YUV440P     \
     )
 #define isPacked(x)         (       \
            (x)==PIX_FMT_PAL8        \
@@ -143,19 +155,37 @@ untested special converters
         || isRGB(x)                 \
         || isBGR(x)                 \
     )
+#define usePal(x)           (       \
+           (x)==PIX_FMT_PAL8        \
+        || (x)==PIX_FMT_BGR4_BYTE   \
+        || (x)==PIX_FMT_RGB4_BYTE   \
+        || (x)==PIX_FMT_BGR8        \
+        || (x)==PIX_FMT_RGB8        \
+    )
 
-#define RGB2YUV_SHIFT 16
-#define BY ((int)( 0.098*(1<<RGB2YUV_SHIFT)+0.5))
-#define BV ((int)(-0.071*(1<<RGB2YUV_SHIFT)+0.5))
-#define BU ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
-#define GY ((int)( 0.504*(1<<RGB2YUV_SHIFT)+0.5))
-#define GV ((int)(-0.368*(1<<RGB2YUV_SHIFT)+0.5))
-#define GU ((int)(-0.291*(1<<RGB2YUV_SHIFT)+0.5))
-#define RY ((int)( 0.257*(1<<RGB2YUV_SHIFT)+0.5))
-#define RV ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
-#define RU ((int)(-0.148*(1<<RGB2YUV_SHIFT)+0.5))
+#define RGB2YUV_SHIFT 15
+#define BY ( (int)(0.114*219/255*(1<<RGB2YUV_SHIFT)+0.5))
+#define BV (-(int)(0.081*224/255*(1<<RGB2YUV_SHIFT)+0.5))
+#define BU ( (int)(0.500*224/255*(1<<RGB2YUV_SHIFT)+0.5))
+#define GY ( (int)(0.587*219/255*(1<<RGB2YUV_SHIFT)+0.5))
+#define GV (-(int)(0.419*224/255*(1<<RGB2YUV_SHIFT)+0.5))
+#define GU (-(int)(0.331*224/255*(1<<RGB2YUV_SHIFT)+0.5))
+#define RY ( (int)(0.299*219/255*(1<<RGB2YUV_SHIFT)+0.5))
+#define RV ( (int)(0.500*224/255*(1<<RGB2YUV_SHIFT)+0.5))
+#define RU (-(int)(0.169*224/255*(1<<RGB2YUV_SHIFT)+0.5))
 
-extern const int32_t Inverse_Table_6_9[8][4];
+extern const int32_t ff_yuv2rgb_coeffs[8][4];
+
+static const double rgb2yuv_table[8][9]={
+    {0.7152, 0.0722, 0.2126, -0.386, 0.5, -0.115, -0.454, -0.046, 0.5},
+    {0.7152, 0.0722, 0.2126, -0.386, 0.5, -0.115, -0.454, -0.046, 0.5},
+    {0.587 , 0.114 , 0.299 , -0.331, 0.5, -0.169, -0.419, -0.081, 0.5},
+    {0.587 , 0.114 , 0.299 , -0.331, 0.5, -0.169, -0.419, -0.081, 0.5},
+    {0.59  , 0.11  , 0.30  , -0.331, 0.5, -0.169, -0.421, -0.079, 0.5}, //FCC
+    {0.587 , 0.114 , 0.299 , -0.331, 0.5, -0.169, -0.419, -0.081, 0.5},
+    {0.587 , 0.114 , 0.299 , -0.331, 0.5, -0.169, -0.419, -0.081, 0.5}, //SMPTE 170M
+    {0.701 , 0.087 , 0.212 , -0.384, 0.5  -0.116, -0.445, -0.055, 0.5}, //SMPTE 240M
+};
 
 /*
 NOTES
@@ -164,15 +194,15 @@ Special versions: fast Y 1:1 scaling (no interpolation in y direction)
 TODO
 more intelligent misalignment avoidance for the horizontal scaler
 write special vertical cubic upscale version
-Optimize C code (yv12 / minmax)
-add support for packed pixel yuv input & output
+optimize C code (YV12 / minmax)
+add support for packed pixel YUV input & output
 add support for Y8 output
-optimize bgr24 & bgr32
+optimize BGR24 & BGR32
 add BGR4 output support
 write special BGR->BGR scaler
 */
 
-#if defined(ARCH_X86) && defined (CONFIG_GPL)
+#if ARCH_X86 && CONFIG_GPL
 DECLARE_ASM_CONST(8, uint64_t, bF8)=       0xF8F8F8F8F8F8F8F8LL;
 DECLARE_ASM_CONST(8, uint64_t, bFC)=       0xFCFCFCFCFCFCFCFCLL;
 DECLARE_ASM_CONST(8, uint64_t, w10)=       0x0010001000100010LL;
@@ -182,11 +212,6 @@ DECLARE_ASM_CONST(8, uint64_t, bm00000111)=0x0000000000FFFFFFLL;
 DECLARE_ASM_CONST(8, uint64_t, bm11111000)=0xFFFFFFFFFF000000LL;
 DECLARE_ASM_CONST(8, uint64_t, bm01010101)=0x00FF00FF00FF00FFLL;
 
-static volatile uint64_t attribute_used __attribute__((aligned(8))) b5Dither;
-static volatile uint64_t attribute_used __attribute__((aligned(8))) g5Dither;
-static volatile uint64_t attribute_used __attribute__((aligned(8))) g6Dither;
-static volatile uint64_t attribute_used __attribute__((aligned(8))) r5Dither;
-
 const DECLARE_ALIGNED(8, uint64_t, ff_dither4[2]) = {
         0x0103010301030103LL,
         0x0200020002000200LL,};
@@ -218,18 +243,133 @@ DECLARE_ALIGNED(8, const uint64_t, ff_bgr2VCoeff)   = 0x00003831D0E6F6EAULL;
 DECLARE_ALIGNED(8, const uint64_t, ff_bgr2YOffset)  = 0x1010101010101010ULL;
 DECLARE_ALIGNED(8, const uint64_t, ff_bgr2UVOffset) = 0x8080808080808080ULL;
 DECLARE_ALIGNED(8, const uint64_t, ff_w1111)        = 0x0001000100010001ULL;
-#endif /* defined(ARCH_X86) */
+
+DECLARE_ASM_CONST(8, uint64_t, ff_bgr24toY1Coeff) = 0x0C88000040870C88ULL;
+DECLARE_ASM_CONST(8, uint64_t, ff_bgr24toY2Coeff) = 0x20DE4087000020DEULL;
+DECLARE_ASM_CONST(8, uint64_t, ff_rgb24toY1Coeff) = 0x20DE0000408720DEULL;
+DECLARE_ASM_CONST(8, uint64_t, ff_rgb24toY2Coeff) = 0x0C88408700000C88ULL;
+DECLARE_ASM_CONST(8, uint64_t, ff_bgr24toYOffset) = 0x0008400000084000ULL;
+
+DECLARE_ASM_CONST(8, uint64_t, ff_bgr24toUV[2][4]) = {
+    {0x38380000DAC83838ULL, 0xECFFDAC80000ECFFULL, 0xF6E40000D0E3F6E4ULL, 0x3838D0E300003838ULL},
+    {0xECFF0000DAC8ECFFULL, 0x3838DAC800003838ULL, 0x38380000D0E33838ULL, 0xF6E4D0E30000F6E4ULL},
+};
+
+DECLARE_ASM_CONST(8, uint64_t, ff_bgr24toUVOffset)= 0x0040400000404000ULL;
+
+#endif /* ARCH_X86 && CONFIG_GPL */
 
 // clipping helper table for C implementations:
 static unsigned char clip_table[768];
 
 static SwsVector *sws_getConvVec(SwsVector *a, SwsVector *b);
 
-extern const uint8_t dither_2x2_4[2][8];
-extern const uint8_t dither_2x2_8[2][8];
-extern const uint8_t dither_8x8_32[8][8];
-extern const uint8_t dither_8x8_73[8][8];
-extern const uint8_t dither_8x8_220[8][8];
+static const uint8_t  __attribute__((aligned(8))) dither_2x2_4[2][8]={
+{  1,   3,   1,   3,   1,   3,   1,   3, },
+{  2,   0,   2,   0,   2,   0,   2,   0, },
+};
+
+static const uint8_t  __attribute__((aligned(8))) dither_2x2_8[2][8]={
+{  6,   2,   6,   2,   6,   2,   6,   2, },
+{  0,   4,   0,   4,   0,   4,   0,   4, },
+};
+
+const uint8_t  __attribute__((aligned(8))) dither_8x8_32[8][8]={
+{ 17,   9,  23,  15,  16,   8,  22,  14, },
+{  5,  29,   3,  27,   4,  28,   2,  26, },
+{ 21,  13,  19,  11,  20,  12,  18,  10, },
+{  0,  24,   6,  30,   1,  25,   7,  31, },
+{ 16,   8,  22,  14,  17,   9,  23,  15, },
+{  4,  28,   2,  26,   5,  29,   3,  27, },
+{ 20,  12,  18,  10,  21,  13,  19,  11, },
+{  1,  25,   7,  31,   0,  24,   6,  30, },
+};
+
+#if 0
+const uint8_t  __attribute__((aligned(8))) dither_8x8_64[8][8]={
+{  0,  48,  12,  60,   3,  51,  15,  63, },
+{ 32,  16,  44,  28,  35,  19,  47,  31, },
+{  8,  56,   4,  52,  11,  59,   7,  55, },
+{ 40,  24,  36,  20,  43,  27,  39,  23, },
+{  2,  50,  14,  62,   1,  49,  13,  61, },
+{ 34,  18,  46,  30,  33,  17,  45,  29, },
+{ 10,  58,   6,  54,   9,  57,   5,  53, },
+{ 42,  26,  38,  22,  41,  25,  37,  21, },
+};
+#endif
+
+const uint8_t  __attribute__((aligned(8))) dither_8x8_73[8][8]={
+{  0,  55,  14,  68,   3,  58,  17,  72, },
+{ 37,  18,  50,  32,  40,  22,  54,  35, },
+{  9,  64,   5,  59,  13,  67,   8,  63, },
+{ 46,  27,  41,  23,  49,  31,  44,  26, },
+{  2,  57,  16,  71,   1,  56,  15,  70, },
+{ 39,  21,  52,  34,  38,  19,  51,  33, },
+{ 11,  66,   7,  62,  10,  65,   6,  60, },
+{ 48,  30,  43,  25,  47,  29,  42,  24, },
+};
+
+#if 0
+const uint8_t  __attribute__((aligned(8))) dither_8x8_128[8][8]={
+{ 68,  36,  92,  60,  66,  34,  90,  58, },
+{ 20, 116,  12, 108,  18, 114,  10, 106, },
+{ 84,  52,  76,  44,  82,  50,  74,  42, },
+{  0,  96,  24, 120,   6, 102,  30, 126, },
+{ 64,  32,  88,  56,  70,  38,  94,  62, },
+{ 16, 112,   8, 104,  22, 118,  14, 110, },
+{ 80,  48,  72,  40,  86,  54,  78,  46, },
+{  4, 100,  28, 124,   2,  98,  26, 122, },
+};
+#endif
+
+#if 1
+const uint8_t  __attribute__((aligned(8))) dither_8x8_220[8][8]={
+{117,  62, 158, 103, 113,  58, 155, 100, },
+{ 34, 199,  21, 186,  31, 196,  17, 182, },
+{144,  89, 131,  76, 141,  86, 127,  72, },
+{  0, 165,  41, 206,  10, 175,  52, 217, },
+{110,  55, 151,  96, 120,  65, 162, 107, },
+{ 28, 193,  14, 179,  38, 203,  24, 189, },
+{138,  83, 124,  69, 148,  93, 134,  79, },
+{  7, 172,  48, 213,   3, 168,  45, 210, },
+};
+#elif 1
+// tries to correct a gamma of 1.5
+const uint8_t  __attribute__((aligned(8))) dither_8x8_220[8][8]={
+{  0, 143,  18, 200,   2, 156,  25, 215, },
+{ 78,  28, 125,  64,  89,  36, 138,  74, },
+{ 10, 180,   3, 161,  16, 195,   8, 175, },
+{109,  51,  93,  38, 121,  60, 105,  47, },
+{  1, 152,  23, 210,   0, 147,  20, 205, },
+{ 85,  33, 134,  71,  81,  30, 130,  67, },
+{ 14, 190,   6, 171,  12, 185,   5, 166, },
+{117,  57, 101,  44, 113,  54,  97,  41, },
+};
+#elif 1
+// tries to correct a gamma of 2.0
+const uint8_t  __attribute__((aligned(8))) dither_8x8_220[8][8]={
+{  0, 124,   8, 193,   0, 140,  12, 213, },
+{ 55,  14, 104,  42,  66,  19, 119,  52, },
+{  3, 168,   1, 145,   6, 187,   3, 162, },
+{ 86,  31,  70,  21,  99,  39,  82,  28, },
+{  0, 134,  11, 206,   0, 129,   9, 200, },
+{ 62,  17, 114,  48,  58,  16, 109,  45, },
+{  5, 181,   2, 157,   4, 175,   1, 151, },
+{ 95,  36,  78,  26,  90,  34,  74,  24, },
+};
+#else
+// tries to correct a gamma of 2.5
+const uint8_t  __attribute__((aligned(8))) dither_8x8_220[8][8]={
+{  0, 107,   3, 187,   0, 125,   6, 212, },
+{ 39,   7,  86,  28,  49,  11, 102,  36, },
+{  1, 158,   0, 131,   3, 180,   1, 151, },
+{ 68,  19,  52,  12,  81,  25,  64,  17, },
+{  0, 119,   5, 203,   0, 113,   4, 195, },
+{ 45,   9,  96,  33,  42,   8,  91,  30, },
+{  2, 172,   1, 144,   2, 165,   0, 137, },
+{ 77,  23,  60,  15,  72,  21,  56,  14, },
+};
+#endif
 
 const char *sws_format_name(enum PixelFormat format)
 {
@@ -312,6 +452,16 @@ const char *sws_format_name(enum PixelFormat format)
             return "nv21";
         case PIX_FMT_YUV440P:
             return "yuv440p";
+        case PIX_FMT_VDPAU_H264:
+            return "vdpau_h264";
+        case PIX_FMT_VDPAU_MPEG1:
+            return "vdpau_mpeg1";
+        case PIX_FMT_VDPAU_MPEG2:
+            return "vdpau_mpeg2";
+        case PIX_FMT_VDPAU_WMV3:
+            return "vdpau_wmv3";
+        case PIX_FMT_VDPAU_VC1:
+            return "vdpau_vc1";
         default:
             return "Unknown format";
     }
@@ -321,7 +471,7 @@ static inline void yuv2yuvXinC(int16_t *lumFilter, int16_t **lumSrc, int lumFilt
                                int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
                                uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW)
 {
-    //FIXME Optimize (just quickly writen not opti..)
+    //FIXME Optimize (just quickly written not optimized..)
     int i;
     for (i=0; i<dstW; i++)
     {
@@ -354,7 +504,7 @@ static inline void yuv2nv12XinC(int16_t *lumFilter, int16_t **lumSrc, int lumFil
                                 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
                                 uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
 {
-    //FIXME Optimize (just quickly writen not opti..)
+    //FIXME Optimize (just quickly written not optimized..)
     int i;
     for (i=0; i<dstW; i++)
     {
@@ -401,7 +551,7 @@ static inline void yuv2nv12XinC(int16_t *lumFilter, int16_t **lumSrc, int lumFil
         }
 }
 
-#define YSCALE_YUV_2_PACKEDX_C(type) \
+#define YSCALE_YUV_2_PACKEDX_NOCLIP_C(type) \
     for (i=0; i<(dstW>>1); i++){\
         int j;\
         int Y1 = 1<<18;\
@@ -425,6 +575,9 @@ static inline void yuv2nv12XinC(int16_t *lumFilter, int16_t **lumSrc, int lumFil
         Y2>>=19;\
         U >>=19;\
         V >>=19;\
+
+#define YSCALE_YUV_2_PACKEDX_C(type) \
+        YSCALE_YUV_2_PACKEDX_NOCLIP_C(type)\
         if ((Y1|Y2|U|V)&256)\
         {\
             if (Y1>255)   Y1=255; \
@@ -437,8 +590,70 @@ static inline void yuv2nv12XinC(int16_t *lumFilter, int16_t **lumSrc, int lumFil
             else if (V<0) V=0;    \
         }
 
+#define YSCALE_YUV_2_PACKEDX_FULL_C \
+    for (i=0; i<dstW; i++){\
+        int j;\
+        int Y = 0;\
+        int U = -128<<19;\
+        int V = -128<<19;\
+        int R,G,B;\
+        \
+        for (j=0; j<lumFilterSize; j++){\
+            Y += lumSrc[j][i     ] * lumFilter[j];\
+        }\
+        for (j=0; j<chrFilterSize; j++){\
+            U += chrSrc[j][i     ] * chrFilter[j];\
+            V += chrSrc[j][i+VOFW] * chrFilter[j];\
+        }\
+        Y >>=10;\
+        U >>=10;\
+        V >>=10;\
+
+#define YSCALE_YUV_2_RGBX_FULL_C(rnd) \
+    YSCALE_YUV_2_PACKEDX_FULL_C\
+        Y-= c->yuv2rgb_y_offset;\
+        Y*= c->yuv2rgb_y_coeff;\
+        Y+= rnd;\
+        R= Y + V*c->yuv2rgb_v2r_coeff;\
+        G= Y + V*c->yuv2rgb_v2g_coeff + U*c->yuv2rgb_u2g_coeff;\
+        B= Y +                          U*c->yuv2rgb_u2b_coeff;\
+        if ((R|G|B)&(0xC0000000)){\
+            if (R>=(256<<22))   R=(256<<22)-1; \
+            else if (R<0)R=0;   \
+            if (G>=(256<<22))   G=(256<<22)-1; \
+            else if (G<0)G=0;   \
+            if (B>=(256<<22))   B=(256<<22)-1; \
+            else if (B<0)B=0;   \
+        }\
+
+
+#define YSCALE_YUV_2_GRAY16_C \
+    for (i=0; i<(dstW>>1); i++){\
+        int j;\
+        int Y1 = 1<<18;\
+        int Y2 = 1<<18;\
+        int U  = 1<<18;\
+        int V  = 1<<18;\
+        \
+        const int i2= 2*i;\
+        \
+        for (j=0; j<lumFilterSize; j++)\
+        {\
+            Y1 += lumSrc[j][i2] * lumFilter[j];\
+            Y2 += lumSrc[j][i2+1] * lumFilter[j];\
+        }\
+        Y1>>=11;\
+        Y2>>=11;\
+        if ((Y1|Y2|U|V)&65536)\
+        {\
+            if (Y1>65535)   Y1=65535; \
+            else if (Y1<0)Y1=0;   \
+            if (Y2>65535)   Y2=65535; \
+            else if (Y2<0)Y2=0;   \
+        }
+
 #define YSCALE_YUV_2_RGBX_C(type) \
-    YSCALE_YUV_2_PACKEDX_C(type)  \
+    YSCALE_YUV_2_PACKEDX_C(type)  /* FIXME fix tables so that clipping is not needed and then use _NOCLIP*/\
     r = (type *)c->table_rV[V];   \
     g = (type *)(c->table_gU[U] + c->table_gV[V]); \
     b = (type *)c->table_bU[U];   \
@@ -451,6 +666,12 @@ static inline void yuv2nv12XinC(int16_t *lumFilter, int16_t **lumSrc, int lumFil
         int U= (uvbuf0[i     ]*uvalpha1+uvbuf1[i     ]*uvalpha)>>19;  \
         int V= (uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19;  \
 
+#define YSCALE_YUV_2_GRAY16_2_C   \
+    for (i=0; i<(dstW>>1); i++){ \
+        const int i2= 2*i;       \
+        int Y1= (buf0[i2  ]*yalpha1+buf1[i2  ]*yalpha)>>11;           \
+        int Y2= (buf0[i2+1]*yalpha1+buf1[i2+1]*yalpha)>>11;           \
+
 #define YSCALE_YUV_2_RGB2_C(type) \
     YSCALE_YUV_2_PACKED2_C\
     type *r, *b, *g;\
@@ -466,6 +687,12 @@ static inline void yuv2nv12XinC(int16_t *lumFilter, int16_t **lumSrc, int lumFil
         int U= (uvbuf1[i     ])>>7;\
         int V= (uvbuf1[i+VOFW])>>7;\
 
+#define YSCALE_YUV_2_GRAY16_1_C \
+    for (i=0; i<(dstW>>1); i++){\
+        const int i2= 2*i;\
+        int Y1= buf0[i2  ]<<1;\
+        int Y2= buf0[i2+1]<<1;\
+
 #define YSCALE_YUV_2_RGB1_C(type) \
     YSCALE_YUV_2_PACKED1_C\
     type *r, *b, *g;\
@@ -488,11 +715,63 @@ static inline void yuv2nv12XinC(int16_t *lumFilter, int16_t **lumSrc, int lumFil
     g = (type *)(c->table_gU[U] + c->table_gV[V]);\
     b = (type *)c->table_bU[U];\
 
-#define YSCALE_YUV_2_ANYRGB_C(func, func2)\
+#define YSCALE_YUV_2_MONO2_C \
+    const uint8_t * const d128=dither_8x8_220[y&7];\
+    uint8_t *g= c->table_gU[128] + c->table_gV[128];\
+    for (i=0; i<dstW-7; i+=8){\
+        int acc;\
+        acc =       g[((buf0[i  ]*yalpha1+buf1[i  ]*yalpha)>>19) + d128[0]];\
+        acc+= acc + g[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19) + d128[1]];\
+        acc+= acc + g[((buf0[i+2]*yalpha1+buf1[i+2]*yalpha)>>19) + d128[2]];\
+        acc+= acc + g[((buf0[i+3]*yalpha1+buf1[i+3]*yalpha)>>19) + d128[3]];\
+        acc+= acc + g[((buf0[i+4]*yalpha1+buf1[i+4]*yalpha)>>19) + d128[4]];\
+        acc+= acc + g[((buf0[i+5]*yalpha1+buf1[i+5]*yalpha)>>19) + d128[5]];\
+        acc+= acc + g[((buf0[i+6]*yalpha1+buf1[i+6]*yalpha)>>19) + d128[6]];\
+        acc+= acc + g[((buf0[i+7]*yalpha1+buf1[i+7]*yalpha)>>19) + d128[7]];\
+        ((uint8_t*)dest)[0]= c->dstFormat == PIX_FMT_MONOBLACK ? acc : ~acc;\
+        dest++;\
+    }\
+
+
+#define YSCALE_YUV_2_MONOX_C \
+    const uint8_t * const d128=dither_8x8_220[y&7];\
+    uint8_t *g= c->table_gU[128] + c->table_gV[128];\
+    int acc=0;\
+    for (i=0; i<dstW-1; i+=2){\
+        int j;\
+        int Y1=1<<18;\
+        int Y2=1<<18;\
+\
+        for (j=0; j<lumFilterSize; j++)\
+        {\
+            Y1 += lumSrc[j][i] * lumFilter[j];\
+            Y2 += lumSrc[j][i+1] * lumFilter[j];\
+        }\
+        Y1>>=19;\
+        Y2>>=19;\
+        if ((Y1|Y2)&256)\
+        {\
+            if (Y1>255)   Y1=255;\
+            else if (Y1<0)Y1=0;\
+            if (Y2>255)   Y2=255;\
+            else if (Y2<0)Y2=0;\
+        }\
+        acc+= acc + g[Y1+d128[(i+0)&7]];\
+        acc+= acc + g[Y2+d128[(i+1)&7]];\
+        if ((i&7)==6){\
+            ((uint8_t*)dest)[0]= c->dstFormat == PIX_FMT_MONOBLACK ? acc : ~acc;\
+            dest++;\
+        }\
+    }
+
+
+#define YSCALE_YUV_2_ANYRGB_C(func, func2, func_g16, func_monoblack)\
     switch(c->dstFormat)\
     {\
     case PIX_FMT_RGB32:\
     case PIX_FMT_BGR32:\
+    case PIX_FMT_RGB32_1:\
+    case PIX_FMT_BGR32_1:\
         func(uint32_t)\
             ((uint32_t*)dest)[i2+0]= r[Y1] + g[Y1] + b[Y1];\
             ((uint32_t*)dest)[i2+1]= r[Y2] + g[Y2] + b[Y2];\
@@ -584,67 +863,9 @@ static inline void yuv2nv12XinC(int16_t *lumFilter, int16_t **lumSrc, int lumFil
         }\
         break;\
     case PIX_FMT_MONOBLACK:\
+    case PIX_FMT_MONOWHITE:\
         {\
-            const uint8_t * const d128=dither_8x8_220[y&7];\
-            uint8_t *g= c->table_gU[128] + c->table_gV[128];\
-            for (i=0; i<dstW-7; i+=8){\
-                int acc;\
-                acc =       g[((buf0[i  ]*yalpha1+buf1[i  ]*yalpha)>>19) + d128[0]];\
-                acc+= acc + g[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19) + d128[1]];\
-                acc+= acc + g[((buf0[i+2]*yalpha1+buf1[i+2]*yalpha)>>19) + d128[2]];\
-                acc+= acc + g[((buf0[i+3]*yalpha1+buf1[i+3]*yalpha)>>19) + d128[3]];\
-                acc+= acc + g[((buf0[i+4]*yalpha1+buf1[i+4]*yalpha)>>19) + d128[4]];\
-                acc+= acc + g[((buf0[i+5]*yalpha1+buf1[i+5]*yalpha)>>19) + d128[5]];\
-                acc+= acc + g[((buf0[i+6]*yalpha1+buf1[i+6]*yalpha)>>19) + d128[6]];\
-                acc+= acc + g[((buf0[i+7]*yalpha1+buf1[i+7]*yalpha)>>19) + d128[7]];\
-                ((uint8_t*)dest)[0]= acc;\
-                dest++;\
-            }\
-\
-/*\
-((uint8_t*)dest)-= dstW>>4;\
-{\
-            int acc=0;\
-            int left=0;\
-            static int top[1024];\
-            static int last_new[1024][1024];\
-            static int last_in3[1024][1024];\
-            static int drift[1024][1024];\
-            int topLeft=0;\
-            int shift=0;\
-            int count=0;\
-            const uint8_t * const d128=dither_8x8_220[y&7];\
-            int error_new=0;\
-            int error_in3=0;\
-            int f=0;\
-            \
-            for (i=dstW>>1; i<dstW; i++){\
-                int in= ((buf0[i  ]*yalpha1+buf1[i  ]*yalpha)>>19);\
-                int in2 = (76309 * (in - 16) + 32768) >> 16;\
-                int in3 = (in2 < 0) ? 0 : ((in2 > 255) ? 255 : in2);\
-                int old= (left*7 + topLeft + top[i]*5 + top[i+1]*3)/20 + in3\
-                         + (last_new[y][i] - in3)*f/256;\
-                int new= old> 128 ? 255 : 0;\
-\
-                error_new+= FFABS(last_new[y][i] - new);\
-                error_in3+= FFABS(last_in3[y][i] - in3);\
-                f= error_new - error_in3*4;\
-                if (f<0) f=0;\
-                if (f>256) f=256;\
-\
-                topLeft= top[i];\
-                left= top[i]= old - new;\
-                last_new[y][i]= new;\
-                last_in3[y][i]= in3;\
-\
-                acc+= acc + (new&1);\
-                if ((i&7)==6){\
-                    ((uint8_t*)dest)[0]= acc;\
-                    ((uint8_t*)dest)++;\
-                }\
-            }\
-}\
-*/\
+            func_monoblack\
         }\
         break;\
     case PIX_FMT_YUYV422:\
@@ -663,6 +884,22 @@ static inline void yuv2nv12XinC(int16_t *lumFilter, int16_t **lumSrc, int lumFil
             ((uint8_t*)dest)[2*i2+3]= Y2;\
         }                \
         break;\
+    case PIX_FMT_GRAY16BE:\
+        func_g16\
+            ((uint8_t*)dest)[2*i2+0]= Y1>>8;\
+            ((uint8_t*)dest)[2*i2+1]= Y1;\
+            ((uint8_t*)dest)[2*i2+2]= Y2>>8;\
+            ((uint8_t*)dest)[2*i2+3]= Y2;\
+        }                \
+        break;\
+    case PIX_FMT_GRAY16LE:\
+        func_g16\
+            ((uint8_t*)dest)[2*i2+0]= Y1;\
+            ((uint8_t*)dest)[2*i2+1]= Y1>>8;\
+            ((uint8_t*)dest)[2*i2+2]= Y2;\
+            ((uint8_t*)dest)[2*i2+3]= Y2>>8;\
+        }                \
+        break;\
     }\
 
 
@@ -671,208 +908,109 @@ static inline void yuv2packedXinC(SwsContext *c, int16_t *lumFilter, int16_t **l
                                   uint8_t *dest, int dstW, int y)
 {
     int i;
-    switch(c->dstFormat)
-    {
-    case PIX_FMT_BGR32:
-    case PIX_FMT_RGB32:
-        YSCALE_YUV_2_RGBX_C(uint32_t)
-            ((uint32_t*)dest)[i2+0]= r[Y1] + g[Y1] + b[Y1];
-            ((uint32_t*)dest)[i2+1]= r[Y2] + g[Y2] + b[Y2];
-        }
-        break;
-    case PIX_FMT_RGB24:
-        YSCALE_YUV_2_RGBX_C(uint8_t)
-            ((uint8_t*)dest)[0]= r[Y1];
-            ((uint8_t*)dest)[1]= g[Y1];
-            ((uint8_t*)dest)[2]= b[Y1];
-            ((uint8_t*)dest)[3]= r[Y2];
-            ((uint8_t*)dest)[4]= g[Y2];
-            ((uint8_t*)dest)[5]= b[Y2];
-            dest+=6;
-        }
-        break;
-    case PIX_FMT_BGR24:
-        YSCALE_YUV_2_RGBX_C(uint8_t)
-            ((uint8_t*)dest)[0]= b[Y1];
-            ((uint8_t*)dest)[1]= g[Y1];
-            ((uint8_t*)dest)[2]= r[Y1];
-            ((uint8_t*)dest)[3]= b[Y2];
-            ((uint8_t*)dest)[4]= g[Y2];
-            ((uint8_t*)dest)[5]= r[Y2];
-            dest+=6;
-        }
-        break;
-    case PIX_FMT_RGB565:
-    case PIX_FMT_BGR565:
-        {
-            const int dr1= dither_2x2_8[y&1    ][0];
-            const int dg1= dither_2x2_4[y&1    ][0];
-            const int db1= dither_2x2_8[(y&1)^1][0];
-            const int dr2= dither_2x2_8[y&1    ][1];
-            const int dg2= dither_2x2_4[y&1    ][1];
-            const int db2= dither_2x2_8[(y&1)^1][1];
-            YSCALE_YUV_2_RGBX_C(uint16_t)
-                ((uint16_t*)dest)[i2+0]= r[Y1+dr1] + g[Y1+dg1] + b[Y1+db1];
-                ((uint16_t*)dest)[i2+1]= r[Y2+dr2] + g[Y2+dg2] + b[Y2+db2];
-            }
-        }
-        break;
-    case PIX_FMT_RGB555:
-    case PIX_FMT_BGR555:
-        {
-            const int dr1= dither_2x2_8[y&1    ][0];
-            const int dg1= dither_2x2_8[y&1    ][1];
-            const int db1= dither_2x2_8[(y&1)^1][0];
-            const int dr2= dither_2x2_8[y&1    ][1];
-            const int dg2= dither_2x2_8[y&1    ][0];
-            const int db2= dither_2x2_8[(y&1)^1][1];
-            YSCALE_YUV_2_RGBX_C(uint16_t)
-                ((uint16_t*)dest)[i2+0]= r[Y1+dr1] + g[Y1+dg1] + b[Y1+db1];
-                ((uint16_t*)dest)[i2+1]= r[Y2+dr2] + g[Y2+dg2] + b[Y2+db2];
-            }
-        }
-        break;
-    case PIX_FMT_RGB8:
-    case PIX_FMT_BGR8:
-        {
-            const uint8_t * const d64= dither_8x8_73[y&7];
-            const uint8_t * const d32= dither_8x8_32[y&7];
-            YSCALE_YUV_2_RGBX_C(uint8_t)
-                ((uint8_t*)dest)[i2+0]= r[Y1+d32[(i2+0)&7]] + g[Y1+d32[(i2+0)&7]] + b[Y1+d64[(i2+0)&7]];
-                ((uint8_t*)dest)[i2+1]= r[Y2+d32[(i2+1)&7]] + g[Y2+d32[(i2+1)&7]] + b[Y2+d64[(i2+1)&7]];
-            }
-        }
-        break;
-    case PIX_FMT_RGB4:
-    case PIX_FMT_BGR4:
-        {
-            const uint8_t * const d64= dither_8x8_73 [y&7];
-            const uint8_t * const d128=dither_8x8_220[y&7];
-            YSCALE_YUV_2_RGBX_C(uint8_t)
-                ((uint8_t*)dest)[i]= r[Y1+d128[(i2+0)&7]] + g[Y1+d64[(i2+0)&7]] + b[Y1+d128[(i2+0)&7]]
-                                  +((r[Y2+d128[(i2+1)&7]] + g[Y2+d64[(i2+1)&7]] + b[Y2+d128[(i2+1)&7]])<<4);
-            }
-        }
-        break;
-    case PIX_FMT_RGB4_BYTE:
-    case PIX_FMT_BGR4_BYTE:
-        {
-            const uint8_t * const d64= dither_8x8_73 [y&7];
-            const uint8_t * const d128=dither_8x8_220[y&7];
-            YSCALE_YUV_2_RGBX_C(uint8_t)
-                ((uint8_t*)dest)[i2+0]= r[Y1+d128[(i2+0)&7]] + g[Y1+d64[(i2+0)&7]] + b[Y1+d128[(i2+0)&7]];
-                ((uint8_t*)dest)[i2+1]= r[Y2+d128[(i2+1)&7]] + g[Y2+d64[(i2+1)&7]] + b[Y2+d128[(i2+1)&7]];
-            }
-        }
-        break;
-    case PIX_FMT_MONOBLACK:
-        {
-            const uint8_t * const d128=dither_8x8_220[y&7];
-            uint8_t *g= c->table_gU[128] + c->table_gV[128];
-            int acc=0;
-            for (i=0; i<dstW-1; i+=2){
-                int j;
-                int Y1=1<<18;
-                int Y2=1<<18;
+    YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGBX_C, YSCALE_YUV_2_PACKEDX_C(void), YSCALE_YUV_2_GRAY16_C, YSCALE_YUV_2_MONOX_C)
+}
 
-                for (j=0; j<lumFilterSize; j++)
-                {
-                    Y1 += lumSrc[j][i] * lumFilter[j];
-                    Y2 += lumSrc[j][i+1] * lumFilter[j];
-                }
-                Y1>>=19;
-                Y2>>=19;
-                if ((Y1|Y2)&256)
-                {
-                    if (Y1>255)   Y1=255;
-                    else if (Y1<0)Y1=0;
-                    if (Y2>255)   Y2=255;
-                    else if (Y2<0)Y2=0;
-                }
-                acc+= acc + g[Y1+d128[(i+0)&7]];
-                acc+= acc + g[Y2+d128[(i+1)&7]];
-                if ((i&7)==6){
-                    ((uint8_t*)dest)[0]= acc;
-                    dest++;
-                }
-            }
+static inline void yuv2rgbXinC_full(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
+                                    int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
+                                    uint8_t *dest, int dstW, int y)
+{
+    int i;
+    int step= fmt_depth(c->dstFormat)/8;
+    int aidx= 3;
+
+    switch(c->dstFormat){
+    case PIX_FMT_ARGB:
+        dest++;
+        aidx= -1;
+    case PIX_FMT_RGB24:
+        aidx--;
+    case PIX_FMT_RGBA:
+        YSCALE_YUV_2_RGBX_FULL_C(1<<21)
+            dest[aidx]= 255;
+            dest[0]= R>>22;
+            dest[1]= G>>22;
+            dest[2]= B>>22;
+            dest+= step;
         }
         break;
-    case PIX_FMT_YUYV422:
-        YSCALE_YUV_2_PACKEDX_C(void)
-            ((uint8_t*)dest)[2*i2+0]= Y1;
-            ((uint8_t*)dest)[2*i2+1]= U;
-            ((uint8_t*)dest)[2*i2+2]= Y2;
-            ((uint8_t*)dest)[2*i2+3]= V;
-        }
-        break;
-    case PIX_FMT_UYVY422:
-        YSCALE_YUV_2_PACKEDX_C(void)
-            ((uint8_t*)dest)[2*i2+0]= U;
-            ((uint8_t*)dest)[2*i2+1]= Y1;
-            ((uint8_t*)dest)[2*i2+2]= V;
-            ((uint8_t*)dest)[2*i2+3]= Y2;
+    case PIX_FMT_ABGR:
+        dest++;
+        aidx= -1;
+    case PIX_FMT_BGR24:
+        aidx--;
+    case PIX_FMT_BGRA:
+        YSCALE_YUV_2_RGBX_FULL_C(1<<21)
+            dest[aidx]= 255;
+            dest[0]= B>>22;
+            dest[1]= G>>22;
+            dest[2]= R>>22;
+            dest+= step;
         }
         break;
+    default:
+        assert(0);
     }
 }
 
-
-//Note: we have C, X86, MMX, MMX2, 3DNOW version therse no 3DNOW+MMX2 one
+//Note: we have C, X86, MMX, MMX2, 3DNOW versions, there is no 3DNOW+MMX2 one
 //Plain C versions
-#if !defined (HAVE_MMX) || defined (RUNTIME_CPUDETECT) || !defined(CONFIG_GPL)
+#if !HAVE_MMX || defined (RUNTIME_CPUDETECT) || !CONFIG_GPL
 #define COMPILE_C
 #endif
 
-#ifdef ARCH_POWERPC
-#if (defined (HAVE_ALTIVEC) || defined (RUNTIME_CPUDETECT)) && defined (CONFIG_GPL)
+#if ARCH_PPC
+#if (HAVE_ALTIVEC || defined (RUNTIME_CPUDETECT)) && CONFIG_GPL
+#undef COMPILE_C
 #define COMPILE_ALTIVEC
-#endif //HAVE_ALTIVEC
-#endif //ARCH_POWERPC
+#endif
+#endif //ARCH_PPC
 
-#if defined(ARCH_X86)
+#if ARCH_X86
 
-#if ((defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)) && defined (CONFIG_GPL)
+#if ((HAVE_MMX && !HAVE_AMD3DNOW && !HAVE_MMX2) || defined (RUNTIME_CPUDETECT)) && CONFIG_GPL
 #define COMPILE_MMX
 #endif
 
-#if (defined (HAVE_MMX2) || defined (RUNTIME_CPUDETECT)) && defined (CONFIG_GPL)
+#if (HAVE_MMX2 || defined (RUNTIME_CPUDETECT)) && CONFIG_GPL
 #define COMPILE_MMX2
 #endif
 
-#if ((defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)) && defined (CONFIG_GPL)
+#if ((HAVE_AMD3DNOW && !HAVE_MMX2) || defined (RUNTIME_CPUDETECT)) && CONFIG_GPL
 #define COMPILE_3DNOW
 #endif
-#endif //ARCH_X86 || ARCH_X86_64
+#endif //ARCH_X86
 
 #undef HAVE_MMX
 #undef HAVE_MMX2
-#undef HAVE_3DNOW
+#undef HAVE_AMD3DNOW
+#undef HAVE_ALTIVEC
+#define HAVE_MMX 0
+#define HAVE_MMX2 0
+#define HAVE_AMD3DNOW 0
+#define HAVE_ALTIVEC 0
 
 #ifdef COMPILE_C
-#undef HAVE_MMX
-#undef HAVE_MMX2
-#undef HAVE_3DNOW
-#undef HAVE_ALTIVEC
 #define RENAME(a) a ## _C
 #include "swscale_template.c"
 #endif
 
 #ifdef COMPILE_ALTIVEC
 #undef RENAME
-#define HAVE_ALTIVEC
+#undef HAVE_ALTIVEC
+#define HAVE_ALTIVEC 1
 #define RENAME(a) a ## _altivec
 #include "swscale_template.c"
 #endif
 
-#if defined(ARCH_X86)
+#if ARCH_X86
 
-//X86 versions
+//x86 versions
 /*
 #undef RENAME
 #undef HAVE_MMX
 #undef HAVE_MMX2
-#undef HAVE_3DNOW
+#undef HAVE_AMD3DNOW
 #define ARCH_X86
 #define RENAME(a) a ## _X86
 #include "swscale_template.c"
@@ -880,9 +1018,12 @@ static inline void yuv2packedXinC(SwsContext *c, int16_t *lumFilter, int16_t **l
 //MMX versions
 #ifdef COMPILE_MMX
 #undef RENAME
-#define HAVE_MMX
+#undef HAVE_MMX
 #undef HAVE_MMX2
-#undef HAVE_3DNOW
+#undef HAVE_AMD3DNOW
+#define HAVE_MMX 1
+#define HAVE_MMX2 0
+#define HAVE_AMD3DNOW 0
 #define RENAME(a) a ## _MMX
 #include "swscale_template.c"
 #endif
@@ -890,9 +1031,12 @@ static inline void yuv2packedXinC(SwsContext *c, int16_t *lumFilter, int16_t **l
 //MMX2 versions
 #ifdef COMPILE_MMX2
 #undef RENAME
-#define HAVE_MMX
-#define HAVE_MMX2
-#undef HAVE_3DNOW
+#undef HAVE_MMX
+#undef HAVE_MMX2
+#undef HAVE_AMD3DNOW
+#define HAVE_MMX 1
+#define HAVE_MMX2 1
+#define HAVE_AMD3DNOW 0
 #define RENAME(a) a ## _MMX2
 #include "swscale_template.c"
 #endif
@@ -900,16 +1044,19 @@ static inline void yuv2packedXinC(SwsContext *c, int16_t *lumFilter, int16_t **l
 //3DNOW versions
 #ifdef COMPILE_3DNOW
 #undef RENAME
-#define HAVE_MMX
+#undef HAVE_MMX
 #undef HAVE_MMX2
-#define HAVE_3DNOW
+#undef HAVE_AMD3DNOW
+#define HAVE_MMX 1
+#define HAVE_MMX2 0
+#define HAVE_AMD3DNOW 1
 #define RENAME(a) a ## _3DNow
 #include "swscale_template.c"
 #endif
 
-#endif //ARCH_X86 || ARCH_X86_64
+#endif //ARCH_X86
 
-// minor note: the HAVE_xyz is messed up after that line so don't use it
+// minor note: the HAVE_xyz are messed up after this line so don't use them
 
 static double getSplineCoeff(double a, double b, double c, double d, double dist)
 {
@@ -930,27 +1077,27 @@ static inline int initFilter(int16_t **outFilter, int16_t **filterPos, int *outF
     int filterSize;
     int filter2Size;
     int minFilterSize;
-    double *filter=NULL;
-    double *filter2=NULL;
+    int64_t *filter=NULL;
+    int64_t *filter2=NULL;
+    const int64_t fone= 1LL<<54;
     int ret= -1;
-#if defined(ARCH_X86)
+#if ARCH_X86
     if (flags & SWS_CPU_CAPS_MMX)
-        asm volatile("emms\n\t"::: "memory"); //FIXME this should not be required but it IS (even for non-MMX versions)
+        __asm__ volatile("emms\n\t"::: "memory"); //FIXME this should not be required but it IS (even for non-MMX versions)
 #endif
 
-    // Note the +1 is for the MMXscaler which reads over the end
+    // NOTE: the +1 is for the MMX scaler which reads over the end
     *filterPos = av_malloc((dstW+1)*sizeof(int16_t));
 
     if (FFABS(xInc - 0x10000) <10) // unscaled
     {
         int i;
         filterSize= 1;
-        filter= av_malloc(dstW*sizeof(double)*filterSize);
-        for (i=0; i<dstW*filterSize; i++) filter[i]=0;
+        filter= av_mallocz(dstW*sizeof(*filter)*filterSize);
 
         for (i=0; i<dstW; i++)
         {
-            filter[i*filterSize]=1;
+            filter[i*filterSize]= fone;
             (*filterPos)[i]=i;
         }
 
@@ -960,7 +1107,7 @@ static inline int initFilter(int16_t **outFilter, int16_t **filterPos, int *outF
         int i;
         int xDstInSrc;
         filterSize= 1;
-        filter= av_malloc(dstW*sizeof(double)*filterSize);
+        filter= av_malloc(dstW*sizeof(*filter)*filterSize);
 
         xDstInSrc= xInc/2 - 0x8000;
         for (i=0; i<dstW; i++)
@@ -968,7 +1115,7 @@ static inline int initFilter(int16_t **outFilter, int16_t **filterPos, int *outF
             int xx= (xDstInSrc - ((filterSize-1)<<15) + (1<<15))>>16;
 
             (*filterPos)[i]= xx;
-            filter[i]= 1.0;
+            filter[i]= fone;
             xDstInSrc+= xInc;
         }
     }
@@ -979,7 +1126,7 @@ static inline int initFilter(int16_t **outFilter, int16_t **filterPos, int *outF
         if      (flags&SWS_BICUBIC) filterSize= 4;
         else if (flags&SWS_X      ) filterSize= 4;
         else                        filterSize= 2; // SWS_BILINEAR / SWS_AREA
-        filter= av_malloc(dstW*sizeof(double)*filterSize);
+        filter= av_malloc(dstW*sizeof(*filter)*filterSize);
 
         xDstInSrc= xInc/2 - 0x8000;
         for (i=0; i<dstW; i++)
@@ -988,11 +1135,10 @@ static inline int initFilter(int16_t **outFilter, int16_t **filterPos, int *outF
             int j;
 
             (*filterPos)[i]= xx;
-                //Bilinear upscale / linear interpolate / Area averaging
+                //bilinear upscale / linear interpolate / area averaging
                 for (j=0; j<filterSize; j++)
                 {
-                    double d= FFABS((xx<<16) - xDstInSrc)/(double)(1<<16);
-                    double coeff= 1.0 - d;
+                    int64_t coeff= fone - FFABS((xx<<16) - xDstInSrc)*(fone>>16);
                     if (coeff<0) coeff=0;
                     filter[i*filterSize + j]= coeff;
                     xx++;
@@ -1002,52 +1148,59 @@ static inline int initFilter(int16_t **outFilter, int16_t **filterPos, int *outF
     }
     else
     {
-        double xDstInSrc;
-        double sizeFactor, filterSizeInSrc;
-        const double xInc1= (double)xInc / (double)(1<<16);
+        int xDstInSrc;
+        int sizeFactor;
 
-        if      (flags&SWS_BICUBIC)      sizeFactor=  4.0;
-        else if (flags&SWS_X)            sizeFactor=  8.0;
-        else if (flags&SWS_AREA)         sizeFactor=  1.0; //downscale only, for upscale it is bilinear
-        else if (flags&SWS_GAUSS)        sizeFactor=  8.0;   // infinite ;)
-        else if (flags&SWS_LANCZOS)      sizeFactor= param[0] != SWS_PARAM_DEFAULT ? 2.0*param[0] : 6.0;
-        else if (flags&SWS_SINC)         sizeFactor= 20.0; // infinite ;)
-        else if (flags&SWS_SPLINE)       sizeFactor= 20.0;  // infinite ;)
-        else if (flags&SWS_BILINEAR)     sizeFactor=  2.0;
+        if      (flags&SWS_BICUBIC)      sizeFactor=  4;
+        else if (flags&SWS_X)            sizeFactor=  8;
+        else if (flags&SWS_AREA)         sizeFactor=  1; //downscale only, for upscale it is bilinear
+        else if (flags&SWS_GAUSS)        sizeFactor=  8;   // infinite ;)
+        else if (flags&SWS_LANCZOS)      sizeFactor= param[0] != SWS_PARAM_DEFAULT ? ceil(2*param[0]) : 6;
+        else if (flags&SWS_SINC)         sizeFactor= 20; // infinite ;)
+        else if (flags&SWS_SPLINE)       sizeFactor= 20;  // infinite ;)
+        else if (flags&SWS_BILINEAR)     sizeFactor=  2;
         else {
-            sizeFactor= 0.0; //GCC warning killer
+            sizeFactor= 0; //GCC warning killer
             assert(0);
         }
 
-        if (xInc1 <= 1.0)       filterSizeInSrc= sizeFactor; // upscale
-        else                    filterSizeInSrc= sizeFactor*srcW / (double)dstW;
+        if (xInc <= 1<<16)      filterSize= 1 + sizeFactor; // upscale
+        else                    filterSize= 1 + (sizeFactor*srcW + dstW - 1)/ dstW;
 
-        filterSize= (int)ceil(1 + filterSizeInSrc); // will be reduced later if possible
         if (filterSize > srcW-2) filterSize=srcW-2;
 
-        filter= av_malloc(dstW*sizeof(double)*filterSize);
+        filter= av_malloc(dstW*sizeof(*filter)*filterSize);
 
-        xDstInSrc= xInc1 / 2.0 - 0.5;
+        xDstInSrc= xInc - 0x10000;
         for (i=0; i<dstW; i++)
         {
-            int xx= (int)(xDstInSrc - (filterSize-1)*0.5 + 0.5);
+            int xx= (xDstInSrc - ((filterSize-2)<<16)) / (1<<17);
             int j;
             (*filterPos)[i]= xx;
             for (j=0; j<filterSize; j++)
             {
-                double d= FFABS(xx - xDstInSrc)/filterSizeInSrc*sizeFactor;
-                double coeff;
+                int64_t d= ((int64_t)FFABS((xx<<17) - xDstInSrc))<<13;
+                double floatd;
+                int64_t coeff;
+
+                if (xInc > 1<<16)
+                    d= d*dstW/srcW;
+                floatd= d * (1.0/(1<<30));
+
                 if (flags & SWS_BICUBIC)
                 {
-                    double B= param[0] != SWS_PARAM_DEFAULT ? param[0] : 0.0;
-                    double C= param[1] != SWS_PARAM_DEFAULT ? param[1] : 0.6;
+                    int64_t B= (param[0] != SWS_PARAM_DEFAULT ? param[0] :   0) * (1<<24);
+                    int64_t C= (param[1] != SWS_PARAM_DEFAULT ? param[1] : 0.6) * (1<<24);
+                    int64_t dd = ( d*d)>>30;
+                    int64_t ddd= (dd*d)>>30;
 
-                    if (d<1.0)
-                        coeff = (12-9*B-6*C)*d*d*d + (-18+12*B+6*C)*d*d + 6-2*B;
-                    else if (d<2.0)
-                        coeff = (-B-6*C)*d*d*d + (6*B+30*C)*d*d + (-12*B-48*C)*d +8*B+24*C;
+                    if      (d < 1LL<<30)
+                        coeff = (12*(1<<24)-9*B-6*C)*ddd + (-18*(1<<24)+12*B+6*C)*dd + (6*(1<<24)-2*B)*(1<<30);
+                    else if (d < 1LL<<31)
+                        coeff = (-B-6*C)*ddd + (6*B+30*C)*dd + (-12*B-48*C)*d + (8*B+24*C)*(1<<30);
                     else
                         coeff=0.0;
+                    coeff *= fone>>(30+24);
                 }
 /*                else if (flags & SWS_X)
                 {
@@ -1058,46 +1211,49 @@ static inline int initFilter(int16_t **outFilter, int16_t **filterPos, int *outF
                 else if (flags & SWS_X)
                 {
                     double A= param[0] != SWS_PARAM_DEFAULT ? param[0] : 1.0;
+                    double c;
 
-                    if (d<1.0)
-                        coeff = cos(d*PI);
+                    if (floatd<1.0)
+                        c = cos(floatd*PI);
                     else
-                        coeff=-1.0;
-                    if (coeff<0.0)      coeff= -pow(-coeff, A);
-                    else                coeff=  pow( coeff, A);
-                    coeff= coeff*0.5 + 0.5;
+                        c=-1.0;
+                    if (c<0.0)      c= -pow(-c, A);
+                    else            c=  pow( c, A);
+                    coeff= (c*0.5 + 0.5)*fone;
                 }
                 else if (flags & SWS_AREA)
                 {
-                    double srcPixelSize= 1.0/xInc1;
-                    if      (d + srcPixelSize/2 < 0.5) coeff= 1.0;
-                    else if (d - srcPixelSize/2 < 0.5) coeff= (0.5-d)/srcPixelSize + 0.5;
+                    int64_t d2= d - (1<<29);
+                    if      (d2*xInc < -(1LL<<(29+16))) coeff= 1.0 * (1LL<<(30+16));
+                    else if (d2*xInc <  (1LL<<(29+16))) coeff= -d2*xInc + (1LL<<(29+16));
                     else coeff=0.0;
+                    coeff *= fone>>(30+16);
                 }
                 else if (flags & SWS_GAUSS)
                 {
                     double p= param[0] != SWS_PARAM_DEFAULT ? param[0] : 3.0;
-                    coeff = pow(2.0, - p*d*d);
+                    coeff = (pow(2.0, - p*floatd*floatd))*fone;
                 }
                 else if (flags & SWS_SINC)
                 {
-                    coeff = d ? sin(d*PI)/(d*PI) : 1.0;
+                    coeff = (d ? sin(floatd*PI)/(floatd*PI) : 1.0)*fone;
                 }
                 else if (flags & SWS_LANCZOS)
                 {
                     double p= param[0] != SWS_PARAM_DEFAULT ? param[0] : 3.0;
-                    coeff = d ? sin(d*PI)*sin(d*PI/p)/(d*d*PI*PI/p) : 1.0;
-                    if (d>p) coeff=0;
+                    coeff = (d ? sin(floatd*PI)*sin(floatd*PI/p)/(floatd*floatd*PI*PI/p) : 1.0)*fone;
+                    if (floatd>p) coeff=0;
                 }
                 else if (flags & SWS_BILINEAR)
                 {
-                    coeff= 1.0 - d;
+                    coeff= (1<<30) - d;
                     if (coeff<0) coeff=0;
+                    coeff *= fone >> 30;
                 }
                 else if (flags & SWS_SPLINE)
                 {
                     double p=-2.196152422706632;
-                    coeff = getSplineCoeff(1.0, 0.0, p, -p-1.0, d);
+                    coeff = getSplineCoeff(1.0, 0.0, p, -p-1.0, floatd) * fone;
                 }
                 else {
                     coeff= 0.0; //GCC warning killer
@@ -1107,7 +1263,7 @@ static inline int initFilter(int16_t **outFilter, int16_t **filterPos, int *outF
                 filter[i*filterSize + j]= coeff;
                 xx++;
             }
-            xDstInSrc+= xInc1;
+            xDstInSrc+= 2*xInc;
         }
     }
 
@@ -1119,31 +1275,24 @@ static inline int initFilter(int16_t **outFilter, int16_t **filterPos, int *outF
     if (srcFilter) filter2Size+= srcFilter->length - 1;
     if (dstFilter) filter2Size+= dstFilter->length - 1;
     assert(filter2Size>0);
-    filter2= av_malloc(filter2Size*dstW*sizeof(double));
+    filter2= av_mallocz(filter2Size*dstW*sizeof(*filter2));
 
     for (i=0; i<dstW; i++)
     {
-        int j;
-        SwsVector scaleFilter;
-        SwsVector *outVec;
+        int j, k;
 
-        scaleFilter.coeff= filter + i*filterSize;
-        scaleFilter.length= filterSize;
-
-        if (srcFilter) outVec= sws_getConvVec(srcFilter, &scaleFilter);
-        else           outVec= &scaleFilter;
-
-        assert(outVec->length == filter2Size);
+        if(srcFilter){
+            for (k=0; k<srcFilter->length; k++){
+                for (j=0; j<filterSize; j++)
+                    filter2[i*filter2Size + k + j] += srcFilter->coeff[k]*filter[i*filterSize + j];
+            }
+        }else{
+            for (j=0; j<filterSize; j++)
+                filter2[i*filter2Size + j]= filter[i*filterSize + j];
+        }
         //FIXME dstFilter
 
-        for (j=0; j<outVec->length; j++)
-        {
-            filter2[i*filter2Size + j]= outVec->coeff[j];
-        }
-
         (*filterPos)[i]+= (filterSize-1)/2 - (filter2Size-1)/2;
-
-        if (outVec != &scaleFilter) sws_freeVec(outVec);
     }
     av_freep(&filter);
 
@@ -1154,7 +1303,7 @@ static inline int initFilter(int16_t **outFilter, int16_t **filterPos, int *outF
     {
         int min= filter2Size;
         int j;
-        double cutOff=0.0;
+        int64_t cutOff=0.0;
 
         /* get rid off near zero elements on the left by shifting left */
         for (j=0; j<filter2Size; j++)
@@ -1162,25 +1311,25 @@ static inline int initFilter(int16_t **outFilter, int16_t **filterPos, int *outF
             int k;
             cutOff += FFABS(filter2[i*filter2Size]);
 
-            if (cutOff > SWS_MAX_REDUCE_CUTOFF) break;
+            if (cutOff > SWS_MAX_REDUCE_CUTOFF*fone) break;
 
             /* preserve monotonicity because the core can't handle the filter otherwise */
             if (i<dstW-1 && (*filterPos)[i] >= (*filterPos)[i+1]) break;
 
-            // Move filter coeffs left
+            // move filter coefficients left
             for (k=1; k<filter2Size; k++)
                 filter2[i*filter2Size + k - 1]= filter2[i*filter2Size + k];
-            filter2[i*filter2Size + k - 1]= 0.0;
+            filter2[i*filter2Size + k - 1]= 0;
             (*filterPos)[i]++;
         }
 
-        cutOff=0.0;
+        cutOff=0;
         /* count near zeros on the right */
         for (j=filter2Size-1; j>0; j--)
         {
             cutOff += FFABS(filter2[i*filter2Size + j]);
 
-            if (cutOff > SWS_MAX_REDUCE_CUTOFF) break;
+            if (cutOff > SWS_MAX_REDUCE_CUTOFF*fone) break;
             min--;
         }
 
@@ -1193,10 +1342,10 @@ static inline int initFilter(int16_t **outFilter, int16_t **filterPos, int *outF
         if (minFilterSize < 5)
             filterAlign = 4;
 
-        // we really don't want to waste our time
-        // doing useless computation, so fall-back on
-        // the scalar C code for very small filter.
-        // vectorizing is worth it only if you have
+        // We really don't want to waste our time
+        // doing useless computation, so fall back on
+        // the scalar C code for very small filters.
+        // Vectorizing is worth it only if you have a
         // decent-sized vector.
         if (minFilterSize < 3)
             filterAlign = 1;
@@ -1211,8 +1360,8 @@ static inline int initFilter(int16_t **outFilter, int16_t **filterPos, int *outF
     assert(minFilterSize > 0);
     filterSize= (minFilterSize +(filterAlign-1)) & (~(filterAlign-1));
     assert(filterSize > 0);
-    filter= av_malloc(filterSize*dstW*sizeof(double));
-    if (filterSize >= MAX_FILTER_SIZE || !filter)
+    filter= av_malloc(filterSize*dstW*sizeof(*filter));
+    if (filterSize >= MAX_FILTER_SIZE*16/((flags&SWS_ACCURATE_RND) ? APCK_SIZE : 16) || !filter)
         goto error;
     *outFilterSize= filterSize;
 
@@ -1225,13 +1374,15 @@ static inline int initFilter(int16_t **outFilter, int16_t **filterPos, int *outF
 
         for (j=0; j<filterSize; j++)
         {
-            if (j>=filter2Size) filter[i*filterSize + j]= 0.0;
+            if (j>=filter2Size) filter[i*filterSize + j]= 0;
             else               filter[i*filterSize + j]= filter2[i*filter2Size + j];
+            if((flags & SWS_BITEXACT) && j>=minFilterSize)
+                filter[i*filterSize + j]= 0;
         }
     }
 
 
-    //FIXME try to align filterpos if possible
+    //FIXME try to align filterPos if possible
 
     //fix borders
     for (i=0; i<dstW; i++)
@@ -1239,7 +1390,7 @@ static inline int initFilter(int16_t **outFilter, int16_t **filterPos, int *outF
         int j;
         if ((*filterPos)[i] < 0)
         {
-            // Move filter coeffs left to compensate for filterPos
+            // move filter coefficients left to compensate for filterPos
             for (j=1; j<filterSize; j++)
             {
                 int left= FFMAX(j + (*filterPos)[i], 0);
@@ -1252,7 +1403,7 @@ static inline int initFilter(int16_t **outFilter, int16_t **filterPos, int *outF
         if ((*filterPos)[i] + filterSize > srcW)
         {
             int shift= (*filterPos)[i] + filterSize - srcW;
-            // Move filter coeffs right to compensate for filterPos
+            // move filter coefficients right to compensate for filterPos
             for (j=filterSize-2; j>=0; j--)
             {
                 int right= FFMIN(j + shift, filterSize-1);
@@ -1263,29 +1414,28 @@ static inline int initFilter(int16_t **outFilter, int16_t **filterPos, int *outF
         }
     }
 
-    // Note the +1 is for the MMXscaler which reads over the end
+    // Note the +1 is for the MMX scaler which reads over the end
     /* align at 16 for AltiVec (needed by hScale_altivec_real) */
     *outFilter= av_mallocz(*outFilterSize*(dstW+1)*sizeof(int16_t));
 
-    /* Normalize & Store in outFilter */
+    /* normalize & store in outFilter */
     for (i=0; i<dstW; i++)
     {
         int j;
-        double error=0;
-        double sum=0;
-        double scale= one;
+        int64_t error=0;
+        int64_t sum=0;
 
         for (j=0; j<filterSize; j++)
         {
             sum+= filter[i*filterSize + j];
         }
-        scale/= sum;
+        sum= (sum + one/2)/ one;
         for (j=0; j<*outFilterSize; j++)
         {
-            double v= filter[i*filterSize + j]*scale + error;
-            int intV= floor(v + 0.5);
+            int64_t v= filter[i*filterSize + j] + error;
+            int intV= ROUNDED_DIV(v, sum);
             (*outFilter)[i*(*outFilterSize) + j]= intV;
-            error = v - intV;
+            error= v - intV*sum;
         }
     }
 
@@ -1322,7 +1472,7 @@ static void initMMX2HScaler(int dstW, int xInc, uint8_t *funnyCode, int16_t *fil
 
     //code fragment
 
-    asm volatile(
+    __asm__ volatile(
         "jmp                         9f                 \n\t"
     // Begin
         "0:                                             \n\t"
@@ -1362,7 +1512,7 @@ static void initMMX2HScaler(int dstW, int xInc, uint8_t *funnyCode, int16_t *fil
         "=r" (fragmentLengthA)
     );
 
-    asm volatile(
+    __asm__ volatile(
         "jmp                         9f                 \n\t"
     // Begin
         "0:                                             \n\t"
@@ -1487,8 +1637,8 @@ static void globalInit(void){
 
 static SwsFunc getSwsFunc(int flags){
 
-#if defined(RUNTIME_CPUDETECT) && defined (CONFIG_GPL)
-#if defined(ARCH_X86)
+#if defined(RUNTIME_CPUDETECT) && CONFIG_GPL
+#if ARCH_X86
     // ordered per speed fastest first
     if (flags & SWS_CPU_CAPS_MMX2)
         return swScale_MMX2;
@@ -1500,22 +1650,22 @@ static SwsFunc getSwsFunc(int flags){
         return swScale_C;
 
 #else
-#ifdef ARCH_POWERPC
+#if ARCH_PPC
     if (flags & SWS_CPU_CAPS_ALTIVEC)
         return swScale_altivec;
     else
         return swScale_C;
 #endif
     return swScale_C;
-#endif /* defined(ARCH_X86) */
+#endif /* ARCH_X86 */
 #else //RUNTIME_CPUDETECT
-#ifdef HAVE_MMX2
+#if   HAVE_MMX2
     return swScale_MMX2;
-#elif defined (HAVE_3DNOW)
+#elif HAVE_AMD3DNOW
     return swScale_3DNow;
-#elif defined (HAVE_MMX)
+#elif HAVE_MMX
     return swScale_MMX;
-#elif defined (HAVE_ALTIVEC)
+#elif HAVE_ALTIVEC
     return swScale_altivec;
 #else
     return swScale_C;
@@ -1568,11 +1718,64 @@ static int PlanarToUyvyWrapper(SwsContext *c, uint8_t* src[], int srcStride[], i
     return srcSliceH;
 }
 
-/* {RGB,BGR}{15,16,24,32} -> {RGB,BGR}{15,16,24,32} */
+static int YUV422PToYuy2Wrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
+                                int srcSliceH, uint8_t* dstParam[], int dstStride[]){
+    uint8_t *dst=dstParam[0] + dstStride[0]*srcSliceY;
+
+    yuv422ptoyuy2(src[0],src[1],src[2],dst,c->srcW,srcSliceH,srcStride[0],srcStride[1],dstStride[0]);
+
+    return srcSliceH;
+}
+
+static int YUV422PToUyvyWrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
+                                int srcSliceH, uint8_t* dstParam[], int dstStride[]){
+    uint8_t *dst=dstParam[0] + dstStride[0]*srcSliceY;
+
+    yuv422ptouyvy(src[0],src[1],src[2],dst,c->srcW,srcSliceH,srcStride[0],srcStride[1],dstStride[0]);
+
+    return srcSliceH;
+}
+
+static int pal2rgbWrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
+                          int srcSliceH, uint8_t* dst[], int dstStride[]){
+    const enum PixelFormat srcFormat= c->srcFormat;
+    const enum PixelFormat dstFormat= c->dstFormat;
+    void (*conv)(const uint8_t *src, uint8_t *dst, long num_pixels,
+                 const uint8_t *palette)=NULL;
+    int i;
+    uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
+    uint8_t *srcPtr= src[0];
+
+    if (!usePal(srcFormat))
+        av_log(c, AV_LOG_ERROR, "internal error %s -> %s converter\n",
+               sws_format_name(srcFormat), sws_format_name(dstFormat));
+
+    switch(dstFormat){
+    case PIX_FMT_RGB32  : conv = palette8topacked32; break;
+    case PIX_FMT_BGR32  : conv = palette8topacked32; break;
+    case PIX_FMT_BGR32_1: conv = palette8topacked32; break;
+    case PIX_FMT_RGB32_1: conv = palette8topacked32; break;
+    case PIX_FMT_RGB24  : conv = palette8topacked24; break;
+    case PIX_FMT_BGR24  : conv = palette8topacked24; break;
+    default: av_log(c, AV_LOG_ERROR, "internal error %s -> %s converter\n",
+                    sws_format_name(srcFormat), sws_format_name(dstFormat)); break;
+    }
+
+
+    for (i=0; i<srcSliceH; i++) {
+        conv(srcPtr, dstPtr, c->srcW, (uint8_t *) c->pal_rgb);
+        srcPtr+= srcStride[0];
+        dstPtr+= dstStride[0];
+    }
+
+    return srcSliceH;
+}
+
+/* {RGB,BGR}{15,16,24,32,32_1} -> {RGB,BGR}{15,16,24,32} */
 static int rgb2rgbWrapper(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
                           int srcSliceH, uint8_t* dst[], int dstStride[]){
-    const int srcFormat= c->srcFormat;
-    const int dstFormat= c->dstFormat;
+    const enum PixelFormat srcFormat= c->srcFormat;
+    const enum PixelFormat dstFormat= c->dstFormat;
     const int srcBpp= (fmt_depth(srcFormat) + 7) >> 3;
     const int dstBpp= (fmt_depth(dstFormat) + 7) >> 3;
     const int srcId= fmt_depth(srcFormat) >> 2; /* 1:0, 4:1, 8:2, 15:3, 16:4, 24:6, 32:8 */
@@ -1627,12 +1830,15 @@ static int rgb2rgbWrapper(SwsContext *c, uint8_t* src[], int srcStride[], int sr
 
     if(conv)
     {
+        uint8_t *srcPtr= src[0];
+        if(srcFormat == PIX_FMT_RGB32_1 || srcFormat == PIX_FMT_BGR32_1)
+            srcPtr += ALT32_CORR;
+
         if (dstStride[0]*srcBpp == srcStride[0]*dstBpp && srcStride[0] > 0)
-            conv(src[0], dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
+            conv(srcPtr, dst[0] + dstStride[0]*srcSliceY, srcSliceH*srcStride[0]);
         else
         {
             int i;
-            uint8_t *srcPtr= src[0];
             uint8_t *dstPtr= dst[0] + dstStride[0]*srcSliceY;
 
             for (i=0; i<srcSliceH; i++)
@@ -1870,7 +2076,7 @@ static uint16_t roundToInt16(int64_t f){
 }
 
 /**
- * @param inv_table the yuv2rgb coeffs, normally Inverse_Table_6_9[x]
+ * @param inv_table the yuv2rgb coefficients, normally ff_yuv2rgb_coeffs[x]
  * @param fullRange if 1 then the luma range is 0..255 if 0 it is 16..235
  * @return -1 if not supported
  */
@@ -1882,7 +2088,6 @@ int sws_setColorspaceDetails(SwsContext *c, const int inv_table[4], int srcRange
     int64_t cy  = 1<<16;
     int64_t oy  = 0;
 
-    if (isYUV(c->dstFormat) || isGray(c->dstFormat)) return -1;
     memcpy(c->srcColorspaceTable, inv_table, sizeof(int)*4);
     memcpy(c->dstColorspaceTable,     table, sizeof(int)*4);
 
@@ -1891,6 +2096,7 @@ int sws_setColorspaceDetails(SwsContext *c, const int inv_table[4], int srcRange
     c->saturation= saturation;
     c->srcRange  = srcRange;
     c->dstRange  = dstRange;
+    if (isYUV(c->dstFormat) || isGray(c->dstFormat)) return 0;
 
     c->uOffset=   0x0400040004000400LL;
     c->vOffset=   0x0400040004000400LL;
@@ -1920,12 +2126,19 @@ int sws_setColorspaceDetails(SwsContext *c, const int inv_table[4], int srcRange
     c->ugCoeff=   roundToInt16(cgu*8192) * 0x0001000100010001ULL;
     c->yOffset=   roundToInt16(oy *   8) * 0x0001000100010001ULL;
 
-    yuv2rgb_c_init_tables(c, inv_table, srcRange, brightness, contrast, saturation);
+    c->yuv2rgb_y_coeff  = (int16_t)roundToInt16(cy <<13);
+    c->yuv2rgb_y_offset = (int16_t)roundToInt16(oy << 9);
+    c->yuv2rgb_v2r_coeff= (int16_t)roundToInt16(crv<<13);
+    c->yuv2rgb_v2g_coeff= (int16_t)roundToInt16(cgv<<13);
+    c->yuv2rgb_u2g_coeff= (int16_t)roundToInt16(cgu<<13);
+    c->yuv2rgb_u2b_coeff= (int16_t)roundToInt16(cbu<<13);
+
+    sws_yuv2rgb_c_init_tables(c, inv_table, srcRange, brightness, contrast, saturation);
     //FIXME factorize
 
 #ifdef COMPILE_ALTIVEC
     if (c->flags & SWS_CPU_CAPS_ALTIVEC)
-        yuv2rgb_altivec_init_tables (c, inv_table, brightness, contrast, saturation);
+        sws_yuv2rgb_altivec_init_tables (c, inv_table, brightness, contrast, saturation);
 #endif
     return 0;
 }
@@ -1947,7 +2160,7 @@ int sws_getColorspaceDetails(SwsContext *c, int **inv_table, int *srcRange, int
     return 0;
 }
 
-static int handle_jpeg(int *format)
+static int handle_jpeg(enum PixelFormat *format)
 {
     switch (*format) {
         case PIX_FMT_YUVJ420P:
@@ -1967,7 +2180,7 @@ static int handle_jpeg(int *format)
     }
 }
 
-SwsContext *sws_getContext(int srcW, int srcH, int srcFormat, int dstW, int dstH, int dstFormat, int flags,
+SwsContext *sws_getContext(int srcW, int srcH, enum PixelFormat srcFormat, int dstW, int dstH, enum PixelFormat dstFormat, int flags,
                            SwsFilter *srcFilter, SwsFilter *dstFilter, double *param){
 
     SwsContext *c;
@@ -1976,22 +2189,22 @@ SwsContext *sws_getContext(int srcW, int srcH, int srcFormat, int dstW, int dstH
     int unscaled, needsDither;
     int srcRange, dstRange;
     SwsFilter dummyFilter= {NULL, NULL, NULL, NULL};
-#if defined(ARCH_X86)
+#if ARCH_X86
     if (flags & SWS_CPU_CAPS_MMX)
-        asm volatile("emms\n\t"::: "memory");
+        __asm__ volatile("emms\n\t"::: "memory");
 #endif
 
-#if !defined(RUNTIME_CPUDETECT) || !defined (CONFIG_GPL) //ensure that the flags match the compiled variant if cpudetect is off
+#if !defined(RUNTIME_CPUDETECT) || !CONFIG_GPL //ensure that the flags match the compiled variant if cpudetect is off
     flags &= ~(SWS_CPU_CAPS_MMX|SWS_CPU_CAPS_MMX2|SWS_CPU_CAPS_3DNOW|SWS_CPU_CAPS_ALTIVEC|SWS_CPU_CAPS_BFIN);
-#ifdef HAVE_MMX2
+#if   HAVE_MMX2
     flags |= SWS_CPU_CAPS_MMX|SWS_CPU_CAPS_MMX2;
-#elif defined (HAVE_3DNOW)
+#elif HAVE_AMD3DNOW
     flags |= SWS_CPU_CAPS_MMX|SWS_CPU_CAPS_3DNOW;
-#elif defined (HAVE_MMX)
+#elif HAVE_MMX
     flags |= SWS_CPU_CAPS_MMX;
-#elif defined (HAVE_ALTIVEC)
+#elif HAVE_ALTIVEC
     flags |= SWS_CPU_CAPS_ALTIVEC;
-#elif defined (ARCH_BFIN)
+#elif ARCH_BFIN
     flags |= SWS_CPU_CAPS_BFIN;
 #endif
 #endif /* RUNTIME_CPUDETECT */
@@ -2030,11 +2243,10 @@ SwsContext *sws_getContext(int srcW, int srcH, int srcFormat, int dstW, int dstH
                 |SWS_BICUBLIN);
     if(!i || (i & (i-1)))
     {
-        av_log(NULL, AV_LOG_ERROR, "swScaler: Exactly one scaler algorithm must be choosen\n");
+        av_log(NULL, AV_LOG_ERROR, "swScaler: Exactly one scaler algorithm must be chosen\n");
         return NULL;
     }
 
-
     /* sanity check */
     if (srcW<4 || srcH<1 || dstW<8 || dstH<1) //FIXME check if these are enough and try to lowwer them after fixing the relevant parts of the code
     {
@@ -2043,7 +2255,7 @@ SwsContext *sws_getContext(int srcW, int srcH, int srcFormat, int dstW, int dstH
         return NULL;
     }
     if(srcW > VOFW || dstW > VOFW){
-        av_log(NULL, AV_LOG_ERROR, "swScaler: Compile time max width is "AV_STRINGIFY(VOFW)" change VOF/VOFW and recompile\n");
+        av_log(NULL, AV_LOG_ERROR, "swScaler: Compile-time maximum width is "AV_STRINGIFY(VOFW)" change VOF/VOFW and recompile\n");
         return NULL;
     }
 
@@ -2077,18 +2289,19 @@ SwsContext *sws_getContext(int srcW, int srcH, int srcFormat, int dstW, int dstH
     getSubSampleFactors(&c->chrSrcHSubSample, &c->chrSrcVSubSample, srcFormat);
     getSubSampleFactors(&c->chrDstHSubSample, &c->chrDstVSubSample, dstFormat);
 
-    // reuse chroma for 2 pixles rgb/bgr unless user wants full chroma interpolation
+    // reuse chroma for 2 pixels RGB/BGR unless user wants full chroma interpolation
     if ((isBGR(dstFormat) || isRGB(dstFormat)) && !(flags&SWS_FULL_CHR_H_INT)) c->chrDstHSubSample=1;
 
     // drop some chroma lines if the user wants it
     c->vChrDrop= (flags&SWS_SRC_V_CHR_DROP_MASK)>>SWS_SRC_V_CHR_DROP_SHIFT;
     c->chrSrcVSubSample+= c->vChrDrop;
 
-    // drop every 2. pixel for chroma calculation unless user wants full chroma
+    // drop every other pixel for chroma calculation unless user wants full chroma
     if ((isBGR(srcFormat) || isRGB(srcFormat)) && !(flags&SWS_FULL_CHR_H_INP)
       && srcFormat!=PIX_FMT_RGB8      && srcFormat!=PIX_FMT_BGR8
       && srcFormat!=PIX_FMT_RGB4      && srcFormat!=PIX_FMT_BGR4
-      && srcFormat!=PIX_FMT_RGB4_BYTE && srcFormat!=PIX_FMT_BGR4_BYTE)
+      && srcFormat!=PIX_FMT_RGB4_BYTE && srcFormat!=PIX_FMT_BGR4_BYTE
+      && ((dstW>>c->chrDstHSubSample) <= (srcW>>1) || (flags&(SWS_FAST_BILINEAR|SWS_POINT))))
         c->chrSrcHSubSample=1;
 
     if (param){
@@ -2108,34 +2321,33 @@ SwsContext *sws_getContext(int srcW, int srcH, int srcFormat, int dstW, int dstH
     c->chrDstW= -((-dstW) >> c->chrDstHSubSample);
     c->chrDstH= -((-dstH) >> c->chrDstVSubSample);
 
-    sws_setColorspaceDetails(c, Inverse_Table_6_9[SWS_CS_DEFAULT], srcRange, Inverse_Table_6_9[SWS_CS_DEFAULT] /* FIXME*/, dstRange, 0, 1<<16, 1<<16);
+    sws_setColorspaceDetails(c, ff_yuv2rgb_coeffs[SWS_CS_DEFAULT], srcRange, ff_yuv2rgb_coeffs[SWS_CS_DEFAULT] /* FIXME*/, dstRange, 0, 1<<16, 1<<16);
 
-    /* unscaled special Cases */
-    if (unscaled && !usesHFilter && !usesVFilter)
+    /* unscaled special cases */
+    if (unscaled && !usesHFilter && !usesVFilter && (srcRange == dstRange || isBGR(dstFormat) || isRGB(dstFormat)))
     {
         /* yv12_to_nv12 */
-        if (srcFormat == PIX_FMT_YUV420P && (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21))
+        if ((srcFormat == PIX_FMT_YUV420P || srcFormat == PIX_FMT_YUVA420P) && (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21))
         {
             c->swScale= PlanarToNV12Wrapper;
         }
-#ifdef CONFIG_GPL
         /* yuv2bgr */
-        if ((srcFormat==PIX_FMT_YUV420P || srcFormat==PIX_FMT_YUV422P) && (isBGR(dstFormat) || isRGB(dstFormat)))
+        if ((srcFormat==PIX_FMT_YUV420P || srcFormat==PIX_FMT_YUV422P || srcFormat==PIX_FMT_YUVA420P) && (isBGR(dstFormat) || isRGB(dstFormat))
+            && !(flags & SWS_ACCURATE_RND) && !(dstH&1))
         {
-            c->swScale= yuv2rgb_get_func_ptr(c);
+            c->swScale= sws_yuv2rgb_get_func_ptr(c);
         }
-#endif
 
-        if (srcFormat==PIX_FMT_YUV410P && dstFormat==PIX_FMT_YUV420P)
+        if (srcFormat==PIX_FMT_YUV410P && dstFormat==PIX_FMT_YUV420P && !(flags & SWS_BITEXACT))
         {
             c->swScale= yvu9toyv12Wrapper;
         }
 
         /* bgr24toYV12 */
-        if (srcFormat==PIX_FMT_BGR24 && dstFormat==PIX_FMT_YUV420P)
+        if (srcFormat==PIX_FMT_BGR24 && dstFormat==PIX_FMT_YUV420P && !(flags & SWS_ACCURATE_RND))
             c->swScale= bgr24toyv12Wrapper;
 
-        /* rgb/bgr -> rgb/bgr (no dither needed forms) */
+        /* RGB/BGR -> RGB/BGR (no dither needed forms) */
         if (  (isBGR(srcFormat) || isRGB(srcFormat))
            && (isBGR(dstFormat) || isRGB(dstFormat))
            && srcFormat != PIX_FMT_BGR8      && dstFormat != PIX_FMT_BGR8
@@ -2145,42 +2357,56 @@ SwsContext *sws_getContext(int srcW, int srcH, int srcFormat, int dstW, int dstH
            && srcFormat != PIX_FMT_BGR4_BYTE && dstFormat != PIX_FMT_BGR4_BYTE
            && srcFormat != PIX_FMT_RGB4_BYTE && dstFormat != PIX_FMT_RGB4_BYTE
            && srcFormat != PIX_FMT_MONOBLACK && dstFormat != PIX_FMT_MONOBLACK
-           && !needsDither)
+           && srcFormat != PIX_FMT_MONOWHITE && dstFormat != PIX_FMT_MONOWHITE
+                                             && dstFormat != PIX_FMT_RGB32_1
+                                             && dstFormat != PIX_FMT_BGR32_1
+           && (!needsDither || (c->flags&(SWS_FAST_BILINEAR|SWS_POINT))))
              c->swScale= rgb2rgbWrapper;
 
+        if ((usePal(srcFormat) && (
+                 dstFormat == PIX_FMT_RGB32   ||
+                 dstFormat == PIX_FMT_RGB32_1 ||
+                 dstFormat == PIX_FMT_RGB24   ||
+                 dstFormat == PIX_FMT_BGR32   ||
+                 dstFormat == PIX_FMT_BGR32_1 ||
+                 dstFormat == PIX_FMT_BGR24)))
+             c->swScale= pal2rgbWrapper;
+
+        if (srcFormat == PIX_FMT_YUV422P)
+        {
+            if (dstFormat == PIX_FMT_YUYV422)
+                c->swScale= YUV422PToYuy2Wrapper;
+            else if (dstFormat == PIX_FMT_UYVY422)
+                c->swScale= YUV422PToUyvyWrapper;
+        }
+
         /* LQ converters if -sws 0 or -sws 4*/
         if (c->flags&(SWS_FAST_BILINEAR|SWS_POINT)){
-            /* rgb/bgr -> rgb/bgr (dither needed forms) */
-            if ( (isBGR(srcFormat) || isRGB(srcFormat))
-              && (isBGR(dstFormat) || isRGB(dstFormat))
-              && needsDither)
-                c->swScale= rgb2rgbWrapper;
-
             /* yv12_to_yuy2 */
-            if (srcFormat == PIX_FMT_YUV420P &&
-                (dstFormat == PIX_FMT_YUYV422 || dstFormat == PIX_FMT_UYVY422))
+            if (srcFormat == PIX_FMT_YUV420P || srcFormat == PIX_FMT_YUVA420P)
             {
                 if (dstFormat == PIX_FMT_YUYV422)
                     c->swScale= PlanarToYuy2Wrapper;
-                else
+                else if (dstFormat == PIX_FMT_UYVY422)
                     c->swScale= PlanarToUyvyWrapper;
             }
         }
 
 #ifdef COMPILE_ALTIVEC
         if ((c->flags & SWS_CPU_CAPS_ALTIVEC) &&
-            ((srcFormat == PIX_FMT_YUV420P &&
-             (dstFormat == PIX_FMT_YUYV422 || dstFormat == PIX_FMT_UYVY422)))) {
+            !(c->flags & SWS_BITEXACT) &&
+            srcFormat == PIX_FMT_YUV420P) {
           // unscaled YV12 -> packed YUV, we want speed
           if (dstFormat == PIX_FMT_YUYV422)
               c->swScale= yv12toyuy2_unscaled_altivec;
-          else
+          else if (dstFormat == PIX_FMT_UYVY422)
               c->swScale= yv12touyvy_unscaled_altivec;
         }
 #endif
 
         /* simple copy */
         if (  srcFormat == dstFormat
+            || (srcFormat == PIX_FMT_YUVA420P && dstFormat == PIX_FMT_YUV420P)
             || (isPlanarYUV(srcFormat) && isGray(dstFormat))
             || (isPlanarYUV(dstFormat) && isGray(srcFormat)))
         {
@@ -2204,7 +2430,7 @@ SwsContext *sws_getContext(int srcW, int srcH, int srcFormat, int dstW, int dstH
             c->swScale= gray16swap;
         }
 
-#ifdef ARCH_BFIN
+#if ARCH_BFIN
         if (flags & SWS_CPU_CAPS_BFIN)
             ff_bfin_get_unscaled_swscale (c);
 #endif
@@ -2223,7 +2449,7 @@ SwsContext *sws_getContext(int srcW, int srcH, int srcFormat, int dstW, int dstH
         if (!c->canMMX2BeUsed && dstW >=srcW && (srcW&15)==0 && (flags&SWS_FAST_BILINEAR))
         {
             if (flags&SWS_PRINT_INFO)
-                av_log(c, AV_LOG_INFO, "output Width is not a multiple of 32 -> no MMX2 scaler\n");
+                av_log(c, AV_LOG_INFO, "output width is not a multiple of 32 -> no MMX2 scaler\n");
         }
         if (usesHFilter) c->canMMX2BeUsed=0;
     }
@@ -2246,7 +2472,7 @@ SwsContext *sws_getContext(int srcW, int srcH, int srcFormat, int dstW, int dstH
             c->lumXInc+= 20;
             c->chrXInc+= 20;
         }
-        //we don't use the x86asm scaler if mmx is available
+        //we don't use the x86 asm scaler if MMX is available
         else if (flags & SWS_CPU_CAPS_MMX)
         {
             c->lumXInc = ((srcW-2)<<16)/(dstW-2) - 20;
@@ -2292,7 +2518,7 @@ SwsContext *sws_getContext(int srcW, int srcH, int srcFormat, int dstW, int dstH
             initMMX2HScaler(c->chrDstW, c->chrXInc, c->funnyUVCode, c->chrMmx2Filter, c->chrMmx2FilterPos, 4);
         }
 #endif /* defined(COMPILE_MMX2) */
-    } // Init Horizontal stuff
+    } // initialize horizontal stuff
 
 
 
@@ -2304,15 +2530,15 @@ SwsContext *sws_getContext(int srcW, int srcH, int srcFormat, int dstW, int dstH
             1;
 
         initFilter(&c->vLumFilter, &c->vLumFilterPos, &c->vLumFilterSize, c->lumYInc,
-                   srcH      ,        dstH, filterAlign, (1<<12)-4,
+                   srcH      ,        dstH, filterAlign, (1<<12),
                    (flags&SWS_BICUBLIN) ? (flags|SWS_BICUBIC)  : flags,
                    srcFilter->lumV, dstFilter->lumV, c->param);
         initFilter(&c->vChrFilter, &c->vChrFilterPos, &c->vChrFilterSize, c->chrYInc,
-                   c->chrSrcH, c->chrDstH, filterAlign, (1<<12)-4,
+                   c->chrSrcH, c->chrDstH, filterAlign, (1<<12),
                    (flags&SWS_BICUBLIN) ? (flags|SWS_BILINEAR) : flags,
                    srcFilter->chrV, dstFilter->chrV, c->param);
 
-#ifdef HAVE_ALTIVEC
+#if HAVE_ALTIVEC
         c->vYCoeffsBank = av_malloc(sizeof (vector signed short)*c->vLumFilterSize*c->dstH);
         c->vCCoeffsBank = av_malloc(sizeof (vector signed short)*c->vChrFilterSize*c->chrDstH);
 
@@ -2332,7 +2558,7 @@ SwsContext *sws_getContext(int srcW, int srcH, int srcFormat, int dstW, int dstH
 #endif
     }
 
-    // Calculate Buffer Sizes so that they won't run out while handling these damn slices
+    // calculate buffer sizes so that they won't run out while handling these damn slices
     c->vLumBufSize= c->vLumFilterSize;
     c->vChrBufSize= c->vChrFilterSize;
     for (i=0; i<dstH; i++)
@@ -2352,7 +2578,7 @@ SwsContext *sws_getContext(int srcW, int srcH, int srcFormat, int dstW, int dstH
     // allocate pixbufs (we use dynamic allocation because otherwise we would need to
     c->lumPixBuf= av_malloc(c->vLumBufSize*2*sizeof(int16_t*));
     c->chrPixBuf= av_malloc(c->vChrBufSize*2*sizeof(int16_t*));
-    //Note we need at least one pixel more at the end because of the mmx code (just in case someone wanna replace the 4000/8000)
+    //Note we need at least one pixel more at the end because of the MMX code (just in case someone wanna replace the 4000/8000)
     /* align at 16 bytes for AltiVec */
     for (i=0; i<c->vLumBufSize; i++)
         c->lumPixBuf[i]= c->lumPixBuf[i+c->vLumBufSize]= av_mallocz(VOF+1);
@@ -2442,8 +2668,8 @@ SwsContext *sws_getContext(int srcW, int srcH, int srcFormat, int dstW, int dstH
         }
         else
         {
-#if defined(ARCH_X86)
-            av_log(c, AV_LOG_VERBOSE, "using X86-Asm scaler for horizontal scaling\n");
+#if ARCH_X86
+            av_log(c, AV_LOG_VERBOSE, "using x86 asm scaler for horizontal scaling\n");
 #else
             if (flags & SWS_FAST_BILINEAR)
                 av_log(c, AV_LOG_VERBOSE, "using FAST_BILINEAR C scaler for horizontal scaling\n");
@@ -2470,22 +2696,22 @@ SwsContext *sws_getContext(int srcW, int srcH, int srcFormat, int dstW, int dstH
         }
 
         if (dstFormat==PIX_FMT_BGR24)
-            av_log(c, AV_LOG_VERBOSE, "using %s YV12->BGR24 Converter\n",
+            av_log(c, AV_LOG_VERBOSE, "using %s YV12->BGR24 converter\n",
                    (flags & SWS_CPU_CAPS_MMX2) ? "MMX2" : ((flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C"));
         else if (dstFormat==PIX_FMT_RGB32)
-            av_log(c, AV_LOG_VERBOSE, "using %s YV12->BGR32 Converter\n", (flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C");
+            av_log(c, AV_LOG_VERBOSE, "using %s YV12->BGR32 converter\n", (flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C");
         else if (dstFormat==PIX_FMT_BGR565)
-            av_log(c, AV_LOG_VERBOSE, "using %s YV12->BGR16 Converter\n", (flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C");
+            av_log(c, AV_LOG_VERBOSE, "using %s YV12->BGR16 converter\n", (flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C");
         else if (dstFormat==PIX_FMT_BGR555)
-            av_log(c, AV_LOG_VERBOSE, "using %s YV12->BGR15 Converter\n", (flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C");
+            av_log(c, AV_LOG_VERBOSE, "using %s YV12->BGR15 converter\n", (flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C");
 
         av_log(c, AV_LOG_VERBOSE, "%dx%d -> %dx%d\n", srcW, srcH, dstW, dstH);
     }
     if (flags & SWS_PRINT_INFO)
     {
-        av_log(c, AV_LOG_DEBUG, "Lum srcW=%d srcH=%d dstW=%d dstH=%d xInc=%d yInc=%d\n",
+        av_log(c, AV_LOG_DEBUG, "lum srcW=%d srcH=%d dstW=%d dstH=%d xInc=%d yInc=%d\n",
                c->srcW, c->srcH, c->dstW, c->dstH, c->lumXInc, c->lumYInc);
-        av_log(c, AV_LOG_DEBUG, "Chr srcW=%d srcH=%d dstW=%d dstH=%d xInc=%d yInc=%d\n",
+        av_log(c, AV_LOG_DEBUG, "chr srcW=%d srcH=%d dstW=%d dstH=%d xInc=%d yInc=%d\n",
                c->chrSrcW, c->chrSrcH, c->chrDstW, c->chrDstH, c->chrXInc, c->chrYInc);
     }
 
@@ -2495,13 +2721,13 @@ SwsContext *sws_getContext(int srcW, int srcH, int srcFormat, int dstW, int dstH
 
 /**
  * swscale wrapper, so we don't need to export the SwsContext.
- * assumes planar YUV to be in YUV order instead of YVU
+ * Assumes planar YUV to be in YUV order instead of YVU.
  */
 int sws_scale(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
               int srcSliceH, uint8_t* dst[], int dstStride[]){
     int i;
     uint8_t* src2[4]= {src[0], src[1], src[2]};
-    uint32_t pal[256];
+
     if (c->sliceDir == 0 && srcSliceY != 0 && srcSliceY + srcSliceH != c->srcH) {
         av_log(c, AV_LOG_ERROR, "Slices start in the middle!\n");
         return 0;
@@ -2510,18 +2736,65 @@ int sws_scale(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
         if (srcSliceY == 0) c->sliceDir = 1; else c->sliceDir = -1;
     }
 
-    if (c->srcFormat == PIX_FMT_PAL8){
+    if (usePal(c->srcFormat)){
         for (i=0; i<256; i++){
-            int p= ((uint32_t*)(src[1]))[i];
-            int r= (p>>16)&0xFF;
-            int g= (p>> 8)&0xFF;
-            int b=  p     &0xFF;
-            int y= av_clip_uint8(((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16 );
-            int u= av_clip_uint8(((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128);
-            int v= av_clip_uint8(((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128);
-            pal[i]= y + (u<<8) + (v<<16);
+            int p, r, g, b,y,u,v;
+            if(c->srcFormat == PIX_FMT_PAL8){
+                p=((uint32_t*)(src[1]))[i];
+                r= (p>>16)&0xFF;
+                g= (p>> 8)&0xFF;
+                b=  p     &0xFF;
+            }else if(c->srcFormat == PIX_FMT_RGB8){
+                r= (i>>5    )*36;
+                g= ((i>>2)&7)*36;
+                b= (i&3     )*85;
+            }else if(c->srcFormat == PIX_FMT_BGR8){
+                b= (i>>6    )*85;
+                g= ((i>>3)&7)*36;
+                r= (i&7     )*36;
+            }else if(c->srcFormat == PIX_FMT_RGB4_BYTE){
+                r= (i>>3    )*255;
+                g= ((i>>1)&3)*85;
+                b= (i&1     )*255;
+            }else {
+                assert(c->srcFormat == PIX_FMT_BGR4_BYTE);
+                b= (i>>3    )*255;
+                g= ((i>>1)&3)*85;
+                r= (i&1     )*255;
+            }
+            y= av_clip_uint8((RY*r + GY*g + BY*b + ( 33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
+            u= av_clip_uint8((RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
+            v= av_clip_uint8((RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
+            c->pal_yuv[i]= y + (u<<8) + (v<<16);
+
+
+            switch(c->dstFormat) {
+            case PIX_FMT_BGR32:
+#ifndef WORDS_BIGENDIAN
+            case PIX_FMT_RGB24:
+#endif
+                c->pal_rgb[i]=  r + (g<<8) + (b<<16);
+                break;
+            case PIX_FMT_BGR32_1:
+#ifdef  WORDS_BIGENDIAN
+            case PIX_FMT_BGR24:
+#endif
+                c->pal_rgb[i]= (r + (g<<8) + (b<<16)) << 8;
+                break;
+            case PIX_FMT_RGB32_1:
+#ifdef  WORDS_BIGENDIAN
+            case PIX_FMT_RGB24:
+#endif
+                c->pal_rgb[i]= (b + (g<<8) + (r<<16)) << 8;
+                break;
+            case PIX_FMT_RGB32:
+#ifndef WORDS_BIGENDIAN
+            case PIX_FMT_BGR24:
+#endif
+            default:
+                c->pal_rgb[i]=  b + (g<<8) + (r<<16);
+            }
         }
-        src2[1]= (uint8_t*)pal;
     }
 
     // copy strides, so they can safely be modified
@@ -2539,7 +2812,7 @@ int sws_scale(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
         int dstStride2[4]= {-dstStride[0], -dstStride[1], -dstStride[2]};
 
         src2[0] += (srcSliceH-1)*srcStride[0];
-        if (c->srcFormat != PIX_FMT_PAL8)
+        if (!usePal(c->srcFormat))
             src2[1] += ((srcSliceH>>c->chrSrcVSubSample)-1)*srcStride[1];
         src2[2] += ((srcSliceH>>c->chrSrcVSubSample)-1)*srcStride[2];
 
@@ -2547,13 +2820,12 @@ int sws_scale(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
     }
 }
 
-/**
- * swscale wrapper, so we don't need to export the SwsContext
- */
+#if LIBSWSCALE_VERSION_MAJOR < 1
 int sws_scale_ordered(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
                       int srcSliceH, uint8_t* dst[], int dstStride[]){
     return sws_scale(c, src, srcStride, srcSliceY, srcSliceH, dst, dstStride);
 }
+#endif
 
 SwsFilter *sws_getDefaultFilter(float lumaGBlur, float chromaGBlur,
                                 float lumaSharpen, float chromaSharpen,
@@ -2607,16 +2879,12 @@ SwsFilter *sws_getDefaultFilter(float lumaGBlur, float chromaGBlur,
     sws_normalizeVec(filter->lumH, 1.0);
     sws_normalizeVec(filter->lumV, 1.0);
 
-    if (verbose) sws_printVec(filter->chrH);
-    if (verbose) sws_printVec(filter->lumH);
+    if (verbose) sws_printVec2(filter->chrH, NULL, AV_LOG_DEBUG);
+    if (verbose) sws_printVec2(filter->lumH, NULL, AV_LOG_DEBUG);
 
     return filter;
 }
 
-/**
- * returns a normalized gaussian curve used to filter stuff
- * quality=3 is high quality, lowwer is lowwer quality
- */
 SwsVector *sws_getGaussianVec(double variance, double quality){
     const int length= (int)(variance*quality + 0.5) | 1;
     int i;
@@ -2799,7 +3067,7 @@ SwsVector *sws_cloneVec(SwsVector *a){
     return vec;
 }
 
-void sws_printVec(SwsVector *a){
+void sws_printVec2(SwsVector *a, AVClass *log_ctx, int log_level){
     int i;
     double max=0;
     double min=0;
@@ -2816,12 +3084,18 @@ void sws_printVec(SwsVector *a){
     for (i=0; i<a->length; i++)
     {
         int x= (int)((a->coeff[i]-min)*60.0/range +0.5);
-        av_log(NULL, AV_LOG_DEBUG, "%1.3f ", a->coeff[i]);
-        for (;x>0; x--) av_log(NULL, AV_LOG_DEBUG, " ");
-        av_log(NULL, AV_LOG_DEBUG, "|\n");
+        av_log(log_ctx, log_level, "%1.3f ", a->coeff[i]);
+        for (;x>0; x--) av_log(log_ctx, log_level, " ");
+        av_log(log_ctx, log_level, "|\n");
     }
 }
 
+#if LIBSWSCALE_VERSION_MAJOR < 1
+void sws_printVec(SwsVector *a){
+    sws_printVec2(a, NULL, AV_LOG_DEBUG);
+}
+#endif
+
 void sws_freeVec(SwsVector *a){
     if (!a) return;
     av_freep(&a->coeff);
@@ -2862,7 +3136,7 @@ void sws_freeContext(SwsContext *c){
     av_freep(&c->vChrFilter);
     av_freep(&c->hLumFilter);
     av_freep(&c->hChrFilter);
-#ifdef HAVE_ALTIVEC
+#if HAVE_ALTIVEC
     av_freep(&c->vYCoeffsBank);
     av_freep(&c->vCCoeffsBank);
 #endif
@@ -2872,7 +3146,7 @@ void sws_freeContext(SwsContext *c){
     av_freep(&c->hLumFilterPos);
     av_freep(&c->hChrFilterPos);
 
-#if defined(ARCH_X86) && defined(CONFIG_GPL)
+#if ARCH_X86 && CONFIG_GPL
 #ifdef MAP_ANONYMOUS
     if (c->funnyYCode) munmap(c->funnyYCode, MAX_FUNNY_CODE_SIZE);
     if (c->funnyUVCode) munmap(c->funnyUVCode, MAX_FUNNY_CODE_SIZE);
@@ -2882,7 +3156,7 @@ void sws_freeContext(SwsContext *c){
 #endif
     c->funnyYCode=NULL;
     c->funnyUVCode=NULL;
-#endif /* defined(ARCH_X86) */
+#endif /* ARCH_X86 && CONFIG_GPL */
 
     av_freep(&c->lumMmx2Filter);
     av_freep(&c->chrMmx2Filter);
@@ -2893,19 +3167,9 @@ void sws_freeContext(SwsContext *c){
     av_free(c);
 }
 
-/**
- * Checks if context is valid or reallocs a new one instead.
- * If context is NULL, just calls sws_getContext() to get a new one.
- * Otherwise, checks if the parameters are the same already saved in context.
- * If that is the case, returns the current context.
- * Otherwise, frees context and gets a new one.
- *
- * Be warned that srcFilter, dstFilter are not checked, they are
- * asumed to remain valid.
- */
 struct SwsContext *sws_getCachedContext(struct SwsContext *context,
-                                        int srcW, int srcH, int srcFormat,
-                                        int dstW, int dstH, int dstFormat, int flags,
+                                        int srcW, int srcH, enum PixelFormat srcFormat,
+                                        int dstW, int dstH, enum PixelFormat dstFormat, int flags,
                                         SwsFilter *srcFilter, SwsFilter *dstFilter, double *param)
 {
     static const double default_param[2] = {SWS_PARAM_DEFAULT, SWS_PARAM_DEFAULT};
diff --git a/src/add-ons/media/plugins/avcodec/libswscale/swscale.h b/src/add-ons/media/plugins/avcodec/libswscale/swscale.h
index b58d358cf3..124a623338 100644
--- a/src/add-ons/media/plugins/avcodec/libswscale/swscale.h
+++ b/src/add-ons/media/plugins/avcodec/libswscale/swscale.h
@@ -18,19 +18,20 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#ifndef FFMPEG_SWSCALE_H
-#define FFMPEG_SWSCALE_H
+#ifndef SWSCALE_SWSCALE_H
+#define SWSCALE_SWSCALE_H
 
 /**
- * @file swscale.h
+ * @file libswscale/swscale.h
  * @brief
  *     external api for the swscale stuff
  */
 
-#include "avutil.h"
+#include "libavutil/avutil.h"
+#include "libavutil/internal.h"
 
 #define LIBSWSCALE_VERSION_MAJOR 0
-#define LIBSWSCALE_VERSION_MINOR 5
+#define LIBSWSCALE_VERSION_MINOR 7
 #define LIBSWSCALE_VERSION_MICRO 1
 
 #define LIBSWSCALE_VERSION_INT  AV_VERSION_INT(LIBSWSCALE_VERSION_MAJOR, \
@@ -43,6 +44,11 @@
 
 #define LIBSWSCALE_IDENT        "SwS" AV_STRINGIFY(LIBSWSCALE_VERSION)
 
+/**
+ * Returns the LIBSWSCALE_VERSION_INT constant.
+ */
+unsigned swscale_version(void);
+
 /* values for the flags, the stuff on the command line is different */
 #define SWS_FAST_BILINEAR     1
 #define SWS_BILINEAR          2
@@ -70,6 +76,7 @@
 #define SWS_FULL_CHR_H_INP    0x4000
 #define SWS_DIRECT_BGR        0x8000
 #define SWS_ACCURATE_RND      0x40000
+#define SWS_BITEXACT          0x80000
 
 #define SWS_CPU_CAPS_MMX      0x80000000
 #define SWS_CPU_CAPS_MMX2     0x20000000
@@ -92,8 +99,8 @@
 // when used for filters they must have an odd number of elements
 // coeffs cannot be shared between vectors
 typedef struct {
-    double *coeff;
-    int length;
+    double *coeff;              ///< pointer to the list of coefficients
+    int length;                 ///< number of coefficients in the vector
 } SwsVector;
 
 // vectors can be shared
@@ -108,39 +115,134 @@ struct SwsContext;
 
 void sws_freeContext(struct SwsContext *swsContext);
 
-struct SwsContext *sws_getContext(int srcW, int srcH, int srcFormat, int dstW, int dstH, int dstFormat, int flags,
+/**
+ * Allocates and returns a SwsContext. You need it to perform
+ * scaling/conversion operations using sws_scale().
+ *
+ * @param srcW the width of the source image
+ * @param srcH the height of the source image
+ * @param srcFormat the source image format
+ * @param dstW the width of the destination image
+ * @param dstH the height of the destination image
+ * @param dstFormat the destination image format
+ * @param flags specify which algorithm and options to use for rescaling
+ * @return a pointer to an allocated context, or NULL in case of error
+ */
+struct SwsContext *sws_getContext(int srcW, int srcH, enum PixelFormat srcFormat, int dstW, int dstH, enum PixelFormat dstFormat, int flags,
                                   SwsFilter *srcFilter, SwsFilter *dstFilter, double *param);
-int sws_scale(struct SwsContext *context, uint8_t* src[], int srcStride[], int srcSliceY,
+
+/**
+ * Scales the image slice in \p srcSlice and puts the resulting scaled
+ * slice in the image in \p dst. A slice is a sequence of consecutive
+ * rows in an image.
+ *
+ * @param context   the scaling context previously created with
+ *                  sws_getContext()
+ * @param srcSlice  the array containing the pointers to the planes of
+ *                  the source slice
+ * @param srcStride the array containing the strides for each plane of
+ *                  the source image
+ * @param srcSliceY the position in the source image of the slice to
+ *                  process, that is the number (counted starting from
+ *                  zero) in the image of the first row of the slice
+ * @param srcSliceH the height of the source slice, that is the number
+ *                  of rows in the slice
+ * @param dst       the array containing the pointers to the planes of
+ *                  the destination image
+ * @param dstStride the array containing the strides for each plane of
+ *                  the destination image
+ * @return          the height of the output slice
+ */
+int sws_scale(struct SwsContext *context, uint8_t* srcSlice[], int srcStride[], int srcSliceY,
               int srcSliceH, uint8_t* dst[], int dstStride[]);
+#if LIBSWSCALE_VERSION_MAJOR < 1
+/**
+ * @deprecated Use sws_scale() instead.
+ */
 int sws_scale_ordered(struct SwsContext *context, uint8_t* src[], int srcStride[], int srcSliceY,
                       int srcSliceH, uint8_t* dst[], int dstStride[]) attribute_deprecated;
+#endif
 
 
 int sws_setColorspaceDetails(struct SwsContext *c, const int inv_table[4], int srcRange, const int table[4], int dstRange, int brightness, int contrast, int saturation);
 int sws_getColorspaceDetails(struct SwsContext *c, int **inv_table, int *srcRange, int **table, int *dstRange, int *brightness, int *contrast, int *saturation);
+
+/**
+ * Returns a normalized Gaussian curve used to filter stuff
+ * quality=3 is high quality, lower is lower quality.
+ */
 SwsVector *sws_getGaussianVec(double variance, double quality);
+
+/**
+ * Allocates and returns a vector with \p length coefficients, all
+ * with the same value \p c.
+ */
 SwsVector *sws_getConstVec(double c, int length);
+
+/**
+ * Allocates and returns a vector with just one coefficient, with
+ * value 1.0.
+ */
 SwsVector *sws_getIdentityVec(void);
+
+/**
+ * Scales all the coefficients of \p a by the \p scalar value.
+ */
 void sws_scaleVec(SwsVector *a, double scalar);
+
+/**
+ * Scales all the coefficients of \p a so that their sum equals \p
+ * height."
+ */
 void sws_normalizeVec(SwsVector *a, double height);
 void sws_convVec(SwsVector *a, SwsVector *b);
 void sws_addVec(SwsVector *a, SwsVector *b);
 void sws_subVec(SwsVector *a, SwsVector *b);
 void sws_shiftVec(SwsVector *a, int shift);
+
+/**
+ * Allocates and returns a clone of the vector \p a, that is a vector
+ * with the same coefficients as \p a.
+ */
 SwsVector *sws_cloneVec(SwsVector *a);
 
-void sws_printVec(SwsVector *a);
+#if LIBSWSCALE_VERSION_MAJOR < 1
+/**
+ * @deprecated Use sws_printVec2() instead.
+ */
+attribute_deprecated void sws_printVec(SwsVector *a);
+#endif
+
+/**
+ * Prints with av_log() a textual representation of the vector \p a
+ * if \p log_level <= av_log_level.
+ */
+void sws_printVec2(SwsVector *a, AVClass *log_ctx, int log_level);
+
 void sws_freeVec(SwsVector *a);
 
 SwsFilter *sws_getDefaultFilter(float lumaGBlur, float chromaGBlur,
-                                float lumaSarpen, float chromaSharpen,
+                                float lumaSharpen, float chromaSharpen,
                                 float chromaHShift, float chromaVShift,
                                 int verbose);
 void sws_freeFilter(SwsFilter *filter);
 
+/**
+ * Checks if \p context can be reused, otherwise reallocates a new
+ * one.
+ *
+ * If \p context is NULL, just calls sws_getContext() to get a new
+ * context. Otherwise, checks if the parameters are the ones already
+ * saved in \p context. If that is the case, returns the current
+ * context. Otherwise, frees \p context and gets a new context with
+ * the new parameters.
+ *
+ * Be warned that \p srcFilter and \p dstFilter are not checked, they
+ * are assumed to remain the same.
+ */
 struct SwsContext *sws_getCachedContext(struct SwsContext *context,
-                                        int srcW, int srcH, int srcFormat,
-                                        int dstW, int dstH, int dstFormat, int flags,
+                                        int srcW, int srcH, enum PixelFormat srcFormat,
+                                        int dstW, int dstH, enum PixelFormat dstFormat, int flags,
                                         SwsFilter *srcFilter, SwsFilter *dstFilter, double *param);
 
-#endif /* FFMPEG_SWSCALE_H */
+#endif /* SWSCALE_SWSCALE_H */
diff --git a/src/add-ons/media/plugins/avcodec/libswscale/swscale_altivec_template.c b/src/add-ons/media/plugins/avcodec/libswscale/swscale_altivec_template.c
index 2111cec410..a008b966e8 100644
--- a/src/add-ons/media/plugins/avcodec/libswscale/swscale_altivec_template.c
+++ b/src/add-ons/media/plugins/avcodec/libswscale/swscale_altivec_template.c
@@ -220,7 +220,7 @@ static inline void hScale_altivec_real(int16_t *dst, int dstW, uint8_t *src, int
             for (j=0; j<filterSize; j++) {
                 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
             }
-            dst[i] = av_clip(val>>7, 0, (1<<15)-1);
+            dst[i] = FFMIN(val>>7, (1<<15)-1);
         }
     }
     else
@@ -259,7 +259,7 @@ static inline void hScale_altivec_real(int16_t *dst, int dstW, uint8_t *src, int
         val_vEven = vec_mule(src_v, filter_v);
         val_s = vec_sums(val_vEven, vzero);
         vec_st(val_s, 0, tempo);
-        dst[i] = av_clip(tempo[3]>>7, 0, (1<<15)-1);
+        dst[i] = FFMIN(tempo[3]>>7, (1<<15)-1);
     }
     }
     break;
@@ -286,7 +286,7 @@ static inline void hScale_altivec_real(int16_t *dst, int dstW, uint8_t *src, int
         val_v = vec_msums(src_v, filter_v, (vector signed int)vzero);
         val_s = vec_sums(val_v, vzero);
         vec_st(val_s, 0, tempo);
-        dst[i] = av_clip(tempo[3]>>7, 0, (1<<15)-1);
+        dst[i] = FFMIN(tempo[3]>>7, (1<<15)-1);
     }
     }
     break;
@@ -315,7 +315,7 @@ static inline void hScale_altivec_real(int16_t *dst, int dstW, uint8_t *src, int
             vector signed int val_s = vec_sums(val_v, vzero);
 
             vec_st(val_s, 0, tempo);
-            dst[i] = av_clip(tempo[3]>>7, 0, (1<<15)-1);
+            dst[i] = FFMIN(tempo[3]>>7, (1<<15)-1);
         }
     }
     break;
@@ -377,7 +377,7 @@ static inline void hScale_altivec_real(int16_t *dst, int dstW, uint8_t *src, int
         val_s = vec_sums(val_v, vzero);
 
         vec_st(val_s, 0, tempo);
-        dst[i] = av_clip(tempo[3]>>7, 0, (1<<15)-1);
+        dst[i] = FFMIN(tempo[3]>>7, (1<<15)-1);
     }
 
     }
diff --git a/src/add-ons/media/plugins/avcodec/libswscale/swscale_avoption.c b/src/add-ons/media/plugins/avcodec/libswscale/swscale_avoption.c
index c16258d254..996843df1d 100644
--- a/src/add-ons/media/plugins/avcodec/libswscale/swscale_avoption.c
+++ b/src/add-ons/media/plugins/avcodec/libswscale/swscale_avoption.c
@@ -18,8 +18,8 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include "avutil.h"
-#include "opt.h"
+#include "libavutil/avutil.h"
+#include "libavcodec/opt.h"
 #include "swscale.h"
 #include "swscale_internal.h"
 
@@ -53,6 +53,7 @@ static const AVOption options[] = {
     { "bfin", "Blackfin SIMD acceleration", 0, FF_OPT_TYPE_CONST, SWS_CPU_CAPS_BFIN, INT_MIN, INT_MAX, VE, "sws_flags" },
     { "full_chroma_int", "full chroma interpolation", 0 , FF_OPT_TYPE_CONST, SWS_FULL_CHR_H_INT, INT_MIN, INT_MAX, VE, "sws_flags" },
     { "full_chroma_inp", "full chroma input", 0 , FF_OPT_TYPE_CONST, SWS_FULL_CHR_H_INP, INT_MIN, INT_MAX, VE, "sws_flags" },
+    { "bitexact", "", 0 , FF_OPT_TYPE_CONST, SWS_BITEXACT, INT_MIN, INT_MAX, VE, "sws_flags" },
     { NULL }
 };
 
diff --git a/src/add-ons/media/plugins/avcodec/libswscale/swscale_bfin.c b/src/add-ons/media/plugins/avcodec/libswscale/swscale_bfin.c
index 3e63bbd638..ed7d9579b6 100644
--- a/src/add-ons/media/plugins/avcodec/libswscale/swscale_bfin.c
+++ b/src/add-ons/media/plugins/avcodec/libswscale/swscale_bfin.c
@@ -26,9 +26,6 @@
 #include <inttypes.h>
 #include <assert.h>
 #include "config.h"
-#ifdef HAVE_MALLOC_H
-#include <malloc.h>
-#endif
 #include <unistd.h>
 #include "rgb2rgb.h"
 #include "swscale.h"
@@ -40,13 +37,13 @@
 #define L1CODE
 #endif
 
-extern int ff_bfin_uyvytoyv12 (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
-                               long width, long height,
-                               long lumStride, long chromStride, long srcStride) L1CODE;
+int ff_bfin_uyvytoyv12 (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+                        long width, long height,
+                        long lumStride, long chromStride, long srcStride) L1CODE;
 
-extern int ff_bfin_yuyvtoyv12 (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
-                               long width, long height,
-                               long lumStride, long chromStride, long srcStride) L1CODE;
+int ff_bfin_yuyvtoyv12 (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+                        long width, long height,
+                        long lumStride, long chromStride, long srcStride) L1CODE;
 
 static int uyvytoyv12_unscaled (SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
                                 int srcSliceH, uint8_t* dst[], int dstStride[])
diff --git a/src/add-ons/media/plugins/avcodec/libswscale/swscale_internal.h b/src/add-ons/media/plugins/avcodec/libswscale/swscale_internal.h
index 7aa3f9babd..cdf3754d14 100644
--- a/src/add-ons/media/plugins/avcodec/libswscale/swscale_internal.h
+++ b/src/add-ons/media/plugins/avcodec/libswscale/swscale_internal.h
@@ -18,22 +18,42 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#ifndef FFMPEG_SWSCALE_INTERNAL_H
-#define FFMPEG_SWSCALE_INTERNAL_H
+#ifndef SWSCALE_SWSCALE_INTERNAL_H
+#define SWSCALE_SWSCALE_INTERNAL_H
 
 #include "config.h"
 
-#ifdef HAVE_ALTIVEC_H
+#if HAVE_ALTIVEC_H
 #include <altivec.h>
 #endif
 
-#include "avutil.h"
+#include "libavutil/avutil.h"
+
+#define STR(s)         AV_TOSTRING(s) //AV_STRINGIFY is too long
 
 #define MAX_FILTER_SIZE 256
 
 #define VOFW 2048
 #define VOF  (VOFW*2)
 
+#ifdef WORDS_BIGENDIAN
+#define ALT32_CORR (-1)
+#else
+#define ALT32_CORR   1
+#endif
+
+#if ARCH_X86_64
+#   define APCK_PTR2 8
+#   define APCK_COEF 16
+#   define APCK_SIZE 24
+#else
+#   define APCK_PTR2 4
+#   define APCK_COEF 8
+#   define APCK_SIZE 16
+#endif
+
+struct SwsContext;
+
 typedef int (*SwsFunc)(struct SwsContext *context, uint8_t* src[], int srcStride[], int srcSliceY,
              int srcSliceH, uint8_t* dst[], int dstStride[]);
 
@@ -53,7 +73,7 @@ typedef struct SwsContext{
     int chrSrcW, chrSrcH, chrDstW, chrDstH;
     int lumXInc, chrXInc;
     int lumYInc, chrYInc;
-    int dstFormat, srcFormat;               ///< format 4:2:0 type is always YV12
+    enum PixelFormat dstFormat, srcFormat;  ///< format 4:2:0 type is always YV12
     int origDstFormat, origSrcFormat;       ///< format
     int chrSrcHSubSample, chrSrcVSubSample;
     int chrIntHSubSample, chrIntVSubSample;
@@ -62,6 +82,9 @@ typedef struct SwsContext{
     int sliceDir;
     double param[2];
 
+    uint32_t pal_yuv[256];
+    uint32_t pal_rgb[256];
+
     int16_t **lumPixBuf;
     int16_t **chrPixBuf;
     int16_t *hLumFilter;
@@ -108,6 +131,12 @@ typedef struct SwsContext{
     int srcColorspaceTable[4];
     int dstColorspaceTable[4];
     int srcRange, dstRange;
+    int yuv2rgb_y_offset;
+    int yuv2rgb_y_coeff;
+    int yuv2rgb_v2r_coeff;
+    int yuv2rgb_v2g_coeff;
+    int yuv2rgb_u2g_coeff;
+    int yuv2rgb_u2b_coeff;
 
 #define RED_DITHER            "0*8"
 #define GREEN_DITHER          "1*8"
@@ -148,7 +177,7 @@ typedef struct SwsContext{
     uint64_t u_temp       __attribute__((aligned(8)));
     uint64_t v_temp       __attribute__((aligned(8)));
 
-#ifdef HAVE_ALTIVEC
+#if HAVE_ALTIVEC
 
   vector signed short   CY;
   vector signed short   CRV;
@@ -162,7 +191,7 @@ typedef struct SwsContext{
 #endif
 
 
-#ifdef ARCH_BFIN
+#if ARCH_BFIN
     uint32_t oy           __attribute__((aligned(4)));
     uint32_t oc           __attribute__((aligned(4)));
     uint32_t zero         __attribute__((aligned(4)));
@@ -176,18 +205,18 @@ typedef struct SwsContext{
     uint32_t gmask        __attribute__((aligned(4)));
 #endif
 
-#ifdef HAVE_VIS
+#if HAVE_VIS
     uint64_t sparc_coeffs[10] __attribute__((aligned(8)));
 #endif
 
 } SwsContext;
 //FIXME check init (where 0)
 
-SwsFunc yuv2rgb_get_func_ptr (SwsContext *c);
-int yuv2rgb_c_init_tables (SwsContext *c, const int inv_table[4], int fullRange, int brightness, int contrast, int saturation);
+SwsFunc sws_yuv2rgb_get_func_ptr (SwsContext *c);
+int sws_yuv2rgb_c_init_tables (SwsContext *c, const int inv_table[4], int fullRange, int brightness, int contrast, int saturation);
 
-void yuv2rgb_altivec_init_tables (SwsContext *c, const int inv_table[4],int brightness,int contrast, int saturation);
-SwsFunc yuv2rgb_init_altivec (SwsContext *c);
+void sws_yuv2rgb_altivec_init_tables (SwsContext *c, const int inv_table[4],int brightness,int contrast, int saturation);
+SwsFunc sws_yuv2rgb_init_altivec (SwsContext *c);
 void altivec_yuv2packedX (SwsContext *c,
                           int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
                           int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
@@ -221,7 +250,8 @@ const char *sws_format_name(int format);
         || (x)==PIX_FMT_GRAY16LE    \
     )
 #define isRGB(x)        (           \
-           (x)==PIX_FMT_BGR32       \
+           (x)==PIX_FMT_RGB32       \
+        || (x)==PIX_FMT_RGB32_1     \
         || (x)==PIX_FMT_RGB24       \
         || (x)==PIX_FMT_RGB565      \
         || (x)==PIX_FMT_RGB555      \
@@ -229,9 +259,11 @@ const char *sws_format_name(int format);
         || (x)==PIX_FMT_RGB4        \
         || (x)==PIX_FMT_RGB4_BYTE   \
         || (x)==PIX_FMT_MONOBLACK   \
+        || (x)==PIX_FMT_MONOWHITE   \
     )
 #define isBGR(x)        (           \
-           (x)==PIX_FMT_RGB32       \
+           (x)==PIX_FMT_BGR32       \
+        || (x)==PIX_FMT_BGR32_1     \
         || (x)==PIX_FMT_BGR24       \
         || (x)==PIX_FMT_BGR565      \
         || (x)==PIX_FMT_BGR555      \
@@ -239,6 +271,14 @@ const char *sws_format_name(int format);
         || (x)==PIX_FMT_BGR4        \
         || (x)==PIX_FMT_BGR4_BYTE   \
         || (x)==PIX_FMT_MONOBLACK   \
+        || (x)==PIX_FMT_MONOWHITE   \
+    )
+#define isALPHA(x)      (           \
+           (x)==PIX_FMT_BGR32       \
+        || (x)==PIX_FMT_BGR32_1     \
+        || (x)==PIX_FMT_RGB32       \
+        || (x)==PIX_FMT_RGB32_1     \
+        || (x)==PIX_FMT_YUVA420P    \
     )
 
 static inline int fmt_depth(int fmt)
@@ -269,15 +309,16 @@ static inline int fmt_depth(int fmt)
         case PIX_FMT_RGB4_BYTE:
             return 4;
         case PIX_FMT_MONOBLACK:
+        case PIX_FMT_MONOWHITE:
             return 1;
         default:
             return 0;
     }
 }
 
-extern const DECLARE_ALIGNED(8, uint64_t, ff_dither4[2]);
-extern const DECLARE_ALIGNED(8, uint64_t, ff_dither8[2]);
+extern const uint64_t ff_dither4[2];
+extern const uint64_t ff_dither8[2];
 
 extern const AVClass sws_context_class;
 
-#endif /* FFMPEG_SWSCALE_INTERNAL_H */
+#endif /* SWSCALE_SWSCALE_INTERNAL_H */
diff --git a/src/add-ons/media/plugins/avcodec/libswscale/swscale_template.c b/src/add-ons/media/plugins/avcodec/libswscale/swscale_template.c
index 1280ba6c02..3262b6ee85 100644
--- a/src/add-ons/media/plugins/avcodec/libswscale/swscale_template.c
+++ b/src/add-ons/media/plugins/avcodec/libswscale/swscale_template.c
@@ -29,17 +29,17 @@
 #undef EMMS
 #undef SFENCE
 
-#ifdef HAVE_3DNOW
-/* On K6 femms is faster than emms. On K7 femms is directly mapped on emms. */
+#if HAVE_AMD3DNOW
+/* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
 #define EMMS     "femms"
 #else
 #define EMMS     "emms"
 #endif
 
-#ifdef HAVE_3DNOW
+#if HAVE_AMD3DNOW
 #define PREFETCH  "prefetch"
 #define PREFETCHW "prefetchw"
-#elif defined (HAVE_MMX2)
+#elif HAVE_MMX2
 #define PREFETCH "prefetchnta"
 #define PREFETCHW "prefetcht0"
 #else
@@ -47,31 +47,31 @@
 #define PREFETCHW " # nop"
 #endif
 
-#ifdef HAVE_MMX2
+#if HAVE_MMX2
 #define SFENCE "sfence"
 #else
 #define SFENCE " # nop"
 #endif
 
-#ifdef HAVE_MMX2
+#if HAVE_MMX2
 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
-#elif defined (HAVE_3DNOW)
+#elif HAVE_AMD3DNOW
 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
 #endif
 
-#ifdef HAVE_MMX2
+#if HAVE_MMX2
 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
 #else
 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
 #endif
 #define MOVNTQ(a,b)  REAL_MOVNTQ(a,b)
 
-#ifdef HAVE_ALTIVEC
+#if HAVE_ALTIVEC
 #include "swscale_altivec_template.c"
 #endif
 
 #define YSCALEYUV2YV12X(x, offset, dest, width) \
-    asm volatile(\
+    __asm__ volatile(\
     "xor                          %%"REG_a", %%"REG_a"  \n\t"\
     "movq             "VROUNDER_OFFSET"(%0), %%mm3      \n\t"\
     "movq                             %%mm3, %%mm4      \n\t"\
@@ -107,7 +107,7 @@
     );
 
 #define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
-    asm volatile(\
+    __asm__ volatile(\
     "lea                     " offset "(%0), %%"REG_d"  \n\t"\
     "xor                          %%"REG_a", %%"REG_a"  \n\t"\
     "pxor                             %%mm4, %%mm4      \n\t"\
@@ -119,19 +119,19 @@
     "1:                                                 \n\t"\
     "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm0      \n\t" /* srcData */\
     "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm2      \n\t" /* srcData */\
-    "mov                       4(%%"REG_d"), %%"REG_S"  \n\t"\
+    "mov        "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"  \n\t"\
     "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm1      \n\t" /* srcData */\
     "movq                             %%mm0, %%mm3      \n\t"\
     "punpcklwd                        %%mm1, %%mm0      \n\t"\
     "punpckhwd                        %%mm1, %%mm3      \n\t"\
-    "movq                      8(%%"REG_d"), %%mm1      \n\t" /* filterCoeff */\
+    "movq       "STR(APCK_COEF)"(%%"REG_d"), %%mm1      \n\t" /* filterCoeff */\
     "pmaddwd                          %%mm1, %%mm0      \n\t"\
     "pmaddwd                          %%mm1, %%mm3      \n\t"\
     "paddd                            %%mm0, %%mm4      \n\t"\
     "paddd                            %%mm3, %%mm5      \n\t"\
     "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm3      \n\t" /* srcData */\
-    "mov                      16(%%"REG_d"), %%"REG_S"  \n\t"\
-    "add                                $16, %%"REG_d"  \n\t"\
+    "mov        "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"  \n\t"\
+    "add                  $"STR(APCK_SIZE)", %%"REG_d"  \n\t"\
     "test                         %%"REG_S", %%"REG_S"  \n\t"\
     "movq                             %%mm2, %%mm0      \n\t"\
     "punpcklwd                        %%mm3, %%mm2      \n\t"\
@@ -190,8 +190,8 @@
     "1:                                   \n\t"\
     "movq  (%0, %%"REG_a", 2), %%mm0      \n\t"\
     "movq 8(%0, %%"REG_a", 2), %%mm1      \n\t"\
-    "paddw              %%mm7, %%mm0      \n\t"\
-    "paddw              %%mm7, %%mm1      \n\t"\
+    "paddsw             %%mm7, %%mm0      \n\t"\
+    "paddsw             %%mm7, %%mm1      \n\t"\
     "psraw                 $7, %%mm0      \n\t"\
     "psraw                 $7, %%mm1      \n\t"\
     "packuswb           %%mm1, %%mm0      \n\t"\
@@ -206,8 +206,8 @@
        "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
     : "%eax", "%ebx", "%ecx", "%edx", "%esi"
 */
-#define YSCALEYUV2PACKEDX \
-    asm volatile(\
+#define YSCALEYUV2PACKEDX_UV \
+    __asm__ volatile(\
     "xor                   %%"REG_a", %%"REG_a"     \n\t"\
     ASMALIGN(4)\
     "nop                                            \n\t"\
@@ -229,8 +229,9 @@
     "paddw                     %%mm5, %%mm4         \n\t"\
     "test                  %%"REG_S", %%"REG_S"     \n\t"\
     " jnz                         2b                \n\t"\
-\
-    "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
+
+#define YSCALEYUV2PACKEDX_YA(offset) \
+    "lea                "offset"(%0), %%"REG_d"     \n\t"\
     "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
     "movq      "VROUNDER_OFFSET"(%0), %%mm1         \n\t"\
     "movq                      %%mm1, %%mm7         \n\t"\
@@ -248,6 +249,10 @@
     "test                  %%"REG_S", %%"REG_S"     \n\t"\
     " jnz                         2b                \n\t"\
 
+#define YSCALEYUV2PACKEDX \
+    YSCALEYUV2PACKEDX_UV \
+    YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET) \
+
 #define YSCALEYUV2PACKEDX_END                 \
     :: "r" (&c->redDither),                   \
         "m" (dummy), "m" (dummy), "m" (dummy),\
@@ -255,8 +260,8 @@
     : "%"REG_a, "%"REG_d, "%"REG_S            \
     );
 
-#define YSCALEYUV2PACKEDX_ACCURATE \
-    asm volatile(\
+#define YSCALEYUV2PACKEDX_ACCURATE_UV \
+    __asm__ volatile(\
     "xor %%"REG_a", %%"REG_a"                       \n\t"\
     ASMALIGN(4)\
     "nop                                            \n\t"\
@@ -271,19 +276,19 @@
     "2:                                             \n\t"\
     "movq     (%%"REG_S", %%"REG_a"), %%mm0         \n\t" /* UsrcData */\
     "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* VsrcData */\
-    "mov                4(%%"REG_d"), %%"REG_S"     \n\t"\
+    "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"     \n\t"\
     "movq     (%%"REG_S", %%"REG_a"), %%mm1         \n\t" /* UsrcData */\
     "movq                      %%mm0, %%mm3         \n\t"\
     "punpcklwd                 %%mm1, %%mm0         \n\t"\
     "punpckhwd                 %%mm1, %%mm3         \n\t"\
-    "movq               8(%%"REG_d"), %%mm1         \n\t" /* filterCoeff */\
+    "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1         \n\t" /* filterCoeff */\
     "pmaddwd                   %%mm1, %%mm0         \n\t"\
     "pmaddwd                   %%mm1, %%mm3         \n\t"\
     "paddd                     %%mm0, %%mm4         \n\t"\
     "paddd                     %%mm3, %%mm5         \n\t"\
     "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3         \n\t" /* VsrcData */\
-    "mov               16(%%"REG_d"), %%"REG_S"     \n\t"\
-    "add                         $16, %%"REG_d"     \n\t"\
+    "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"     \n\t"\
+    "add           $"STR(APCK_SIZE)", %%"REG_d"     \n\t"\
     "test                  %%"REG_S", %%"REG_S"     \n\t"\
     "movq                      %%mm2, %%mm0         \n\t"\
     "punpcklwd                 %%mm3, %%mm2         \n\t"\
@@ -304,8 +309,9 @@
     "paddw                     %%mm0, %%mm6         \n\t"\
     "movq                      %%mm4, "U_TEMP"(%0)  \n\t"\
     "movq                      %%mm6, "V_TEMP"(%0)  \n\t"\
-\
-    "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
+
+#define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
+    "lea                "offset"(%0), %%"REG_d"     \n\t"\
     "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
     "pxor                      %%mm1, %%mm1         \n\t"\
     "pxor                      %%mm5, %%mm5         \n\t"\
@@ -315,19 +321,19 @@
     "2:                                             \n\t"\
     "movq  (%%"REG_S", %%"REG_a", 2), %%mm0         \n\t" /* Y1srcData */\
     "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2         \n\t" /* Y2srcData */\
-    "mov                4(%%"REG_d"), %%"REG_S"     \n\t"\
+    "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"     \n\t"\
     "movq  (%%"REG_S", %%"REG_a", 2), %%mm4         \n\t" /* Y1srcData */\
     "movq                      %%mm0, %%mm3         \n\t"\
     "punpcklwd                 %%mm4, %%mm0         \n\t"\
     "punpckhwd                 %%mm4, %%mm3         \n\t"\
-    "movq               8(%%"REG_d"), %%mm4         \n\t" /* filterCoeff */\
+    "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4         \n\t" /* filterCoeff */\
     "pmaddwd                   %%mm4, %%mm0         \n\t"\
     "pmaddwd                   %%mm4, %%mm3         \n\t"\
     "paddd                     %%mm0, %%mm1         \n\t"\
     "paddd                     %%mm3, %%mm5         \n\t"\
     "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3         \n\t" /* Y2srcData */\
-    "mov               16(%%"REG_d"), %%"REG_S"     \n\t"\
-    "add                         $16, %%"REG_d"     \n\t"\
+    "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"     \n\t"\
+    "add           $"STR(APCK_SIZE)", %%"REG_d"     \n\t"\
     "test                  %%"REG_S", %%"REG_S"     \n\t"\
     "movq                      %%mm2, %%mm0         \n\t"\
     "punpcklwd                 %%mm3, %%mm2         \n\t"\
@@ -349,6 +355,10 @@
     "movq               "U_TEMP"(%0), %%mm3         \n\t"\
     "movq               "V_TEMP"(%0), %%mm4         \n\t"\
 
+#define YSCALEYUV2PACKEDX_ACCURATE \
+    YSCALEYUV2PACKEDX_ACCURATE_UV \
+    YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
+
 #define YSCALEYUV2RGBX \
     "psubw  "U_OFFSET"(%0), %%mm3       \n\t" /* (U-128)8*/\
     "psubw  "V_OFFSET"(%0), %%mm4       \n\t" /* (V-128)8*/\
@@ -384,61 +394,6 @@
     "packuswb        %%mm0, %%mm2       \n\t"\
     "packuswb        %%mm6, %%mm5       \n\t"\
     "packuswb        %%mm3, %%mm4       \n\t"\
-    "pxor            %%mm7, %%mm7       \n\t"
-#if 0
-#define FULL_YSCALEYUV2RGB \
-    "pxor                 %%mm7, %%mm7  \n\t"\
-    "movd                    %6, %%mm6  \n\t" /*yalpha1*/\
-    "punpcklwd            %%mm6, %%mm6  \n\t"\
-    "punpcklwd            %%mm6, %%mm6  \n\t"\
-    "movd                    %7, %%mm5  \n\t" /*uvalpha1*/\
-    "punpcklwd            %%mm5, %%mm5  \n\t"\
-    "punpcklwd            %%mm5, %%mm5  \n\t"\
-    "xor              %%"REG_a", %%"REG_a"  \n\t"\
-    ASMALIGN(4)\
-    "1:                                 \n\t"\
-    "movq     (%0, %%"REG_a",2), %%mm0  \n\t" /*buf0[eax]*/\
-    "movq     (%1, %%"REG_a",2), %%mm1  \n\t" /*buf1[eax]*/\
-    "movq     (%2, %%"REG_a",2), %%mm2  \n\t" /* uvbuf0[eax]*/\
-    "movq     (%3, %%"REG_a",2), %%mm3  \n\t" /* uvbuf1[eax]*/\
-    "psubw                %%mm1, %%mm0  \n\t" /* buf0[eax] - buf1[eax]*/\
-    "psubw                %%mm3, %%mm2  \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
-    "pmulhw               %%mm6, %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
-    "pmulhw               %%mm5, %%mm2  \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
-    "psraw                   $4, %%mm1  \n\t" /* buf0[eax] - buf1[eax] >>4*/\
-    "movq "AV_STRINGIFY(VOF)"(%2, %%"REG_a",2), %%mm4  \n\t" /* uvbuf0[eax+2048]*/\
-    "psraw                   $4, %%mm3  \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
-    "paddw                %%mm0, %%mm1  \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
-    "movq "AV_STRINGIFY(VOF)"(%3, %%"REG_a",2), %%mm0  \n\t" /* uvbuf1[eax+2048]*/\
-    "paddw                %%mm2, %%mm3  \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
-    "psubw                %%mm0, %%mm4  \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
-    "psubw        "MANGLE(w80)", %%mm1  \n\t" /* 8(Y-16)*/\
-    "psubw       "MANGLE(w400)", %%mm3  \n\t" /* 8(U-128)*/\
-    "pmulhw    "MANGLE(yCoeff)", %%mm1  \n\t"\
-\
-\
-    "pmulhw               %%mm5, %%mm4  \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
-    "movq                 %%mm3, %%mm2  \n\t" /* (U-128)8*/\
-    "pmulhw   "MANGLE(ubCoeff)", %%mm3  \n\t"\
-    "psraw                   $4, %%mm0  \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
-    "pmulhw   "MANGLE(ugCoeff)", %%mm2  \n\t"\
-    "paddw                %%mm4, %%mm0  \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
-    "psubw       "MANGLE(w400)", %%mm0  \n\t" /* (V-128)8*/\
-\
-\
-    "movq                 %%mm0, %%mm4  \n\t" /* (V-128)8*/\
-    "pmulhw   "MANGLE(vrCoeff)", %%mm0  \n\t"\
-    "pmulhw   "MANGLE(vgCoeff)", %%mm4  \n\t"\
-    "paddw                %%mm1, %%mm3  \n\t" /* B*/\
-    "paddw                %%mm1, %%mm0  \n\t" /* R*/\
-    "packuswb             %%mm3, %%mm3  \n\t"\
-\
-    "packuswb             %%mm0, %%mm0  \n\t"\
-    "paddw                %%mm4, %%mm2  \n\t"\
-    "paddw                %%mm2, %%mm1  \n\t" /* G*/\
-\
-    "packuswb             %%mm1, %%mm1  \n\t"
-#endif
 
 #define REAL_YSCALEYUV2PACKED(index, c) \
     "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0              \n\t"\
@@ -478,7 +433,7 @@
 
 #define YSCALEYUV2PACKED(index, c)  REAL_YSCALEYUV2PACKED(index, c)
 
-#define REAL_YSCALEYUV2RGB(index, c) \
+#define REAL_YSCALEYUV2RGB_UV(index, c) \
     "xor            "#index", "#index"  \n\t"\
     ASMALIGN(4)\
     "1:                                 \n\t"\
@@ -502,6 +457,8 @@
     "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
     "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
     /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
+
+#define REAL_YSCALEYUV2RGB_YA(index, c) \
     "movq  (%0, "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
     "movq  (%1, "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
     "movq 8(%0, "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
@@ -514,6 +471,8 @@
     "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
     "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
     "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
+
+#define REAL_YSCALEYUV2RGB_COEFF(c) \
     "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
     "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
     "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
@@ -541,8 +500,13 @@
     "packuswb          %%mm0, %%mm2     \n\t"\
     "packuswb          %%mm6, %%mm5     \n\t"\
     "packuswb          %%mm3, %%mm4     \n\t"\
-    "pxor              %%mm7, %%mm7     \n\t"
-#define YSCALEYUV2RGB(index, c)  REAL_YSCALEYUV2RGB(index, c)
+
+#define YSCALEYUV2RGB_YA(index, c) REAL_YSCALEYUV2RGB_YA(index, c)
+
+#define YSCALEYUV2RGB(index, c) \
+    REAL_YSCALEYUV2RGB_UV(index, c) \
+    REAL_YSCALEYUV2RGB_YA(index, c) \
+    REAL_YSCALEYUV2RGB_COEFF(c)
 
 #define REAL_YSCALEYUV2PACKED1(index, c) \
     "xor            "#index", "#index"  \n\t"\
@@ -605,7 +569,7 @@
     "packuswb          %%mm0, %%mm2     \n\t"\
     "packuswb          %%mm6, %%mm5     \n\t"\
     "packuswb          %%mm3, %%mm4     \n\t"\
-    "pxor              %%mm7, %%mm7     \n\t"
+
 #define YSCALEYUV2RGB1(index, c)  REAL_YSCALEYUV2RGB1(index, c)
 
 #define REAL_YSCALEYUV2PACKED1b(index, c) \
@@ -677,35 +641,34 @@
     "packuswb          %%mm0, %%mm2     \n\t"\
     "packuswb          %%mm6, %%mm5     \n\t"\
     "packuswb          %%mm3, %%mm4     \n\t"\
-    "pxor              %%mm7, %%mm7     \n\t"
+
 #define YSCALEYUV2RGB1b(index, c)  REAL_YSCALEYUV2RGB1b(index, c)
 
-#define REAL_WRITEBGR32(dst, dstw, index) \
-    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
-    "movq      %%mm2, %%mm1     \n\t" /* B */\
-    "movq      %%mm5, %%mm6     \n\t" /* R */\
-    "punpcklbw %%mm4, %%mm2     \n\t" /* GBGBGBGB 0 */\
-    "punpcklbw %%mm7, %%mm5     \n\t" /* 0R0R0R0R 0 */\
-    "punpckhbw %%mm4, %%mm1     \n\t" /* GBGBGBGB 2 */\
-    "punpckhbw %%mm7, %%mm6     \n\t" /* 0R0R0R0R 2 */\
-    "movq      %%mm2, %%mm0     \n\t" /* GBGBGBGB 0 */\
-    "movq      %%mm1, %%mm3     \n\t" /* GBGBGBGB 2 */\
-    "punpcklwd %%mm5, %%mm0     \n\t" /* 0RGB0RGB 0 */\
-    "punpckhwd %%mm5, %%mm2     \n\t" /* 0RGB0RGB 1 */\
-    "punpcklwd %%mm6, %%mm1     \n\t" /* 0RGB0RGB 2 */\
-    "punpckhwd %%mm6, %%mm3     \n\t" /* 0RGB0RGB 3 */\
+#define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
+    "movq       "#b", "#q2"     \n\t" /* B */\
+    "movq       "#r", "#t"      \n\t" /* R */\
+    "punpcklbw  "#g", "#b"      \n\t" /* GBGBGBGB 0 */\
+    "punpcklbw  "#a", "#r"      \n\t" /* ARARARAR 0 */\
+    "punpckhbw  "#g", "#q2"     \n\t" /* GBGBGBGB 2 */\
+    "punpckhbw  "#a", "#t"      \n\t" /* ARARARAR 2 */\
+    "movq       "#b", "#q0"     \n\t" /* GBGBGBGB 0 */\
+    "movq      "#q2", "#q3"     \n\t" /* GBGBGBGB 2 */\
+    "punpcklwd  "#r", "#q0"     \n\t" /* ARGBARGB 0 */\
+    "punpckhwd  "#r", "#b"      \n\t" /* ARGBARGB 1 */\
+    "punpcklwd  "#t", "#q2"     \n\t" /* ARGBARGB 2 */\
+    "punpckhwd  "#t", "#q3"     \n\t" /* ARGBARGB 3 */\
 \
-    MOVNTQ(%%mm0,   (dst, index, 4))\
-    MOVNTQ(%%mm2,  8(dst, index, 4))\
-    MOVNTQ(%%mm1, 16(dst, index, 4))\
-    MOVNTQ(%%mm3, 24(dst, index, 4))\
+    MOVNTQ(   q0,   (dst, index, 4))\
+    MOVNTQ(    b,  8(dst, index, 4))\
+    MOVNTQ(   q2, 16(dst, index, 4))\
+    MOVNTQ(   q3, 24(dst, index, 4))\
 \
     "add      $8, "#index"      \n\t"\
     "cmp "#dstw", "#index"      \n\t"\
     " jb      1b                \n\t"
-#define WRITEBGR32(dst, dstw, index)  REAL_WRITEBGR32(dst, dstw, index)
+#define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)  REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
 
-#define REAL_WRITEBGR16(dst, dstw, index) \
+#define REAL_WRITERGB16(dst, dstw, index) \
     "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
     "pand "MANGLE(bFC)", %%mm4  \n\t" /* G */\
     "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
@@ -731,9 +694,9 @@
     "add             $8, "#index"   \n\t"\
     "cmp        "#dstw", "#index"   \n\t"\
     " jb             1b             \n\t"
-#define WRITEBGR16(dst, dstw, index)  REAL_WRITEBGR16(dst, dstw, index)
+#define WRITERGB16(dst, dstw, index)  REAL_WRITERGB16(dst, dstw, index)
 
-#define REAL_WRITEBGR15(dst, dstw, index) \
+#define REAL_WRITERGB15(dst, dstw, index) \
     "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
     "pand "MANGLE(bF8)", %%mm4  \n\t" /* G */\
     "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
@@ -760,7 +723,7 @@
     "add             $8, "#index"   \n\t"\
     "cmp        "#dstw", "#index"   \n\t"\
     " jb             1b             \n\t"
-#define WRITEBGR15(dst, dstw, index)  REAL_WRITEBGR15(dst, dstw, index)
+#define WRITERGB15(dst, dstw, index)  REAL_WRITERGB15(dst, dstw, index)
 
 #define WRITEBGR24OLD(dst, dstw, index) \
     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
@@ -919,7 +882,7 @@
     "cmp  "#dstw", "#index"     \n\t"\
     " jb       1b               \n\t"
 
-#ifdef HAVE_MMX2
+#if HAVE_MMX2
 #undef WRITEBGR24
 #define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX2(dst, dstw, index)
 #else
@@ -949,24 +912,27 @@ static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t *
                                     int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
                                     uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
 {
-#ifdef HAVE_MMX
-    if (c->flags & SWS_ACCURATE_RND){
-        if (uDest){
-            YSCALEYUV2YV12X_ACCURATE(   "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
-            YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
-        }
+#if HAVE_MMX
+    if(!(c->flags & SWS_BITEXACT)){
+        if (c->flags & SWS_ACCURATE_RND){
+            if (uDest){
+                YSCALEYUV2YV12X_ACCURATE(   "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
+                YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
+            }
 
-        YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
-    }else{
-        if (uDest){
-            YSCALEYUV2YV12X(   "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
-            YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
-        }
+            YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
+        }else{
+            if (uDest){
+                YSCALEYUV2YV12X(   "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
+                YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
+            }
 
-        YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
+            YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
+        }
+        return;
     }
-#else
-#ifdef HAVE_ALTIVEC
+#endif
+#if HAVE_ALTIVEC
 yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
                       chrFilter, chrSrc, chrFilterSize,
                       dest, uDest, vDest, dstW, chrDstW);
@@ -975,7 +941,6 @@ yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
             chrFilter, chrSrc, chrFilterSize,
             dest, uDest, vDest, dstW, chrDstW);
 #endif //!HAVE_ALTIVEC
-#endif /* HAVE_MMX */
 }
 
 static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
@@ -990,34 +955,36 @@ yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
 static inline void RENAME(yuv2yuv1)(SwsContext *c, int16_t *lumSrc, int16_t *chrSrc,
                                     uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
 {
-#ifdef HAVE_MMX
-    long p= uDest ? 3 : 1;
-    uint8_t *src[3]= {lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
-    uint8_t *dst[3]= {dest, uDest, vDest};
-    long counter[3] = {dstW, chrDstW, chrDstW};
-
-    if (c->flags & SWS_ACCURATE_RND){
-        while(p--){
-            asm volatile(
-                YSCALEYUV2YV121_ACCURATE
-                :: "r" (src[p]), "r" (dst[p] + counter[p]),
-                "g" (-counter[p])
-                : "%"REG_a
-            );
-        }
-    }else{
-        while(p--){
-            asm volatile(
-                YSCALEYUV2YV121
-                :: "r" (src[p]), "r" (dst[p] + counter[p]),
-                "g" (-counter[p])
-                : "%"REG_a
-            );
-        }
-    }
-
-#else
     int i;
+#if HAVE_MMX
+    if(!(c->flags & SWS_BITEXACT)){
+        long p= uDest ? 3 : 1;
+        uint8_t *src[3]= {lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
+        uint8_t *dst[3]= {dest, uDest, vDest};
+        long counter[3] = {dstW, chrDstW, chrDstW};
+
+        if (c->flags & SWS_ACCURATE_RND){
+            while(p--){
+                __asm__ volatile(
+                    YSCALEYUV2YV121_ACCURATE
+                    :: "r" (src[p]), "r" (dst[p] + counter[p]),
+                    "g" (-counter[p])
+                    : "%"REG_a
+                );
+            }
+        }else{
+            while(p--){
+                __asm__ volatile(
+                    YSCALEYUV2YV121
+                    :: "r" (src[p]), "r" (dst[p] + counter[p]),
+                    "g" (-counter[p])
+                    : "%"REG_a
+                );
+            }
+        }
+        return;
+    }
+#endif
     for (i=0; i<dstW; i++)
     {
         int val= (lumSrc[i]+64)>>7;
@@ -1046,7 +1013,6 @@ static inline void RENAME(yuv2yuv1)(SwsContext *c, int16_t *lumSrc, int16_t *chr
             uDest[i]= u;
             vDest[i]= v;
         }
-#endif
 }
 
 
@@ -1057,137 +1023,148 @@ static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_
                                        int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
                                        uint8_t *dest, long dstW, long dstY)
 {
-#ifdef HAVE_MMX
+#if HAVE_MMX
     long dummy=0;
-    if (c->flags & SWS_ACCURATE_RND){
-        switch(c->dstFormat){
-        case PIX_FMT_RGB32:
-            YSCALEYUV2PACKEDX_ACCURATE
-            YSCALEYUV2RGBX
-            WRITEBGR32(%4, %5, %%REGa)
+    if(!(c->flags & SWS_BITEXACT)){
+        if (c->flags & SWS_ACCURATE_RND){
+            switch(c->dstFormat){
+            case PIX_FMT_RGB32:
+                YSCALEYUV2PACKEDX_ACCURATE
+                YSCALEYUV2RGBX
+                "pcmpeqd %%mm7, %%mm7 \n\t"
+                WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
 
-            YSCALEYUV2PACKEDX_END
-            return;
-        case PIX_FMT_BGR24:
-            YSCALEYUV2PACKEDX_ACCURATE
-            YSCALEYUV2RGBX
-            "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
-            "add %4, %%"REG_c"                        \n\t"
-            WRITEBGR24(%%REGc, %5, %%REGa)
+                YSCALEYUV2PACKEDX_END
+                return;
+            case PIX_FMT_BGR24:
+                YSCALEYUV2PACKEDX_ACCURATE
+                YSCALEYUV2RGBX
+                "pxor %%mm7, %%mm7 \n\t"
+                "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
+                "add %4, %%"REG_c"                        \n\t"
+                WRITEBGR24(%%REGc, %5, %%REGa)
 
 
-            :: "r" (&c->redDither),
-               "m" (dummy), "m" (dummy), "m" (dummy),
-               "r" (dest), "m" (dstW)
-            : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
-            );
-            return;
-        case PIX_FMT_BGR555:
-            YSCALEYUV2PACKEDX_ACCURATE
-            YSCALEYUV2RGBX
-            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
+                :: "r" (&c->redDither),
+                "m" (dummy), "m" (dummy), "m" (dummy),
+                "r" (dest), "m" (dstW)
+                : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
+                );
+                return;
+            case PIX_FMT_RGB555:
+                YSCALEYUV2PACKEDX_ACCURATE
+                YSCALEYUV2RGBX
+                "pxor %%mm7, %%mm7 \n\t"
+                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
 #ifdef DITHER1XBPP
-            "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
-            "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
-            "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
+                "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
+                "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
+                "paddusb "RED_DITHER"(%0), %%mm5\n\t"
 #endif
 
-            WRITEBGR15(%4, %5, %%REGa)
-            YSCALEYUV2PACKEDX_END
-            return;
-        case PIX_FMT_BGR565:
-            YSCALEYUV2PACKEDX_ACCURATE
-            YSCALEYUV2RGBX
-            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
+                WRITERGB15(%4, %5, %%REGa)
+                YSCALEYUV2PACKEDX_END
+                return;
+            case PIX_FMT_RGB565:
+                YSCALEYUV2PACKEDX_ACCURATE
+                YSCALEYUV2RGBX
+                "pxor %%mm7, %%mm7 \n\t"
+                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
 #ifdef DITHER1XBPP
-            "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
-            "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
-            "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
+                "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
+                "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
+                "paddusb "RED_DITHER"(%0), %%mm5\n\t"
 #endif
 
-            WRITEBGR16(%4, %5, %%REGa)
-            YSCALEYUV2PACKEDX_END
-            return;
-        case PIX_FMT_YUYV422:
-            YSCALEYUV2PACKEDX_ACCURATE
-            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
+                WRITERGB16(%4, %5, %%REGa)
+                YSCALEYUV2PACKEDX_END
+                return;
+            case PIX_FMT_YUYV422:
+                YSCALEYUV2PACKEDX_ACCURATE
+                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
 
-            "psraw $3, %%mm3    \n\t"
-            "psraw $3, %%mm4    \n\t"
-            "psraw $3, %%mm1    \n\t"
-            "psraw $3, %%mm7    \n\t"
-            WRITEYUY2(%4, %5, %%REGa)
-            YSCALEYUV2PACKEDX_END
-            return;
-    }
-    }else{
-        switch(c->dstFormat)
-        {
-        case PIX_FMT_RGB32:
-            YSCALEYUV2PACKEDX
-            YSCALEYUV2RGBX
-            WRITEBGR32(%4, %5, %%REGa)
-            YSCALEYUV2PACKEDX_END
-            return;
-        case PIX_FMT_BGR24:
-            YSCALEYUV2PACKEDX
-            YSCALEYUV2RGBX
-            "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"   \n\t" //FIXME optimize
-            "add                        %4, %%"REG_c"   \n\t"
-            WRITEBGR24(%%REGc, %5, %%REGa)
+                "psraw $3, %%mm3    \n\t"
+                "psraw $3, %%mm4    \n\t"
+                "psraw $3, %%mm1    \n\t"
+                "psraw $3, %%mm7    \n\t"
+                WRITEYUY2(%4, %5, %%REGa)
+                YSCALEYUV2PACKEDX_END
+                return;
+            }
+        }else{
+            switch(c->dstFormat)
+            {
+            case PIX_FMT_RGB32:
+                YSCALEYUV2PACKEDX
+                YSCALEYUV2RGBX
+                "pcmpeqd %%mm7, %%mm7 \n\t"
+                WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
+                YSCALEYUV2PACKEDX_END
+                return;
+            case PIX_FMT_BGR24:
+                YSCALEYUV2PACKEDX
+                YSCALEYUV2RGBX
+                "pxor                    %%mm7, %%mm7       \n\t"
+                "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"   \n\t" //FIXME optimize
+                "add                        %4, %%"REG_c"   \n\t"
+                WRITEBGR24(%%REGc, %5, %%REGa)
 
-            :: "r" (&c->redDither),
-               "m" (dummy), "m" (dummy), "m" (dummy),
-               "r" (dest),  "m" (dstW)
-            : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
-            );
-            return;
-        case PIX_FMT_BGR555:
-            YSCALEYUV2PACKEDX
-            YSCALEYUV2RGBX
-            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
+                :: "r" (&c->redDither),
+                "m" (dummy), "m" (dummy), "m" (dummy),
+                "r" (dest),  "m" (dstW)
+                : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
+                );
+                return;
+            case PIX_FMT_RGB555:
+                YSCALEYUV2PACKEDX
+                YSCALEYUV2RGBX
+                "pxor %%mm7, %%mm7 \n\t"
+                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
 #ifdef DITHER1XBPP
-            "paddusb "MANGLE(b5Dither)", %%mm2  \n\t"
-            "paddusb "MANGLE(g5Dither)", %%mm4  \n\t"
-            "paddusb "MANGLE(r5Dither)", %%mm5  \n\t"
+                "paddusb "BLUE_DITHER"(%0), %%mm2  \n\t"
+                "paddusb "GREEN_DITHER"(%0), %%mm4  \n\t"
+                "paddusb "RED_DITHER"(%0), %%mm5  \n\t"
 #endif
 
-            WRITEBGR15(%4, %5, %%REGa)
-            YSCALEYUV2PACKEDX_END
-            return;
-        case PIX_FMT_BGR565:
-            YSCALEYUV2PACKEDX
-            YSCALEYUV2RGBX
-            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
+                WRITERGB15(%4, %5, %%REGa)
+                YSCALEYUV2PACKEDX_END
+                return;
+            case PIX_FMT_RGB565:
+                YSCALEYUV2PACKEDX
+                YSCALEYUV2RGBX
+                "pxor %%mm7, %%mm7 \n\t"
+                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
 #ifdef DITHER1XBPP
-            "paddusb "MANGLE(b5Dither)", %%mm2  \n\t"
-            "paddusb "MANGLE(g6Dither)", %%mm4  \n\t"
-            "paddusb "MANGLE(r5Dither)", %%mm5  \n\t"
+                "paddusb "BLUE_DITHER"(%0), %%mm2  \n\t"
+                "paddusb "GREEN_DITHER"(%0), %%mm4  \n\t"
+                "paddusb "RED_DITHER"(%0), %%mm5  \n\t"
 #endif
 
-            WRITEBGR16(%4, %5, %%REGa)
-            YSCALEYUV2PACKEDX_END
-            return;
-        case PIX_FMT_YUYV422:
-            YSCALEYUV2PACKEDX
-            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
+                WRITERGB16(%4, %5, %%REGa)
+                YSCALEYUV2PACKEDX_END
+                return;
+            case PIX_FMT_YUYV422:
+                YSCALEYUV2PACKEDX
+                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
 
-            "psraw $3, %%mm3    \n\t"
-            "psraw $3, %%mm4    \n\t"
-            "psraw $3, %%mm1    \n\t"
-            "psraw $3, %%mm7    \n\t"
-            WRITEYUY2(%4, %5, %%REGa)
-            YSCALEYUV2PACKEDX_END
-            return;
+                "psraw $3, %%mm3    \n\t"
+                "psraw $3, %%mm4    \n\t"
+                "psraw $3, %%mm1    \n\t"
+                "psraw $3, %%mm7    \n\t"
+                WRITEYUY2(%4, %5, %%REGa)
+                YSCALEYUV2PACKEDX_END
+                return;
+            }
         }
     }
 #endif /* HAVE_MMX */
-#ifdef HAVE_ALTIVEC
+#if HAVE_ALTIVEC
     /* The following list of supported dstFormat values should
        match what's found in the body of altivec_yuv2packedX() */
-    if (c->dstFormat==PIX_FMT_ABGR  || c->dstFormat==PIX_FMT_BGRA  ||
+    if (!(c->flags & SWS_BITEXACT) &&
+       (c->dstFormat==PIX_FMT_ABGR  || c->dstFormat==PIX_FMT_BGRA  ||
         c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
-        c->dstFormat==PIX_FMT_RGBA  || c->dstFormat==PIX_FMT_ARGB)
+        c->dstFormat==PIX_FMT_RGBA  || c->dstFormat==PIX_FMT_ARGB))
             altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize,
                                  chrFilter, chrSrc, chrFilterSize,
                                  dest, dstW, dstY);
@@ -1204,236 +1181,23 @@ static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_
 static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
                           uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
 {
-    int yalpha1=yalpha^4095;
-    int uvalpha1=uvalpha^4095;
+    int  yalpha1=4095- yalpha;
+    int uvalpha1=4095-uvalpha;
     int i;
 
-#if 0 //isn't used
-    if (flags&SWS_FULL_CHR_H_INT)
-    {
-        switch(dstFormat)
-        {
-#ifdef HAVE_MMX
-        case PIX_FMT_RGB32:
-            asm volatile(
-
-
-FULL_YSCALEYUV2RGB
-            "punpcklbw %%mm1, %%mm3    \n\t" // BGBGBGBG
-            "punpcklbw %%mm7, %%mm0    \n\t" // R0R0R0R0
-
-            "movq      %%mm3, %%mm1    \n\t"
-            "punpcklwd %%mm0, %%mm3    \n\t" // BGR0BGR0
-            "punpckhwd %%mm0, %%mm1    \n\t" // BGR0BGR0
-
-            MOVNTQ(%%mm3,  (%4, %%REGa, 4))
-            MOVNTQ(%%mm1, 8(%4, %%REGa, 4))
-
-            "add $4, %%"REG_a"  \n\t"
-            "cmp %5, %%"REG_a"  \n\t"
-            " jb 1b             \n\t"
-
-            :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" ((long)dstW),
-            "m" (yalpha1), "m" (uvalpha1)
-            : "%"REG_a
-            );
-            break;
-        case PIX_FMT_BGR24:
-            asm volatile(
-
-FULL_YSCALEYUV2RGB
-
-                                              // lsb ... msb
-            "punpcklbw %%mm1, %%mm3     \n\t" // BGBGBGBG
-            "punpcklbw %%mm7, %%mm0     \n\t" // R0R0R0R0
-
-            "movq      %%mm3, %%mm1     \n\t"
-            "punpcklwd %%mm0, %%mm3     \n\t" // BGR0BGR0
-            "punpckhwd %%mm0, %%mm1     \n\t" // BGR0BGR0
-
-            "movq      %%mm3, %%mm2     \n\t" // BGR0BGR0
-            "psrlq        $8, %%mm3     \n\t" // GR0BGR00
-            "pand "MANGLE(bm00000111)", %%mm2   \n\t" // BGR00000
-            "pand "MANGLE(bm11111000)", %%mm3   \n\t" // 000BGR00
-            "por       %%mm2, %%mm3     \n\t" // BGRBGR00
-            "movq      %%mm1, %%mm2     \n\t"
-            "psllq       $48, %%mm1     \n\t" // 000000BG
-            "por       %%mm1, %%mm3     \n\t" // BGRBGRBG
-
-            "movq      %%mm2, %%mm1     \n\t" // BGR0BGR0
-            "psrld       $16, %%mm2     \n\t" // R000R000
-            "psrlq       $24, %%mm1     \n\t" // 0BGR0000
-            "por       %%mm2, %%mm1     \n\t" // RBGRR000
-
-            "mov          %4, %%"REG_b" \n\t"
-            "add   %%"REG_a", %%"REG_b" \n\t"
-
-#ifdef HAVE_MMX2
-            //FIXME Alignment
-            "movntq %%mm3,  (%%"REG_b", %%"REG_a", 2)   \n\t"
-            "movntq %%mm1, 8(%%"REG_b", %%"REG_a", 2)   \n\t"
-#else
-            "movd %%mm3,  (%%"REG_b", %%"REG_a", 2)     \n\t"
-            "psrlq  $32, %%mm3                          \n\t"
-            "movd %%mm3, 4(%%"REG_b", %%"REG_a", 2)     \n\t"
-            "movd %%mm1, 8(%%"REG_b", %%"REG_a", 2)     \n\t"
-#endif
-            "add     $4, %%"REG_a"                      \n\t"
-            "cmp     %5, %%"REG_a"                      \n\t"
-            " jb     1b                                 \n\t"
-
-            :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
-            "m" (yalpha1), "m" (uvalpha1)
-            : "%"REG_a, "%"REG_b
-            );
-            break;
-        case PIX_FMT_BGR555:
-            asm volatile(
-
-FULL_YSCALEYUV2RGB
-#ifdef DITHER1XBPP
-            "paddusb "MANGLE(g5Dither)", %%mm1  \n\t"
-            "paddusb "MANGLE(r5Dither)", %%mm0  \n\t"
-            "paddusb "MANGLE(b5Dither)", %%mm3  \n\t"
-#endif
-            "punpcklbw            %%mm7, %%mm1  \n\t" // 0G0G0G0G
-            "punpcklbw            %%mm7, %%mm3  \n\t" // 0B0B0B0B
-            "punpcklbw            %%mm7, %%mm0  \n\t" // 0R0R0R0R
-
-            "psrlw                   $3, %%mm3  \n\t"
-            "psllw                   $2, %%mm1  \n\t"
-            "psllw                   $7, %%mm0  \n\t"
-            "pand     "MANGLE(g15Mask)", %%mm1  \n\t"
-            "pand     "MANGLE(r15Mask)", %%mm0  \n\t"
-
-            "por                  %%mm3, %%mm1  \n\t"
-            "por                  %%mm1, %%mm0  \n\t"
-
-            MOVNTQ(%%mm0, (%4, %%REGa, 2))
-
-            "add $4, %%"REG_a"  \n\t"
-            "cmp %5, %%"REG_a"  \n\t"
-            " jb 1b             \n\t"
-
-            :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
-            "m" (yalpha1), "m" (uvalpha1)
-            : "%"REG_a
-            );
-            break;
-        case PIX_FMT_BGR565:
-            asm volatile(
-
-FULL_YSCALEYUV2RGB
-#ifdef DITHER1XBPP
-            "paddusb "MANGLE(g6Dither)", %%mm1  \n\t"
-            "paddusb "MANGLE(r5Dither)", %%mm0  \n\t"
-            "paddusb "MANGLE(b5Dither)", %%mm3  \n\t"
-#endif
-            "punpcklbw            %%mm7, %%mm1  \n\t" // 0G0G0G0G
-            "punpcklbw            %%mm7, %%mm3  \n\t" // 0B0B0B0B
-            "punpcklbw            %%mm7, %%mm0  \n\t" // 0R0R0R0R
-
-            "psrlw                   $3, %%mm3  \n\t"
-            "psllw                   $3, %%mm1  \n\t"
-            "psllw                   $8, %%mm0  \n\t"
-            "pand     "MANGLE(g16Mask)", %%mm1  \n\t"
-            "pand     "MANGLE(r16Mask)", %%mm0  \n\t"
-
-            "por                  %%mm3, %%mm1  \n\t"
-            "por                  %%mm1, %%mm0  \n\t"
-
-            MOVNTQ(%%mm0, (%4, %%REGa, 2))
-
-            "add $4, %%"REG_a"  \n\t"
-            "cmp %5, %%"REG_a"  \n\t"
-            " jb 1b             \n\t"
-
-            :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
-            "m" (yalpha1), "m" (uvalpha1)
-            : "%"REG_a
-            );
-            break;
-#endif /* HAVE_MMX */
-        case PIX_FMT_BGR32:
-#ifndef HAVE_MMX
-        case PIX_FMT_RGB32:
-#endif
-            if (dstFormat==PIX_FMT_RGB32)
-            {
-                int i;
-#ifdef WORDS_BIGENDIAN
-                dest++;
-#endif
-                for (i=0;i<dstW;i++){
-                    // vertical linear interpolation && yuv2rgb in a single step:
-                    int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
-                    int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
-                    int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
-                    dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
-                    dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
-                    dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
-                    dest+= 4;
-                }
-            }
-            else if (dstFormat==PIX_FMT_BGR24)
-            {
-                int i;
-                for (i=0;i<dstW;i++){
-                    // vertical linear interpolation && yuv2rgb in a single step:
-                    int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
-                    int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
-                    int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
-                    dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
-                    dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
-                    dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
-                    dest+= 3;
-                }
-            }
-            else if (dstFormat==PIX_FMT_BGR565)
-            {
-                int i;
-                for (i=0;i<dstW;i++){
-                    // vertical linear interpolation && yuv2rgb in a single step:
-                    int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
-                    int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
-                    int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
-
-                    ((uint16_t*)dest)[i] =
-                        clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
-                        clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
-                        clip_table16r[(Y + yuvtab_3343[V]) >>13];
-                }
-            }
-            else if (dstFormat==PIX_FMT_BGR555)
-            {
-                int i;
-                for (i=0;i<dstW;i++){
-                    // vertical linear interpolation && yuv2rgb in a single step:
-                    int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
-                    int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
-                    int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
-
-                    ((uint16_t*)dest)[i] =
-                        clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
-                        clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
-                        clip_table15r[(Y + yuvtab_3343[V]) >>13];
-                }
-            }
-        }//FULL_UV_IPOL
-    else
-    {
-#endif // if 0
-#ifdef HAVE_MMX
+#if HAVE_MMX
+    if(!(c->flags & SWS_BITEXACT)){
         switch(c->dstFormat)
         {
             //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
             case PIX_FMT_RGB32:
-                asm volatile(
+                __asm__ volatile(
                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
                 "mov        %4, %%"REG_b"               \n\t"
                 "push %%"REG_BP"                        \n\t"
                 YSCALEYUV2RGB(%%REGBP, %5)
-                WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
+                "pcmpeqd %%mm7, %%mm7                   \n\t"
+                WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
                 "pop %%"REG_BP"                         \n\t"
                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
 
@@ -1442,11 +1206,12 @@ FULL_YSCALEYUV2RGB
                 );
                 return;
             case PIX_FMT_BGR24:
-                asm volatile(
+                __asm__ volatile(
                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
                 "mov        %4, %%"REG_b"               \n\t"
                 "push %%"REG_BP"                        \n\t"
                 YSCALEYUV2RGB(%%REGBP, %5)
+                "pxor    %%mm7, %%mm7                   \n\t"
                 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
                 "pop %%"REG_BP"                         \n\t"
                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
@@ -1454,20 +1219,21 @@ FULL_YSCALEYUV2RGB
                 "a" (&c->redDither)
                 );
                 return;
-            case PIX_FMT_BGR555:
-                asm volatile(
+            case PIX_FMT_RGB555:
+                __asm__ volatile(
                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
                 "mov        %4, %%"REG_b"               \n\t"
                 "push %%"REG_BP"                        \n\t"
                 YSCALEYUV2RGB(%%REGBP, %5)
+                "pxor    %%mm7, %%mm7                   \n\t"
                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
 #ifdef DITHER1XBPP
-                "paddusb "MANGLE(b5Dither)", %%mm2      \n\t"
-                "paddusb "MANGLE(g5Dither)", %%mm4      \n\t"
-                "paddusb "MANGLE(r5Dither)", %%mm5      \n\t"
+                "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
+                "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
+                "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
 #endif
 
-                WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
+                WRITERGB15(%%REGb, 8280(%5), %%REGBP)
                 "pop %%"REG_BP"                         \n\t"
                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
 
@@ -1475,20 +1241,21 @@ FULL_YSCALEYUV2RGB
                 "a" (&c->redDither)
                 );
                 return;
-            case PIX_FMT_BGR565:
-                asm volatile(
+            case PIX_FMT_RGB565:
+                __asm__ volatile(
                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
                 "mov        %4, %%"REG_b"               \n\t"
                 "push %%"REG_BP"                        \n\t"
                 YSCALEYUV2RGB(%%REGBP, %5)
+                "pxor    %%mm7, %%mm7                   \n\t"
                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
 #ifdef DITHER1XBPP
-                "paddusb "MANGLE(b5Dither)", %%mm2      \n\t"
-                "paddusb "MANGLE(g6Dither)", %%mm4      \n\t"
-                "paddusb "MANGLE(r5Dither)", %%mm5      \n\t"
+                "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
+                "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
+                "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
 #endif
 
-                WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
+                WRITERGB16(%%REGb, 8280(%5), %%REGBP)
                 "pop %%"REG_BP"                         \n\t"
                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
                 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
@@ -1496,7 +1263,7 @@ FULL_YSCALEYUV2RGB
                 );
                 return;
             case PIX_FMT_YUYV422:
-                asm volatile(
+                __asm__ volatile(
                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
                 "mov %4, %%"REG_b"                        \n\t"
                 "push %%"REG_BP"                        \n\t"
@@ -1510,8 +1277,9 @@ FULL_YSCALEYUV2RGB
                 return;
             default: break;
         }
+    }
 #endif //HAVE_MMX
-YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C)
+YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C, YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
 }
 
 /**
@@ -1532,200 +1300,210 @@ static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *
         return;
     }
 
-#ifdef HAVE_MMX
-    if (uvalpha < 2048) // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
-    {
-        switch(dstFormat)
+#if HAVE_MMX
+    if(!(flags & SWS_BITEXACT)){
+        if (uvalpha < 2048) // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
         {
-        case PIX_FMT_RGB32:
-            asm volatile(
-            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
-            "mov        %4, %%"REG_b"               \n\t"
-            "push %%"REG_BP"                        \n\t"
-            YSCALEYUV2RGB1(%%REGBP, %5)
-            WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
-            "pop %%"REG_BP"                         \n\t"
-            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
+            switch(dstFormat)
+            {
+            case PIX_FMT_RGB32:
+                __asm__ volatile(
+                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
+                "mov        %4, %%"REG_b"               \n\t"
+                "push %%"REG_BP"                        \n\t"
+                YSCALEYUV2RGB1(%%REGBP, %5)
+                "pcmpeqd %%mm7, %%mm7                   \n\t"
+                WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
+                "pop %%"REG_BP"                         \n\t"
+                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
 
-            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
-            "a" (&c->redDither)
-            );
-            return;
-        case PIX_FMT_BGR24:
-            asm volatile(
-            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
-            "mov        %4, %%"REG_b"               \n\t"
-            "push %%"REG_BP"                        \n\t"
-            YSCALEYUV2RGB1(%%REGBP, %5)
-            WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
-            "pop %%"REG_BP"                         \n\t"
-            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
+                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
+                "a" (&c->redDither)
+                );
+                return;
+            case PIX_FMT_BGR24:
+                __asm__ volatile(
+                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
+                "mov        %4, %%"REG_b"               \n\t"
+                "push %%"REG_BP"                        \n\t"
+                YSCALEYUV2RGB1(%%REGBP, %5)
+                "pxor    %%mm7, %%mm7                   \n\t"
+                WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
+                "pop %%"REG_BP"                         \n\t"
+                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
 
-            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
-            "a" (&c->redDither)
-            );
-            return;
-        case PIX_FMT_BGR555:
-            asm volatile(
-            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
-            "mov        %4, %%"REG_b"               \n\t"
-            "push %%"REG_BP"                        \n\t"
-            YSCALEYUV2RGB1(%%REGBP, %5)
-            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
+                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
+                "a" (&c->redDither)
+                );
+                return;
+            case PIX_FMT_RGB555:
+                __asm__ volatile(
+                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
+                "mov        %4, %%"REG_b"               \n\t"
+                "push %%"REG_BP"                        \n\t"
+                YSCALEYUV2RGB1(%%REGBP, %5)
+                "pxor    %%mm7, %%mm7                   \n\t"
+                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
 #ifdef DITHER1XBPP
-            "paddusb "MANGLE(b5Dither)", %%mm2      \n\t"
-            "paddusb "MANGLE(g5Dither)", %%mm4      \n\t"
-            "paddusb "MANGLE(r5Dither)", %%mm5      \n\t"
+                "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
+                "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
+                "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
 #endif
-            WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
-            "pop %%"REG_BP"                         \n\t"
-            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
+                WRITERGB15(%%REGb, 8280(%5), %%REGBP)
+                "pop %%"REG_BP"                         \n\t"
+                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
 
-            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
-            "a" (&c->redDither)
-            );
-            return;
-        case PIX_FMT_BGR565:
-            asm volatile(
-            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
-            "mov        %4, %%"REG_b"               \n\t"
-            "push %%"REG_BP"                        \n\t"
-            YSCALEYUV2RGB1(%%REGBP, %5)
-            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
+                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
+                "a" (&c->redDither)
+                );
+                return;
+            case PIX_FMT_RGB565:
+                __asm__ volatile(
+                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
+                "mov        %4, %%"REG_b"               \n\t"
+                "push %%"REG_BP"                        \n\t"
+                YSCALEYUV2RGB1(%%REGBP, %5)
+                "pxor    %%mm7, %%mm7                   \n\t"
+                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
 #ifdef DITHER1XBPP
-            "paddusb "MANGLE(b5Dither)", %%mm2      \n\t"
-            "paddusb "MANGLE(g6Dither)", %%mm4      \n\t"
-            "paddusb "MANGLE(r5Dither)", %%mm5      \n\t"
+                "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
+                "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
+                "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
 #endif
 
-            WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
-            "pop %%"REG_BP"                         \n\t"
-            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
+                WRITERGB16(%%REGb, 8280(%5), %%REGBP)
+                "pop %%"REG_BP"                         \n\t"
+                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
 
-            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
-            "a" (&c->redDither)
-            );
-            return;
-        case PIX_FMT_YUYV422:
-            asm volatile(
-            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
-            "mov        %4, %%"REG_b"               \n\t"
-            "push %%"REG_BP"                        \n\t"
-            YSCALEYUV2PACKED1(%%REGBP, %5)
-            WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
-            "pop %%"REG_BP"                         \n\t"
-            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
+                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
+                "a" (&c->redDither)
+                );
+                return;
+            case PIX_FMT_YUYV422:
+                __asm__ volatile(
+                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
+                "mov        %4, %%"REG_b"               \n\t"
+                "push %%"REG_BP"                        \n\t"
+                YSCALEYUV2PACKED1(%%REGBP, %5)
+                WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
+                "pop %%"REG_BP"                         \n\t"
+                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
 
-            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
-            "a" (&c->redDither)
-            );
-            return;
+                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
+                "a" (&c->redDither)
+                );
+                return;
+            }
         }
-    }
-    else
-    {
-        switch(dstFormat)
+        else
         {
-        case PIX_FMT_RGB32:
-            asm volatile(
-            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
-            "mov        %4, %%"REG_b"               \n\t"
-            "push %%"REG_BP"                        \n\t"
-            YSCALEYUV2RGB1b(%%REGBP, %5)
-            WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
-            "pop %%"REG_BP"                         \n\t"
-            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
+            switch(dstFormat)
+            {
+            case PIX_FMT_RGB32:
+                __asm__ volatile(
+                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
+                "mov        %4, %%"REG_b"               \n\t"
+                "push %%"REG_BP"                        \n\t"
+                YSCALEYUV2RGB1b(%%REGBP, %5)
+                "pcmpeqd %%mm7, %%mm7                   \n\t"
+                WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
+                "pop %%"REG_BP"                         \n\t"
+                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
 
-            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
-            "a" (&c->redDither)
-            );
-            return;
-        case PIX_FMT_BGR24:
-            asm volatile(
-            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
-            "mov        %4, %%"REG_b"               \n\t"
-            "push %%"REG_BP"                        \n\t"
-            YSCALEYUV2RGB1b(%%REGBP, %5)
-            WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
-            "pop %%"REG_BP"                         \n\t"
-            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
+                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
+                "a" (&c->redDither)
+                );
+                return;
+            case PIX_FMT_BGR24:
+                __asm__ volatile(
+                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
+                "mov        %4, %%"REG_b"               \n\t"
+                "push %%"REG_BP"                        \n\t"
+                YSCALEYUV2RGB1b(%%REGBP, %5)
+                "pxor    %%mm7, %%mm7                   \n\t"
+                WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
+                "pop %%"REG_BP"                         \n\t"
+                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
 
-            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
-            "a" (&c->redDither)
-            );
-            return;
-        case PIX_FMT_BGR555:
-            asm volatile(
-            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
-            "mov        %4, %%"REG_b"               \n\t"
-            "push %%"REG_BP"                        \n\t"
-            YSCALEYUV2RGB1b(%%REGBP, %5)
-            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
+                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
+                "a" (&c->redDither)
+                );
+                return;
+            case PIX_FMT_RGB555:
+                __asm__ volatile(
+                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
+                "mov        %4, %%"REG_b"               \n\t"
+                "push %%"REG_BP"                        \n\t"
+                YSCALEYUV2RGB1b(%%REGBP, %5)
+                "pxor    %%mm7, %%mm7                   \n\t"
+                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
 #ifdef DITHER1XBPP
-            "paddusb "MANGLE(b5Dither)", %%mm2      \n\t"
-            "paddusb "MANGLE(g5Dither)", %%mm4      \n\t"
-            "paddusb "MANGLE(r5Dither)", %%mm5      \n\t"
+                "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
+                "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
+                "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
 #endif
-            WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
-            "pop %%"REG_BP"                         \n\t"
-            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
+                WRITERGB15(%%REGb, 8280(%5), %%REGBP)
+                "pop %%"REG_BP"                         \n\t"
+                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
 
-            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
-            "a" (&c->redDither)
-            );
-            return;
-        case PIX_FMT_BGR565:
-            asm volatile(
-            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
-            "mov        %4, %%"REG_b"               \n\t"
-            "push %%"REG_BP"                        \n\t"
-            YSCALEYUV2RGB1b(%%REGBP, %5)
-            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
+                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
+                "a" (&c->redDither)
+                );
+                return;
+            case PIX_FMT_RGB565:
+                __asm__ volatile(
+                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
+                "mov        %4, %%"REG_b"               \n\t"
+                "push %%"REG_BP"                        \n\t"
+                YSCALEYUV2RGB1b(%%REGBP, %5)
+                "pxor    %%mm7, %%mm7                   \n\t"
+                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
 #ifdef DITHER1XBPP
-            "paddusb "MANGLE(b5Dither)", %%mm2      \n\t"
-            "paddusb "MANGLE(g6Dither)", %%mm4      \n\t"
-            "paddusb "MANGLE(r5Dither)", %%mm5      \n\t"
+                "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
+                "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
+                "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
 #endif
 
-            WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
-            "pop %%"REG_BP"                         \n\t"
-            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
+                WRITERGB16(%%REGb, 8280(%5), %%REGBP)
+                "pop %%"REG_BP"                         \n\t"
+                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
 
-            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
-            "a" (&c->redDither)
-            );
-            return;
-        case PIX_FMT_YUYV422:
-            asm volatile(
-            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
-            "mov        %4, %%"REG_b"               \n\t"
-            "push %%"REG_BP"                        \n\t"
-            YSCALEYUV2PACKED1b(%%REGBP, %5)
-            WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
-            "pop %%"REG_BP"                         \n\t"
-            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
+                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
+                "a" (&c->redDither)
+                );
+                return;
+            case PIX_FMT_YUYV422:
+                __asm__ volatile(
+                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
+                "mov        %4, %%"REG_b"               \n\t"
+                "push %%"REG_BP"                        \n\t"
+                YSCALEYUV2PACKED1b(%%REGBP, %5)
+                WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
+                "pop %%"REG_BP"                         \n\t"
+                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
 
-            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
-            "a" (&c->redDither)
-            );
-            return;
+                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
+                "a" (&c->redDither)
+                );
+                return;
+            }
         }
     }
 #endif /* HAVE_MMX */
     if (uvalpha < 2048)
     {
-        YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C)
+        YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C, YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
     }else{
-        YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C)
+        YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C, YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
     }
 }
 
 //FIXME yuy2* can read up to 7 samples too much
 
-static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width)
+static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
 {
-#ifdef HAVE_MMX
-    asm volatile(
+#if HAVE_MMX
+    __asm__ volatile(
     "movq "MANGLE(bm01010101)", %%mm2           \n\t"
     "mov                    %0, %%"REG_a"       \n\t"
     "1:                                         \n\t"
@@ -1747,10 +1525,10 @@ static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width)
 #endif
 }
 
-static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
+static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
 {
-#ifdef HAVE_MMX
-    asm volatile(
+#if HAVE_MMX
+    __asm__ volatile(
     "movq "MANGLE(bm01010101)", %%mm4           \n\t"
     "mov                    %0, %%"REG_a"       \n\t"
     "1:                                         \n\t"
@@ -1784,10 +1562,10 @@ static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1,
 
 /* This is almost identical to the previous, end exists only because
  * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
-static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width)
+static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
 {
-#ifdef HAVE_MMX
-    asm volatile(
+#if HAVE_MMX
+    __asm__ volatile(
     "mov                  %0, %%"REG_a"         \n\t"
     "1:                                         \n\t"
     "movq  (%1, %%"REG_a",2), %%mm0             \n\t"
@@ -1808,10 +1586,10 @@ static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width)
 #endif
 }
 
-static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
+static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
 {
-#ifdef HAVE_MMX
-    asm volatile(
+#if HAVE_MMX
+    __asm__ volatile(
     "movq "MANGLE(bm01010101)", %%mm4           \n\t"
     "mov                    %0, %%"REG_a"       \n\t"
     "1:                                         \n\t"
@@ -1843,110 +1621,184 @@ static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1,
     assert(src1 == src2);
 }
 
-static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width)
-{
-    int i;
-    for (i=0; i<width; i++)
-    {
-        int b=  ((uint32_t*)src)[i]&0xFF;
-        int g= (((uint32_t*)src)[i]>>8)&0xFF;
-        int r= (((uint32_t*)src)[i]>>16)&0xFF;
-
-        dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
-    }
+#define BGR2Y(type, name, shr, shg, shb, maskr, maskg, maskb, RY, GY, BY, S)\
+static inline void RENAME(name)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)\
+{\
+    int i;\
+    for (i=0; i<width; i++)\
+    {\
+        int b= (((type*)src)[i]>>shb)&maskb;\
+        int g= (((type*)src)[i]>>shg)&maskg;\
+        int r= (((type*)src)[i]>>shr)&maskr;\
+\
+        dst[i]= (((RY)*r + (GY)*g + (BY)*b + (33<<((S)-1)))>>(S));\
+    }\
 }
 
-static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
-{
-    int i;
-    assert(src1 == src2);
-    for (i=0; i<width; i++)
-    {
-        const int a= ((uint32_t*)src1)[2*i+0];
-        const int e= ((uint32_t*)src1)[2*i+1];
-        const int l= (a&0xFF00FF) + (e&0xFF00FF);
-        const int h= (a&0x00FF00) + (e&0x00FF00);
-        const int b=  l&0x3FF;
-        const int g=  h>>8;
-        const int r=  l>>16;
+BGR2Y(uint32_t, bgr32ToY,16, 0, 0, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY   , BY<< 8, RGB2YUV_SHIFT+8)
+BGR2Y(uint32_t, rgb32ToY, 0, 0,16, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY   , BY<< 8, RGB2YUV_SHIFT+8)
+BGR2Y(uint16_t, bgr16ToY, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RY<<11, GY<<5, BY    , RGB2YUV_SHIFT+8)
+BGR2Y(uint16_t, bgr15ToY, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RY<<10, GY<<5, BY    , RGB2YUV_SHIFT+7)
+BGR2Y(uint16_t, rgb16ToY, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RY    , GY<<5, BY<<11, RGB2YUV_SHIFT+8)
+BGR2Y(uint16_t, rgb15ToY, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RY    , GY<<5, BY<<10, RGB2YUV_SHIFT+7)
 
-        dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128;
-        dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128;
-    }
+#define BGR2UV(type, name, shr, shg, shb, maska, maskr, maskg, maskb, RU, GU, BU, RV, GV, BV, S)\
+static inline void RENAME(name)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, uint8_t *dummy, long width, uint32_t *unused)\
+{\
+    int i;\
+    for (i=0; i<width; i++)\
+    {\
+        int b= (((type*)src)[i]&maskb)>>shb;\
+        int g= (((type*)src)[i]&maskg)>>shg;\
+        int r= (((type*)src)[i]&maskr)>>shr;\
+\
+        dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<((S)-1)))>>(S);\
+        dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<((S)-1)))>>(S);\
+    }\
+}\
+static inline void RENAME(name ## _half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, uint8_t *dummy, long width, uint32_t *unused)\
+{\
+    int i;\
+    for (i=0; i<width; i++)\
+    {\
+        int pix0= ((type*)src)[2*i+0];\
+        int pix1= ((type*)src)[2*i+1];\
+        int g= (pix0&(maskg|maska))+(pix1&(maskg|maska));\
+        int b= ((pix0+pix1-g)&(maskb|(2*maskb)))>>shb;\
+        int r= ((pix0+pix1-g)&(maskr|(2*maskr)))>>shr;\
+        g&= maskg|(2*maskg);\
+\
+        g>>=shg;\
+\
+        dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<(S)))>>((S)+1);\
+        dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<(S)))>>((S)+1);\
+    }\
 }
 
-static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width)
+BGR2UV(uint32_t, bgr32ToUV,16, 0, 0, 0xFF000000, 0xFF0000, 0xFF00,   0x00FF, RU<< 8, GU   , BU<< 8, RV<< 8, GV   , BV<< 8, RGB2YUV_SHIFT+8)
+BGR2UV(uint32_t, rgb32ToUV, 0, 0,16, 0xFF000000,   0x00FF, 0xFF00, 0xFF0000, RU<< 8, GU   , BU<< 8, RV<< 8, GV   , BV<< 8, RGB2YUV_SHIFT+8)
+BGR2UV(uint16_t, bgr16ToUV, 0, 0, 0,          0,   0x001F, 0x07E0,   0xF800, RU<<11, GU<<5, BU    , RV<<11, GV<<5, BV    , RGB2YUV_SHIFT+8)
+BGR2UV(uint16_t, bgr15ToUV, 0, 0, 0,          0,   0x001F, 0x03E0,   0x7C00, RU<<10, GU<<5, BU    , RV<<10, GV<<5, BV    , RGB2YUV_SHIFT+7)
+BGR2UV(uint16_t, rgb16ToUV, 0, 0, 0,          0,   0xF800, 0x07E0,   0x001F, RU    , GU<<5, BU<<11, RV    , GV<<5, BV<<11, RGB2YUV_SHIFT+8)
+BGR2UV(uint16_t, rgb15ToUV, 0, 0, 0,          0,   0x7C00, 0x03E0,   0x001F, RU    , GU<<5, BU<<10, RV    , GV<<5, BV<<10, RGB2YUV_SHIFT+7)
+
+#if HAVE_MMX
+static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, uint8_t *src, long width, int srcFormat)
 {
-#ifdef HAVE_MMX
-    asm volatile(
-    "mov                        %2, %%"REG_a"   \n\t"
-    "movq  "MANGLE(ff_bgr2YCoeff)", %%mm6       \n\t"
-    "movq       "MANGLE(ff_w1111)", %%mm5       \n\t"
-    "pxor                    %%mm7, %%mm7       \n\t"
-    "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"   \n\t"
-    ASMALIGN(4)
-    "1:                                         \n\t"
-    PREFETCH" 64(%0, %%"REG_d")                 \n\t"
-    "movd          (%0, %%"REG_d"), %%mm0       \n\t"
-    "movd         3(%0, %%"REG_d"), %%mm1       \n\t"
-    "punpcklbw               %%mm7, %%mm0       \n\t"
-    "punpcklbw               %%mm7, %%mm1       \n\t"
-    "movd         6(%0, %%"REG_d"), %%mm2       \n\t"
-    "movd         9(%0, %%"REG_d"), %%mm3       \n\t"
-    "punpcklbw               %%mm7, %%mm2       \n\t"
-    "punpcklbw               %%mm7, %%mm3       \n\t"
-    "pmaddwd                 %%mm6, %%mm0       \n\t"
-    "pmaddwd                 %%mm6, %%mm1       \n\t"
-    "pmaddwd                 %%mm6, %%mm2       \n\t"
-    "pmaddwd                 %%mm6, %%mm3       \n\t"
-#ifndef FAST_BGR2YV12
-    "psrad                      $8, %%mm0       \n\t"
-    "psrad                      $8, %%mm1       \n\t"
-    "psrad                      $8, %%mm2       \n\t"
-    "psrad                      $8, %%mm3       \n\t"
-#endif
-    "packssdw                %%mm1, %%mm0       \n\t"
-    "packssdw                %%mm3, %%mm2       \n\t"
-    "pmaddwd                 %%mm5, %%mm0       \n\t"
-    "pmaddwd                 %%mm5, %%mm2       \n\t"
-    "packssdw                %%mm2, %%mm0       \n\t"
-    "psraw                      $7, %%mm0       \n\t"
 
-    "movd        12(%0, %%"REG_d"), %%mm4       \n\t"
-    "movd        15(%0, %%"REG_d"), %%mm1       \n\t"
-    "punpcklbw               %%mm7, %%mm4       \n\t"
-    "punpcklbw               %%mm7, %%mm1       \n\t"
-    "movd        18(%0, %%"REG_d"), %%mm2       \n\t"
-    "movd        21(%0, %%"REG_d"), %%mm3       \n\t"
-    "punpcklbw               %%mm7, %%mm2       \n\t"
-    "punpcklbw               %%mm7, %%mm3       \n\t"
-    "pmaddwd                 %%mm6, %%mm4       \n\t"
-    "pmaddwd                 %%mm6, %%mm1       \n\t"
-    "pmaddwd                 %%mm6, %%mm2       \n\t"
-    "pmaddwd                 %%mm6, %%mm3       \n\t"
-#ifndef FAST_BGR2YV12
-    "psrad                      $8, %%mm4       \n\t"
-    "psrad                      $8, %%mm1       \n\t"
-    "psrad                      $8, %%mm2       \n\t"
-    "psrad                      $8, %%mm3       \n\t"
-#endif
-    "packssdw                %%mm1, %%mm4       \n\t"
-    "packssdw                %%mm3, %%mm2       \n\t"
-    "pmaddwd                 %%mm5, %%mm4       \n\t"
-    "pmaddwd                 %%mm5, %%mm2       \n\t"
-    "add                       $24, %%"REG_d"   \n\t"
-    "packssdw                %%mm2, %%mm4       \n\t"
-    "psraw                      $7, %%mm4       \n\t"
+    if(srcFormat == PIX_FMT_BGR24){
+        __asm__ volatile(
+            "movq  "MANGLE(ff_bgr24toY1Coeff)", %%mm5       \n\t"
+            "movq  "MANGLE(ff_bgr24toY2Coeff)", %%mm6       \n\t"
+            :
+        );
+    }else{
+        __asm__ volatile(
+            "movq  "MANGLE(ff_rgb24toY1Coeff)", %%mm5       \n\t"
+            "movq  "MANGLE(ff_rgb24toY2Coeff)", %%mm6       \n\t"
+            :
+        );
+    }
 
-    "packuswb                %%mm4, %%mm0       \n\t"
-    "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0    \n\t"
-
-    "movq                    %%mm0, (%1, %%"REG_a") \n\t"
-    "add                        $8, %%"REG_a"   \n\t"
-    " js                        1b              \n\t"
-    : : "r" (src+width*3), "r" (dst+width), "g" (-width)
-    : "%"REG_a, "%"REG_d
+    __asm__ volatile(
+        "movq  "MANGLE(ff_bgr24toYOffset)", %%mm4   \n\t"
+        "mov                        %2, %%"REG_a"   \n\t"
+        "pxor                    %%mm7, %%mm7       \n\t"
+        "1:                                         \n\t"
+        PREFETCH"               64(%0)              \n\t"
+        "movd                     (%0), %%mm0       \n\t"
+        "movd                    2(%0), %%mm1       \n\t"
+        "movd                    6(%0), %%mm2       \n\t"
+        "movd                    8(%0), %%mm3       \n\t"
+        "add                       $12, %0          \n\t"
+        "punpcklbw               %%mm7, %%mm0       \n\t"
+        "punpcklbw               %%mm7, %%mm1       \n\t"
+        "punpcklbw               %%mm7, %%mm2       \n\t"
+        "punpcklbw               %%mm7, %%mm3       \n\t"
+        "pmaddwd                 %%mm5, %%mm0       \n\t"
+        "pmaddwd                 %%mm6, %%mm1       \n\t"
+        "pmaddwd                 %%mm5, %%mm2       \n\t"
+        "pmaddwd                 %%mm6, %%mm3       \n\t"
+        "paddd                   %%mm1, %%mm0       \n\t"
+        "paddd                   %%mm3, %%mm2       \n\t"
+        "paddd                   %%mm4, %%mm0       \n\t"
+        "paddd                   %%mm4, %%mm2       \n\t"
+        "psrad                     $15, %%mm0       \n\t"
+        "psrad                     $15, %%mm2       \n\t"
+        "packssdw                %%mm2, %%mm0       \n\t"
+        "packuswb                %%mm0, %%mm0       \n\t"
+        "movd                %%mm0, (%1, %%"REG_a") \n\t"
+        "add                        $4, %%"REG_a"   \n\t"
+        " js                        1b              \n\t"
+    : "+r" (src)
+    : "r" (dst+width), "g" (-width)
+    : "%"REG_a
     );
+}
+
+static inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, long width, int srcFormat)
+{
+    __asm__ volatile(
+        "movq                    24+%4, %%mm6       \n\t"
+        "mov                        %3, %%"REG_a"   \n\t"
+        "pxor                    %%mm7, %%mm7       \n\t"
+        "1:                                         \n\t"
+        PREFETCH"               64(%0)              \n\t"
+        "movd                     (%0), %%mm0       \n\t"
+        "movd                    2(%0), %%mm1       \n\t"
+        "punpcklbw               %%mm7, %%mm0       \n\t"
+        "punpcklbw               %%mm7, %%mm1       \n\t"
+        "movq                    %%mm0, %%mm2       \n\t"
+        "movq                    %%mm1, %%mm3       \n\t"
+        "pmaddwd                    %4, %%mm0       \n\t"
+        "pmaddwd                  8+%4, %%mm1       \n\t"
+        "pmaddwd                 16+%4, %%mm2       \n\t"
+        "pmaddwd                 %%mm6, %%mm3       \n\t"
+        "paddd                   %%mm1, %%mm0       \n\t"
+        "paddd                   %%mm3, %%mm2       \n\t"
+
+        "movd                    6(%0), %%mm1       \n\t"
+        "movd                    8(%0), %%mm3       \n\t"
+        "add                       $12, %0          \n\t"
+        "punpcklbw               %%mm7, %%mm1       \n\t"
+        "punpcklbw               %%mm7, %%mm3       \n\t"
+        "movq                    %%mm1, %%mm4       \n\t"
+        "movq                    %%mm3, %%mm5       \n\t"
+        "pmaddwd                    %4, %%mm1       \n\t"
+        "pmaddwd                  8+%4, %%mm3       \n\t"
+        "pmaddwd                 16+%4, %%mm4       \n\t"
+        "pmaddwd                 %%mm6, %%mm5       \n\t"
+        "paddd                   %%mm3, %%mm1       \n\t"
+        "paddd                   %%mm5, %%mm4       \n\t"
+
+        "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3       \n\t"
+        "paddd                   %%mm3, %%mm0       \n\t"
+        "paddd                   %%mm3, %%mm2       \n\t"
+        "paddd                   %%mm3, %%mm1       \n\t"
+        "paddd                   %%mm3, %%mm4       \n\t"
+        "psrad                     $15, %%mm0       \n\t"
+        "psrad                     $15, %%mm2       \n\t"
+        "psrad                     $15, %%mm1       \n\t"
+        "psrad                     $15, %%mm4       \n\t"
+        "packssdw                %%mm1, %%mm0       \n\t"
+        "packssdw                %%mm4, %%mm2       \n\t"
+        "packuswb                %%mm0, %%mm0       \n\t"
+        "packuswb                %%mm2, %%mm2       \n\t"
+        "movd                %%mm0, (%1, %%"REG_a") \n\t"
+        "movd                %%mm2, (%2, %%"REG_a") \n\t"
+        "add                        $4, %%"REG_a"   \n\t"
+        " js                        1b              \n\t"
+    : "+r" (src)
+    : "r" (dstU+width), "r" (dstV+width), "g" (-width), "m"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24][0])
+    : "%"REG_a
+    );
+}
+#endif
+
+static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
+{
+#if HAVE_MMX
+    RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
 #else
     int i;
     for (i=0; i<width; i++)
@@ -1960,126 +1812,27 @@ static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width)
 #endif /* HAVE_MMX */
 }
 
-static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
+static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
 {
-#ifdef HAVE_MMX
-    asm volatile(
-    "mov                        %3, %%"REG_a"   \n\t"
-    "movq       "MANGLE(ff_w1111)", %%mm5       \n\t"
-    "movq  "MANGLE(ff_bgr2UCoeff)", %%mm6       \n\t"
-    "pxor                    %%mm7, %%mm7       \n\t"
-    "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"   \n\t"
-    "add                 %%"REG_d", %%"REG_d"   \n\t"
-    ASMALIGN(4)
-    "1:                                         \n\t"
-    PREFETCH" 64(%0, %%"REG_d")                 \n\t"
-#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
-    "movq          (%0, %%"REG_d"), %%mm0       \n\t"
-    "movq         6(%0, %%"REG_d"), %%mm2       \n\t"
-    "movq                    %%mm0, %%mm1       \n\t"
-    "movq                    %%mm2, %%mm3       \n\t"
-    "psrlq                     $24, %%mm0       \n\t"
-    "psrlq                     $24, %%mm2       \n\t"
-    PAVGB(%%mm1, %%mm0)
-    PAVGB(%%mm3, %%mm2)
-    "punpcklbw               %%mm7, %%mm0       \n\t"
-    "punpcklbw               %%mm7, %%mm2       \n\t"
+#if HAVE_MMX
+    RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
 #else
-    "movd          (%0, %%"REG_d"), %%mm0       \n\t"
-    "movd         3(%0, %%"REG_d"), %%mm2       \n\t"
-    "punpcklbw               %%mm7, %%mm0       \n\t"
-    "punpcklbw               %%mm7, %%mm2       \n\t"
-    "paddw                   %%mm2, %%mm0       \n\t"
-    "movd         6(%0, %%"REG_d"), %%mm4       \n\t"
-    "movd         9(%0, %%"REG_d"), %%mm2       \n\t"
-    "punpcklbw               %%mm7, %%mm4       \n\t"
-    "punpcklbw               %%mm7, %%mm2       \n\t"
-    "paddw                   %%mm4, %%mm2       \n\t"
-    "psrlw                      $1, %%mm0       \n\t"
-    "psrlw                      $1, %%mm2       \n\t"
-#endif
-    "movq  "MANGLE(ff_bgr2VCoeff)", %%mm1       \n\t"
-    "movq  "MANGLE(ff_bgr2VCoeff)", %%mm3       \n\t"
+    int i;
+    for (i=0; i<width; i++)
+    {
+        int b= src1[3*i + 0];
+        int g= src1[3*i + 1];
+        int r= src1[3*i + 2];
 
-    "pmaddwd                 %%mm0, %%mm1       \n\t"
-    "pmaddwd                 %%mm2, %%mm3       \n\t"
-    "pmaddwd                 %%mm6, %%mm0       \n\t"
-    "pmaddwd                 %%mm6, %%mm2       \n\t"
-#ifndef FAST_BGR2YV12
-    "psrad                      $8, %%mm0       \n\t"
-    "psrad                      $8, %%mm1       \n\t"
-    "psrad                      $8, %%mm2       \n\t"
-    "psrad                      $8, %%mm3       \n\t"
-#endif
-    "packssdw                %%mm2, %%mm0       \n\t"
-    "packssdw                %%mm3, %%mm1       \n\t"
-    "pmaddwd                 %%mm5, %%mm0       \n\t"
-    "pmaddwd                 %%mm5, %%mm1       \n\t"
-    "packssdw                %%mm1, %%mm0       \n\t" // V1 V0 U1 U0
-    "psraw                      $7, %%mm0       \n\t"
+        dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
+        dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
+    }
+#endif /* HAVE_MMX */
+    assert(src1 == src2);
+}
 
-#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
-    "movq       12(%0, %%"REG_d"), %%mm4       \n\t"
-    "movq       18(%0, %%"REG_d"), %%mm2       \n\t"
-    "movq                   %%mm4, %%mm1       \n\t"
-    "movq                   %%mm2, %%mm3       \n\t"
-    "psrlq                    $24, %%mm4       \n\t"
-    "psrlq                    $24, %%mm2       \n\t"
-    PAVGB(%%mm1, %%mm4)
-    PAVGB(%%mm3, %%mm2)
-    "punpcklbw              %%mm7, %%mm4       \n\t"
-    "punpcklbw              %%mm7, %%mm2       \n\t"
-#else
-    "movd       12(%0, %%"REG_d"), %%mm4       \n\t"
-    "movd       15(%0, %%"REG_d"), %%mm2       \n\t"
-    "punpcklbw              %%mm7, %%mm4       \n\t"
-    "punpcklbw              %%mm7, %%mm2       \n\t"
-    "paddw                  %%mm2, %%mm4       \n\t"
-    "movd       18(%0, %%"REG_d"), %%mm5       \n\t"
-    "movd       21(%0, %%"REG_d"), %%mm2       \n\t"
-    "punpcklbw              %%mm7, %%mm5       \n\t"
-    "punpcklbw              %%mm7, %%mm2       \n\t"
-    "paddw                  %%mm5, %%mm2       \n\t"
-    "movq      "MANGLE(ff_w1111)", %%mm5       \n\t"
-    "psrlw                     $2, %%mm4       \n\t"
-    "psrlw                     $2, %%mm2       \n\t"
-#endif
-    "movq "MANGLE(ff_bgr2VCoeff)", %%mm1       \n\t"
-    "movq "MANGLE(ff_bgr2VCoeff)", %%mm3       \n\t"
-
-    "pmaddwd                %%mm4, %%mm1       \n\t"
-    "pmaddwd                %%mm2, %%mm3       \n\t"
-    "pmaddwd                %%mm6, %%mm4       \n\t"
-    "pmaddwd                %%mm6, %%mm2       \n\t"
-#ifndef FAST_BGR2YV12
-    "psrad                     $8, %%mm4       \n\t"
-    "psrad                     $8, %%mm1       \n\t"
-    "psrad                     $8, %%mm2       \n\t"
-    "psrad                     $8, %%mm3       \n\t"
-#endif
-    "packssdw               %%mm2, %%mm4       \n\t"
-    "packssdw               %%mm3, %%mm1       \n\t"
-    "pmaddwd                %%mm5, %%mm4       \n\t"
-    "pmaddwd                %%mm5, %%mm1       \n\t"
-    "add                      $24, %%"REG_d"   \n\t"
-    "packssdw               %%mm1, %%mm4       \n\t" // V3 V2 U3 U2
-    "psraw                     $7, %%mm4       \n\t"
-
-    "movq                   %%mm0, %%mm1       \n\t"
-    "punpckldq              %%mm4, %%mm0       \n\t"
-    "punpckhdq              %%mm4, %%mm1       \n\t"
-    "packsswb               %%mm1, %%mm0       \n\t"
-    "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0    \n\t"
-
-    "movd                   %%mm0, (%1, %%"REG_a")  \n\t"
-    "punpckhdq              %%mm0, %%mm0            \n\t"
-    "movd                   %%mm0, (%2, %%"REG_a")  \n\t"
-    "add                       $4, %%"REG_a"        \n\t"
-    " js                       1b                   \n\t"
-    : : "r" (src1+width*6), "r" (dstU+width), "r" (dstV+width), "g" (-width)
-    : "%"REG_a, "%"REG_d
-    );
-#else
+static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
+{
     int i;
     for (i=0; i<width; i++)
     {
@@ -2087,120 +1840,17 @@ static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1
         int g= src1[6*i + 1] + src1[6*i + 4];
         int r= src1[6*i + 2] + src1[6*i + 5];
 
-        dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128;
-        dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128;
+        dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
+        dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
     }
-#endif /* HAVE_MMX */
     assert(src1 == src2);
 }
 
-static inline void RENAME(rgb16ToY)(uint8_t *dst, uint8_t *src, int width)
-{
-    int i;
-    for (i=0; i<width; i++)
-    {
-        int d= ((uint16_t*)src)[i];
-        int b= d&0x1F;
-        int g= (d>>5)&0x3F;
-        int r= (d>>11)&0x1F;
-
-        dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
-    }
-}
-
-static inline void RENAME(rgb16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
-{
-    int i;
-    assert(src1==src2);
-    for (i=0; i<width; i++)
-    {
-        int d0= ((uint32_t*)src1)[i];
-
-        int dl= (d0&0x07E0F81F);
-        int dh= ((d0>>5)&0x07C0F83F);
-
-        int dh2= (dh>>11) + (dh<<21);
-        int d= dh2 + dl;
-
-        int b= d&0x7F;
-        int r= (d>>11)&0x7F;
-        int g= d>>21;
-        dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+1-2)) + 128;
-        dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+1-2)) + 128;
-    }
-}
-
-static inline void RENAME(rgb15ToY)(uint8_t *dst, uint8_t *src, int width)
-{
-    int i;
-    for (i=0; i<width; i++)
-    {
-        int d= ((uint16_t*)src)[i];
-        int b= d&0x1F;
-        int g= (d>>5)&0x1F;
-        int r= (d>>10)&0x1F;
-
-        dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
-    }
-}
-
-static inline void RENAME(rgb15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
-{
-    int i;
-    assert(src1==src2);
-    for (i=0; i<width; i++)
-    {
-        int d0= ((uint32_t*)src1)[i];
-
-        int dl= (d0&0x03E07C1F);
-        int dh= ((d0>>5)&0x03E0F81F);
-
-        int dh2= (dh>>11) + (dh<<21);
-        int d= dh2 + dl;
-
-        int b= d&0x7F;
-        int r= (d>>10)&0x7F;
-        int g= d>>21;
-        dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1-3)) + 128;
-        dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1-3)) + 128;
-    }
-}
-
-
-static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width)
-{
-    int i;
-    for (i=0; i<width; i++)
-    {
-        int r=  ((uint32_t*)src)[i]&0xFF;
-        int g= (((uint32_t*)src)[i]>>8)&0xFF;
-        int b= (((uint32_t*)src)[i]>>16)&0xFF;
-
-        dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
-    }
-}
-
-static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
-{
-    int i;
-    assert(src1==src2);
-    for (i=0; i<width; i++)
-    {
-        const int a= ((uint32_t*)src1)[2*i+0];
-        const int e= ((uint32_t*)src1)[2*i+1];
-        const int l= (a&0xFF00FF) + (e&0xFF00FF);
-        const int h= (a&0x00FF00) + (e&0x00FF00);
-        const int r=  l&0x3FF;
-        const int g=  h>>8;
-        const int b=  l>>16;
-
-        dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128;
-        dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128;
-    }
-}
-
-static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width)
+static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
 {
+#if HAVE_MMX
+    RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
+#else
     int i;
     for (i=0; i<width; i++)
     {
@@ -2210,9 +1860,30 @@ static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width)
 
         dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
     }
+#endif
 }
 
-static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
+static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
+{
+#if HAVE_MMX
+    assert(src1==src2);
+    RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
+#else
+    int i;
+    assert(src1==src2);
+    for (i=0; i<width; i++)
+    {
+        int r= src1[3*i + 0];
+        int g= src1[3*i + 1];
+        int b= src1[3*i + 2];
+
+        dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
+        dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
+    }
+#endif
+}
+
+static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
 {
     int i;
     assert(src1==src2);
@@ -2222,78 +1893,13 @@ static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1
         int g= src1[6*i + 1] + src1[6*i + 4];
         int b= src1[6*i + 2] + src1[6*i + 5];
 
-        dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128;
-        dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128;
+        dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
+        dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
     }
 }
 
-static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width)
-{
-    int i;
-    for (i=0; i<width; i++)
-    {
-        int d= ((uint16_t*)src)[i];
-        int r= d&0x1F;
-        int g= (d>>5)&0x3F;
-        int b= (d>>11)&0x1F;
 
-        dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
-    }
-}
-
-static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
-{
-    int i;
-    assert(src1 == src2);
-    for (i=0; i<width; i++)
-    {
-        int d0= ((uint32_t*)src1)[i];
-
-        int dl= (d0&0x07E0F81F);
-        int d= dl + (((d0>>16) + (d0<<16))&0x07E0F81F);
-
-        int r= d&0x3F;
-        int b= (d>>11)&0x3F;
-        int g= d>>21;
-        dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+1-2)) + 128;
-        dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+1-2)) + 128;
-    }
-}
-
-static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width)
-{
-    int i;
-    for (i=0; i<width; i++)
-    {
-        int d= ((uint16_t*)src)[i];
-        int r= d&0x1F;
-        int g= (d>>5)&0x1F;
-        int b= (d>>10)&0x1F;
-
-        dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
-    }
-}
-
-static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
-{
-    int i;
-    assert(src1 == src2);
-    for (i=0; i<width; i++)
-    {
-        int d0= ((uint32_t*)src1)[i];
-
-        int dl= (d0&0x03E07C1F);
-        int d= dl + (((d0>>16) + (d0<<16))&0x03E07C1F);
-
-        int r= d&0x3F;
-        int b= (d>>10)&0x3F;
-        int g= d>>21;
-        dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1-3)) + 128;
-        dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1-3)) + 128;
-    }
-}
-
-static inline void RENAME(palToY)(uint8_t *dst, uint8_t *src, int width, uint32_t *pal)
+static inline void RENAME(palToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *pal)
 {
     int i;
     for (i=0; i<width; i++)
@@ -2304,7 +1910,7 @@ static inline void RENAME(palToY)(uint8_t *dst, uint8_t *src, int width, uint32_
     }
 }
 
-static inline void RENAME(palToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width, uint32_t *pal)
+static inline void RENAME(palToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *pal)
 {
     int i;
     assert(src1 == src2);
@@ -2317,11 +1923,31 @@ static inline void RENAME(palToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1,
     }
 }
 
+static inline void RENAME(monowhite2Y)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
+{
+    int i, j;
+    for (i=0; i<width/8; i++){
+        int d= ~src[i];
+        for(j=0; j<8; j++)
+            dst[8*i+j]= ((d>>(7-j))&1)*255;
+    }
+}
+
+static inline void RENAME(monoblack2Y)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
+{
+    int i, j;
+    for (i=0; i<width/8; i++){
+        int d= src[i];
+        for(j=0; j<8; j++)
+            dst[8*i+j]= ((d>>(7-j))&1)*255;
+    }
+}
+
 // bilinear / bicubic scaling
 static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
                                   int16_t *filter, int16_t *filterPos, long filterSize)
 {
-#ifdef HAVE_MMX
+#if HAVE_MMX
     assert(filterSize % 4 == 0 && filterSize>0);
     if (filterSize==4) // Always true for upscaling, sometimes for down, too.
     {
@@ -2329,12 +1955,11 @@ static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW
         filter-= counter*2;
         filterPos-= counter/2;
         dst-= counter/2;
-        asm volatile(
+        __asm__ volatile(
 #if defined(PIC)
         "push            %%"REG_b"              \n\t"
 #endif
         "pxor                %%mm7, %%mm7       \n\t"
-        "movq        "MANGLE(w02)", %%mm6       \n\t"
         "push           %%"REG_BP"              \n\t" // we use 7 regs here ...
         "mov             %%"REG_a", %%"REG_BP"  \n\t"
         ASMALIGN(4)
@@ -2349,10 +1974,11 @@ static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW
         "punpcklbw           %%mm7, %%mm2       \n\t"
         "pmaddwd             %%mm1, %%mm0       \n\t"
         "pmaddwd             %%mm2, %%mm3       \n\t"
-        "psrad                  $8, %%mm0       \n\t"
-        "psrad                  $8, %%mm3       \n\t"
-        "packssdw            %%mm3, %%mm0       \n\t"
-        "pmaddwd             %%mm6, %%mm0       \n\t"
+        "movq                %%mm0, %%mm4       \n\t"
+        "punpckldq           %%mm3, %%mm0       \n\t"
+        "punpckhdq           %%mm3, %%mm4       \n\t"
+        "paddd               %%mm4, %%mm0       \n\t"
+        "psrad                  $7, %%mm0       \n\t"
         "packssdw            %%mm0, %%mm0       \n\t"
         "movd                %%mm0, (%4, %%"REG_BP")    \n\t"
         "add                    $4, %%"REG_BP"  \n\t"
@@ -2375,12 +2001,11 @@ static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW
         filter-= counter*4;
         filterPos-= counter/2;
         dst-= counter/2;
-        asm volatile(
+        __asm__ volatile(
 #if defined(PIC)
         "push             %%"REG_b"             \n\t"
 #endif
         "pxor                 %%mm7, %%mm7      \n\t"
-        "movq         "MANGLE(w02)", %%mm6      \n\t"
         "push            %%"REG_BP"             \n\t" // we use 7 regs here ...
         "mov              %%"REG_a", %%"REG_BP" \n\t"
         ASMALIGN(4)
@@ -2406,11 +2031,11 @@ static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW
         "pmaddwd              %%mm2, %%mm5      \n\t"
         "paddd                %%mm4, %%mm0      \n\t"
         "paddd                %%mm5, %%mm3      \n\t"
-
-        "psrad                   $8, %%mm0      \n\t"
-        "psrad                   $8, %%mm3      \n\t"
-        "packssdw             %%mm3, %%mm0      \n\t"
-        "pmaddwd              %%mm6, %%mm0      \n\t"
+        "movq                 %%mm0, %%mm4      \n\t"
+        "punpckldq            %%mm3, %%mm0      \n\t"
+        "punpckhdq            %%mm3, %%mm4      \n\t"
+        "paddd                %%mm4, %%mm0      \n\t"
+        "psrad                   $7, %%mm0      \n\t"
         "packssdw             %%mm0, %%mm0      \n\t"
         "movd                 %%mm0, (%4, %%"REG_BP")   \n\t"
         "add                     $4, %%"REG_BP" \n\t"
@@ -2434,9 +2059,8 @@ static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW
         //filter-= counter*filterSize/2;
         filterPos-= counter/2;
         dst-= counter/2;
-        asm volatile(
+        __asm__ volatile(
         "pxor                  %%mm7, %%mm7     \n\t"
-        "movq          "MANGLE(w02)", %%mm6     \n\t"
         ASMALIGN(4)
         "1:                                     \n\t"
         "mov                      %2, %%"REG_c" \n\t"
@@ -2461,10 +2085,11 @@ static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW
         "cmp                      %4, %%"REG_c" \n\t"
         " jb                      2b            \n\t"
         "add                      %6, %1        \n\t"
-        "psrad                    $8, %%mm4     \n\t"
-        "psrad                    $8, %%mm5     \n\t"
-        "packssdw              %%mm5, %%mm4     \n\t"
-        "pmaddwd               %%mm6, %%mm4     \n\t"
+        "movq                  %%mm4, %%mm0     \n\t"
+        "punpckldq             %%mm5, %%mm4     \n\t"
+        "punpckhdq             %%mm5, %%mm0     \n\t"
+        "paddd                 %%mm0, %%mm4     \n\t"
+        "psrad                    $7, %%mm4     \n\t"
         "packssdw              %%mm4, %%mm4     \n\t"
         "mov                      %3, %%"REG_a" \n\t"
         "movd                  %%mm4, (%%"REG_a", %0)   \n\t"
@@ -2478,7 +2103,7 @@ static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW
         );
     }
 #else
-#ifdef HAVE_ALTIVEC
+#if HAVE_ALTIVEC
     hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
 #else
     int i;
@@ -2494,76 +2119,96 @@ static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW
             val += ((int)src[srcPos + j])*filter[filterSize*i + j];
         }
         //filter += hFilterSize;
-        dst[i] = av_clip(val>>7, 0, (1<<15)-1); // the cubic equation does overflow ...
+        dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
         //dst[i] = val>>7;
     }
 #endif /* HAVE_ALTIVEC */
 #endif /* HAVE_MMX */
 }
       // *** horizontal scale Y line to temp buffer
-static inline void RENAME(hyscale)(uint16_t *dst, long dstWidth, uint8_t *src, int srcW, int xInc,
+static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, uint8_t *src, int srcW, int xInc,
                                    int flags, int canMMX2BeUsed, int16_t *hLumFilter,
                                    int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
                                    int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
-                                   int32_t *mmx2FilterPos, uint8_t *pal)
+                                   int32_t *mmx2FilterPos, uint32_t *pal)
 {
     if (srcFormat==PIX_FMT_YUYV422 || srcFormat==PIX_FMT_GRAY16BE)
     {
-        RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
+        RENAME(yuy2ToY)(formatConvBuffer, src, srcW, pal);
         src= formatConvBuffer;
     }
     else if (srcFormat==PIX_FMT_UYVY422 || srcFormat==PIX_FMT_GRAY16LE)
     {
-        RENAME(uyvyToY)(formatConvBuffer, src, srcW);
+        RENAME(uyvyToY)(formatConvBuffer, src, srcW, pal);
         src= formatConvBuffer;
     }
     else if (srcFormat==PIX_FMT_RGB32)
     {
-        RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
+        RENAME(bgr32ToY)(formatConvBuffer, src, srcW, pal);
+        src= formatConvBuffer;
+    }
+    else if (srcFormat==PIX_FMT_RGB32_1)
+    {
+        RENAME(bgr32ToY)(formatConvBuffer, src+ALT32_CORR, srcW, pal);
         src= formatConvBuffer;
     }
     else if (srcFormat==PIX_FMT_BGR24)
     {
-        RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
+        RENAME(bgr24ToY)(formatConvBuffer, src, srcW, pal);
         src= formatConvBuffer;
     }
     else if (srcFormat==PIX_FMT_BGR565)
     {
-        RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
+        RENAME(bgr16ToY)(formatConvBuffer, src, srcW, pal);
         src= formatConvBuffer;
     }
     else if (srcFormat==PIX_FMT_BGR555)
     {
-        RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
+        RENAME(bgr15ToY)(formatConvBuffer, src, srcW, pal);
         src= formatConvBuffer;
     }
     else if (srcFormat==PIX_FMT_BGR32)
     {
-        RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
+        RENAME(rgb32ToY)(formatConvBuffer, src, srcW, pal);
+        src= formatConvBuffer;
+    }
+    else if (srcFormat==PIX_FMT_BGR32_1)
+    {
+        RENAME(rgb32ToY)(formatConvBuffer, src+ALT32_CORR, srcW, pal);
         src= formatConvBuffer;
     }
     else if (srcFormat==PIX_FMT_RGB24)
     {
-        RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
+        RENAME(rgb24ToY)(formatConvBuffer, src, srcW, pal);
         src= formatConvBuffer;
     }
     else if (srcFormat==PIX_FMT_RGB565)
     {
-        RENAME(rgb16ToY)(formatConvBuffer, src, srcW);
+        RENAME(rgb16ToY)(formatConvBuffer, src, srcW, pal);
         src= formatConvBuffer;
     }
     else if (srcFormat==PIX_FMT_RGB555)
     {
-        RENAME(rgb15ToY)(formatConvBuffer, src, srcW);
+        RENAME(rgb15ToY)(formatConvBuffer, src, srcW, pal);
         src= formatConvBuffer;
     }
     else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE  || srcFormat==PIX_FMT_RGB4_BYTE)
     {
-        RENAME(palToY)(formatConvBuffer, src, srcW, (uint32_t*)pal);
+        RENAME(palToY)(formatConvBuffer, src, srcW, pal);
+        src= formatConvBuffer;
+    }
+    else if (srcFormat==PIX_FMT_MONOBLACK)
+    {
+        RENAME(monoblack2Y)(formatConvBuffer, src, srcW, pal);
+        src= formatConvBuffer;
+    }
+    else if (srcFormat==PIX_FMT_MONOWHITE)
+    {
+        RENAME(monowhite2Y)(formatConvBuffer, src, srcW, pal);
         src= formatConvBuffer;
     }
 
-#ifdef HAVE_MMX
+#if HAVE_MMX
     // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
     if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
 #else
@@ -2574,15 +2219,15 @@ static inline void RENAME(hyscale)(uint16_t *dst, long dstWidth, uint8_t *src, i
     }
     else // fast bilinear upscale / crap downscale
     {
-#if defined(ARCH_X86)
-#ifdef HAVE_MMX2
+#if ARCH_X86
+#if HAVE_MMX2
         int i;
 #if defined(PIC)
         uint64_t ebxsave __attribute__((aligned(8)));
 #endif
         if (canMMX2BeUsed)
         {
-            asm volatile(
+            __asm__ volatile(
 #if defined(PIC)
             "mov               %%"REG_b", %5        \n\t"
 #endif
@@ -2596,7 +2241,7 @@ static inline void RENAME(hyscale)(uint16_t *dst, long dstWidth, uint8_t *src, i
             PREFETCH"      32(%%"REG_c")            \n\t"
             PREFETCH"      64(%%"REG_c")            \n\t"
 
-#ifdef ARCH_X86_64
+#if ARCH_X86_64
 
 #define FUNNY_Y_CODE \
             "movl            (%%"REG_b"), %%esi     \n\t"\
@@ -2647,7 +2292,7 @@ FUNNY_Y_CODE
         long xInc_shr16 = xInc >> 16;
         uint16_t xInc_mask = xInc & 0xffff;
         //NO MMX just normal asm ...
-        asm volatile(
+        __asm__ volatile(
         "xor %%"REG_a", %%"REG_a"            \n\t" // i
         "xor %%"REG_d", %%"REG_d"            \n\t" // xx
         "xorl    %%ecx, %%ecx                \n\t" // 2*xalpha
@@ -2686,7 +2331,7 @@ FUNNY_Y_CODE
         :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
         : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
         );
-#ifdef HAVE_MMX2
+#if HAVE_MMX2
         } //if MMX2 can't be used
 #endif
 #else
@@ -2699,88 +2344,143 @@ FUNNY_Y_CODE
             dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
             xpos+=xInc;
         }
-#endif /* defined(ARCH_X86) */
+#endif /* ARCH_X86 */
+    }
+
+    if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
+        int i;
+        //FIXME all pal and rgb srcFormats could do this convertion as well
+        //FIXME all scalers more complex than bilinear could do half of this transform
+        if(c->srcRange){
+            for (i=0; i<dstWidth; i++)
+                dst[i]= (dst[i]*14071 + 33561947)>>14;
+        }else{
+            for (i=0; i<dstWidth; i++)
+                dst[i]= (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
+        }
     }
 }
 
-inline static void RENAME(hcscale)(uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2,
+inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2,
                                    int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
                                    int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
                                    int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
-                                   int32_t *mmx2FilterPos, uint8_t *pal)
+                                   int32_t *mmx2FilterPos, uint32_t *pal)
 {
     if (srcFormat==PIX_FMT_YUYV422)
     {
-        RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
+        RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
         src1= formatConvBuffer;
         src2= formatConvBuffer+VOFW;
     }
     else if (srcFormat==PIX_FMT_UYVY422)
     {
-        RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
+        RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
         src1= formatConvBuffer;
         src2= formatConvBuffer+VOFW;
     }
     else if (srcFormat==PIX_FMT_RGB32)
     {
-        RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
+        if(c->chrSrcHSubSample)
+            RENAME(bgr32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
+        else
+            RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
+        src1= formatConvBuffer;
+        src2= formatConvBuffer+VOFW;
+    }
+    else if (srcFormat==PIX_FMT_RGB32_1)
+    {
+        if(c->chrSrcHSubSample)
+            RENAME(bgr32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
+        else
+            RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
         src1= formatConvBuffer;
         src2= formatConvBuffer+VOFW;
     }
     else if (srcFormat==PIX_FMT_BGR24)
     {
-        RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
+        if(c->chrSrcHSubSample)
+            RENAME(bgr24ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
+        else
+            RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
         src1= formatConvBuffer;
         src2= formatConvBuffer+VOFW;
     }
     else if (srcFormat==PIX_FMT_BGR565)
     {
-        RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
+        if(c->chrSrcHSubSample)
+            RENAME(bgr16ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
+        else
+            RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
         src1= formatConvBuffer;
         src2= formatConvBuffer+VOFW;
     }
     else if (srcFormat==PIX_FMT_BGR555)
     {
-        RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
+        if(c->chrSrcHSubSample)
+            RENAME(bgr15ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
+        else
+            RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
         src1= formatConvBuffer;
         src2= formatConvBuffer+VOFW;
     }
     else if (srcFormat==PIX_FMT_BGR32)
     {
-        RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
+        if(c->chrSrcHSubSample)
+            RENAME(rgb32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
+        else
+            RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
+        src1= formatConvBuffer;
+        src2= formatConvBuffer+VOFW;
+    }
+    else if (srcFormat==PIX_FMT_BGR32_1)
+    {
+        if(c->chrSrcHSubSample)
+            RENAME(rgb32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
+        else
+            RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
         src1= formatConvBuffer;
         src2= formatConvBuffer+VOFW;
     }
     else if (srcFormat==PIX_FMT_RGB24)
     {
-        RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
+        if(c->chrSrcHSubSample)
+            RENAME(rgb24ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
+        else
+            RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
         src1= formatConvBuffer;
         src2= formatConvBuffer+VOFW;
     }
     else if (srcFormat==PIX_FMT_RGB565)
     {
-        RENAME(rgb16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
+        if(c->chrSrcHSubSample)
+            RENAME(rgb16ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
+        else
+            RENAME(rgb16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
         src1= formatConvBuffer;
         src2= formatConvBuffer+VOFW;
     }
     else if (srcFormat==PIX_FMT_RGB555)
     {
-        RENAME(rgb15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
+        if(c->chrSrcHSubSample)
+            RENAME(rgb15ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
+        else
+            RENAME(rgb15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
         src1= formatConvBuffer;
         src2= formatConvBuffer+VOFW;
     }
-    else if (isGray(srcFormat))
+    else if (isGray(srcFormat) || srcFormat==PIX_FMT_MONOBLACK || srcFormat==PIX_FMT_MONOWHITE)
     {
         return;
     }
     else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE  || srcFormat==PIX_FMT_RGB4_BYTE)
     {
-        RENAME(palToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, (uint32_t*)pal);
+        RENAME(palToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
         src1= formatConvBuffer;
         src2= formatConvBuffer+VOFW;
     }
 
-#ifdef HAVE_MMX
+#if HAVE_MMX
     // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
     if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
 #else
@@ -2792,15 +2492,15 @@ inline static void RENAME(hcscale)(uint16_t *dst, long dstWidth, uint8_t *src1,
     }
     else // fast bilinear upscale / crap downscale
     {
-#if defined(ARCH_X86)
-#ifdef HAVE_MMX2
+#if ARCH_X86
+#if HAVE_MMX2
         int i;
 #if defined(PIC)
         uint64_t ebxsave __attribute__((aligned(8)));
 #endif
         if (canMMX2BeUsed)
         {
-            asm volatile(
+            __asm__ volatile(
 #if defined(PIC)
             "mov          %%"REG_b", %6         \n\t"
 #endif
@@ -2814,7 +2514,7 @@ inline static void RENAME(hcscale)(uint16_t *dst, long dstWidth, uint8_t *src1,
             PREFETCH" 32(%%"REG_c")             \n\t"
             PREFETCH" 64(%%"REG_c")             \n\t"
 
-#ifdef ARCH_X86_64
+#if ARCH_X86_64
 
 #define FUNNY_UV_CODE \
             "movl       (%%"REG_b"), %%esi      \n\t"\
@@ -2877,7 +2577,7 @@ FUNNY_UV_CODE
 #endif /* HAVE_MMX2 */
             long xInc_shr16 = (long) (xInc >> 16);
             uint16_t xInc_mask = xInc & 0xffff;
-            asm volatile(
+            __asm__ volatile(
             "xor %%"REG_a", %%"REG_a"               \n\t" // i
             "xor %%"REG_d", %%"REG_d"               \n\t" // xx
             "xorl    %%ecx, %%ecx                   \n\t" // 2*xalpha
@@ -2912,7 +2612,7 @@ FUNNY_UV_CODE
 
 /* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
    which is needed to support GCC 4.0. */
-#if defined(ARCH_X86_64) && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
+#if ARCH_X86_64 && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
             :: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
 #else
             :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
@@ -2920,7 +2620,7 @@ FUNNY_UV_CODE
             "r" (src2)
             : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
             );
-#ifdef HAVE_MMX2
+#if HAVE_MMX2
         } //if MMX2 can't be used
 #endif
 #else
@@ -2938,7 +2638,23 @@ FUNNY_UV_CODE
             */
             xpos+=xInc;
         }
-#endif /* defined(ARCH_X86) */
+#endif /* ARCH_X86 */
+    }
+    if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
+        int i;
+        //FIXME all pal and rgb srcFormats could do this convertion as well
+        //FIXME all scalers more complex than bilinear could do half of this transform
+        if(c->srcRange){
+            for (i=0; i<dstWidth; i++){
+                dst[i     ]= (dst[i     ]*1799 + 4081085)>>11; //1469
+                dst[i+VOFW]= (dst[i+VOFW]*1799 + 4081085)>>11; //1469
+            }
+        }else{
+            for (i=0; i<dstWidth; i++){
+                dst[i     ]= (FFMIN(dst[i     ],30775)*4663 - 9289992)>>12; //-264
+                dst[i+VOFW]= (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264
+            }
+        }
     }
 }
 
@@ -2981,7 +2697,7 @@ static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int s
     const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
     const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
     int lastDstY;
-    uint8_t *pal=NULL;
+    uint32_t *pal=c->pal_yuv;
 
     /* vars which will change and which we need to store back in the context */
     int dstY= c->dstY;
@@ -2991,7 +2707,6 @@ static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int s
     int lastInChrBuf= c->lastInChrBuf;
 
     if (isPacked(c->srcFormat)){
-        pal= src[1];
         src[0]=
         src[1]=
         src[2]= src[0];
@@ -3020,12 +2735,12 @@ static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int s
 
     if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
     {
-        static int firstTime=1; //FIXME move this into the context perhaps
-        if (flags & SWS_PRINT_INFO && firstTime)
+        static int warnedAlready=0; //FIXME move this into the context perhaps
+        if (flags & SWS_PRINT_INFO && !warnedAlready)
         {
             av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
                    "         ->cannot do aligned memory accesses anymore\n");
-            firstTime=0;
+            warnedAlready=1;
         }
     }
 
@@ -3075,7 +2790,7 @@ static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int s
                 assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
                 assert(lastInLumBuf + 1 - srcSliceY >= 0);
                 //printf("%d %d\n", lumBufIndex, vLumBufSize);
-                RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
+                RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
                                 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
                                 funnyYCode, c->srcFormat, formatConvBuffer,
                                 c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
@@ -3092,7 +2807,7 @@ static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int s
                 //FIXME replace parameters through context struct (some at least)
 
                 if (!(isGray(srcFormat) || isGray(dstFormat)))
-                    RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
+                    RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
                                     flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
                                     funnyUVCode, c->srcFormat, formatConvBuffer,
                                     c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
@@ -3117,7 +2832,7 @@ static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int s
                 assert(lumBufIndex < 2*vLumBufSize);
                 assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
                 assert(lastInLumBuf + 1 - srcSliceY >= 0);
-                RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
+                RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
                                 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
                                 funnyYCode, c->srcFormat, formatConvBuffer,
                                 c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
@@ -3133,7 +2848,7 @@ static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int s
                 assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
 
                 if (!(isGray(srcFormat) || isGray(dstFormat)))
-                    RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
+                    RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
                             flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
                             funnyUVCode, c->srcFormat, formatConvBuffer,
                             c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
@@ -3145,31 +2860,34 @@ static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int s
             break; //we can't output a dstY line so let's try with the next slice
         }
 
-#ifdef HAVE_MMX
-        b5Dither= ff_dither8[dstY&1];
-        g6Dither= ff_dither4[dstY&1];
-        g5Dither= ff_dither8[dstY&1];
-        r5Dither= ff_dither8[(dstY+1)&1];
+#if HAVE_MMX
+        c->blueDither= ff_dither8[dstY&1];
+        if (c->dstFormat == PIX_FMT_RGB555 || c->dstFormat == PIX_FMT_BGR555)
+            c->greenDither= ff_dither8[dstY&1];
+        else
+            c->greenDither= ff_dither4[dstY&1];
+        c->redDither= ff_dither8[(dstY+1)&1];
 #endif
         if (dstY < dstH-2)
         {
             int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
             int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
-#ifdef HAVE_MMX
+#if HAVE_MMX
             int i;
         if (flags & SWS_ACCURATE_RND){
+            int s= APCK_SIZE / 8;
             for (i=0; i<vLumFilterSize; i+=2){
-                lumMmxFilter[2*i+0]= (int32_t)lumSrcPtr[i  ];
-                lumMmxFilter[2*i+1]= (int32_t)lumSrcPtr[i+(vLumFilterSize>1)];
-                lumMmxFilter[2*i+2]=
-                lumMmxFilter[2*i+3]= vLumFilter[dstY*vLumFilterSize + i    ]
+                *(void**)&lumMmxFilter[s*i              ]= lumSrcPtr[i  ];
+                *(void**)&lumMmxFilter[s*i+APCK_PTR2/4  ]= lumSrcPtr[i+(vLumFilterSize>1)];
+                          lumMmxFilter[s*i+APCK_COEF/4  ]=
+                          lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i    ]
                     + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
             }
             for (i=0; i<vChrFilterSize; i+=2){
-                chrMmxFilter[2*i+0]= (int32_t)chrSrcPtr[i  ];
-                chrMmxFilter[2*i+1]= (int32_t)chrSrcPtr[i+(vChrFilterSize>1)];
-                chrMmxFilter[2*i+2]=
-                chrMmxFilter[2*i+3]= vChrFilter[chrDstY*vChrFilterSize + i    ]
+                *(void**)&chrMmxFilter[s*i              ]= chrSrcPtr[i  ];
+                *(void**)&chrMmxFilter[s*i+APCK_PTR2/4  ]= chrSrcPtr[i+(vChrFilterSize>1)];
+                          chrMmxFilter[s*i+APCK_COEF/4  ]=
+                          chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i    ]
                     + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
             }
         }else{
@@ -3199,7 +2917,7 @@ static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int s
                     vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
                     dest, uDest, dstW, chrDstW, dstFormat);
             }
-            else if (isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 like
+            else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) //YV12 like
             {
                 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
                 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
@@ -3224,8 +2942,15 @@ static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int s
                 if (vLumFilterSize == 1 && vChrFilterSize == 2) //unscaled RGB
                 {
                     int chrAlpha= vChrFilter[2*dstY+1];
-                    RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
-                        dest, dstW, chrAlpha, dstFormat, flags, dstY);
+                    if(flags & SWS_FULL_CHR_H_INT){
+                        yuv2rgbXinC_full(c, //FIXME write a packed1_full function
+                            vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
+                            vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
+                            dest, dstW, dstY);
+                    }else{
+                        RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
+                            dest, dstW, chrAlpha, dstFormat, flags, dstY);
+                    }
                 }
                 else if (vLumFilterSize == 2 && vChrFilterSize == 2) //bilinear upscale RGB
                 {
@@ -3235,15 +2960,29 @@ static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int s
                     lumMmxFilter[3]= vLumFilter[2*dstY   ]*0x10001;
                     chrMmxFilter[2]=
                     chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
-                    RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
-                        dest, dstW, lumAlpha, chrAlpha, dstY);
+                    if(flags & SWS_FULL_CHR_H_INT){
+                        yuv2rgbXinC_full(c, //FIXME write a packed2_full function
+                            vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
+                            vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
+                            dest, dstW, dstY);
+                    }else{
+                        RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
+                            dest, dstW, lumAlpha, chrAlpha, dstY);
+                    }
                 }
                 else //general RGB
                 {
-                    RENAME(yuv2packedX)(c,
-                        vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
-                        vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
-                        dest, dstW, dstY);
+                    if(flags & SWS_FULL_CHR_H_INT){
+                        yuv2rgbXinC_full(c,
+                            vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
+                            vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
+                            dest, dstW, dstY);
+                    }else{
+                        RENAME(yuv2packedX)(c,
+                            vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
+                            vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
+                            dest, dstW, dstY);
+                    }
                 }
             }
         }
@@ -3259,7 +2998,7 @@ static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int s
                     vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
                     dest, uDest, dstW, chrDstW, dstFormat);
             }
-            else if (isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12
+            else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) //YV12
             {
                 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
                 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
@@ -3272,17 +3011,24 @@ static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int s
             {
                 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
                 assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
-                yuv2packedXinC(c,
-                    vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
-                    vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
-                    dest, dstW, dstY);
+                if(flags & SWS_FULL_CHR_H_INT){
+                    yuv2rgbXinC_full(c,
+                        vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
+                        vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
+                        dest, dstW, dstY);
+                }else{
+                    yuv2packedXinC(c,
+                        vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
+                        vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
+                        dest, dstW, dstY);
+                }
             }
         }
     }
 
-#ifdef HAVE_MMX
-    asm volatile(SFENCE:::"memory");
-    asm volatile(EMMS:::"memory");
+#if HAVE_MMX
+    __asm__ volatile(SFENCE:::"memory");
+    __asm__ volatile(EMMS:::"memory");
 #endif
     /* store changed local vars back in the context */
     c->dstY= dstY;
diff --git a/src/add-ons/media/plugins/avcodec/libswscale/yuv2rgb.c b/src/add-ons/media/plugins/avcodec/libswscale/yuv2rgb.c
index 83d65c5f2a..65af412c2c 100644
--- a/src/add-ons/media/plugins/avcodec/libswscale/yuv2rgb.c
+++ b/src/add-ons/media/plugins/avcodec/libswscale/yuv2rgb.c
@@ -1,32 +1,27 @@
 /*
- * yuv2rgb.c, Software YUV to RGB converter
+ * software YUV to RGB converter
  *
- *  Copyright (C) 1999, Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ * Copyright (C) 2009 Konstantin Shishkov
  *
- *  Functions broken out from display_x11.c and several new modes
- *  added by Håkan Hjort <d95hjort@dtek.chalmers.se>
+ * MMX/MMX2 template stuff (needed for fast movntq support),
+ * 1,4,8bpp support and context / deglobalize stuff
+ * by Michael Niedermayer (michaelni@gmx.at)
  *
- *  15 & 16 bpp support by Franck Sicard <Franck.Sicard@solsoft.fr>
+ * This file is part of FFmpeg.
  *
- *  MMX/MMX2 template stuff (needed for fast movntq support),
- *  1,4,8bpp support and context / deglobalize stuff
- *  by Michael Niedermayer (michaelni@gmx.at)
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
  *
- *  This file is part of mpeg2dec, a free MPEG-2 video decoder
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
  *
- *  mpeg2dec is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2, or (at your option)
- *  any later version.
- *
- *  mpeg2dec is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with mpeg2dec; if not, write to the Free Software
- *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 #include <stdio.h>
@@ -41,148 +36,36 @@
 
 #define DITHER1XBPP // only for MMX
 
-const uint8_t  __attribute__((aligned(8))) dither_2x2_4[2][8]={
-{  1,   3,   1,   3,   1,   3,   1,   3, },
-{  2,   0,   2,   0,   2,   0,   2,   0, },
-};
+extern const uint8_t dither_8x8_32[8][8];
+extern const uint8_t dither_8x8_73[8][8];
+extern const uint8_t dither_8x8_220[8][8];
 
-const uint8_t  __attribute__((aligned(8))) dither_2x2_8[2][8]={
-{  6,   2,   6,   2,   6,   2,   6,   2, },
-{  0,   4,   0,   4,   0,   4,   0,   4, },
-};
-
-const uint8_t  __attribute__((aligned(8))) dither_8x8_32[8][8]={
-{ 17,   9,  23,  15,  16,   8,  22,  14, },
-{  5,  29,   3,  27,   4,  28,   2,  26, },
-{ 21,  13,  19,  11,  20,  12,  18,  10, },
-{  0,  24,   6,  30,   1,  25,   7,  31, },
-{ 16,   8,  22,  14,  17,   9,  23,  15, },
-{  4,  28,   2,  26,   5,  29,   3,  27, },
-{ 20,  12,  18,  10,  21,  13,  19,  11, },
-{  1,  25,   7,  31,   0,  24,   6,  30, },
-};
-
-#if 0
-const uint8_t  __attribute__((aligned(8))) dither_8x8_64[8][8]={
-{  0,  48,  12,  60,   3,  51,  15,  63, },
-{ 32,  16,  44,  28,  35,  19,  47,  31, },
-{  8,  56,   4,  52,  11,  59,   7,  55, },
-{ 40,  24,  36,  20,  43,  27,  39,  23, },
-{  2,  50,  14,  62,   1,  49,  13,  61, },
-{ 34,  18,  46,  30,  33,  17,  45,  29, },
-{ 10,  58,   6,  54,   9,  57,   5,  53, },
-{ 42,  26,  38,  22,  41,  25,  37,  21, },
-};
-#endif
-
-const uint8_t  __attribute__((aligned(8))) dither_8x8_73[8][8]={
-{  0,  55,  14,  68,   3,  58,  17,  72, },
-{ 37,  18,  50,  32,  40,  22,  54,  35, },
-{  9,  64,   5,  59,  13,  67,   8,  63, },
-{ 46,  27,  41,  23,  49,  31,  44,  26, },
-{  2,  57,  16,  71,   1,  56,  15,  70, },
-{ 39,  21,  52,  34,  38,  19,  51,  33, },
-{ 11,  66,   7,  62,  10,  65,   6,  60, },
-{ 48,  30,  43,  25,  47,  29,  42,  24, },
-};
-
-#if 0
-const uint8_t  __attribute__((aligned(8))) dither_8x8_128[8][8]={
-{ 68,  36,  92,  60,  66,  34,  90,  58, },
-{ 20, 116,  12, 108,  18, 114,  10, 106, },
-{ 84,  52,  76,  44,  82,  50,  74,  42, },
-{  0,  96,  24, 120,   6, 102,  30, 126, },
-{ 64,  32,  88,  56,  70,  38,  94,  62, },
-{ 16, 112,   8, 104,  22, 118,  14, 110, },
-{ 80,  48,  72,  40,  86,  54,  78,  46, },
-{  4, 100,  28, 124,   2,  98,  26, 122, },
-};
-#endif
-
-#if 1
-const uint8_t  __attribute__((aligned(8))) dither_8x8_220[8][8]={
-{117,  62, 158, 103, 113,  58, 155, 100, },
-{ 34, 199,  21, 186,  31, 196,  17, 182, },
-{144,  89, 131,  76, 141,  86, 127,  72, },
-{  0, 165,  41, 206,  10, 175,  52, 217, },
-{110,  55, 151,  96, 120,  65, 162, 107, },
-{ 28, 193,  14, 179,  38, 203,  24, 189, },
-{138,  83, 124,  69, 148,  93, 134,  79, },
-{  7, 172,  48, 213,   3, 168,  45, 210, },
-};
-#elif 1
-// tries to correct a gamma of 1.5
-const uint8_t  __attribute__((aligned(8))) dither_8x8_220[8][8]={
-{  0, 143,  18, 200,   2, 156,  25, 215, },
-{ 78,  28, 125,  64,  89,  36, 138,  74, },
-{ 10, 180,   3, 161,  16, 195,   8, 175, },
-{109,  51,  93,  38, 121,  60, 105,  47, },
-{  1, 152,  23, 210,   0, 147,  20, 205, },
-{ 85,  33, 134,  71,  81,  30, 130,  67, },
-{ 14, 190,   6, 171,  12, 185,   5, 166, },
-{117,  57, 101,  44, 113,  54,  97,  41, },
-};
-#elif 1
-// tries to correct a gamma of 2.0
-const uint8_t  __attribute__((aligned(8))) dither_8x8_220[8][8]={
-{  0, 124,   8, 193,   0, 140,  12, 213, },
-{ 55,  14, 104,  42,  66,  19, 119,  52, },
-{  3, 168,   1, 145,   6, 187,   3, 162, },
-{ 86,  31,  70,  21,  99,  39,  82,  28, },
-{  0, 134,  11, 206,   0, 129,   9, 200, },
-{ 62,  17, 114,  48,  58,  16, 109,  45, },
-{  5, 181,   2, 157,   4, 175,   1, 151, },
-{ 95,  36,  78,  26,  90,  34,  74,  24, },
-};
-#else
-// tries to correct a gamma of 2.5
-const uint8_t  __attribute__((aligned(8))) dither_8x8_220[8][8]={
-{  0, 107,   3, 187,   0, 125,   6, 212, },
-{ 39,   7,  86,  28,  49,  11, 102,  36, },
-{  1, 158,   0, 131,   3, 180,   1, 151, },
-{ 68,  19,  52,  12,  81,  25,  64,  17, },
-{  0, 119,   5, 203,   0, 113,   4, 195, },
-{ 45,   9,  96,  33,  42,   8,  91,  30, },
-{  2, 172,   1, 144,   2, 165,   0, 137, },
-{ 77,  23,  60,  15,  72,  21,  56,  14, },
-};
-#endif
-
-#ifdef HAVE_MMX
+#if HAVE_MMX && CONFIG_GPL
 
 /* hope these constant values are cache line aligned */
 DECLARE_ASM_CONST(8, uint64_t, mmx_00ffw)   = 0x00ff00ff00ff00ffULL;
 DECLARE_ASM_CONST(8, uint64_t, mmx_redmask) = 0xf8f8f8f8f8f8f8f8ULL;
 DECLARE_ASM_CONST(8, uint64_t, mmx_grnmask) = 0xfcfcfcfcfcfcfcfcULL;
 
-// The volatile is required because gcc otherwise optimizes some writes away
-// not knowing that these are read in the ASM block.
-static volatile uint64_t attribute_used __attribute__((aligned(8))) b5Dither;
-static volatile uint64_t attribute_used __attribute__((aligned(8))) g5Dither;
-static volatile uint64_t attribute_used __attribute__((aligned(8))) g6Dither;
-static volatile uint64_t attribute_used __attribute__((aligned(8))) r5Dither;
-
-#undef HAVE_MMX
-
 //MMX versions
 #undef RENAME
-#define HAVE_MMX
 #undef HAVE_MMX2
-#undef HAVE_3DNOW
+#undef HAVE_AMD3DNOW
+#define HAVE_MMX2 0
+#define HAVE_AMD3DNOW 0
 #define RENAME(a) a ## _MMX
 #include "yuv2rgb_template.c"
 
 //MMX2 versions
 #undef RENAME
-#define HAVE_MMX
-#define HAVE_MMX2
-#undef HAVE_3DNOW
+#undef HAVE_MMX2
+#define HAVE_MMX2 1
 #define RENAME(a) a ## _MMX2
 #include "yuv2rgb_template.c"
 
-#endif /* HAVE_MMX */
+#endif /* HAVE_MMX && CONFIG_GPL */
 
-const int32_t Inverse_Table_6_9[8][4] = {
+const int32_t ff_yuv2rgb_coeffs[8][4] = {
     {117504, 138453, 13954, 34903}, /* no sequence_display_extension */
     {117504, 138453, 13954, 34903}, /* ITU-R Rec. 709 (1990) */
     {104597, 132201, 25675, 53279}, /* unspecified */
@@ -193,73 +76,55 @@ const int32_t Inverse_Table_6_9[8][4] = {
     {117579, 136230, 16907, 35559}  /* SMPTE 240M (1987) */
 };
 
-#define RGB(i)                                      \
+#define LOADCHROMA(i)                               \
     U = pu[i];                                      \
     V = pv[i];                                      \
     r = (void *)c->table_rV[V];                     \
     g = (void *)(c->table_gU[U] + c->table_gV[V]);  \
     b = (void *)c->table_bU[U];
 
-#define DST1(i)                         \
-    Y = py_1[2*i];                      \
-    dst_1[2*i] = r[Y] + g[Y] + b[Y];    \
-    Y = py_1[2*i+1];                    \
-    dst_1[2*i+1] = r[Y] + g[Y] + b[Y];
+#define PUTRGB(dst,src,i,o)          \
+    Y = src[2*i+o];                  \
+    dst[2*i  ] = r[Y] + g[Y] + b[Y]; \
+    Y = src[2*i+1-o];                \
+    dst[2*i+1] = r[Y] + g[Y] + b[Y];
 
-#define DST2(i)                         \
-    Y = py_2[2*i];                      \
-    dst_2[2*i] = r[Y] + g[Y] + b[Y];    \
-    Y = py_2[2*i+1];                    \
-    dst_2[2*i+1] = r[Y] + g[Y] + b[Y];
+#define PUTRGB24(dst,src,i)                                  \
+    Y = src[2*i];                                            \
+    dst[6*i+0] = r[Y]; dst[6*i+1] = g[Y]; dst[6*i+2] = b[Y]; \
+    Y = src[2*i+1];                                          \
+    dst[6*i+3] = r[Y]; dst[6*i+4] = g[Y]; dst[6*i+5] = b[Y];
 
-#define DST1RGB(i)                                                \
-    Y = py_1[2*i];                                                \
-    dst_1[6*i] = r[Y]; dst_1[6*i+1] = g[Y]; dst_1[6*i+2] = b[Y];  \
-    Y = py_1[2*i+1];                                              \
-    dst_1[6*i+3] = r[Y]; dst_1[6*i+4] = g[Y]; dst_1[6*i+5] = b[Y];
+#define PUTBGR24(dst,src,i)                                  \
+    Y = src[2*i];                                            \
+    dst[6*i+0] = b[Y]; dst[6*i+1] = g[Y]; dst[6*i+2] = r[Y]; \
+    Y = src[2*i+1];                                          \
+    dst[6*i+3] = b[Y]; dst[6*i+4] = g[Y]; dst[6*i+5] = r[Y];
 
-#define DST2RGB(i)                                                \
-    Y = py_2[2*i];                                                \
-    dst_2[6*i] = r[Y]; dst_2[6*i+1] = g[Y]; dst_2[6*i+2] = b[Y];  \
-    Y = py_2[2*i+1];                                              \
-    dst_2[6*i+3] = r[Y]; dst_2[6*i+4] = g[Y]; dst_2[6*i+5] = b[Y];
-
-#define DST1BGR(i)                                                \
-    Y = py_1[2*i];                                                \
-    dst_1[6*i] = b[Y]; dst_1[6*i+1] = g[Y]; dst_1[6*i+2] = r[Y];  \
-    Y = py_1[2*i+1];                                              \
-    dst_1[6*i+3] = b[Y]; dst_1[6*i+4] = g[Y]; dst_1[6*i+5] = r[Y];
-
-#define DST2BGR(i)                                                \
-    Y = py_2[2*i];                                                \
-    dst_2[6*i] = b[Y]; dst_2[6*i+1] = g[Y]; dst_2[6*i+2] = r[Y];  \
-    Y = py_2[2*i+1];                                              \
-    dst_2[6*i+3] = b[Y]; dst_2[6*i+4] = g[Y]; dst_2[6*i+5] = r[Y];
-
-#define PROLOG(func_name, dst_type) \
+#define YUV2RGBFUNC(func_name, dst_type) \
 static int func_name(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, \
                      int srcSliceH, uint8_t* dst[], int dstStride[]){\
     int y;\
 \
-    if (c->srcFormat == PIX_FMT_YUV422P){\
+    if (c->srcFormat == PIX_FMT_YUV422P) {\
         srcStride[1] *= 2;\
         srcStride[2] *= 2;\
     }\
-    for (y=0; y<srcSliceH; y+=2){\
-        dst_type *dst_1= (dst_type*)(dst[0] + (y+srcSliceY  )*dstStride[0]);\
-        dst_type *dst_2= (dst_type*)(dst[0] + (y+srcSliceY+1)*dstStride[0]);\
+    for (y=0; y<srcSliceH; y+=2) {\
+        dst_type *dst_1 = (dst_type*)(dst[0] + (y+srcSliceY  )*dstStride[0]);\
+        dst_type *dst_2 = (dst_type*)(dst[0] + (y+srcSliceY+1)*dstStride[0]);\
         dst_type av_unused *r, *b;\
         dst_type *g;\
-        uint8_t *py_1= src[0] + y*srcStride[0];\
-        uint8_t *py_2= py_1 + srcStride[0];\
-        uint8_t *pu= src[1] + (y>>1)*srcStride[1];\
-        uint8_t *pv= src[2] + (y>>1)*srcStride[2];\
-        unsigned int h_size= c->dstW>>3;\
+        uint8_t *py_1 = src[0] + y*srcStride[0];\
+        uint8_t *py_2 = py_1 + srcStride[0];\
+        uint8_t *pu = src[1] + (y>>1)*srcStride[1];\
+        uint8_t *pv = src[2] + (y>>1)*srcStride[2];\
+        unsigned int h_size = c->dstW>>3;\
         while (h_size--) {\
             int av_unused U, V;\
             int Y;\
 
-#define EPILOG1(dst_delta)\
+#define ENDYUV2RGBLINE(dst_delta)\
             pu += 4;\
             pv += 4;\
             py_1 += 8;\
@@ -270,392 +135,354 @@ static int func_name(SwsContext *c, uint8_t* src[], int srcStride[], int srcSlic
         if (c->dstW & 4) {\
             int av_unused Y, U, V;\
 
-#define EPILOG2()\
+#define ENDYUV2RGBFUNC()\
         }\
     }\
     return srcSliceH;\
 }
 
-#define EPILOG(dst_delta)\
-    EPILOG1(dst_delta)\
-    EPILOG2()
+#define CLOSEYUV2RGBFUNC(dst_delta)\
+    ENDYUV2RGBLINE(dst_delta)\
+    ENDYUV2RGBFUNC()
 
-PROLOG(yuv2rgb_c_32, uint32_t)
-    RGB(0);
-    DST1(0);
-    DST2(0);
+YUV2RGBFUNC(yuv2rgb_c_32, uint32_t)
+    LOADCHROMA(0);
+    PUTRGB(dst_1,py_1,0,0);
+    PUTRGB(dst_2,py_2,0,1);
 
-    RGB(1);
-    DST2(1);
-    DST1(1);
+    LOADCHROMA(1);
+    PUTRGB(dst_2,py_2,1,1);
+    PUTRGB(dst_1,py_1,1,0);
+    LOADCHROMA(1);
+    PUTRGB(dst_2,py_2,1,1);
+    PUTRGB(dst_1,py_1,1,0);
 
-    RGB(2);
-    DST1(2);
-    DST2(2);
+    LOADCHROMA(2);
+    PUTRGB(dst_1,py_1,2,0);
+    PUTRGB(dst_2,py_2,2,1);
 
-    RGB(3);
-    DST2(3);
-    DST1(3);
-EPILOG1(8)
-    RGB(0);
-    DST1(0);
-    DST2(0);
+    LOADCHROMA(3);
+    PUTRGB(dst_2,py_2,3,1);
+    PUTRGB(dst_1,py_1,3,0);
+ENDYUV2RGBLINE(8)
+    LOADCHROMA(0);
+    PUTRGB(dst_1,py_1,0,0);
+    PUTRGB(dst_2,py_2,0,1);
 
-    RGB(1);
-    DST2(1);
-    DST1(1);
-EPILOG2()
+    LOADCHROMA(1);
+    PUTRGB(dst_2,py_2,1,1);
+    PUTRGB(dst_1,py_1,1,0);
+ENDYUV2RGBFUNC()
 
-PROLOG(yuv2rgb_c_24_rgb, uint8_t)
-    RGB(0);
-    DST1RGB(0);
-    DST2RGB(0);
+YUV2RGBFUNC(yuv2rgb_c_24_rgb, uint8_t)
+    LOADCHROMA(0);
+    PUTRGB24(dst_1,py_1,0);
+    PUTRGB24(dst_2,py_2,0);
 
-    RGB(1);
-    DST2RGB(1);
-    DST1RGB(1);
+    LOADCHROMA(1);
+    PUTRGB24(dst_2,py_2,1);
+    PUTRGB24(dst_1,py_1,1);
 
-    RGB(2);
-    DST1RGB(2);
-    DST2RGB(2);
+    LOADCHROMA(2);
+    PUTRGB24(dst_1,py_1,2);
+    PUTRGB24(dst_2,py_2,2);
 
-    RGB(3);
-    DST2RGB(3);
-    DST1RGB(3);
-EPILOG1(24)
-    RGB(0);
-    DST1RGB(0);
-    DST2RGB(0);
+    LOADCHROMA(3);
+    PUTRGB24(dst_2,py_2,3);
+    PUTRGB24(dst_1,py_1,3);
+ENDYUV2RGBLINE(24)
+    LOADCHROMA(0);
+    PUTRGB24(dst_1,py_1,0);
+    PUTRGB24(dst_2,py_2,0);
 
-    RGB(1);
-    DST2RGB(1);
-    DST1RGB(1);
-EPILOG2()
+    LOADCHROMA(1);
+    PUTRGB24(dst_2,py_2,1);
+    PUTRGB24(dst_1,py_1,1);
+ENDYUV2RGBFUNC()
 
 // only trivial mods from yuv2rgb_c_24_rgb
-PROLOG(yuv2rgb_c_24_bgr, uint8_t)
-    RGB(0);
-    DST1BGR(0);
-    DST2BGR(0);
+YUV2RGBFUNC(yuv2rgb_c_24_bgr, uint8_t)
+    LOADCHROMA(0);
+    PUTBGR24(dst_1,py_1,0);
+    PUTBGR24(dst_2,py_2,0);
 
-    RGB(1);
-    DST2BGR(1);
-    DST1BGR(1);
+    LOADCHROMA(1);
+    PUTBGR24(dst_2,py_2,1);
+    PUTBGR24(dst_1,py_1,1);
 
-    RGB(2);
-    DST1BGR(2);
-    DST2BGR(2);
+    LOADCHROMA(2);
+    PUTBGR24(dst_1,py_1,2);
+    PUTBGR24(dst_2,py_2,2);
 
-    RGB(3);
-    DST2BGR(3);
-    DST1BGR(3);
-EPILOG1(24)
-    RGB(0);
-    DST1BGR(0);
-    DST2BGR(0);
+    LOADCHROMA(3);
+    PUTBGR24(dst_2,py_2,3);
+    PUTBGR24(dst_1,py_1,3);
+ENDYUV2RGBLINE(24)
+    LOADCHROMA(0);
+    PUTBGR24(dst_1,py_1,0);
+    PUTBGR24(dst_2,py_2,0);
 
-    RGB(1);
-    DST2BGR(1);
-    DST1BGR(1);
-EPILOG2()
+    LOADCHROMA(1);
+    PUTBGR24(dst_2,py_2,1);
+    PUTBGR24(dst_1,py_1,1);
+ENDYUV2RGBFUNC()
 
 // This is exactly the same code as yuv2rgb_c_32 except for the types of
 // r, g, b, dst_1, dst_2
-PROLOG(yuv2rgb_c_16, uint16_t)
-    RGB(0);
-    DST1(0);
-    DST2(0);
+YUV2RGBFUNC(yuv2rgb_c_16, uint16_t)
+    LOADCHROMA(0);
+    PUTRGB(dst_1,py_1,0,0);
+    PUTRGB(dst_2,py_2,0,1);
 
-    RGB(1);
-    DST2(1);
-    DST1(1);
+    LOADCHROMA(1);
+    PUTRGB(dst_2,py_2,1,1);
+    PUTRGB(dst_1,py_1,1,0);
 
-    RGB(2);
-    DST1(2);
-    DST2(2);
+    LOADCHROMA(2);
+    PUTRGB(dst_1,py_1,2,0);
+    PUTRGB(dst_2,py_2,2,1);
 
-    RGB(3);
-    DST2(3);
-    DST1(3);
-EPILOG(8)
+    LOADCHROMA(3);
+    PUTRGB(dst_2,py_2,3,1);
+    PUTRGB(dst_1,py_1,3,0);
+CLOSEYUV2RGBFUNC(8)
 
 // This is exactly the same code as yuv2rgb_c_32 except for the types of
 // r, g, b, dst_1, dst_2
-PROLOG(yuv2rgb_c_8, uint8_t)
-    RGB(0);
-    DST1(0);
-    DST2(0);
+YUV2RGBFUNC(yuv2rgb_c_8, uint8_t)
+    LOADCHROMA(0);
+    PUTRGB(dst_1,py_1,0,0);
+    PUTRGB(dst_2,py_2,0,1);
 
-    RGB(1);
-    DST2(1);
-    DST1(1);
+    LOADCHROMA(1);
+    PUTRGB(dst_2,py_2,1,1);
+    PUTRGB(dst_1,py_1,1,0);
 
-    RGB(2);
-    DST1(2);
-    DST2(2);
+    LOADCHROMA(2);
+    PUTRGB(dst_1,py_1,2,0);
+    PUTRGB(dst_2,py_2,2,1);
 
-    RGB(3);
-    DST2(3);
-    DST1(3);
-EPILOG(8)
+    LOADCHROMA(3);
+    PUTRGB(dst_2,py_2,3,1);
+    PUTRGB(dst_1,py_1,3,0);
+CLOSEYUV2RGBFUNC(8)
 
 // r, g, b, dst_1, dst_2
-PROLOG(yuv2rgb_c_8_ordered_dither, uint8_t)
-    const uint8_t *d32= dither_8x8_32[y&7];
-    const uint8_t *d64= dither_8x8_73[y&7];
-#define DST1bpp8(i,o)                                               \
-    Y = py_1[2*i];                                                  \
-    dst_1[2*i]   = r[Y+d32[0+o]] + g[Y+d32[0+o]] + b[Y+d64[0+o]];   \
-    Y = py_1[2*i+1];                                                \
-    dst_1[2*i+1] = r[Y+d32[1+o]] + g[Y+d32[1+o]] + b[Y+d64[1+o]];
+YUV2RGBFUNC(yuv2rgb_c_8_ordered_dither, uint8_t)
+    const uint8_t *d32 = dither_8x8_32[y&7];
+    const uint8_t *d64 = dither_8x8_73[y&7];
+#define PUTRGB8(dst,src,i,o)                                    \
+    Y = src[2*i];                                               \
+    dst[2*i]   = r[Y+d32[0+o]] + g[Y+d32[0+o]] + b[Y+d64[0+o]]; \
+    Y = src[2*i+1];                                             \
+    dst[2*i+1] = r[Y+d32[1+o]] + g[Y+d32[1+o]] + b[Y+d64[1+o]];
 
-#define DST2bpp8(i,o)                                               \
-    Y = py_2[2*i];                                                  \
-    dst_2[2*i]   =  r[Y+d32[8+o]] + g[Y+d32[8+o]] + b[Y+d64[8+o]];  \
-    Y = py_2[2*i+1];                                                \
-    dst_2[2*i+1] =  r[Y+d32[9+o]] + g[Y+d32[9+o]] + b[Y+d64[9+o]];
+    LOADCHROMA(0);
+    PUTRGB8(dst_1,py_1,0,0);
+    PUTRGB8(dst_2,py_2,0,0+8);
 
+    LOADCHROMA(1);
+    PUTRGB8(dst_2,py_2,1,2+8);
+    PUTRGB8(dst_1,py_1,1,2);
 
-    RGB(0);
-    DST1bpp8(0,0);
-    DST2bpp8(0,0);
+    LOADCHROMA(2);
+    PUTRGB8(dst_1,py_1,2,4);
+    PUTRGB8(dst_2,py_2,2,4+8);
 
-    RGB(1);
-    DST2bpp8(1,2);
-    DST1bpp8(1,2);
-
-    RGB(2);
-    DST1bpp8(2,4);
-    DST2bpp8(2,4);
-
-    RGB(3);
-    DST2bpp8(3,6);
-    DST1bpp8(3,6);
-EPILOG(8)
+    LOADCHROMA(3);
+    PUTRGB8(dst_2,py_2,3,6+8);
+    PUTRGB8(dst_1,py_1,3,6);
+CLOSEYUV2RGBFUNC(8)
 
 
 // This is exactly the same code as yuv2rgb_c_32 except for the types of
 // r, g, b, dst_1, dst_2
-PROLOG(yuv2rgb_c_4, uint8_t)
+YUV2RGBFUNC(yuv2rgb_c_4, uint8_t)
     int acc;
-#define DST1_4(i)                   \
-    Y = py_1[2*i];                  \
+#define PUTRGB4(dst,src,i)          \
+    Y = src[2*i];                   \
     acc = r[Y] + g[Y] + b[Y];       \
-    Y = py_1[2*i+1];                \
+    Y = src[2*i+1];                 \
     acc |= (r[Y] + g[Y] + b[Y])<<4; \
-    dst_1[i] = acc;
+    dst[i] = acc;
 
-#define DST2_4(i)                   \
-    Y = py_2[2*i];                  \
-    acc = r[Y] + g[Y] + b[Y];       \
-    Y = py_2[2*i+1];                \
-    acc |= (r[Y] + g[Y] + b[Y])<<4; \
-    dst_2[i] = acc;
+    LOADCHROMA(0);
+    PUTRGB4(dst_1,py_1,0);
+    PUTRGB4(dst_2,py_2,0);
 
-    RGB(0);
-    DST1_4(0);
-    DST2_4(0);
+    LOADCHROMA(1);
+    PUTRGB4(dst_2,py_2,1);
+    PUTRGB4(dst_1,py_1,1);
 
-    RGB(1);
-    DST2_4(1);
-    DST1_4(1);
+    LOADCHROMA(2);
+    PUTRGB4(dst_1,py_1,2);
+    PUTRGB4(dst_2,py_2,2);
 
-    RGB(2);
-    DST1_4(2);
-    DST2_4(2);
+    LOADCHROMA(3);
+    PUTRGB4(dst_2,py_2,3);
+    PUTRGB4(dst_1,py_1,3);
+CLOSEYUV2RGBFUNC(4)
 
-    RGB(3);
-    DST2_4(3);
-    DST1_4(3);
-EPILOG(4)
-
-PROLOG(yuv2rgb_c_4_ordered_dither, uint8_t)
-    const uint8_t *d64= dither_8x8_73[y&7];
-    const uint8_t *d128=dither_8x8_220[y&7];
+YUV2RGBFUNC(yuv2rgb_c_4_ordered_dither, uint8_t)
+    const uint8_t *d64 =  dither_8x8_73[y&7];
+    const uint8_t *d128 = dither_8x8_220[y&7];
     int acc;
 
-#define DST1bpp4(i,o)                                             \
-    Y = py_1[2*i];                                                \
+#define PUTRGB4D(dst,src,i,o)                                     \
+    Y = src[2*i];                                                 \
     acc = r[Y+d128[0+o]] + g[Y+d64[0+o]] + b[Y+d128[0+o]];        \
-    Y = py_1[2*i+1];                                              \
+    Y = src[2*i+1];                                               \
     acc |= (r[Y+d128[1+o]] + g[Y+d64[1+o]] + b[Y+d128[1+o]])<<4;  \
-    dst_1[i]= acc;
+    dst[i]= acc;
 
-#define DST2bpp4(i,o)                                             \
-    Y = py_2[2*i];                                                \
-    acc =  r[Y+d128[8+o]] + g[Y+d64[8+o]] + b[Y+d128[8+o]];       \
-    Y = py_2[2*i+1];                                              \
-    acc |=  (r[Y+d128[9+o]] + g[Y+d64[9+o]] + b[Y+d128[9+o]])<<4; \
-    dst_2[i]= acc;
+    LOADCHROMA(0);
+    PUTRGB4D(dst_1,py_1,0,0);
+    PUTRGB4D(dst_2,py_2,0,0+8);
 
+    LOADCHROMA(1);
+    PUTRGB4D(dst_2,py_2,1,2+8);
+    PUTRGB4D(dst_1,py_1,1,2);
 
-    RGB(0);
-    DST1bpp4(0,0);
-    DST2bpp4(0,0);
+    LOADCHROMA(2);
+    PUTRGB4D(dst_1,py_1,2,4);
+    PUTRGB4D(dst_2,py_2,2,4+8);
 
-    RGB(1);
-    DST2bpp4(1,2);
-    DST1bpp4(1,2);
-
-    RGB(2);
-    DST1bpp4(2,4);
-    DST2bpp4(2,4);
-
-    RGB(3);
-    DST2bpp4(3,6);
-    DST1bpp4(3,6);
-EPILOG(4)
+    LOADCHROMA(3);
+    PUTRGB4D(dst_2,py_2,3,6+8);
+    PUTRGB4D(dst_1,py_1,3,6);
+CLOSEYUV2RGBFUNC(4)
 
 // This is exactly the same code as yuv2rgb_c_32 except for the types of
 // r, g, b, dst_1, dst_2
-PROLOG(yuv2rgb_c_4b, uint8_t)
-    RGB(0);
-    DST1(0);
-    DST2(0);
+YUV2RGBFUNC(yuv2rgb_c_4b, uint8_t)
+    LOADCHROMA(0);
+    PUTRGB(dst_1,py_1,0,0);
+    PUTRGB(dst_2,py_2,0,1);
 
-    RGB(1);
-    DST2(1);
-    DST1(1);
+    LOADCHROMA(1);
+    PUTRGB(dst_2,py_2,1,1);
+    PUTRGB(dst_1,py_1,1,0);
 
-    RGB(2);
-    DST1(2);
-    DST2(2);
+    LOADCHROMA(2);
+    PUTRGB(dst_1,py_1,2,0);
+    PUTRGB(dst_2,py_2,2,1);
 
-    RGB(3);
-    DST2(3);
-    DST1(3);
-EPILOG(8)
+    LOADCHROMA(3);
+    PUTRGB(dst_2,py_2,3,1);
+    PUTRGB(dst_1,py_1,3,0);
+CLOSEYUV2RGBFUNC(8)
 
-PROLOG(yuv2rgb_c_4b_ordered_dither, uint8_t)
-    const uint8_t *d64= dither_8x8_73[y&7];
-    const uint8_t *d128=dither_8x8_220[y&7];
+YUV2RGBFUNC(yuv2rgb_c_4b_ordered_dither, uint8_t)
+    const uint8_t *d64 =  dither_8x8_73[y&7];
+    const uint8_t *d128 = dither_8x8_220[y&7];
 
-#define DST1bpp4b(i,o)                                                \
-    Y = py_1[2*i];                                                    \
-    dst_1[2*i]   = r[Y+d128[0+o]] + g[Y+d64[0+o]] + b[Y+d128[0+o]];   \
-    Y = py_1[2*i+1];                                                  \
-    dst_1[2*i+1] = r[Y+d128[1+o]] + g[Y+d64[1+o]] + b[Y+d128[1+o]];
+#define PUTRGB4DB(dst,src,i,o)                                    \
+    Y = src[2*i];                                                 \
+    dst[2*i]   = r[Y+d128[0+o]] + g[Y+d64[0+o]] + b[Y+d128[0+o]]; \
+    Y = src[2*i+1];                                               \
+    dst[2*i+1] = r[Y+d128[1+o]] + g[Y+d64[1+o]] + b[Y+d128[1+o]];
 
-#define DST2bpp4b(i,o)                                                \
-    Y = py_2[2*i];                                                    \
-    dst_2[2*i]   =  r[Y+d128[8+o]] + g[Y+d64[8+o]] + b[Y+d128[8+o]];  \
-    Y = py_2[2*i+1];                                                  \
-    dst_2[2*i+1] =  r[Y+d128[9+o]] + g[Y+d64[9+o]] + b[Y+d128[9+o]];
+    LOADCHROMA(0);
+    PUTRGB4DB(dst_1,py_1,0,0);
+    PUTRGB4DB(dst_2,py_2,0,0+8);
 
+    LOADCHROMA(1);
+    PUTRGB4DB(dst_2,py_2,1,2+8);
+    PUTRGB4DB(dst_1,py_1,1,2);
 
-    RGB(0);
-    DST1bpp4b(0,0);
-    DST2bpp4b(0,0);
+    LOADCHROMA(2);
+    PUTRGB4DB(dst_1,py_1,2,4);
+    PUTRGB4DB(dst_2,py_2,2,4+8);
 
-    RGB(1);
-    DST2bpp4b(1,2);
-    DST1bpp4b(1,2);
+    LOADCHROMA(3);
+    PUTRGB4DB(dst_2,py_2,3,6+8);
+    PUTRGB4DB(dst_1,py_1,3,6);
+CLOSEYUV2RGBFUNC(8)
 
-    RGB(2);
-    DST1bpp4b(2,4);
-    DST2bpp4b(2,4);
-
-    RGB(3);
-    DST2bpp4b(3,6);
-    DST1bpp4b(3,6);
-EPILOG(8)
-
-PROLOG(yuv2rgb_c_1_ordered_dither, uint8_t)
-        const uint8_t *d128=dither_8x8_220[y&7];
-        char out_1=0, out_2=0;
+YUV2RGBFUNC(yuv2rgb_c_1_ordered_dither, uint8_t)
+        const uint8_t *d128 = dither_8x8_220[y&7];
+        char out_1 = 0, out_2 = 0;
         g= c->table_gU[128] + c->table_gV[128];
 
-#define DST1bpp1(i,o)               \
-    Y = py_1[2*i];                  \
-    out_1+= out_1 + g[Y+d128[0+o]]; \
-    Y = py_1[2*i+1];                \
-    out_1+= out_1 + g[Y+d128[1+o]];
+#define PUTRGB1(out,src,i,o)    \
+    Y = src[2*i];               \
+    out+= out + g[Y+d128[0+o]]; \
+    Y = src[2*i+1];             \
+    out+= out + g[Y+d128[1+o]];
 
-#define DST2bpp1(i,o)               \
-    Y = py_2[2*i];                  \
-    out_2+= out_2 + g[Y+d128[8+o]]; \
-    Y = py_2[2*i+1];                \
-    out_2+= out_2 + g[Y+d128[9+o]];
+    PUTRGB1(out_1,py_1,0,0);
+    PUTRGB1(out_2,py_2,0,0+8);
 
-    DST1bpp1(0,0);
-    DST2bpp1(0,0);
+    PUTRGB1(out_2,py_2,1,2+8);
+    PUTRGB1(out_1,py_1,1,2);
 
-    DST2bpp1(1,2);
-    DST1bpp1(1,2);
+    PUTRGB1(out_1,py_1,2,4);
+    PUTRGB1(out_2,py_2,2,4+8);
 
-    DST1bpp1(2,4);
-    DST2bpp1(2,4);
-
-    DST2bpp1(3,6);
-    DST1bpp1(3,6);
+    PUTRGB1(out_2,py_2,3,6+8);
+    PUTRGB1(out_1,py_1,3,6);
 
     dst_1[0]= out_1;
     dst_2[0]= out_2;
-EPILOG(1)
+CLOSEYUV2RGBFUNC(1)
 
-SwsFunc yuv2rgb_get_func_ptr (SwsContext *c)
+SwsFunc sws_yuv2rgb_get_func_ptr(SwsContext *c)
 {
-#if defined(HAVE_MMX2) || defined(HAVE_MMX)
-    if (c->flags & SWS_CPU_CAPS_MMX2){
-        switch(c->dstFormat){
+    SwsFunc t = NULL;
+#if (HAVE_MMX2 || HAVE_MMX) && CONFIG_GPL
+    if (c->flags & SWS_CPU_CAPS_MMX2) {
+        switch (c->dstFormat) {
         case PIX_FMT_RGB32:  return yuv420_rgb32_MMX2;
         case PIX_FMT_BGR24:  return yuv420_rgb24_MMX2;
-        case PIX_FMT_BGR565: return yuv420_rgb16_MMX2;
-        case PIX_FMT_BGR555: return yuv420_rgb15_MMX2;
+        case PIX_FMT_RGB565: return yuv420_rgb16_MMX2;
+        case PIX_FMT_RGB555: return yuv420_rgb15_MMX2;
         }
     }
-    if (c->flags & SWS_CPU_CAPS_MMX){
-        switch(c->dstFormat){
+    if (c->flags & SWS_CPU_CAPS_MMX) {
+        switch (c->dstFormat) {
         case PIX_FMT_RGB32:  return yuv420_rgb32_MMX;
         case PIX_FMT_BGR24:  return yuv420_rgb24_MMX;
-        case PIX_FMT_BGR565: return yuv420_rgb16_MMX;
-        case PIX_FMT_BGR555: return yuv420_rgb15_MMX;
+        case PIX_FMT_RGB565: return yuv420_rgb16_MMX;
+        case PIX_FMT_RGB555: return yuv420_rgb15_MMX;
         }
     }
 #endif
-#ifdef HAVE_VIS
-    {
-        SwsFunc t= yuv2rgb_init_vis(c);
-        if (t) return t;
-    }
+#if HAVE_VIS
+    t = sws_yuv2rgb_init_vis(c);
 #endif
-#ifdef CONFIG_MLIB
-    {
-        SwsFunc t= yuv2rgb_init_mlib(c);
-        if (t) return t;
-    }
+#if CONFIG_MLIB
+    t = sws_yuv2rgb_init_mlib(c);
 #endif
-#ifdef HAVE_ALTIVEC
+#if HAVE_ALTIVEC && CONFIG_GPL
     if (c->flags & SWS_CPU_CAPS_ALTIVEC)
-    {
-        SwsFunc t = yuv2rgb_init_altivec(c);
-        if (t) return t;
-    }
+        t = sws_yuv2rgb_init_altivec(c);
 #endif
 
-#ifdef ARCH_BFIN
+#if ARCH_BFIN
     if (c->flags & SWS_CPU_CAPS_BFIN)
-    {
-        SwsFunc t = ff_bfin_yuv2rgb_get_func_ptr (c);
-        if (t) return t;
-    }
+        t = sws_ff_bfin_yuv2rgb_get_func_ptr(c);
 #endif
 
+    if (t)
+        return t;
+
     av_log(c, AV_LOG_WARNING, "No accelerated colorspace conversion found.\n");
 
-    switch(c->dstFormat){
+    switch (c->dstFormat) {
+    case PIX_FMT_BGR32_1:
+    case PIX_FMT_RGB32_1:
     case PIX_FMT_BGR32:
-    case PIX_FMT_RGB32: return yuv2rgb_c_32;
-    case PIX_FMT_RGB24: return yuv2rgb_c_24_rgb;
-    case PIX_FMT_BGR24: return yuv2rgb_c_24_bgr;
+    case PIX_FMT_RGB32:      return yuv2rgb_c_32;
+    case PIX_FMT_RGB24:      return yuv2rgb_c_24_rgb;
+    case PIX_FMT_BGR24:      return yuv2rgb_c_24_bgr;
     case PIX_FMT_RGB565:
     case PIX_FMT_BGR565:
     case PIX_FMT_RGB555:
-    case PIX_FMT_BGR555: return yuv2rgb_c_16;
+    case PIX_FMT_BGR555:     return yuv2rgb_c_16;
     case PIX_FMT_RGB8:
-    case PIX_FMT_BGR8:  return yuv2rgb_c_8_ordered_dither;
+    case PIX_FMT_BGR8:       return yuv2rgb_c_8_ordered_dither;
     case PIX_FMT_RGB4:
-    case PIX_FMT_BGR4:  return yuv2rgb_c_4_ordered_dither;
+    case PIX_FMT_BGR4:       return yuv2rgb_c_4_ordered_dither;
     case PIX_FMT_RGB4_BYTE:
     case PIX_FMT_BGR4_BYTE:  return yuv2rgb_c_4b_ordered_dither;
     case PIX_FMT_MONOBLACK:  return yuv2rgb_c_1_ordered_dither;
@@ -665,29 +492,49 @@ SwsFunc yuv2rgb_get_func_ptr (SwsContext *c)
     return NULL;
 }
 
-static int div_round (int dividend, int divisor)
+static void fill_table(uint8_t* table[256], const int elemsize, const int inc, uint8_t *y_table)
 {
-    if (dividend > 0)
-        return (dividend + (divisor>>1)) / divisor;
-    else
-        return -((-dividend + (divisor>>1)) / divisor);
+    int i;
+    int64_t cb = 0;
+
+    y_table -= elemsize * (inc >> 9);
+
+    for (i = 0; i < 256; i++) {
+        table[i] = y_table + elemsize * (cb >> 16);
+        cb += inc;
+    }
 }
 
-int yuv2rgb_c_init_tables (SwsContext *c, const int inv_table[4], int fullRange, int brightness, int contrast, int saturation)
+static void fill_gv_table(int table[256], const int elemsize, const int inc)
 {
-    const int isRgb = isBGR(c->dstFormat);
-    const int bpp = fmt_depth(c->dstFormat);
     int i;
-    uint8_t table_Y[1024];
-    uint32_t *table_32 = 0;
-    uint16_t *table_16 = 0;
-    uint8_t *table_8 = 0;
-    uint8_t *table_332 = 0;
-    uint8_t *table_121 = 0;
-    uint8_t *table_1 = 0;
-    int entry_size = 0;
-    void *table_r = 0, *table_g = 0, *table_b = 0;
-    void *table_start;
+    int64_t cb = 0;
+    int off = -(inc >> 9);
+
+    for (i = 0; i < 256; i++) {
+        table[i] = elemsize * (off + (cb >> 16));
+        cb += inc;
+    }
+}
+
+av_cold int sws_yuv2rgb_c_init_tables(SwsContext *c, const int inv_table[4], int fullRange,
+                                      int brightness, int contrast, int saturation)
+{
+    const int isRgb =      c->dstFormat==PIX_FMT_RGB32
+                        || c->dstFormat==PIX_FMT_RGB32_1
+                        || c->dstFormat==PIX_FMT_BGR24
+                        || c->dstFormat==PIX_FMT_RGB565
+                        || c->dstFormat==PIX_FMT_RGB555
+                        || c->dstFormat==PIX_FMT_RGB8
+                        || c->dstFormat==PIX_FMT_RGB4
+                        || c->dstFormat==PIX_FMT_RGB4_BYTE
+                        || c->dstFormat==PIX_FMT_MONOBLACK;
+    const int bpp = fmt_depth(c->dstFormat);
+    uint8_t *y_table;
+    uint16_t *y_table16;
+    uint32_t *y_table32;
+    int i, base, rbase, gbase, bbase, abase;
+    const int yoffs = fullRange ? 384 : 326;
 
     int64_t crv =  inv_table[0];
     int64_t cbu =  inv_table[1];
@@ -696,186 +543,142 @@ int yuv2rgb_c_init_tables (SwsContext *c, const int inv_table[4], int fullRange,
     int64_t cy  = 1<<16;
     int64_t oy  = 0;
 
-//printf("%lld %lld %lld %lld %lld\n", cy, crv, cbu, cgu, cgv);
-    if (!fullRange){
-        cy= (cy*255) / 219;
-        oy= 16<<16;
-    }else{
-        crv= (crv*224) / 255;
-        cbu= (cbu*224) / 255;
-        cgu= (cgu*224) / 255;
-        cgv= (cgv*224) / 255;
+    int64_t yb = 0;
+
+    if (!fullRange) {
+        cy = (cy*255) / 219;
+        oy = 16<<16;
+    } else {
+        crv = (crv*224) / 255;
+        cbu = (cbu*224) / 255;
+        cgu = (cgu*224) / 255;
+        cgv = (cgv*224) / 255;
     }
 
-    cy = (cy *contrast             )>>16;
-    crv= (crv*contrast * saturation)>>32;
-    cbu= (cbu*contrast * saturation)>>32;
-    cgu= (cgu*contrast * saturation)>>32;
-    cgv= (cgv*contrast * saturation)>>32;
-//printf("%lld %lld %lld %lld %lld\n", cy, crv, cbu, cgu, cgv);
+    cy  = (cy *contrast             ) >> 16;
+    crv = (crv*contrast * saturation) >> 32;
+    cbu = (cbu*contrast * saturation) >> 32;
+    cgu = (cgu*contrast * saturation) >> 32;
+    cgv = (cgv*contrast * saturation) >> 32;
     oy -= 256*brightness;
 
-    for (i = 0; i < 1024; i++) {
-        int j;
+    //scale coefficients by cy
+    crv = ((crv << 16) + 0x8000) / cy;
+    cbu = ((cbu << 16) + 0x8000) / cy;
+    cgu = ((cgu << 16) + 0x8000) / cy;
+    cgv = ((cgv << 16) + 0x8000) / cy;
 
-        j= (cy*(((i - 384)<<16) - oy) + (1<<31))>>32;
-        j = (j < 0) ? 0 : ((j > 255) ? 255 : j);
-        table_Y[i] = j;
-    }
+    av_free(c->yuvTable);
 
     switch (bpp) {
-    case 32:
-        table_start= table_32 = av_malloc ((197 + 2*682 + 256 + 132) * sizeof (uint32_t));
-
-        entry_size = sizeof (uint32_t);
-        table_r = table_32 + 197;
-        table_b = table_32 + 197 + 685;
-        table_g = table_32 + 197 + 2*682;
-
-        for (i = -197; i < 256+197; i++)
-            ((uint32_t *)table_r)[i] = table_Y[i+384] << (isRgb ? 16 : 0);
-        for (i = -132; i < 256+132; i++)
-            ((uint32_t *)table_g)[i] = table_Y[i+384] << 8;
-        for (i = -232; i < 256+232; i++)
-            ((uint32_t *)table_b)[i] = table_Y[i+384] << (isRgb ? 0 : 16);
-        break;
-
-    case 24:
-        table_start= table_8 = av_malloc ((256 + 2*232) * sizeof (uint8_t));
-
-        entry_size = sizeof (uint8_t);
-        table_r = table_g = table_b = table_8 + 232;
-
-        for (i = -232; i < 256+232; i++)
-            ((uint8_t * )table_b)[i] = table_Y[i+384];
-        break;
-
-    case 15:
-    case 16:
-        table_start= table_16 = av_malloc ((197 + 2*682 + 256 + 132) * sizeof (uint16_t));
-
-        entry_size = sizeof (uint16_t);
-        table_r = table_16 + 197;
-        table_b = table_16 + 197 + 685;
-        table_g = table_16 + 197 + 2*682;
-
-        for (i = -197; i < 256+197; i++) {
-            int j = table_Y[i+384] >> 3;
-
-            if (isRgb)
-                j <<= ((bpp==16) ? 11 : 10);
-
-            ((uint16_t *)table_r)[i] = j;
-        }
-        for (i = -132; i < 256+132; i++) {
-            int j = table_Y[i+384] >> ((bpp==16) ? 2 : 3);
-
-            ((uint16_t *)table_g)[i] = j << 5;
-        }
-        for (i = -232; i < 256+232; i++) {
-            int j = table_Y[i+384] >> 3;
-
-            if (!isRgb)
-                j <<= ((bpp==16) ? 11 : 10);
-
-            ((uint16_t *)table_b)[i] = j;
-        }
-        break;
-
-    case 8:
-        table_start= table_332 = av_malloc ((197 + 2*682 + 256 + 132) * sizeof (uint8_t));
-
-        entry_size = sizeof (uint8_t);
-        table_r = table_332 + 197;
-        table_b = table_332 + 197 + 685;
-        table_g = table_332 + 197 + 2*682;
-
-        for (i = -197; i < 256+197; i++) {
-            int j = (table_Y[i+384 - 16] + 18)/36;
-
-            if (isRgb)
-                j <<= 5;
-
-            ((uint8_t *)table_r)[i] = j;
-        }
-        for (i = -132; i < 256+132; i++) {
-            int j = (table_Y[i+384 - 16] + 18)/36;
-
-            if (!isRgb)
-                j <<= 1;
-
-            ((uint8_t *)table_g)[i] = j << 2;
-        }
-        for (i = -232; i < 256+232; i++) {
-            int j = (table_Y[i+384 - 37] + 43)/85;
-
-            if (!isRgb)
-                j <<= 6;
-
-            ((uint8_t *)table_b)[i] = j;
+    case 1:
+        c->yuvTable = av_malloc(1024);
+        y_table = c->yuvTable;
+        yb = -(384<<16) - oy;
+        for (i = 0; i < 1024-110; i++) {
+            y_table[i+110] = av_clip_uint8((yb + 0x8000) >> 16) >> 7;
+            yb += cy;
         }
+        fill_table(c->table_gU, 1, cgu, y_table + yoffs);
+        fill_gv_table(c->table_gV, 1, cgv);
         break;
     case 4:
     case 4|128:
-        table_start= table_121 = av_malloc ((197 + 2*682 + 256 + 132) * sizeof (uint8_t));
-
-        entry_size = sizeof (uint8_t);
-        table_r = table_121 + 197;
-        table_b = table_121 + 197 + 685;
-        table_g = table_121 + 197 + 2*682;
-
-        for (i = -197; i < 256+197; i++) {
-            int j = table_Y[i+384 - 110] >> 7;
-
-            if (isRgb)
-                j <<= 3;
-
-            ((uint8_t *)table_r)[i] = j;
-        }
-        for (i = -132; i < 256+132; i++) {
-            int j = (table_Y[i+384 - 37]+ 43)/85;
-
-            ((uint8_t *)table_g)[i] = j << 1;
-        }
-        for (i = -232; i < 256+232; i++) {
-            int j =table_Y[i+384 - 110] >> 7;
-
-            if (!isRgb)
-                j <<= 3;
-
-            ((uint8_t *)table_b)[i] = j;
+        rbase = isRgb ? 3 : 0;
+        gbase = 1;
+        bbase = isRgb ? 0 : 3;
+        c->yuvTable = av_malloc(1024*3);
+        y_table = c->yuvTable;
+        yb = -(384<<16) - oy;
+        for (i = 0; i < 1024-110; i++) {
+            int yval = av_clip_uint8((yb + 0x8000) >> 16);
+            y_table[i+110     ] =  (yval >> 7)       << rbase;
+            y_table[i+ 37+1024] = ((yval + 43) / 85) << gbase;
+            y_table[i+110+2048] =  (yval >> 7)       << bbase;
+            yb += cy;
         }
+        fill_table(c->table_rV, 1, crv, y_table + yoffs);
+        fill_table(c->table_gU, 1, cgu, y_table + yoffs + 1024);
+        fill_table(c->table_bU, 1, cbu, y_table + yoffs + 2048);
+        fill_gv_table(c->table_gV, 1, cgv);
         break;
-
-    case 1:
-        table_start= table_1 = av_malloc (256*2 * sizeof (uint8_t));
-
-        entry_size = sizeof (uint8_t);
-        table_g = table_1;
-        table_r = table_b = NULL;
-
-        for (i = 0; i < 256+256; i++) {
-            int j = table_Y[i + 384 - 110]>>7;
-
-            ((uint8_t *)table_g)[i] = j;
+    case 8:
+        rbase = isRgb ? 5 : 0;
+        gbase = isRgb ? 2 : 3;
+        bbase = isRgb ? 0 : 6;
+        c->yuvTable = av_malloc(1024*3);
+        y_table = c->yuvTable;
+        yb = -(384<<16) - oy;
+        for (i = 0; i < 1024-38; i++) {
+            int yval = av_clip_uint8((yb + 0x8000) >> 16);
+            y_table[i+16     ] = ((yval + 18) / 36) << rbase;
+            y_table[i+16+1024] = ((yval + 18) / 36) << gbase;
+            y_table[i+37+2048] = ((yval + 43) / 85) << bbase;
+            yb += cy;
         }
+        fill_table(c->table_rV, 1, crv, y_table + yoffs);
+        fill_table(c->table_gU, 1, cgu, y_table + yoffs + 1024);
+        fill_table(c->table_bU, 1, cbu, y_table + yoffs + 2048);
+        fill_gv_table(c->table_gV, 1, cgv);
+        break;
+    case 15:
+    case 16:
+        rbase = isRgb ? bpp - 5 : 0;
+        gbase = 5;
+        bbase = isRgb ? 0 : (bpp - 5);
+        c->yuvTable = av_malloc(1024*3*2);
+        y_table16 = c->yuvTable;
+        yb = -(384<<16) - oy;
+        for (i = 0; i < 1024; i++) {
+            uint8_t yval = av_clip_uint8((yb + 0x8000) >> 16);
+            y_table16[i     ] = (yval >> 3)          << rbase;
+            y_table16[i+1024] = (yval >> (18 - bpp)) << gbase;
+            y_table16[i+2048] = (yval >> 3)          << bbase;
+            yb += cy;
+        }
+        fill_table(c->table_rV, 2, crv, y_table16 + yoffs);
+        fill_table(c->table_gU, 2, cgu, y_table16 + yoffs + 1024);
+        fill_table(c->table_bU, 2, cbu, y_table16 + yoffs + 2048);
+        fill_gv_table(c->table_gV, 2, cgv);
+        break;
+    case 24:
+        c->yuvTable = av_malloc(1024);
+        y_table = c->yuvTable;
+        yb = -(384<<16) - oy;
+        for (i = 0; i < 1024; i++) {
+            y_table[i] = av_clip_uint8((yb + 0x8000) >> 16);
+            yb += cy;
+        }
+        fill_table(c->table_rV, 1, crv, y_table + yoffs);
+        fill_table(c->table_gU, 1, cgu, y_table + yoffs);
+        fill_table(c->table_bU, 1, cbu, y_table + yoffs);
+        fill_gv_table(c->table_gV, 1, cgv);
+        break;
+    case 32:
+        base = (c->dstFormat == PIX_FMT_RGB32_1 || c->dstFormat == PIX_FMT_BGR32_1) ? 8 : 0;
+        rbase = base + (isRgb ? 16 : 0);
+        gbase = base + 8;
+        bbase = base + (isRgb ? 0 : 16);
+        abase = (base + 24) & 31;
+        c->yuvTable = av_malloc(1024*3*4);
+        y_table32 = c->yuvTable;
+        yb = -(384<<16) - oy;
+        for (i = 0; i < 1024; i++) {
+            uint8_t yval = av_clip_uint8((yb + 0x8000) >> 16);
+            y_table32[i     ] = (yval << rbase) + (255 << abase);
+            y_table32[i+1024] = yval << gbase;
+            y_table32[i+2048] = yval << bbase;
+            yb += cy;
+        }
+        fill_table(c->table_rV, 4, crv, y_table32 + yoffs);
+        fill_table(c->table_gU, 4, cgu, y_table32 + yoffs + 1024);
+        fill_table(c->table_bU, 4, cbu, y_table32 + yoffs + 2048);
+        fill_gv_table(c->table_gV, 4, cgv);
         break;
-
     default:
-        table_start= NULL;
+        c->yuvTable = NULL;
         av_log(c, AV_LOG_ERROR, "%ibpp not supported by yuv2rgb\n", bpp);
-        //free mem?
         return -1;
     }
-
-    for (i = 0; i < 256; i++) {
-        c->table_rV[i] = (uint8_t *)table_r + entry_size * div_round (crv * (i-128), 76309);
-        c->table_gU[i] = (uint8_t *)table_g + entry_size * div_round (cgu * (i-128), 76309);
-        c->table_gV[i] = entry_size * div_round (cgv * (i-128), 76309);
-        c->table_bU[i] = (uint8_t *)table_b + entry_size * div_round (cbu * (i-128), 76309);
-    }
-
-    av_free(c->yuvTable);
-    c->yuvTable= table_start;
     return 0;
 }
diff --git a/src/add-ons/media/plugins/avcodec/libswscale/yuv2rgb_altivec.c b/src/add-ons/media/plugins/avcodec/libswscale/yuv2rgb_altivec.c
index 43d224edfd..b3a87a0360 100644
--- a/src/add-ons/media/plugins/avcodec/libswscale/yuv2rgb_altivec.c
+++ b/src/add-ons/media/plugins/avcodec/libswscale/yuv2rgb_altivec.c
@@ -5,18 +5,18 @@
  *
  * This file is part of FFmpeg.
  *
- * FFmpeg is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with FFmpeg; if not, write to the Free Software
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -91,9 +91,6 @@ adjustment.
 #include <inttypes.h>
 #include <assert.h>
 #include "config.h"
-#ifdef HAVE_MALLOC_H
-#include <malloc.h>
-#endif
 #include "rgb2rgb.h"
 #include "swscale.h"
 #include "swscale_internal.h"
@@ -154,7 +151,7 @@ const vector unsigned char
 
 #define vec_merge3(x2,x1,x0,y0,y1,y2)       \
 do {                                        \
-    typeof(x0) o0,o2,o3;                    \
+    __typeof__(x0) o0,o2,o3;                \
         o0 = vec_mergeh (x0,x1);            \
         y0 = vec_perm (o0, x2, perm_rgb_0); \
         o2 = vec_perm (o0, x2, perm_rgb_1); \
@@ -165,7 +162,7 @@ do {                                        \
 
 #define vec_mstbgr24(x0,x1,x2,ptr)      \
 do {                                    \
-    typeof(x0) _0,_1,_2;                \
+    __typeof__(x0) _0,_1,_2;            \
     vec_merge3 (x0,x1,x2,_0,_1,_2);     \
     vec_st (_0, 0, ptr++);              \
     vec_st (_1, 0, ptr++);              \
@@ -174,7 +171,7 @@ do {                                    \
 
 #define vec_mstrgb24(x0,x1,x2,ptr)      \
 do {                                    \
-    typeof(x0) _0,_1,_2;                \
+    __typeof__(x0) _0,_1,_2;            \
     vec_merge3 (x2,x1,x0,_0,_1,_2);     \
     vec_st (_0, 0, ptr++);              \
     vec_st (_1, 0, ptr++);              \
@@ -222,12 +219,12 @@ do {                                                                          \
 
 #define vec_unh(x) \
     (vector signed short) \
-        vec_perm(x,(typeof(x)){0}, \
+        vec_perm(x,(__typeof__(x)){0}, \
                  ((vector unsigned char){0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\
                                          0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07}))
 #define vec_unl(x) \
     (vector signed short) \
-        vec_perm(x,(typeof(x)){0}, \
+        vec_perm(x,(__typeof__(x)){0}, \
                  ((vector unsigned char){0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\
                                          0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F}))
 
@@ -240,7 +237,7 @@ do {                                                                          \
         ((vector unsigned short)vec_max (x,((vector signed short) {0})), \
          (vector unsigned short)vec_max (y,((vector signed short) {0})))
 
-//#define out_pixels(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a)){0}),a,a,a,ptr)
+//#define out_pixels(a,b,c,ptr) vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),a,a,a,ptr)
 
 
 static inline void cvtyuvtoRGB (SwsContext *c,
@@ -441,10 +438,10 @@ static int altivec_##name (SwsContext *c,                               \
 }
 
 
-#define out_abgr(a,b,c,ptr)  vec_mstrgb32(typeof(a),((typeof (a)){0}),c,b,a,ptr)
-#define out_bgra(a,b,c,ptr)  vec_mstrgb32(typeof(a),c,b,a,((typeof (a)){0}),ptr)
-#define out_rgba(a,b,c,ptr)  vec_mstrgb32(typeof(a),a,b,c,((typeof (a)){0}),ptr)
-#define out_argb(a,b,c,ptr)  vec_mstrgb32(typeof(a),((typeof (a)){0}),a,b,c,ptr)
+#define out_abgr(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),c,b,a,ptr)
+#define out_bgra(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),c,b,a,((__typeof__ (a)){255}),ptr)
+#define out_rgba(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),a,b,c,((__typeof__ (a)){255}),ptr)
+#define out_argb(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),a,b,c,ptr)
 #define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr)
 #define out_bgr24(a,b,c,ptr) vec_mstbgr24(a,b,c,ptr)
 
@@ -693,7 +690,7 @@ static int altivec_uyvy_rgb32 (SwsContext *c,
 
    So we just fall back to the C codes for this.
 */
-SwsFunc yuv2rgb_init_altivec (SwsContext *c)
+SwsFunc sws_yuv2rgb_init_altivec (SwsContext *c)
 {
     if (!(c->flags & SWS_CPU_CAPS_ALTIVEC))
         return NULL;
@@ -753,7 +750,7 @@ SwsFunc yuv2rgb_init_altivec (SwsContext *c)
     return NULL;
 }
 
-void yuv2rgb_altivec_init_tables (SwsContext *c, const int inv_table[4],int brightness,int contrast, int saturation)
+void sws_yuv2rgb_altivec_init_tables (SwsContext *c, const int inv_table[4],int brightness,int contrast, int saturation)
 {
     union {
         signed short tmp[8] __attribute__ ((aligned(16)));
diff --git a/src/add-ons/media/plugins/avcodec/libswscale/yuv2rgb_bfin.c b/src/add-ons/media/plugins/avcodec/libswscale/yuv2rgb_bfin.c
index 1500a96b25..58cc5b6a35 100644
--- a/src/add-ons/media/plugins/avcodec/libswscale/yuv2rgb_bfin.c
+++ b/src/add-ons/media/plugins/avcodec/libswscale/yuv2rgb_bfin.c
@@ -27,9 +27,6 @@
 #include <inttypes.h>
 #include <assert.h>
 #include "config.h"
-#ifdef HAVE_MALLOC_H
-#include <malloc.h>
-#endif
 #include <unistd.h>
 #include "rgb2rgb.h"
 #include "swscale.h"
@@ -41,17 +38,17 @@
 #define L1CODE
 #endif
 
-extern void ff_bfin_yuv2rgb555_line (uint8_t *Y, uint8_t *U, uint8_t *V, uint8_t *out,
-                                     int w, uint32_t *coeffs) L1CODE;
+void ff_bfin_yuv2rgb555_line (uint8_t *Y, uint8_t *U, uint8_t *V, uint8_t *out,
+                              int w, uint32_t *coeffs) L1CODE;
 
-extern void ff_bfin_yuv2rgb565_line (uint8_t *Y, uint8_t *U, uint8_t *V, uint8_t *out,
-                                     int w, uint32_t *coeffs) L1CODE;
+void ff_bfin_yuv2rgb565_line (uint8_t *Y, uint8_t *U, uint8_t *V, uint8_t *out,
+                              int w, uint32_t *coeffs) L1CODE;
 
-extern void ff_bfin_yuv2rgb24_line (uint8_t *Y, uint8_t *U, uint8_t *V, uint8_t *out,
-                                    int w, uint32_t *coeffs) L1CODE;
+void ff_bfin_yuv2rgb24_line (uint8_t *Y, uint8_t *U, uint8_t *V, uint8_t *out,
+                             int w, uint32_t *coeffs) L1CODE;
 
-typedef void (* ltransform_t)(uint8_t *Y, uint8_t *U, uint8_t *V, uint8_t *out,
-                              int w, uint32_t *coeffs);
+typedef void (* ltransform)(uint8_t *Y, uint8_t *U, uint8_t *V, uint8_t *out,
+                            int w, uint32_t *coeffs);
 
 
 static void bfin_prepare_coefficients (SwsContext *c, int rgb, int masks)
@@ -95,7 +92,7 @@ static int core_yuv420_rgb (SwsContext *c,
                             uint8_t **in, int *instrides,
                             int srcSliceY, int srcSliceH,
                             uint8_t **oplanes, int *outstrides,
-                            ltransform_t lcscf, int rgb, int masks)
+                            ltransform lcscf, int rgb, int masks)
 {
     uint8_t *py,*pu,*pv,*op;
     int w  = instrides[0];
diff --git a/src/add-ons/media/plugins/avcodec/libswscale/yuv2rgb_mlib.c b/src/add-ons/media/plugins/avcodec/libswscale/yuv2rgb_mlib.c
index ff2e50a2b0..68247914e7 100644
--- a/src/add-ons/media/plugins/avcodec/libswscale/yuv2rgb_mlib.c
+++ b/src/add-ons/media/plugins/avcodec/libswscale/yuv2rgb_mlib.c
@@ -73,7 +73,7 @@ static int mlib_YUV2RGB420_24(SwsContext *c, uint8_t* src[], int srcStride[], in
 }
 
 
-SwsFunc yuv2rgb_init_mlib(SwsContext *c)
+SwsFunc sws_yuv2rgb_init_mlib(SwsContext *c)
 {
     switch(c->dstFormat){
     case PIX_FMT_RGB24: return mlib_YUV2RGB420_24;
diff --git a/src/add-ons/media/plugins/avcodec/libswscale/yuv2rgb_template.c b/src/add-ons/media/plugins/avcodec/libswscale/yuv2rgb_template.c
index 1f8e225baa..f55568b0ab 100644
--- a/src/add-ons/media/plugins/avcodec/libswscale/yuv2rgb_template.c
+++ b/src/add-ons/media/plugins/avcodec/libswscale/yuv2rgb_template.c
@@ -1,7 +1,7 @@
 /*
  * yuv2rgb_mmx.c, software YUV to RGB converter with Intel MMX "technology"
  *
- * Copyright (C) 2000, Silicon Integrated System Corp.
+ * Copyright (C) 2000, Silicon Integrated System Corp
  *
  * Author: Olie Lho <ollie@sis.com.tw>
  *
@@ -30,14 +30,14 @@
 #undef EMMS
 #undef SFENCE
 
-#ifdef HAVE_3DNOW
-/* On K6 femms is faster than emms. On K7 femms is directly mapped on emms. */
+#if HAVE_AMD3DNOW
+/* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
 #define EMMS     "femms"
 #else
 #define EMMS     "emms"
 #endif
 
-#ifdef HAVE_MMX2
+#if HAVE_MMX2
 #define MOVNTQ "movntq"
 #define SFENCE "sfence"
 #else
@@ -121,53 +121,73 @@
     "punpcklbw %%mm5, %%mm2;" /* G7 G6 G5 G4 G3 G2 G1 G0 */\
 
 
+#define YUV422_UNSHIFT                   \
+    if(c->srcFormat == PIX_FMT_YUV422P){ \
+        srcStride[1] *= 2;               \
+        srcStride[2] *= 2;               \
+    }                                    \
+
+#define YUV2RGB_LOOP(depth)                                   \
+    h_size= (c->dstW+7)&~7;                                   \
+    if(h_size*depth > FFABS(dstStride[0])) h_size-=8;         \
+\
+    __asm__ volatile ("pxor %mm4, %mm4;" /* zero mm4 */ );    \
+    for (y= 0; y<srcSliceH; y++ ) {                           \
+        uint8_t *image = dst[0] + (y+srcSliceY)*dstStride[0]; \
+        uint8_t *py = src[0] + y*srcStride[0];                \
+        uint8_t *pu = src[1] + (y>>1)*srcStride[1];           \
+        uint8_t *pv = src[2] + (y>>1)*srcStride[2];           \
+        long index= -h_size/2;                                \
+
+#define YUV2RGB_INIT                                                       \
+        /* This MMX assembly code deals with a SINGLE scan line at a time, \
+         * it converts 8 pixels in each iteration. */                      \
+        __asm__ volatile (                                                 \
+        /* load data for start of next scan line */                        \
+        "movd    (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ \
+        "movd    (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ \
+        "movq (%5, %0, 2), %%mm6;" /* Load 8  Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ \
+        /*                                                                 \
+        ".balign 16     \n\t"                                              \
+        */                                                                 \
+        "1:             \n\t"                                              \
+        /* No speed difference on my p3@500 with prefetch,                 \
+         * if it is faster for anyone with -benchmark then tell me.        \
+        PREFETCH" 64(%0) \n\t"                                             \
+        PREFETCH" 64(%1) \n\t"                                             \
+        PREFETCH" 64(%2) \n\t"                                             \
+        */                                                                 \
+
+#define YUV2RGB_ENDLOOP(depth) \
+        "add $"AV_STRINGIFY(depth*8)", %1    \n\t" \
+        "add                       $4, %0    \n\t" \
+        " js                       1b        \n\t" \
+\
+        : "+r" (index), "+r" (image) \
+        : "r" (pu - index), "r" (pv - index), "r"(&c->redDither), "r" (py - 2*index) \
+        ); \
+    } \
+    __asm__ volatile (EMMS); \
+    return srcSliceH; \
+
 static inline int RENAME(yuv420_rgb16)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
                                        int srcSliceH, uint8_t* dst[], int dstStride[]){
     int y, h_size;
 
-    if(c->srcFormat == PIX_FMT_YUV422P){
-        srcStride[1] *= 2;
-        srcStride[2] *= 2;
-    }
+    YUV422_UNSHIFT
+    YUV2RGB_LOOP(2)
 
-    h_size= (c->dstW+7)&~7;
-    if(h_size*2 > FFABS(dstStride[0])) h_size-=8;
+        c->blueDither= ff_dither8[y&1];
+        c->greenDither= ff_dither4[y&1];
+        c->redDither= ff_dither8[(y+1)&1];
 
-    asm volatile ("pxor %mm4, %mm4;" /* zero mm4 */ );
-    //printf("%X %X %X %X %X %X %X %X %X %X\n", (int)&c->redDither, (int)&b5Dither, (int)src[0], (int)src[1], (int)src[2], (int)dst[0],
-    //srcStride[0],srcStride[1],srcStride[2],dstStride[0]);
-    for (y= 0; y<srcSliceH; y++ ) {
-        uint8_t *image = dst[0] + (y+srcSliceY)*dstStride[0];
-        uint8_t *py = src[0] + y*srcStride[0];
-        uint8_t *pu = src[1] + (y>>1)*srcStride[1];
-        uint8_t *pv = src[2] + (y>>1)*srcStride[2];
-        long index= -h_size/2;
-
-        b5Dither= ff_dither8[y&1];
-        g6Dither= ff_dither4[y&1];
-        g5Dither= ff_dither8[y&1];
-        r5Dither= ff_dither8[(y+1)&1];
-        /* This MMX assembly code deals with a SINGLE scan line at a time,
-         * it converts 8 pixels in each iteration. */
-        asm volatile (
-        /* load data for start of next scan line */
-        "movd    (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
-        "movd    (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
-        "movq (%5, %0, 2), %%mm6;" /* Load 8  Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
-        //".balign 16     \n\t"
-        "1:             \n\t"
-        /* No speed difference on my p3@500 with prefetch,
-         * if it is faster for anyone with -benchmark then tell me.
-        PREFETCH" 64(%0) \n\t"
-        PREFETCH" 64(%1) \n\t"
-        PREFETCH" 64(%2) \n\t"
-        */
-YUV2RGB
+        YUV2RGB_INIT
+        YUV2RGB
 
 #ifdef DITHER1XBPP
-        "paddusb "MANGLE(b5Dither)", %%mm0;"
-        "paddusb "MANGLE(g6Dither)", %%mm2;"
-        "paddusb "MANGLE(r5Dither)", %%mm1;"
+        "paddusb "BLUE_DITHER"(%4), %%mm0;"
+        "paddusb "GREEN_DITHER"(%4), %%mm2;"
+        "paddusb "RED_DITHER"(%4), %%mm1;"
 #endif
         /* mask unneeded bits off */
         "pand "MANGLE(mmx_redmask)", %%mm0;" /* b7b6b5b4 b3_0_0_0 b7b6b5b4 b3_0_0_0 */
@@ -202,61 +222,27 @@ YUV2RGB
 
         MOVNTQ "   %%mm5, 8 (%1);" /* store pixel 4-7 */
 
-        "add $16, %1    \n\t"
-        "add  $4, %0    \n\t"
-        " js  1b        \n\t"
-
-        : "+r" (index), "+r" (image)
-        : "r" (pu - index), "r" (pv - index), "r"(&c->redDither), "r" (py - 2*index)
-        );
-    }
-
-    asm volatile (EMMS);
-
-    return srcSliceH;
+    YUV2RGB_ENDLOOP(2)
 }
 
 static inline int RENAME(yuv420_rgb15)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
                                        int srcSliceH, uint8_t* dst[], int dstStride[]){
     int y, h_size;
 
-    if(c->srcFormat == PIX_FMT_YUV422P){
-        srcStride[1] *= 2;
-        srcStride[2] *= 2;
-    }
+    YUV422_UNSHIFT
+    YUV2RGB_LOOP(2)
 
-    h_size= (c->dstW+7)&~7;
-    if(h_size*2 > FFABS(dstStride[0])) h_size-=8;
+        c->blueDither= ff_dither8[y&1];
+        c->greenDither= ff_dither8[y&1];
+        c->redDither= ff_dither8[(y+1)&1];
 
-    asm volatile ("pxor %mm4, %mm4;" /* zero mm4 */ );
-    //printf("%X %X %X %X %X %X %X %X %X %X\n", (int)&c->redDither, (int)&b5Dither, (int)src[0], (int)src[1], (int)src[2], (int)dst[0],
-    //srcStride[0],srcStride[1],srcStride[2],dstStride[0]);
-    for (y= 0; y<srcSliceH; y++ ) {
-        uint8_t *image = dst[0] + (y+srcSliceY)*dstStride[0];
-        uint8_t *py = src[0] + y*srcStride[0];
-        uint8_t *pu = src[1] + (y>>1)*srcStride[1];
-        uint8_t *pv = src[2] + (y>>1)*srcStride[2];
-        long index= -h_size/2;
-
-        b5Dither= ff_dither8[y&1];
-        g6Dither= ff_dither4[y&1];
-        g5Dither= ff_dither8[y&1];
-        r5Dither= ff_dither8[(y+1)&1];
-        /* This MMX assembly code deals with a SINGLE scan line at a time,
-         * it converts 8 pixels in each iteration. */
-        asm volatile (
-        /* load data for start of next scan line */
-        "movd    (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
-        "movd    (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
-        "movq (%5, %0, 2), %%mm6;" /* Load 8  Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
-        //".balign 16     \n\t"
-        "1:             \n\t"
-YUV2RGB
+        YUV2RGB_INIT
+        YUV2RGB
 
 #ifdef DITHER1XBPP
-        "paddusb "MANGLE(b5Dither)", %%mm0  \n\t"
-        "paddusb "MANGLE(g5Dither)", %%mm2  \n\t"
-        "paddusb "MANGLE(r5Dither)", %%mm1  \n\t"
+        "paddusb "BLUE_DITHER"(%4), %%mm0  \n\t"
+        "paddusb "GREEN_DITHER"(%4), %%mm2  \n\t"
+        "paddusb "RED_DITHER"(%4), %%mm1  \n\t"
 #endif
 
         /* mask unneeded bits off */
@@ -293,51 +279,20 @@ YUV2RGB
 
         MOVNTQ " %%mm5, 8 (%1);" /* store pixel 4-7 */
 
-        "add $16, %1            \n\t"
-        "add $4, %0             \n\t"
-        " js 1b                 \n\t"
-        : "+r" (index), "+r" (image)
-        : "r" (pu - index), "r" (pv - index), "r"(&c->redDither), "r" (py - 2*index)
-        );
-    }
-
-    asm volatile (EMMS);
-    return srcSliceH;
+    YUV2RGB_ENDLOOP(2)
 }
 
 static inline int RENAME(yuv420_rgb24)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
                                        int srcSliceH, uint8_t* dst[], int dstStride[]){
     int y, h_size;
 
-    if(c->srcFormat == PIX_FMT_YUV422P){
-        srcStride[1] *= 2;
-        srcStride[2] *= 2;
-    }
+    YUV422_UNSHIFT
+    YUV2RGB_LOOP(3)
 
-    h_size= (c->dstW+7)&~7;
-    if(h_size*3 > FFABS(dstStride[0])) h_size-=8;
-
-    asm volatile ("pxor %mm4, %mm4;" /* zero mm4 */ );
-
-    for (y= 0; y<srcSliceH; y++ ) {
-        uint8_t *image = dst[0] + (y+srcSliceY)*dstStride[0];
-        uint8_t *py = src[0] + y*srcStride[0];
-        uint8_t *pu = src[1] + (y>>1)*srcStride[1];
-        uint8_t *pv = src[2] + (y>>1)*srcStride[2];
-        long index= -h_size/2;
-
-        /* This MMX assembly code deals with a SINGLE scan line at a time,
-         * it converts 8 pixels in each iteration. */
-        asm volatile (
-        /* load data for start of next scan line */
-        "movd    (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
-        "movd    (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
-        "movq (%5, %0, 2), %%mm6;" /* Load 8  Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
-        //".balign 16     \n\t"
-        "1:             \n\t"
-YUV2RGB
+        YUV2RGB_INIT
+        YUV2RGB
         /* mm0=B, %%mm2=G, %%mm1=R */
-#ifdef HAVE_MMX2
+#if HAVE_MMX2
         "movq "MANGLE(ff_M24A)", %%mm4     \n\t"
         "movq "MANGLE(ff_M24C)", %%mm7     \n\t"
         "pshufw $0x50, %%mm0, %%mm5     \n\t" /* B3 B2 B3 B2  B1 B0 B1 B0 */
@@ -438,101 +393,61 @@ YUV2RGB
         "pxor      %%mm4, %%mm4     \n\t"
 #endif
 
-        "add $24, %1    \n\t"
-        "add  $4, %0    \n\t"
-        " js  1b        \n\t"
-
-        : "+r" (index), "+r" (image)
-        : "r" (pu - index), "r" (pv - index), "r"(&c->redDither), "r" (py - 2*index)
-        );
-    }
-
-    asm volatile (EMMS);
-    return srcSliceH;
+    YUV2RGB_ENDLOOP(3)
 }
 
+#define RGB_PLANAR2PACKED32                                             \
+    /* convert RGB plane to RGB packed format,                          \
+       mm0 ->  B, mm1 -> R, mm2 -> G, mm3 -> A,                         \
+       mm4 -> GB, mm5 -> AR pixel 4-7,                                  \
+       mm6 -> GB, mm7 -> AR pixel 0-3 */                                \
+    "movq      %%mm0, %%mm6;"   /* B7 B6 B5 B4 B3 B2 B1 B0 */           \
+    "movq      %%mm1, %%mm7;"   /* R7 R6 R5 R4 R3 R2 R1 R0 */           \
+\
+    "movq      %%mm0, %%mm4;"   /* B7 B6 B5 B4 B3 B2 B1 B0 */           \
+    "movq      %%mm1, %%mm5;"   /* R7 R6 R5 R4 R3 R2 R1 R0 */           \
+\
+    "punpcklbw %%mm2, %%mm6;"   /* G3 B3 G2 B2 G1 B1 G0 B0 */           \
+    "punpcklbw %%mm3, %%mm7;"   /* A3 R3 A2 R2 A1 R1 A0 R0 */           \
+\
+    "punpcklwd %%mm7, %%mm6;"   /* A1 R1 B1 G1 A0 R0 B0 G0 */           \
+    MOVNTQ "   %%mm6, (%1);"    /* Store ARGB1 ARGB0 */                 \
+\
+    "movq      %%mm0, %%mm6;"   /* B7 B6 B5 B4 B3 B2 B1 B0 */           \
+    "punpcklbw %%mm2, %%mm6;"   /* G3 B3 G2 B2 G1 B1 G0 B0 */           \
+\
+    "punpckhwd %%mm7, %%mm6;"   /* A3 R3 G3 B3 A2 R2 B3 G2 */           \
+    MOVNTQ "   %%mm6, 8 (%1);"  /* Store ARGB3 ARGB2 */                 \
+\
+    "punpckhbw %%mm2, %%mm4;"   /* G7 B7 G6 B6 G5 B5 G4 B4 */           \
+    "punpckhbw %%mm3, %%mm5;"   /* A7 R7 A6 R6 A5 R5 A4 R4 */           \
+\
+    "punpcklwd %%mm5, %%mm4;"   /* A5 R5 B5 G5 A4 R4 B4 G4 */           \
+    MOVNTQ "   %%mm4, 16 (%1);" /* Store ARGB5 ARGB4 */                 \
+\
+    "movq      %%mm0, %%mm4;"   /* B7 B6 B5 B4 B3 B2 B1 B0 */           \
+    "punpckhbw %%mm2, %%mm4;"   /* G7 B7 G6 B6 G5 B5 G4 B4 */           \
+\
+    "punpckhwd %%mm5, %%mm4;"   /* A7 R7 G7 B7 A6 R6 B6 G6 */           \
+    MOVNTQ "   %%mm4, 24 (%1);" /* Store ARGB7 ARGB6 */                 \
+\
+    "movd 4 (%2, %0), %%mm0;"   /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ \
+    "movd 4 (%3, %0), %%mm1;"   /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ \
+\
+    "pxor         %%mm4, %%mm4;" /* zero mm4 */                         \
+    "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ \
+
 static inline int RENAME(yuv420_rgb32)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
                                        int srcSliceH, uint8_t* dst[], int dstStride[]){
     int y, h_size;
 
-    if(c->srcFormat == PIX_FMT_YUV422P){
-        srcStride[1] *= 2;
-        srcStride[2] *= 2;
-    }
+    YUV422_UNSHIFT
+    YUV2RGB_LOOP(4)
 
-    h_size= (c->dstW+7)&~7;
-    if(h_size*4 > FFABS(dstStride[0])) h_size-=8;
+        YUV2RGB_INIT
+        YUV2RGB
+        "pcmpeqd   %%mm3, %%mm3;"   /* fill mm3 */
+        RGB_PLANAR2PACKED32
 
-    asm volatile ("pxor %mm4, %mm4;" /* zero mm4 */ );
-
-    for (y= 0; y<srcSliceH; y++ ) {
-        uint8_t *image = dst[0] + (y+srcSliceY)*dstStride[0];
-        uint8_t *py = src[0] + y*srcStride[0];
-        uint8_t *pu = src[1] + (y>>1)*srcStride[1];
-        uint8_t *pv = src[2] + (y>>1)*srcStride[2];
-        long index= -h_size/2;
-
-        /* This MMX assembly code deals with a SINGLE scan line at a time,
-         * it converts 8 pixels in each iteration. */
-        asm volatile (
-        /* load data for start of next scan line */
-        "movd    (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
-        "movd    (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
-        "movq (%5, %0, 2), %%mm6;" /* Load 8  Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
-        //".balign 16     \n\t"
-        "1:             \n\t"
-YUV2RGB
-        /* convert RGB plane to RGB packed format,
-           mm0 ->  B, mm1 -> R, mm2 -> G, mm3 -> 0,
-           mm4 -> GB, mm5 -> AR pixel 4-7,
-           mm6 -> GB, mm7 -> AR pixel 0-3 */
-        "pxor      %%mm3, %%mm3;"   /* zero mm3 */
-
-        "movq      %%mm0, %%mm6;"   /* B7 B6 B5 B4 B3 B2 B1 B0 */
-        "movq      %%mm1, %%mm7;"   /* R7 R6 R5 R4 R3 R2 R1 R0 */
-
-        "movq      %%mm0, %%mm4;"   /* B7 B6 B5 B4 B3 B2 B1 B0 */
-        "movq      %%mm1, %%mm5;"   /* R7 R6 R5 R4 R3 R2 R1 R0 */
-
-        "punpcklbw %%mm2, %%mm6;"   /* G3 B3 G2 B2 G1 B1 G0 B0 */
-        "punpcklbw %%mm3, %%mm7;"   /* 00 R3 00 R2 00 R1 00 R0 */
-
-        "punpcklwd %%mm7, %%mm6;"   /* 00 R1 B1 G1 00 R0 B0 G0 */
-        MOVNTQ "   %%mm6, (%1);"    /* Store ARGB1 ARGB0 */
-
-        "movq      %%mm0, %%mm6;"   /* B7 B6 B5 B4 B3 B2 B1 B0 */
-        "punpcklbw %%mm2, %%mm6;"   /* G3 B3 G2 B2 G1 B1 G0 B0 */
-
-        "punpckhwd %%mm7, %%mm6;"   /* 00 R3 G3 B3 00 R2 B3 G2 */
-        MOVNTQ "   %%mm6, 8 (%1);"  /* Store ARGB3 ARGB2 */
-
-        "punpckhbw %%mm2, %%mm4;"   /* G7 B7 G6 B6 G5 B5 G4 B4 */
-        "punpckhbw %%mm3, %%mm5;"   /* 00 R7 00 R6 00 R5 00 R4 */
-
-        "punpcklwd %%mm5, %%mm4;"   /* 00 R5 B5 G5 00 R4 B4 G4 */
-        MOVNTQ "   %%mm4, 16 (%1);" /* Store ARGB5 ARGB4 */
-
-        "movq      %%mm0, %%mm4;"   /* B7 B6 B5 B4 B3 B2 B1 B0 */
-        "punpckhbw %%mm2, %%mm4;"   /* G7 B7 G6 B6 G5 B5 G4 B4 */
-
-        "punpckhwd %%mm5, %%mm4;"   /* 00 R7 G7 B7 00 R6 B6 G6 */
-        MOVNTQ "   %%mm4, 24 (%1);" /* Store ARGB7 ARGB6 */
-
-        "movd 4 (%2, %0), %%mm0;"   /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
-        "movd 4 (%3, %0), %%mm1;"   /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */
-
-        "pxor         %%mm4, %%mm4;" /* zero mm4 */
-        "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
-
-        "add $32, %1    \n\t"
-        "add  $4, %0    \n\t"
-        " js  1b        \n\t"
-
-        : "+r" (index), "+r" (image)
-        : "r" (pu - index), "r" (pv - index), "r"(&c->redDither), "r" (py - 2*index)
-        );
-    }
-
-    asm volatile (EMMS);
-    return srcSliceH;
+    YUV2RGB_ENDLOOP(4)
 }
diff --git a/src/add-ons/media/plugins/avcodec/libswscale/yuv2rgb_vis.c b/src/add-ons/media/plugins/avcodec/libswscale/yuv2rgb_vis.c
index 120fa56c71..2e2737aa9f 100644
--- a/src/add-ons/media/plugins/avcodec/libswscale/yuv2rgb_vis.c
+++ b/src/add-ons/media/plugins/avcodec/libswscale/yuv2rgb_vis.c
@@ -80,12 +80,13 @@
 
 
 
+// FIXME: must be changed to set alpha to 255 instead of 0
 static int vis_420P_ARGB32(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
                            int srcSliceH, uint8_t* dst[], int dstStride[]){
   int y, out1, out2, out3, out4, out5, out6;
 
   for(y=0;y < srcSliceH;++y) {
-      asm volatile (
+      __asm__ volatile (
           YUV2RGB_INIT
           "wr %%g0, 0xd2, %%asi        \n\t" /* ASI_FL16_P */
           "1:                          \n\t"
@@ -131,12 +132,13 @@ static int vis_420P_ARGB32(SwsContext *c, uint8_t* src[], int srcStride[], int s
   return srcSliceH;
 }
 
+// FIXME: must be changed to set alpha to 255 instead of 0
 static int vis_422P_ARGB32(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
                            int srcSliceH, uint8_t* dst[], int dstStride[]){
   int y, out1, out2, out3, out4, out5, out6;
 
   for(y=0;y < srcSliceH;++y) {
-      asm volatile (
+      __asm__ volatile (
           YUV2RGB_INIT
           "wr %%g0, 0xd2, %%asi        \n\t" /* ASI_FL16_P */
           "1:                          \n\t"
@@ -182,7 +184,7 @@ static int vis_422P_ARGB32(SwsContext *c, uint8_t* src[], int srcStride[], int s
   return srcSliceH;
 }
 
-SwsFunc yuv2rgb_init_vis(SwsContext *c) {
+SwsFunc sws_yuv2rgb_init_vis(SwsContext *c) {
     c->sparc_coeffs[5]=c->yCoeff;
     c->sparc_coeffs[6]=c->vgCoeff;
     c->sparc_coeffs[7]=c->vrCoeff;
@@ -196,11 +198,11 @@ SwsFunc yuv2rgb_init_vis(SwsContext *c) {
     c->sparc_coeffs[4]=(((int16_t)c->vOffset*(int16_t)c->vrCoeff>>11) & 0xffff) * 0x0001000100010001ULL;
 
     if (c->dstFormat == PIX_FMT_RGB32 && c->srcFormat == PIX_FMT_YUV422P && (c->dstW & 7)==0) {
-        av_log(c, AV_LOG_INFO, "SPARC VIS accelerated YUV422P -> RGB32\n");
+        av_log(c, AV_LOG_INFO, "SPARC VIS accelerated YUV422P -> RGB32 (WARNING: alpha value is wrong)\n");
         return vis_422P_ARGB32;
     }
     else if (c->dstFormat == PIX_FMT_RGB32 && c->srcFormat == PIX_FMT_YUV420P && (c->dstW & 7)==0) {
-        av_log(c, AV_LOG_INFO, "SPARC VIS accelerated YUV420P -> RGB32\n");
+        av_log(c, AV_LOG_INFO, "SPARC VIS accelerated YUV420P -> RGB32 (WARNING: alpha value is wrong)\n");
         return vis_420P_ARGB32;
     }
     return NULL;