Added SSE2 yuv to rgb conversion code. This code needs to move into the media kit though

git-svn-id: file:///srv/svn/repos/haiku/haiku/trunk@33200 a95241bf-73f2-0310-859d-f6bbb57e9c96
This commit is contained in:
David McPaul 2009-09-19 23:34:23 +00:00
parent 6a400ee80f
commit 6bfbea62cd
8 changed files with 593 additions and 70 deletions

View File

@ -215,7 +215,6 @@ AVCodecDecoder::Setup(media_format* ioEncodedFormat, const void* infoBuffer,
fBlockAlign
= ioEncodedFormat->u.encoded_audio.output.buffer_size;
}
printf("XXX extra data size %ld\n", infoSize);
if (extraData != NULL && fExtraDataSize > 0) {
TRACE("AVCodecDecoder: extra data size %ld\n", infoSize);
fExtraData = new(std::nothrow) char[fExtraDataSize];
@ -731,13 +730,13 @@ AVCodecDecoder::_DecodeVideo(void* outBuffer, int64* outFrameCount,
profileCounter++;
if (!(fFrame % 10)) {
if (info) {
TRACE("[v] profile: d1 = %lld, d2 = %lld (%Ld) required "
printf("[v] profile: d1 = %lld, d2 = %lld (%Ld) required "
"%Ld\n",
decodingTime / profileCounter,
conversionTime / profileCounter,
fFrame, info->time_to_decode);
} else {
TRACE("[v] profile: d1 = %lld, d2 = %lld (%Ld) required "
printf("[v] profile: d1 = %lld, d2 = %lld (%Ld) required "
"%Ld\n",
decodingTime / profileCounter,
conversionTime / profileCounter,

View File

@ -0,0 +1,146 @@
/*
* Copyright (C) 2009 David McPaul
*
* includes code from sysinfo.c which is
* Copyright 2004-2008, Axel Dörfler, axeld@pinc-software.de.
* Copyright (c) 2002, Carlos Hasan, for Haiku.
*
* All rights reserved. Distributed under the terms of the MIT License.
*/
#include <string.h>
#include <cpu_type.h>
#include "CpuCapabilities.h"
CPUCapabilities::~CPUCapabilities()
{
}
CPUCapabilities::CPUCapabilities()
{
#ifdef __INTEL__
setIntelCapabilities();
#endif
PrintCapabilities();
}
void
CPUCapabilities::setIntelCapabilities()
{
cpuid_info baseInfo;
cpuid_info cpuInfo;
int32 maxStandardFunction, maxExtendedFunction = 0;
if (get_cpuid(&baseInfo, 0L, 0L) != B_OK) {
// this CPU doesn't support cpuid
return;
}
maxStandardFunction = baseInfo.eax_0.max_eax;
if (maxStandardFunction >= 500) {
maxStandardFunction = 0; /* old Pentium sample chips has cpu signature here */
}
/* Extended cpuid */
get_cpuid(&cpuInfo, 0x80000000, 0L);
// extended cpuid is only supported if max_eax is greater than the service id
if (cpuInfo.eax_0.max_eax > 0x80000000) {
maxExtendedFunction = cpuInfo.eax_0.max_eax & 0xff;
}
if (maxStandardFunction > 0) {
get_cpuid(&cpuInfo, 1L, 0L);
if (cpuInfo.eax_1.features & (1UL << 23)) {
capabilities = CAPABILITY_MMX;
}
if (cpuInfo.eax_1.features & (1UL << 25)) {
capabilities = CAPABILITY_SSE1;
}
if (cpuInfo.eax_1.features & (1UL << 26)) {
capabilities = CAPABILITY_SSE2;
}
if (maxStandardFunction >= 1) {
/* Extended features */
if (cpuInfo.eax_1.extended_features & (1UL << 0)) {
capabilities = CAPABILITY_SSE3;
}
if (cpuInfo.eax_1.extended_features & (1UL << 9)) {
capabilities = CAPABILITY_SSSE3;
}
if (cpuInfo.eax_1.extended_features & (1UL << 19)) {
capabilities = CAPABILITY_SSE41;
}
if (cpuInfo.eax_1.extended_features & (1UL << 20)) {
capabilities = CAPABILITY_SSE42;
}
}
}
}
bool
CPUCapabilities::HasMMX()
{
return capabilities >= CAPABILITY_MMX;
}
bool
CPUCapabilities::HasSSE1()
{
return capabilities >= CAPABILITY_SSE1;
}
bool
CPUCapabilities::HasSSE2()
{
return capabilities >= CAPABILITY_SSE2;
}
bool
CPUCapabilities::HasSSE3()
{
return capabilities >= CAPABILITY_SSE3;
}
bool
CPUCapabilities::HasSSSE3()
{
return capabilities >= CAPABILITY_SSSE3;
}
bool
CPUCapabilities::HasSSE41()
{
return capabilities >= CAPABILITY_SSE41;
}
bool
CPUCapabilities::HasSSE42()
{
return capabilities >= CAPABILITY_SSE42;
}
void
CPUCapabilities::PrintCapabilities()
{
static const char *CapArray[8] = {
"", "MMX", "SSE1", "SSE2", "SSE3", "SSSE3", "SSE4.1", "SSE4.2"
};
printf("CPU is capable of running ");
if (capabilities) {
for (uint32 i=1;i<=capabilities;i++) {
printf("%s ",CapArray[i]);
}
} else {
printf("no extensions");
}
printf("\n");
}

View File

@ -0,0 +1,40 @@
/*
* Copyright (C) 2009 David McPaul
*
* All rights reserved. Distributed under the terms of the MIT License.
*/
#ifndef __CPU_CAPABILITIES__
#define __CPU_CAPABILITIES__
#define CAPABILITY_MMX 1
#define CAPABILITY_SSE1 2
#define CAPABILITY_SSE2 3
#define CAPABILITY_SSE3 4
#define CAPABILITY_SSSE3 5
#define CAPABILITY_SSE41 6
#define CAPABILITY_SSE42 7
class CPUCapabilities {
public:
CPUCapabilities();
~CPUCapabilities();
bool HasMMX();
bool HasSSE1();
bool HasSSE2();
bool HasSSE3();
bool HasSSSE3();
bool HasSSE41();
bool HasSSE42();
void PrintCapabilities();
private:
uint32 capabilities;
void setIntelCapabilities();
};
#endif //__CPU_CAPABILITIES__

View File

@ -19,11 +19,13 @@ Addon ffmpeg :
EncoderTable.cpp
FFmpegPlugin.cpp
MuxerTable.cpp
CpuCapabilities.cpp
gfx_conv_c.cpp
gfx_conv_c_lookup.cpp
# gfx_conv_mmx.cpp
gfx_conv_mmx.cpp
gfx_util.cpp
yuvrgb.nasm
:
libavformat.a
libavcodec.a

View File

@ -0,0 +1,84 @@
#include "gfx_conv_mmx.h"
#include "gfx_conv_c.h"
extern "C" void _Convert_YUV420P_RGBA32_SSE2(void *fromYPtr, void *fromUPtr, void *fromVPtr, void *toPtr, int width);
extern "C" void _Convert_YUV422_RGBA32_SSE2(void *fromYPtr, void *toPtr, int width);
void gfx_conv_null_mmx(AVFrame *in, AVFrame *out, int width, int height) {
memcpy(out->data[0], in->data[0], height * in->linesize[0]);
}
void gfx_conv_yuv410p_ycbcr422_mmx(AVFrame *in, AVFrame *out, int width, int height)
{
// img_convert((AVPicture *)out,PIX_FMT_YUV422P,(const AVPicture *)in,PIX_FMT_YUV410P,width,height);
}
void gfx_conv_yuv411p_ycbcr422_mmx(AVFrame *in, AVFrame *out, int width, int height)
{
// img_convert((AVPicture *)out,PIX_FMT_YUV422P,(const AVPicture *)in,PIX_FMT_YUV411P,width,height);
}
void gfx_conv_yuv420p_ycbcr422_mmx(AVFrame *in, AVFrame *out, int width, int height)
{
// img_convert((AVPicture *)out,PIX_FMT_YUV422P,(const AVPicture *)in,PIX_FMT_YUV420P,width,height);
}
void gfx_conv_yuv410p_rgb32_mmx(AVFrame *in, AVFrame *out, int width, int height)
{
// img_convert((AVPicture *)out,PIX_FMT_RGB32,(const AVPicture *)in,PIX_FMT_YUV410P,width,height);
}
void gfx_conv_yuv411p_rgb32_mmx(AVFrame *in, AVFrame *out, int width, int height)
{
// img_convert((AVPicture *)out,PIX_FMT_RGB32,(const AVPicture *)in,PIX_FMT_YUV411P,width,height);
}
void gfx_conv_yuv420p_rgb32_mmx(AVFrame *in, AVFrame *out, int width, int height)
{
// img_convert((AVPicture *)out,PIX_FMT_RGB32,(const AVPicture *)in,PIX_FMT_YUV420P,width,height);
}
// Planar YUV420
void gfx_conv_yuv420p_rgba32_sse2(AVFrame *in, AVFrame *out, int width, int height)
{
// width must be divisibile by 8 and height divisible by 2
if (width % 8 == 0 && height % 2 == 0) {
uint8 *ybase = (uint8 *)in->data[0];
uint8 *ubase = (uint8 *)in->data[1];
uint8 *vbase = (uint8 *)in->data[2];
uint8 *rgbbase = (uint8 *)out->data[0];
for (int i=0;i<height;i+=2) {
_Convert_YUV420P_RGBA32_SSE2(ybase, ubase, vbase, rgbbase, width); // First Y row
ybase += in->linesize[0];
rgbbase += out->linesize[0];
_Convert_YUV420P_RGBA32_SSE2(ybase, ubase, vbase, rgbbase, width); // Second Y row but same u and v row
ybase += in->linesize[0];
ubase += in->linesize[1];
vbase += in->linesize[2];
rgbbase += out->linesize[0];
}
} else {
gfx_conv_YCbCr420p_RGB32_c(in, out, width, height);
}
}
// Packed YUV422
void gfx_conv_yuv422p_rgba32_sse2(AVFrame *in, AVFrame *out, int width, int height)
{
// width must be divisibile by 8
if (width % 8 == 0) {
uint8 *ybase = (uint8 *)in->data[0];
uint8 *rgbbase = (uint8 *)out->data[0];
for (int i = 0; i <= height; i++) {
_Convert_YUV422_RGBA32_SSE2(ybase, rgbbase, width);
ybase += in->linesize[0];
rgbbase += out->linesize[0];
}
} else {
gfx_conv_YCbCr422_RGB32_c(in, out, width, height);
}
}

View File

@ -5,8 +5,6 @@
#include <GraphicsDefs.h>
#include "libavcodec/avcodec.h"
bool IsMmxCpu();
void gfx_conv_null_mmx(AVFrame *in, AVFrame *out, int width, int height);
void gfx_conv_yuv410p_ycbcr422_mmx(AVFrame *in, AVFrame *out, int width, int height);
@ -15,6 +13,7 @@ void gfx_conv_yuv420p_ycbcr422_mmx(AVFrame *in, AVFrame *out, int width, int hei
void gfx_conv_yuv410p_rgb32_mmx(AVFrame *in, AVFrame *out, int width, int height);
void gfx_conv_yuv411p_rgb32_mmx(AVFrame *in, AVFrame *out, int width, int height);
void gfx_conv_yuv420p_rgb32_mmx(AVFrame *in, AVFrame *out, int width, int height);
void gfx_conv_yuv420p_rgba32_sse2(AVFrame *in, AVFrame *out, int width, int height);
void gfx_conv_yuv422p_rgba32_sse2(AVFrame *in, AVFrame *out, int width, int height);
#endif

View File

@ -3,6 +3,7 @@
#include "gfx_util.h"
#include "gfx_conv_c.h"
#include "gfx_conv_mmx.h"
#include "CpuCapabilities.h"
/*
* ref docs
@ -15,61 +16,51 @@
#define TRACE(a...)
#endif
//#define INCLUDE_MMX defined(__INTEL__)
#define INCLUDE_MMX 0
// this function will try to find the best colorspaces for both the ff-codec and
// the Media Kit sides.
gfx_convert_func resolve_colorspace(color_space colorSpace, PixelFormat pixelFormat)
{
#if INCLUDE_MMX
bool mmx = IsMmxCpu();
#endif
CPUCapabilities cpu;
switch (colorSpace)
{
case B_RGB32:
if (pixelFormat == PIX_FMT_YUV410P) {
#if INCLUDE_MMX
if (mmx) {
TRACE("resolve_colorspace: gfx_conv_yuv410p_rgb32_mmx\n");
return gfx_conv_yuv410p_rgb32_mmx;
} else
#endif
{
// if (cpu.HasMMX()) {
// TRACE("resolve_colorspace: gfx_conv_yuv410p_rgb32_mmx\n");
// return gfx_conv_yuv410p_rgb32_mmx;
// } else {
TRACE("resolve_colorspace: gfx_conv_yuv410p_rgb32_c\n");
return gfx_conv_yuv410p_rgb32_c;
}
// }
}
if (pixelFormat == PIX_FMT_YUV411P) {
#if INCLUDE_MMX
if (mmx) {
TRACE("resolve_colorspace: gfx_conv_yuv411p_rgb32_mmx\n");
return gfx_conv_yuv411p_rgb32_mmx;
} else
#endif
{
// if (cpu.HasMMX()) {
// TRACE("resolve_colorspace: gfx_conv_yuv411p_rgb32_mmx\n");
// return gfx_conv_yuv411p_rgb32_mmx;
// } else {
TRACE("resolve_colorspace: gfx_conv_yuv411p_rgb32_c\n");
return gfx_conv_yuv411p_rgb32_c;
}
// }
}
if (pixelFormat == PIX_FMT_YUV420P || pixelFormat == PIX_FMT_YUVJ420P) {
#if INCLUDE_MMX
if (mmx) {
TRACE("resolve_colorspace: gfx_conv_yuv420p_rgb32_mmx\n");
return gfx_conv_yuv420p_rgb32_mmx;
} else
#endif
{
TRACE("resolve_colorspace: gfx_conv_yuv420p_rgb32_c\n");
if (cpu.HasSSE2()) {
TRACE("resolve_colorspace: gfx_conv_yuv420p_rgba32_sse2\n");
return gfx_conv_yuv420p_rgba32_sse2;
} else {
TRACE("resolve_colorspace: gfx_conv_YCbCr420p_RGB32_c\n");
return gfx_conv_YCbCr420p_RGB32_c;
}
}
if (pixelFormat == PIX_FMT_YUV422P || pixelFormat == PIX_FMT_YUVJ422P) {
return gfx_conv_YCbCr422_RGB32_c;
if (cpu.HasSSE2()) {
return gfx_conv_yuv422p_rgba32_sse2;
} else {
return gfx_conv_YCbCr422_RGB32_c;
}
}
TRACE("resolve_colorspace: %s => B_RGB32: NULL\n", pixfmt_to_string(pixelFormat));
@ -86,55 +77,43 @@ gfx_convert_func resolve_colorspace(color_space colorSpace, PixelFormat pixelFor
case B_YCbCr422:
if (pixelFormat == PIX_FMT_YUV410P) {
#if INCLUDE_MMX
if (mmx) {
TRACE("resolve_colorspace: gfx_conv_yuv410p_ycbcr422_mmx\n");
return gfx_conv_yuv410p_ycbcr422_mmx;
} else
#endif
{
// if (cpu.HasMMX()) {
// TRACE("resolve_colorspace: gfx_conv_yuv410p_ycbcr422_mmx\n");
// return gfx_conv_yuv410p_ycbcr422_mmx;
// } else {
TRACE("resolve_colorspace: gfx_conv_yuv410p_ycbcr422_c\n");
return gfx_conv_yuv410p_ycbcr422_c;
}
// }
}
if (pixelFormat == PIX_FMT_YUV411P) {
#if INCLUDE_MMX
if (mmx) {
TRACE("resolve_colorspace: gfx_conv_yuv411p_ycbcr422_mmx\n");
return gfx_conv_yuv411p_ycbcr422_mmx;
} else
#endif
{
// if (cpu.HasMMX()) {
// TRACE("resolve_colorspace: gfx_conv_yuv411p_ycbcr422_mmx\n");
// return gfx_conv_yuv411p_ycbcr422_mmx;
// } else {
TRACE("resolve_colorspace: gfx_conv_yuv411p_ycbcr422_c\n");
return gfx_conv_yuv411p_ycbcr422_c;
}
// }
}
if (pixelFormat == PIX_FMT_YUV420P || pixelFormat == PIX_FMT_YUVJ420P) {
#if INCLUDE_MMX
if (mmx) {
TRACE("resolve_colorspace: gfx_conv_yuv420p_ycbcr422_mmx\n");
return gfx_conv_yuv420p_ycbcr422_mmx;
} else
#endif
{
// if (cpu.HasMMX()) {
// TRACE("resolve_colorspace: gfx_conv_yuv420p_ycbcr422_mmx\n");
// return gfx_conv_yuv420p_ycbcr422_mmx;
// } else {
TRACE("resolve_colorspace: gfx_conv_yuv420p_ycbcr422_c\n");
return gfx_conv_yuv420p_ycbcr422_c;
}
// }
}
if (pixelFormat == PIX_FMT_YUYV422) {
#if INCLUDE_MMX
if (mmx) {
TRACE("resolve_colorspace: PIX_FMT_YUV422 => B_YCbCr422: gfx_conv_null_mmx\n");
return gfx_conv_null_mmx;
} else
#endif
{
// if (cpu.HasMMX()) {
// TRACE("resolve_colorspace: PIX_FMT_YUV422 => B_YCbCr422: gfx_conv_null_mmx\n");
// return gfx_conv_null_mmx;
// } else {
TRACE("resolve_colorspace: PIX_FMT_YUV422 => B_YCbCr422: gfx_conv_null_c\n");
return gfx_conv_null_c;
}
// }
}
TRACE("resolve_colorspace: %s => B_YCbCr422: NULL\n", pixfmt_to_string(pixelFormat));

View File

@ -0,0 +1,274 @@
/*
* Copyright (C) 2009 David McPaul
*
* All rights reserved. Distributed under the terms of the MIT License.
*/
; A rather unoptimised set of yuv to rgb converters
; does 8 pixels at a time
; inputer:
; reads 128bits of yuv 8 bit data and puts
; the y values converted to 16 bit in xmm0
; the u values converted to 16 bit and duplicated into xmm1
; the v values converted to 16 bit and duplicated into xmm2
; conversion:
; does the yuv to rgb conversion using 16 bit fixed point and the
; results are placed into the following registers as 8 bit clamped values
; r values in xmm3
; g values in xmm4
; b values in xmm5
; outputer:
; writes out the rgba pixels as 8 bit values with 0 for alpha
; xmm6 used for scratch
; xmm7 used for scratch
%macro cglobal 1
global _%1
%define %1 _%1
align 16
%1:
%endmacro
; conversion code
%macro yuv2rgb 0
; u = u - 128
; v = v - 128
; r = y + v + v >> 2 + v >> 3 + v >> 5
; g = y - (u >> 2 + u >> 4 + u >> 5) - (v >> 1 + v >> 3 + v >> 4 + v >> 5)
; b = y + u + u >> 1 + u >> 2 + u >> 6
; subtract 16 from y
movdqa xmm7, [Const16] ; loads a constant using data cache
psubsw xmm0,xmm7 ; y = y - 16
; subtract 128 from u and v
; mov eax,128*10001H ; load a constant using instruction cache
; movd xmm7,eax ; but requires eax to be saved
; pshufd xmm7,xmm7,0 ; and uses more instructions
movdqa xmm7, [Const128] ; loads a constant using data cache (slower on first fetch but then cached)
psubsw xmm1,xmm7 ; u = u - 128
psubsw xmm2,xmm7 ; v = v - 128
; load r,g,b with y
movdqa xmm3,xmm0 ; r = y
pshufd xmm4,xmm0, 0xE4 ; g = y
movdqa xmm5,xmm0 ; b = y
; r = r + v + v >> 2 + v >> 3 + v >> 5
paddsw xmm3, xmm2 ; add v to r
movdqa xmm6, xmm2 ; move v to scratch
psraw xmm6,2 ; divide by 4
paddsw xmm3, xmm6 ; and add to r
psraw xmm6,1 ; divide by 2
paddsw xmm3, xmm6 ; and add to r
psraw xmm6,2 ; divide by 4
paddsw xmm3, xmm6 ; and add to r
; g = y - u >> 2 - u >> 4 - u >> 5 - v >> 1 - v >> 3 - v >> 4 - v >> 5
movdqa xmm6,xmm1 ; move u to scratch
psraw xmm6,2 ; divide by 4
psubsw xmm4,xmm6 ; subtract from g
psraw xmm6,2 ; divide by 4
psubsw xmm4,xmm6 ; subtract from g
psraw xmm6,1 ; divide by 2
psubsw xmm4,xmm6 ; subtract from g
movdqa xmm6,xmm2 ; move v to scratch
psraw xmm6,1 ; divide by 2
psubsw xmm4,xmm6 ; subtract from g
psraw xmm6,2 ; divide by 4
psubsw xmm4,xmm6 ; subtract from g
psraw xmm6,1 ; divide by 2
psubsw xmm4,xmm6 ; subtract from g
psraw xmm6,1 ; divide by 2
psubsw xmm4,xmm6 ; subtract from g
; b = y + u + u >> 1 + u >> 2 + u >> 6
paddsw xmm5, xmm1 ; add u to b
movdqa xmm6, xmm1 ; move u to scratch
psraw xmm6,1 ; divide by 2
paddsw xmm5, xmm6 ; and add to b
psraw xmm6,1 ; divide by 2
paddsw xmm5, xmm6 ; and add to b
psraw xmm6,4 ; divide by 32
paddsw xmm5, xmm6 ; and add to b
%endmacro
; outputer
%macro rgba32output 0
; clamp values
pxor xmm7,xmm7
packuswb xmm3,xmm7 ; clamp to 0,255 and pack R to 8 bit per pixel
packuswb xmm4,xmm7 ; clamp to 0,255 and pack G to 8 bit per pixel
packuswb xmm5,xmm7 ; clamp to 0,255 and pack B to 8 bit per pixel
; convert to bgra32 packed
punpcklbw xmm3,xmm7 ; r0r0r0r0r0r0r0r0
punpcklbw xmm5,xmm4 ; bgbgbgbgbgbgbgbg
movdqa xmm0, xmm5 ; save gb values
punpcklwd xmm5,xmm3 ; lower half bgr0bgr0bgr0bgr0
punpckhwd xmm0,xmm3 ; upper half bgr0bgr0bgr0bgr0
; write to output ptr
movntdq [edi], xmm5 ; output first 4 pixels bypassing cache
movntdq [edi+16], xmm0 ; output second 4 pixels bypassing cache
%endmacro
SECTION .data align=16
Const16 dw 16
dw 16
dw 16
dw 16
dw 16
dw 16
dw 16
dw 16
Const128 dw 128
dw 128
dw 128
dw 128
dw 128
dw 128
dw 128
dw 128
; void Convert_YUV422_RGBA32_SSE2(void *fromPtr, void *toPtr, int width)
width equ ebp+16
toPtr equ ebp+12
fromPtr equ ebp+8
; void Convert_YUV420P_RGBA32_SSE2(void *fromYPtr, void *fromUPtr, void *fromVPtr, void *toPtr, int width)
width1 equ ebp+24
toPtr1 equ ebp+20
fromVPtr equ ebp+16
fromUPtr equ ebp+12
fromYPtr equ ebp+8
SECTION .text align=16
cglobal Convert_YUV422_RGBA32_SSE2
; reserve variables
push ebp
mov ebp, esp
push edi
push esi
push ecx
mov esi, [fromPtr]
mov edi, [toPtr]
mov ecx, [width]
prefetchnta [esi] ; hint that we will be loading our data outside of cache
; loop width / 8 times
shr ecx,3
test ecx,ecx
jng ENDLOOP
REPEATLOOP: ; loop over width / 8
; push ecx ; preserve loop counter
; YUV422 packed inputer
movdqa xmm0, [esi] ; should have yuyv yuyv yuyv yuyv
pshufd xmm1, xmm0, 0xE4 ; copy to xmm1
movdqa xmm2, xmm0 ; copy to xmm2
; extract y
pxor xmm7,xmm7 ; 00000000000000000000000000000000
pcmpeqd xmm6,xmm6 ; ffffffffffffffffffffffffffffffff
punpcklbw xmm6,xmm7 ; interleave xmm7 into xmm6 ff00ff00ff00ff00ff00ff00ff00ff00
pand xmm0, xmm6 ; clear all but y values leaving y0y0 etc
; extract u and duplicate so each u in yuyv becomes 0u0u
psrld xmm6,8 ; 00ff0000 00ff0000 00ff0000 00ff0000
pand xmm1, xmm6 ; clear all yv values leaving 0u00 etc
psrld xmm1,8 ; rotate u to get u000
pshuflw xmm1,xmm1, 0xA0 ; copy u values
pshufhw xmm1,xmm1, 0xA0 ; to get u0u0
; extract v
pslld xmm6,16 ; 000000ff000000ff 000000ff000000ff
pand xmm2, xmm6 ; clear all yu values leaving 000v etc
psrld xmm2,8 ; rotate v to get 00v0
pshuflw xmm2,xmm2, 0xF5 ; copy v values
pshufhw xmm2,xmm2, 0xF5 ; to get v0v0
yuv2rgb
rgba32output
; endloop
add edi,32
add esi,16
; pop ecx
sub ecx, 1 ; apparently sub is better than dec
jnz REPEATLOOP
ENDLOOP:
; Cleanup
pop ecx
pop esi
pop edi
mov esp, ebp
pop ebp
ret
cglobal Convert_YUV420P_RGBA32_SSE2
; reserve variables
push ebp
mov ebp, esp
push edi
push esi
push ecx
push eax
push ebx
mov esi, [fromYPtr]
mov eax, [fromUPtr]
mov ebx, [fromVPtr]
mov edi, [toPtr1]
mov ecx, [width1]
; loop width / 8 times
shr ecx,3
test ecx,ecx
jng ENDLOOP1
REPEATLOOP1: ; loop over width / 8
; push ecx ; preserve loop counter
; YUV420 Planar inputer
movq mm0, [esi] ; fetch 8 y values (8 bit) (direct unaligned sse2 loads might be better)
movd mm1, [eax] ; fetch 4 u values
movd mm2, [ebx] ; fetch 4 v values
movq2dq xmm0, mm0 ; copy y to sse register yyyyyyyy00000000
movq2dq xmm1, mm1 ; copy u to sse register uuuu000000000000
movq2dq xmm2, mm2 ; copy v to sse register vvvv000000000000
; extract y
pxor xmm7,xmm7 ; 00000000000000000000000000000000
punpcklbw xmm0,xmm7 ; interleave xmm7 into xmm0 y0y0y0y0y0y0y0y0
; extract u and duplicate so each becomes 0u0u
punpcklbw xmm1,xmm7 ; interleave xmm7 into xmm1 u0u0u0u000000000
punpcklwd xmm1,xmm7 ; interleave again u000u000u000u000
pshuflw xmm1,xmm1, 0xA0 ; copy u values
pshufhw xmm1,xmm1, 0xA0 ; to get u0u0
; extract v
punpcklbw xmm2,xmm7 ; interleave xmm7 into xmm1 v0v0v0v000000000
punpcklwd xmm2,xmm7 ; interleave again v000v000v000v000
pshuflw xmm2,xmm2, 0xA0 ; copy v values
pshufhw xmm2,xmm2, 0xA0 ; to get v0v0
yuv2rgb
rgba32output
; endloop
add edi,32
add esi,8
add eax,4
add ebx,4
; pop ecx
sub ecx, 1 ; apparently sub is better than dec
jnz REPEATLOOP1
ENDLOOP1:
; Cleanup
emms
pop ebx
pop eax
pop ecx
pop esi
pop edi
mov esp, ebp
pop ebp
ret
SECTION .note.GNU-stack noalloc noexec nowrite progbits