diff --git a/src/add-ons/media/plugins/ffmpeg/AVCodecDecoder.cpp b/src/add-ons/media/plugins/ffmpeg/AVCodecDecoder.cpp index 8b1b8e0114..74f26c4324 100644 --- a/src/add-ons/media/plugins/ffmpeg/AVCodecDecoder.cpp +++ b/src/add-ons/media/plugins/ffmpeg/AVCodecDecoder.cpp @@ -215,7 +215,6 @@ AVCodecDecoder::Setup(media_format* ioEncodedFormat, const void* infoBuffer, fBlockAlign = ioEncodedFormat->u.encoded_audio.output.buffer_size; } -printf("XXX extra data size %ld\n", infoSize); if (extraData != NULL && fExtraDataSize > 0) { TRACE("AVCodecDecoder: extra data size %ld\n", infoSize); fExtraData = new(std::nothrow) char[fExtraDataSize]; @@ -731,13 +730,13 @@ AVCodecDecoder::_DecodeVideo(void* outBuffer, int64* outFrameCount, profileCounter++; if (!(fFrame % 10)) { if (info) { - TRACE("[v] profile: d1 = %lld, d2 = %lld (%Ld) required " + printf("[v] profile: d1 = %lld, d2 = %lld (%Ld) required " "%Ld\n", decodingTime / profileCounter, conversionTime / profileCounter, fFrame, info->time_to_decode); } else { - TRACE("[v] profile: d1 = %lld, d2 = %lld (%Ld) required " + printf("[v] profile: d1 = %lld, d2 = %lld (%Ld) required " "%Ld\n", decodingTime / profileCounter, conversionTime / profileCounter, diff --git a/src/add-ons/media/plugins/ffmpeg/CpuCapabilities.cpp b/src/add-ons/media/plugins/ffmpeg/CpuCapabilities.cpp new file mode 100644 index 0000000000..2a6d84dfdd --- /dev/null +++ b/src/add-ons/media/plugins/ffmpeg/CpuCapabilities.cpp @@ -0,0 +1,146 @@ +/* + * Copyright (C) 2009 David McPaul + * + * includes code from sysinfo.c which is + * Copyright 2004-2008, Axel Dörfler, axeld@pinc-software.de. + * Copyright (c) 2002, Carlos Hasan, for Haiku. + * + * All rights reserved. Distributed under the terms of the MIT License. + */ + +#include +#include + +#include "CpuCapabilities.h" + +CPUCapabilities::~CPUCapabilities() +{ +} + +CPUCapabilities::CPUCapabilities() +{ + #ifdef __INTEL__ + setIntelCapabilities(); + #endif + + PrintCapabilities(); +} + +void +CPUCapabilities::setIntelCapabilities() +{ + cpuid_info baseInfo; + cpuid_info cpuInfo; + int32 maxStandardFunction, maxExtendedFunction = 0; + + if (get_cpuid(&baseInfo, 0L, 0L) != B_OK) { + // this CPU doesn't support cpuid + return; + } + + maxStandardFunction = baseInfo.eax_0.max_eax; + if (maxStandardFunction >= 500) { + maxStandardFunction = 0; /* old Pentium sample chips has cpu signature here */ + } + + /* Extended cpuid */ + + get_cpuid(&cpuInfo, 0x80000000, 0L); + + // extended cpuid is only supported if max_eax is greater than the service id + if (cpuInfo.eax_0.max_eax > 0x80000000) { + maxExtendedFunction = cpuInfo.eax_0.max_eax & 0xff; + } + + if (maxStandardFunction > 0) { + + get_cpuid(&cpuInfo, 1L, 0L); + if (cpuInfo.eax_1.features & (1UL << 23)) { + capabilities = CAPABILITY_MMX; + } + + if (cpuInfo.eax_1.features & (1UL << 25)) { + capabilities = CAPABILITY_SSE1; + } + + if (cpuInfo.eax_1.features & (1UL << 26)) { + capabilities = CAPABILITY_SSE2; + } + + if (maxStandardFunction >= 1) { + /* Extended features */ + if (cpuInfo.eax_1.extended_features & (1UL << 0)) { + capabilities = CAPABILITY_SSE3; + } + if (cpuInfo.eax_1.extended_features & (1UL << 9)) { + capabilities = CAPABILITY_SSSE3; + } + if (cpuInfo.eax_1.extended_features & (1UL << 19)) { + capabilities = CAPABILITY_SSE41; + } + if (cpuInfo.eax_1.extended_features & (1UL << 20)) { + capabilities = CAPABILITY_SSE42; + } + } + } +} + +bool +CPUCapabilities::HasMMX() +{ + return capabilities >= CAPABILITY_MMX; +} + +bool +CPUCapabilities::HasSSE1() +{ + return capabilities >= CAPABILITY_SSE1; +} + +bool +CPUCapabilities::HasSSE2() +{ + return capabilities >= CAPABILITY_SSE2; +} + +bool +CPUCapabilities::HasSSE3() +{ + return capabilities >= CAPABILITY_SSE3; +} + +bool +CPUCapabilities::HasSSSE3() +{ + return capabilities >= CAPABILITY_SSSE3; +} + +bool +CPUCapabilities::HasSSE41() +{ + return capabilities >= CAPABILITY_SSE41; +} + +bool +CPUCapabilities::HasSSE42() +{ + return capabilities >= CAPABILITY_SSE42; +} + +void +CPUCapabilities::PrintCapabilities() +{ + static const char *CapArray[8] = { + "", "MMX", "SSE1", "SSE2", "SSE3", "SSSE3", "SSE4.1", "SSE4.2" + }; + + printf("CPU is capable of running "); + if (capabilities) { + for (uint32 i=1;i<=capabilities;i++) { + printf("%s ",CapArray[i]); + } + } else { + printf("no extensions"); + } + printf("\n"); +} diff --git a/src/add-ons/media/plugins/ffmpeg/CpuCapabilities.h b/src/add-ons/media/plugins/ffmpeg/CpuCapabilities.h new file mode 100644 index 0000000000..c3678584a3 --- /dev/null +++ b/src/add-ons/media/plugins/ffmpeg/CpuCapabilities.h @@ -0,0 +1,40 @@ +/* + * Copyright (C) 2009 David McPaul + * + * All rights reserved. Distributed under the terms of the MIT License. + */ + +#ifndef __CPU_CAPABILITIES__ +#define __CPU_CAPABILITIES__ + +#define CAPABILITY_MMX 1 +#define CAPABILITY_SSE1 2 +#define CAPABILITY_SSE2 3 +#define CAPABILITY_SSE3 4 +#define CAPABILITY_SSSE3 5 +#define CAPABILITY_SSE41 6 +#define CAPABILITY_SSE42 7 + + +class CPUCapabilities { + public: + CPUCapabilities(); + ~CPUCapabilities(); + + bool HasMMX(); + bool HasSSE1(); + bool HasSSE2(); + bool HasSSE3(); + bool HasSSSE3(); + bool HasSSE41(); + bool HasSSE42(); + + void PrintCapabilities(); + + private: + uint32 capabilities; + + void setIntelCapabilities(); +}; + +#endif //__CPU_CAPABILITIES__ diff --git a/src/add-ons/media/plugins/ffmpeg/Jamfile b/src/add-ons/media/plugins/ffmpeg/Jamfile index 7546dab63d..a433dbb42a 100644 --- a/src/add-ons/media/plugins/ffmpeg/Jamfile +++ b/src/add-ons/media/plugins/ffmpeg/Jamfile @@ -19,11 +19,13 @@ Addon ffmpeg : EncoderTable.cpp FFmpegPlugin.cpp MuxerTable.cpp + CpuCapabilities.cpp gfx_conv_c.cpp gfx_conv_c_lookup.cpp -# gfx_conv_mmx.cpp + gfx_conv_mmx.cpp gfx_util.cpp + yuvrgb.nasm : libavformat.a libavcodec.a diff --git a/src/add-ons/media/plugins/ffmpeg/gfx_conv_mmx.cpp b/src/add-ons/media/plugins/ffmpeg/gfx_conv_mmx.cpp new file mode 100644 index 0000000000..b6c6ee07be --- /dev/null +++ b/src/add-ons/media/plugins/ffmpeg/gfx_conv_mmx.cpp @@ -0,0 +1,84 @@ +#include "gfx_conv_mmx.h" +#include "gfx_conv_c.h" + +extern "C" void _Convert_YUV420P_RGBA32_SSE2(void *fromYPtr, void *fromUPtr, void *fromVPtr, void *toPtr, int width); +extern "C" void _Convert_YUV422_RGBA32_SSE2(void *fromYPtr, void *toPtr, int width); + +void gfx_conv_null_mmx(AVFrame *in, AVFrame *out, int width, int height) { + memcpy(out->data[0], in->data[0], height * in->linesize[0]); +} + +void gfx_conv_yuv410p_ycbcr422_mmx(AVFrame *in, AVFrame *out, int width, int height) +{ +// img_convert((AVPicture *)out,PIX_FMT_YUV422P,(const AVPicture *)in,PIX_FMT_YUV410P,width,height); +} + +void gfx_conv_yuv411p_ycbcr422_mmx(AVFrame *in, AVFrame *out, int width, int height) +{ +// img_convert((AVPicture *)out,PIX_FMT_YUV422P,(const AVPicture *)in,PIX_FMT_YUV411P,width,height); +} + +void gfx_conv_yuv420p_ycbcr422_mmx(AVFrame *in, AVFrame *out, int width, int height) +{ +// img_convert((AVPicture *)out,PIX_FMT_YUV422P,(const AVPicture *)in,PIX_FMT_YUV420P,width,height); +} + +void gfx_conv_yuv410p_rgb32_mmx(AVFrame *in, AVFrame *out, int width, int height) +{ +// img_convert((AVPicture *)out,PIX_FMT_RGB32,(const AVPicture *)in,PIX_FMT_YUV410P,width,height); +} + +void gfx_conv_yuv411p_rgb32_mmx(AVFrame *in, AVFrame *out, int width, int height) +{ +// img_convert((AVPicture *)out,PIX_FMT_RGB32,(const AVPicture *)in,PIX_FMT_YUV411P,width,height); +} + +void gfx_conv_yuv420p_rgb32_mmx(AVFrame *in, AVFrame *out, int width, int height) +{ +// img_convert((AVPicture *)out,PIX_FMT_RGB32,(const AVPicture *)in,PIX_FMT_YUV420P,width,height); +} + +// Planar YUV420 +void gfx_conv_yuv420p_rgba32_sse2(AVFrame *in, AVFrame *out, int width, int height) +{ + // width must be divisibile by 8 and height divisible by 2 + if (width % 8 == 0 && height % 2 == 0) { + + uint8 *ybase = (uint8 *)in->data[0]; + uint8 *ubase = (uint8 *)in->data[1]; + uint8 *vbase = (uint8 *)in->data[2]; + uint8 *rgbbase = (uint8 *)out->data[0]; + + for (int i=0;ilinesize[0]; + rgbbase += out->linesize[0]; + + _Convert_YUV420P_RGBA32_SSE2(ybase, ubase, vbase, rgbbase, width); // Second Y row but same u and v row + ybase += in->linesize[0]; + ubase += in->linesize[1]; + vbase += in->linesize[2]; + rgbbase += out->linesize[0]; + } + } else { + gfx_conv_YCbCr420p_RGB32_c(in, out, width, height); + } +} + +// Packed YUV422 +void gfx_conv_yuv422p_rgba32_sse2(AVFrame *in, AVFrame *out, int width, int height) +{ + // width must be divisibile by 8 + if (width % 8 == 0) { + uint8 *ybase = (uint8 *)in->data[0]; + uint8 *rgbbase = (uint8 *)out->data[0]; + + for (int i = 0; i <= height; i++) { + _Convert_YUV422_RGBA32_SSE2(ybase, rgbbase, width); + ybase += in->linesize[0]; + rgbbase += out->linesize[0]; + } + } else { + gfx_conv_YCbCr422_RGB32_c(in, out, width, height); + } +} diff --git a/src/add-ons/media/plugins/ffmpeg/gfx_conv_mmx.h b/src/add-ons/media/plugins/ffmpeg/gfx_conv_mmx.h index 79456e4a61..7d82d08c30 100644 --- a/src/add-ons/media/plugins/ffmpeg/gfx_conv_mmx.h +++ b/src/add-ons/media/plugins/ffmpeg/gfx_conv_mmx.h @@ -5,8 +5,6 @@ #include #include "libavcodec/avcodec.h" -bool IsMmxCpu(); - void gfx_conv_null_mmx(AVFrame *in, AVFrame *out, int width, int height); void gfx_conv_yuv410p_ycbcr422_mmx(AVFrame *in, AVFrame *out, int width, int height); @@ -15,6 +13,7 @@ void gfx_conv_yuv420p_ycbcr422_mmx(AVFrame *in, AVFrame *out, int width, int hei void gfx_conv_yuv410p_rgb32_mmx(AVFrame *in, AVFrame *out, int width, int height); void gfx_conv_yuv411p_rgb32_mmx(AVFrame *in, AVFrame *out, int width, int height); -void gfx_conv_yuv420p_rgb32_mmx(AVFrame *in, AVFrame *out, int width, int height); +void gfx_conv_yuv420p_rgba32_sse2(AVFrame *in, AVFrame *out, int width, int height); +void gfx_conv_yuv422p_rgba32_sse2(AVFrame *in, AVFrame *out, int width, int height); #endif diff --git a/src/add-ons/media/plugins/ffmpeg/gfx_util.cpp b/src/add-ons/media/plugins/ffmpeg/gfx_util.cpp index a8b05a5d3b..d6ed18c573 100644 --- a/src/add-ons/media/plugins/ffmpeg/gfx_util.cpp +++ b/src/add-ons/media/plugins/ffmpeg/gfx_util.cpp @@ -3,6 +3,7 @@ #include "gfx_util.h" #include "gfx_conv_c.h" #include "gfx_conv_mmx.h" +#include "CpuCapabilities.h" /* * ref docs @@ -15,61 +16,51 @@ #define TRACE(a...) #endif -//#define INCLUDE_MMX defined(__INTEL__) -#define INCLUDE_MMX 0 - // this function will try to find the best colorspaces for both the ff-codec and // the Media Kit sides. gfx_convert_func resolve_colorspace(color_space colorSpace, PixelFormat pixelFormat) { -#if INCLUDE_MMX - bool mmx = IsMmxCpu(); -#endif +CPUCapabilities cpu; switch (colorSpace) { case B_RGB32: if (pixelFormat == PIX_FMT_YUV410P) { - #if INCLUDE_MMX - if (mmx) { - TRACE("resolve_colorspace: gfx_conv_yuv410p_rgb32_mmx\n"); - return gfx_conv_yuv410p_rgb32_mmx; - } else - #endif - { +// if (cpu.HasMMX()) { +// TRACE("resolve_colorspace: gfx_conv_yuv410p_rgb32_mmx\n"); +// return gfx_conv_yuv410p_rgb32_mmx; +// } else { TRACE("resolve_colorspace: gfx_conv_yuv410p_rgb32_c\n"); return gfx_conv_yuv410p_rgb32_c; - } +// } } if (pixelFormat == PIX_FMT_YUV411P) { - #if INCLUDE_MMX - if (mmx) { - TRACE("resolve_colorspace: gfx_conv_yuv411p_rgb32_mmx\n"); - return gfx_conv_yuv411p_rgb32_mmx; - } else - #endif - { +// if (cpu.HasMMX()) { +// TRACE("resolve_colorspace: gfx_conv_yuv411p_rgb32_mmx\n"); +// return gfx_conv_yuv411p_rgb32_mmx; +// } else { TRACE("resolve_colorspace: gfx_conv_yuv411p_rgb32_c\n"); return gfx_conv_yuv411p_rgb32_c; - } +// } } if (pixelFormat == PIX_FMT_YUV420P || pixelFormat == PIX_FMT_YUVJ420P) { - #if INCLUDE_MMX - if (mmx) { - TRACE("resolve_colorspace: gfx_conv_yuv420p_rgb32_mmx\n"); - return gfx_conv_yuv420p_rgb32_mmx; - } else - #endif - { - TRACE("resolve_colorspace: gfx_conv_yuv420p_rgb32_c\n"); + if (cpu.HasSSE2()) { + TRACE("resolve_colorspace: gfx_conv_yuv420p_rgba32_sse2\n"); + return gfx_conv_yuv420p_rgba32_sse2; + } else { + TRACE("resolve_colorspace: gfx_conv_YCbCr420p_RGB32_c\n"); return gfx_conv_YCbCr420p_RGB32_c; } } if (pixelFormat == PIX_FMT_YUV422P || pixelFormat == PIX_FMT_YUVJ422P) { - return gfx_conv_YCbCr422_RGB32_c; + if (cpu.HasSSE2()) { + return gfx_conv_yuv422p_rgba32_sse2; + } else { + return gfx_conv_YCbCr422_RGB32_c; + } } TRACE("resolve_colorspace: %s => B_RGB32: NULL\n", pixfmt_to_string(pixelFormat)); @@ -86,55 +77,43 @@ gfx_convert_func resolve_colorspace(color_space colorSpace, PixelFormat pixelFor case B_YCbCr422: if (pixelFormat == PIX_FMT_YUV410P) { - #if INCLUDE_MMX - if (mmx) { - TRACE("resolve_colorspace: gfx_conv_yuv410p_ycbcr422_mmx\n"); - return gfx_conv_yuv410p_ycbcr422_mmx; - } else - #endif - { +// if (cpu.HasMMX()) { +// TRACE("resolve_colorspace: gfx_conv_yuv410p_ycbcr422_mmx\n"); +// return gfx_conv_yuv410p_ycbcr422_mmx; +// } else { TRACE("resolve_colorspace: gfx_conv_yuv410p_ycbcr422_c\n"); return gfx_conv_yuv410p_ycbcr422_c; - } +// } } if (pixelFormat == PIX_FMT_YUV411P) { - #if INCLUDE_MMX - if (mmx) { - TRACE("resolve_colorspace: gfx_conv_yuv411p_ycbcr422_mmx\n"); - return gfx_conv_yuv411p_ycbcr422_mmx; - } else - #endif - { +// if (cpu.HasMMX()) { +// TRACE("resolve_colorspace: gfx_conv_yuv411p_ycbcr422_mmx\n"); +// return gfx_conv_yuv411p_ycbcr422_mmx; +// } else { TRACE("resolve_colorspace: gfx_conv_yuv411p_ycbcr422_c\n"); return gfx_conv_yuv411p_ycbcr422_c; - } +// } } if (pixelFormat == PIX_FMT_YUV420P || pixelFormat == PIX_FMT_YUVJ420P) { - #if INCLUDE_MMX - if (mmx) { - TRACE("resolve_colorspace: gfx_conv_yuv420p_ycbcr422_mmx\n"); - return gfx_conv_yuv420p_ycbcr422_mmx; - } else - #endif - { +// if (cpu.HasMMX()) { +// TRACE("resolve_colorspace: gfx_conv_yuv420p_ycbcr422_mmx\n"); +// return gfx_conv_yuv420p_ycbcr422_mmx; +// } else { TRACE("resolve_colorspace: gfx_conv_yuv420p_ycbcr422_c\n"); return gfx_conv_yuv420p_ycbcr422_c; - } +// } } if (pixelFormat == PIX_FMT_YUYV422) { - #if INCLUDE_MMX - if (mmx) { - TRACE("resolve_colorspace: PIX_FMT_YUV422 => B_YCbCr422: gfx_conv_null_mmx\n"); - return gfx_conv_null_mmx; - } else - #endif - { +// if (cpu.HasMMX()) { +// TRACE("resolve_colorspace: PIX_FMT_YUV422 => B_YCbCr422: gfx_conv_null_mmx\n"); +// return gfx_conv_null_mmx; +// } else { TRACE("resolve_colorspace: PIX_FMT_YUV422 => B_YCbCr422: gfx_conv_null_c\n"); return gfx_conv_null_c; - } +// } } TRACE("resolve_colorspace: %s => B_YCbCr422: NULL\n", pixfmt_to_string(pixelFormat)); diff --git a/src/add-ons/media/plugins/ffmpeg/yuvrgb.nasm b/src/add-ons/media/plugins/ffmpeg/yuvrgb.nasm new file mode 100644 index 0000000000..8103e2286c --- /dev/null +++ b/src/add-ons/media/plugins/ffmpeg/yuvrgb.nasm @@ -0,0 +1,274 @@ +/* + * Copyright (C) 2009 David McPaul + * + * All rights reserved. Distributed under the terms of the MIT License. + */ + +; A rather unoptimised set of yuv to rgb converters +; does 8 pixels at a time + +; inputer: +; reads 128bits of yuv 8 bit data and puts +; the y values converted to 16 bit in xmm0 +; the u values converted to 16 bit and duplicated into xmm1 +; the v values converted to 16 bit and duplicated into xmm2 + +; conversion: +; does the yuv to rgb conversion using 16 bit fixed point and the +; results are placed into the following registers as 8 bit clamped values +; r values in xmm3 +; g values in xmm4 +; b values in xmm5 + +; outputer: +; writes out the rgba pixels as 8 bit values with 0 for alpha + +; xmm6 used for scratch +; xmm7 used for scratch + +%macro cglobal 1 + global _%1 + %define %1 _%1 + align 16 +%1: +%endmacro + +; conversion code +%macro yuv2rgb 0 +; u = u - 128 +; v = v - 128 +; r = y + v + v >> 2 + v >> 3 + v >> 5 +; g = y - (u >> 2 + u >> 4 + u >> 5) - (v >> 1 + v >> 3 + v >> 4 + v >> 5) +; b = y + u + u >> 1 + u >> 2 + u >> 6 +; subtract 16 from y + movdqa xmm7, [Const16] ; loads a constant using data cache + psubsw xmm0,xmm7 ; y = y - 16 +; subtract 128 from u and v +; mov eax,128*10001H ; load a constant using instruction cache +; movd xmm7,eax ; but requires eax to be saved +; pshufd xmm7,xmm7,0 ; and uses more instructions + movdqa xmm7, [Const128] ; loads a constant using data cache (slower on first fetch but then cached) + psubsw xmm1,xmm7 ; u = u - 128 + psubsw xmm2,xmm7 ; v = v - 128 +; load r,g,b with y + movdqa xmm3,xmm0 ; r = y + pshufd xmm4,xmm0, 0xE4 ; g = y + movdqa xmm5,xmm0 ; b = y +; r = r + v + v >> 2 + v >> 3 + v >> 5 + paddsw xmm3, xmm2 ; add v to r + movdqa xmm6, xmm2 ; move v to scratch + psraw xmm6,2 ; divide by 4 + paddsw xmm3, xmm6 ; and add to r + psraw xmm6,1 ; divide by 2 + paddsw xmm3, xmm6 ; and add to r + psraw xmm6,2 ; divide by 4 + paddsw xmm3, xmm6 ; and add to r + +; g = y - u >> 2 - u >> 4 - u >> 5 - v >> 1 - v >> 3 - v >> 4 - v >> 5 + movdqa xmm6,xmm1 ; move u to scratch + psraw xmm6,2 ; divide by 4 + psubsw xmm4,xmm6 ; subtract from g + psraw xmm6,2 ; divide by 4 + psubsw xmm4,xmm6 ; subtract from g + psraw xmm6,1 ; divide by 2 + psubsw xmm4,xmm6 ; subtract from g + + movdqa xmm6,xmm2 ; move v to scratch + psraw xmm6,1 ; divide by 2 + psubsw xmm4,xmm6 ; subtract from g + psraw xmm6,2 ; divide by 4 + psubsw xmm4,xmm6 ; subtract from g + psraw xmm6,1 ; divide by 2 + psubsw xmm4,xmm6 ; subtract from g + psraw xmm6,1 ; divide by 2 + psubsw xmm4,xmm6 ; subtract from g +; b = y + u + u >> 1 + u >> 2 + u >> 6 + paddsw xmm5, xmm1 ; add u to b + movdqa xmm6, xmm1 ; move u to scratch + psraw xmm6,1 ; divide by 2 + paddsw xmm5, xmm6 ; and add to b + psraw xmm6,1 ; divide by 2 + paddsw xmm5, xmm6 ; and add to b + psraw xmm6,4 ; divide by 32 + paddsw xmm5, xmm6 ; and add to b +%endmacro + +; outputer +%macro rgba32output 0 +; clamp values + pxor xmm7,xmm7 + packuswb xmm3,xmm7 ; clamp to 0,255 and pack R to 8 bit per pixel + packuswb xmm4,xmm7 ; clamp to 0,255 and pack G to 8 bit per pixel + packuswb xmm5,xmm7 ; clamp to 0,255 and pack B to 8 bit per pixel +; convert to bgra32 packed + punpcklbw xmm3,xmm7 ; r0r0r0r0r0r0r0r0 + punpcklbw xmm5,xmm4 ; bgbgbgbgbgbgbgbg + movdqa xmm0, xmm5 ; save gb values + punpcklwd xmm5,xmm3 ; lower half bgr0bgr0bgr0bgr0 + punpckhwd xmm0,xmm3 ; upper half bgr0bgr0bgr0bgr0 +; write to output ptr + movntdq [edi], xmm5 ; output first 4 pixels bypassing cache + movntdq [edi+16], xmm0 ; output second 4 pixels bypassing cache +%endmacro + +SECTION .data align=16 + +Const16 dw 16 + dw 16 + dw 16 + dw 16 + dw 16 + dw 16 + dw 16 + dw 16 + +Const128 dw 128 + dw 128 + dw 128 + dw 128 + dw 128 + dw 128 + dw 128 + dw 128 + +; void Convert_YUV422_RGBA32_SSE2(void *fromPtr, void *toPtr, int width) +width equ ebp+16 +toPtr equ ebp+12 +fromPtr equ ebp+8 + +; void Convert_YUV420P_RGBA32_SSE2(void *fromYPtr, void *fromUPtr, void *fromVPtr, void *toPtr, int width) +width1 equ ebp+24 +toPtr1 equ ebp+20 +fromVPtr equ ebp+16 +fromUPtr equ ebp+12 +fromYPtr equ ebp+8 + +SECTION .text align=16 + +cglobal Convert_YUV422_RGBA32_SSE2 +; reserve variables + push ebp + mov ebp, esp + push edi + push esi + push ecx + + mov esi, [fromPtr] + mov edi, [toPtr] + mov ecx, [width] + prefetchnta [esi] ; hint that we will be loading our data outside of cache +; loop width / 8 times + shr ecx,3 + test ecx,ecx + jng ENDLOOP +REPEATLOOP: ; loop over width / 8 +; push ecx ; preserve loop counter +; YUV422 packed inputer + movdqa xmm0, [esi] ; should have yuyv yuyv yuyv yuyv + pshufd xmm1, xmm0, 0xE4 ; copy to xmm1 + movdqa xmm2, xmm0 ; copy to xmm2 +; extract y + pxor xmm7,xmm7 ; 00000000000000000000000000000000 + pcmpeqd xmm6,xmm6 ; ffffffffffffffffffffffffffffffff + punpcklbw xmm6,xmm7 ; interleave xmm7 into xmm6 ff00ff00ff00ff00ff00ff00ff00ff00 + pand xmm0, xmm6 ; clear all but y values leaving y0y0 etc +; extract u and duplicate so each u in yuyv becomes 0u0u + psrld xmm6,8 ; 00ff0000 00ff0000 00ff0000 00ff0000 + pand xmm1, xmm6 ; clear all yv values leaving 0u00 etc + psrld xmm1,8 ; rotate u to get u000 + pshuflw xmm1,xmm1, 0xA0 ; copy u values + pshufhw xmm1,xmm1, 0xA0 ; to get u0u0 +; extract v + pslld xmm6,16 ; 000000ff000000ff 000000ff000000ff + pand xmm2, xmm6 ; clear all yu values leaving 000v etc + psrld xmm2,8 ; rotate v to get 00v0 + pshuflw xmm2,xmm2, 0xF5 ; copy v values + pshufhw xmm2,xmm2, 0xF5 ; to get v0v0 + +yuv2rgb + +rgba32output + +; endloop + add edi,32 + add esi,16 +; pop ecx + sub ecx, 1 ; apparently sub is better than dec + jnz REPEATLOOP +ENDLOOP: +; Cleanup + pop ecx + pop esi + pop edi + mov esp, ebp + pop ebp + ret + +cglobal Convert_YUV420P_RGBA32_SSE2 +; reserve variables + push ebp + mov ebp, esp + push edi + push esi + push ecx + push eax + push ebx + + mov esi, [fromYPtr] + mov eax, [fromUPtr] + mov ebx, [fromVPtr] + mov edi, [toPtr1] + mov ecx, [width1] +; loop width / 8 times + shr ecx,3 + test ecx,ecx + jng ENDLOOP1 +REPEATLOOP1: ; loop over width / 8 +; push ecx ; preserve loop counter +; YUV420 Planar inputer + movq mm0, [esi] ; fetch 8 y values (8 bit) (direct unaligned sse2 loads might be better) + movd mm1, [eax] ; fetch 4 u values + movd mm2, [ebx] ; fetch 4 v values + + movq2dq xmm0, mm0 ; copy y to sse register yyyyyyyy00000000 + movq2dq xmm1, mm1 ; copy u to sse register uuuu000000000000 + movq2dq xmm2, mm2 ; copy v to sse register vvvv000000000000 +; extract y + pxor xmm7,xmm7 ; 00000000000000000000000000000000 + punpcklbw xmm0,xmm7 ; interleave xmm7 into xmm0 y0y0y0y0y0y0y0y0 +; extract u and duplicate so each becomes 0u0u + punpcklbw xmm1,xmm7 ; interleave xmm7 into xmm1 u0u0u0u000000000 + punpcklwd xmm1,xmm7 ; interleave again u000u000u000u000 + pshuflw xmm1,xmm1, 0xA0 ; copy u values + pshufhw xmm1,xmm1, 0xA0 ; to get u0u0 +; extract v + punpcklbw xmm2,xmm7 ; interleave xmm7 into xmm1 v0v0v0v000000000 + punpcklwd xmm2,xmm7 ; interleave again v000v000v000v000 + pshuflw xmm2,xmm2, 0xA0 ; copy v values + pshufhw xmm2,xmm2, 0xA0 ; to get v0v0 + +yuv2rgb + +rgba32output + +; endloop + add edi,32 + add esi,8 + add eax,4 + add ebx,4 +; pop ecx + sub ecx, 1 ; apparently sub is better than dec + jnz REPEATLOOP1 +ENDLOOP1: +; Cleanup + emms + pop ebx + pop eax + pop ecx + pop esi + pop edi + mov esp, ebp + pop ebp + ret + +SECTION .note.GNU-stack noalloc noexec nowrite progbits