Added SSE2 yuv to rgb conversion code. This code needs to move into the media kit though
git-svn-id: file:///srv/svn/repos/haiku/haiku/trunk@33200 a95241bf-73f2-0310-859d-f6bbb57e9c96
This commit is contained in:
parent
6a400ee80f
commit
6bfbea62cd
@ -215,7 +215,6 @@ AVCodecDecoder::Setup(media_format* ioEncodedFormat, const void* infoBuffer,
|
||||
fBlockAlign
|
||||
= ioEncodedFormat->u.encoded_audio.output.buffer_size;
|
||||
}
|
||||
printf("XXX extra data size %ld\n", infoSize);
|
||||
if (extraData != NULL && fExtraDataSize > 0) {
|
||||
TRACE("AVCodecDecoder: extra data size %ld\n", infoSize);
|
||||
fExtraData = new(std::nothrow) char[fExtraDataSize];
|
||||
@ -731,13 +730,13 @@ AVCodecDecoder::_DecodeVideo(void* outBuffer, int64* outFrameCount,
|
||||
profileCounter++;
|
||||
if (!(fFrame % 10)) {
|
||||
if (info) {
|
||||
TRACE("[v] profile: d1 = %lld, d2 = %lld (%Ld) required "
|
||||
printf("[v] profile: d1 = %lld, d2 = %lld (%Ld) required "
|
||||
"%Ld\n",
|
||||
decodingTime / profileCounter,
|
||||
conversionTime / profileCounter,
|
||||
fFrame, info->time_to_decode);
|
||||
} else {
|
||||
TRACE("[v] profile: d1 = %lld, d2 = %lld (%Ld) required "
|
||||
printf("[v] profile: d1 = %lld, d2 = %lld (%Ld) required "
|
||||
"%Ld\n",
|
||||
decodingTime / profileCounter,
|
||||
conversionTime / profileCounter,
|
||||
|
146
src/add-ons/media/plugins/ffmpeg/CpuCapabilities.cpp
Normal file
146
src/add-ons/media/plugins/ffmpeg/CpuCapabilities.cpp
Normal file
@ -0,0 +1,146 @@
|
||||
/*
|
||||
* Copyright (C) 2009 David McPaul
|
||||
*
|
||||
* includes code from sysinfo.c which is
|
||||
* Copyright 2004-2008, Axel Dörfler, axeld@pinc-software.de.
|
||||
* Copyright (c) 2002, Carlos Hasan, for Haiku.
|
||||
*
|
||||
* All rights reserved. Distributed under the terms of the MIT License.
|
||||
*/
|
||||
|
||||
#include <string.h>
|
||||
#include <cpu_type.h>
|
||||
|
||||
#include "CpuCapabilities.h"
|
||||
|
||||
CPUCapabilities::~CPUCapabilities()
|
||||
{
|
||||
}
|
||||
|
||||
CPUCapabilities::CPUCapabilities()
|
||||
{
|
||||
#ifdef __INTEL__
|
||||
setIntelCapabilities();
|
||||
#endif
|
||||
|
||||
PrintCapabilities();
|
||||
}
|
||||
|
||||
void
|
||||
CPUCapabilities::setIntelCapabilities()
|
||||
{
|
||||
cpuid_info baseInfo;
|
||||
cpuid_info cpuInfo;
|
||||
int32 maxStandardFunction, maxExtendedFunction = 0;
|
||||
|
||||
if (get_cpuid(&baseInfo, 0L, 0L) != B_OK) {
|
||||
// this CPU doesn't support cpuid
|
||||
return;
|
||||
}
|
||||
|
||||
maxStandardFunction = baseInfo.eax_0.max_eax;
|
||||
if (maxStandardFunction >= 500) {
|
||||
maxStandardFunction = 0; /* old Pentium sample chips has cpu signature here */
|
||||
}
|
||||
|
||||
/* Extended cpuid */
|
||||
|
||||
get_cpuid(&cpuInfo, 0x80000000, 0L);
|
||||
|
||||
// extended cpuid is only supported if max_eax is greater than the service id
|
||||
if (cpuInfo.eax_0.max_eax > 0x80000000) {
|
||||
maxExtendedFunction = cpuInfo.eax_0.max_eax & 0xff;
|
||||
}
|
||||
|
||||
if (maxStandardFunction > 0) {
|
||||
|
||||
get_cpuid(&cpuInfo, 1L, 0L);
|
||||
if (cpuInfo.eax_1.features & (1UL << 23)) {
|
||||
capabilities = CAPABILITY_MMX;
|
||||
}
|
||||
|
||||
if (cpuInfo.eax_1.features & (1UL << 25)) {
|
||||
capabilities = CAPABILITY_SSE1;
|
||||
}
|
||||
|
||||
if (cpuInfo.eax_1.features & (1UL << 26)) {
|
||||
capabilities = CAPABILITY_SSE2;
|
||||
}
|
||||
|
||||
if (maxStandardFunction >= 1) {
|
||||
/* Extended features */
|
||||
if (cpuInfo.eax_1.extended_features & (1UL << 0)) {
|
||||
capabilities = CAPABILITY_SSE3;
|
||||
}
|
||||
if (cpuInfo.eax_1.extended_features & (1UL << 9)) {
|
||||
capabilities = CAPABILITY_SSSE3;
|
||||
}
|
||||
if (cpuInfo.eax_1.extended_features & (1UL << 19)) {
|
||||
capabilities = CAPABILITY_SSE41;
|
||||
}
|
||||
if (cpuInfo.eax_1.extended_features & (1UL << 20)) {
|
||||
capabilities = CAPABILITY_SSE42;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool
|
||||
CPUCapabilities::HasMMX()
|
||||
{
|
||||
return capabilities >= CAPABILITY_MMX;
|
||||
}
|
||||
|
||||
bool
|
||||
CPUCapabilities::HasSSE1()
|
||||
{
|
||||
return capabilities >= CAPABILITY_SSE1;
|
||||
}
|
||||
|
||||
bool
|
||||
CPUCapabilities::HasSSE2()
|
||||
{
|
||||
return capabilities >= CAPABILITY_SSE2;
|
||||
}
|
||||
|
||||
bool
|
||||
CPUCapabilities::HasSSE3()
|
||||
{
|
||||
return capabilities >= CAPABILITY_SSE3;
|
||||
}
|
||||
|
||||
bool
|
||||
CPUCapabilities::HasSSSE3()
|
||||
{
|
||||
return capabilities >= CAPABILITY_SSSE3;
|
||||
}
|
||||
|
||||
bool
|
||||
CPUCapabilities::HasSSE41()
|
||||
{
|
||||
return capabilities >= CAPABILITY_SSE41;
|
||||
}
|
||||
|
||||
bool
|
||||
CPUCapabilities::HasSSE42()
|
||||
{
|
||||
return capabilities >= CAPABILITY_SSE42;
|
||||
}
|
||||
|
||||
void
|
||||
CPUCapabilities::PrintCapabilities()
|
||||
{
|
||||
static const char *CapArray[8] = {
|
||||
"", "MMX", "SSE1", "SSE2", "SSE3", "SSSE3", "SSE4.1", "SSE4.2"
|
||||
};
|
||||
|
||||
printf("CPU is capable of running ");
|
||||
if (capabilities) {
|
||||
for (uint32 i=1;i<=capabilities;i++) {
|
||||
printf("%s ",CapArray[i]);
|
||||
}
|
||||
} else {
|
||||
printf("no extensions");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
40
src/add-ons/media/plugins/ffmpeg/CpuCapabilities.h
Normal file
40
src/add-ons/media/plugins/ffmpeg/CpuCapabilities.h
Normal file
@ -0,0 +1,40 @@
|
||||
/*
|
||||
* Copyright (C) 2009 David McPaul
|
||||
*
|
||||
* All rights reserved. Distributed under the terms of the MIT License.
|
||||
*/
|
||||
|
||||
#ifndef __CPU_CAPABILITIES__
|
||||
#define __CPU_CAPABILITIES__
|
||||
|
||||
#define CAPABILITY_MMX 1
|
||||
#define CAPABILITY_SSE1 2
|
||||
#define CAPABILITY_SSE2 3
|
||||
#define CAPABILITY_SSE3 4
|
||||
#define CAPABILITY_SSSE3 5
|
||||
#define CAPABILITY_SSE41 6
|
||||
#define CAPABILITY_SSE42 7
|
||||
|
||||
|
||||
class CPUCapabilities {
|
||||
public:
|
||||
CPUCapabilities();
|
||||
~CPUCapabilities();
|
||||
|
||||
bool HasMMX();
|
||||
bool HasSSE1();
|
||||
bool HasSSE2();
|
||||
bool HasSSE3();
|
||||
bool HasSSSE3();
|
||||
bool HasSSE41();
|
||||
bool HasSSE42();
|
||||
|
||||
void PrintCapabilities();
|
||||
|
||||
private:
|
||||
uint32 capabilities;
|
||||
|
||||
void setIntelCapabilities();
|
||||
};
|
||||
|
||||
#endif //__CPU_CAPABILITIES__
|
@ -19,11 +19,13 @@ Addon ffmpeg :
|
||||
EncoderTable.cpp
|
||||
FFmpegPlugin.cpp
|
||||
MuxerTable.cpp
|
||||
CpuCapabilities.cpp
|
||||
|
||||
gfx_conv_c.cpp
|
||||
gfx_conv_c_lookup.cpp
|
||||
# gfx_conv_mmx.cpp
|
||||
gfx_conv_mmx.cpp
|
||||
gfx_util.cpp
|
||||
yuvrgb.nasm
|
||||
:
|
||||
libavformat.a
|
||||
libavcodec.a
|
||||
|
84
src/add-ons/media/plugins/ffmpeg/gfx_conv_mmx.cpp
Normal file
84
src/add-ons/media/plugins/ffmpeg/gfx_conv_mmx.cpp
Normal file
@ -0,0 +1,84 @@
|
||||
#include "gfx_conv_mmx.h"
|
||||
#include "gfx_conv_c.h"
|
||||
|
||||
extern "C" void _Convert_YUV420P_RGBA32_SSE2(void *fromYPtr, void *fromUPtr, void *fromVPtr, void *toPtr, int width);
|
||||
extern "C" void _Convert_YUV422_RGBA32_SSE2(void *fromYPtr, void *toPtr, int width);
|
||||
|
||||
void gfx_conv_null_mmx(AVFrame *in, AVFrame *out, int width, int height) {
|
||||
memcpy(out->data[0], in->data[0], height * in->linesize[0]);
|
||||
}
|
||||
|
||||
void gfx_conv_yuv410p_ycbcr422_mmx(AVFrame *in, AVFrame *out, int width, int height)
|
||||
{
|
||||
// img_convert((AVPicture *)out,PIX_FMT_YUV422P,(const AVPicture *)in,PIX_FMT_YUV410P,width,height);
|
||||
}
|
||||
|
||||
void gfx_conv_yuv411p_ycbcr422_mmx(AVFrame *in, AVFrame *out, int width, int height)
|
||||
{
|
||||
// img_convert((AVPicture *)out,PIX_FMT_YUV422P,(const AVPicture *)in,PIX_FMT_YUV411P,width,height);
|
||||
}
|
||||
|
||||
void gfx_conv_yuv420p_ycbcr422_mmx(AVFrame *in, AVFrame *out, int width, int height)
|
||||
{
|
||||
// img_convert((AVPicture *)out,PIX_FMT_YUV422P,(const AVPicture *)in,PIX_FMT_YUV420P,width,height);
|
||||
}
|
||||
|
||||
void gfx_conv_yuv410p_rgb32_mmx(AVFrame *in, AVFrame *out, int width, int height)
|
||||
{
|
||||
// img_convert((AVPicture *)out,PIX_FMT_RGB32,(const AVPicture *)in,PIX_FMT_YUV410P,width,height);
|
||||
}
|
||||
|
||||
void gfx_conv_yuv411p_rgb32_mmx(AVFrame *in, AVFrame *out, int width, int height)
|
||||
{
|
||||
// img_convert((AVPicture *)out,PIX_FMT_RGB32,(const AVPicture *)in,PIX_FMT_YUV411P,width,height);
|
||||
}
|
||||
|
||||
void gfx_conv_yuv420p_rgb32_mmx(AVFrame *in, AVFrame *out, int width, int height)
|
||||
{
|
||||
// img_convert((AVPicture *)out,PIX_FMT_RGB32,(const AVPicture *)in,PIX_FMT_YUV420P,width,height);
|
||||
}
|
||||
|
||||
// Planar YUV420
|
||||
void gfx_conv_yuv420p_rgba32_sse2(AVFrame *in, AVFrame *out, int width, int height)
|
||||
{
|
||||
// width must be divisibile by 8 and height divisible by 2
|
||||
if (width % 8 == 0 && height % 2 == 0) {
|
||||
|
||||
uint8 *ybase = (uint8 *)in->data[0];
|
||||
uint8 *ubase = (uint8 *)in->data[1];
|
||||
uint8 *vbase = (uint8 *)in->data[2];
|
||||
uint8 *rgbbase = (uint8 *)out->data[0];
|
||||
|
||||
for (int i=0;i<height;i+=2) {
|
||||
_Convert_YUV420P_RGBA32_SSE2(ybase, ubase, vbase, rgbbase, width); // First Y row
|
||||
ybase += in->linesize[0];
|
||||
rgbbase += out->linesize[0];
|
||||
|
||||
_Convert_YUV420P_RGBA32_SSE2(ybase, ubase, vbase, rgbbase, width); // Second Y row but same u and v row
|
||||
ybase += in->linesize[0];
|
||||
ubase += in->linesize[1];
|
||||
vbase += in->linesize[2];
|
||||
rgbbase += out->linesize[0];
|
||||
}
|
||||
} else {
|
||||
gfx_conv_YCbCr420p_RGB32_c(in, out, width, height);
|
||||
}
|
||||
}
|
||||
|
||||
// Packed YUV422
|
||||
void gfx_conv_yuv422p_rgba32_sse2(AVFrame *in, AVFrame *out, int width, int height)
|
||||
{
|
||||
// width must be divisibile by 8
|
||||
if (width % 8 == 0) {
|
||||
uint8 *ybase = (uint8 *)in->data[0];
|
||||
uint8 *rgbbase = (uint8 *)out->data[0];
|
||||
|
||||
for (int i = 0; i <= height; i++) {
|
||||
_Convert_YUV422_RGBA32_SSE2(ybase, rgbbase, width);
|
||||
ybase += in->linesize[0];
|
||||
rgbbase += out->linesize[0];
|
||||
}
|
||||
} else {
|
||||
gfx_conv_YCbCr422_RGB32_c(in, out, width, height);
|
||||
}
|
||||
}
|
@ -5,8 +5,6 @@
|
||||
#include <GraphicsDefs.h>
|
||||
#include "libavcodec/avcodec.h"
|
||||
|
||||
bool IsMmxCpu();
|
||||
|
||||
void gfx_conv_null_mmx(AVFrame *in, AVFrame *out, int width, int height);
|
||||
|
||||
void gfx_conv_yuv410p_ycbcr422_mmx(AVFrame *in, AVFrame *out, int width, int height);
|
||||
@ -15,6 +13,7 @@ void gfx_conv_yuv420p_ycbcr422_mmx(AVFrame *in, AVFrame *out, int width, int hei
|
||||
|
||||
void gfx_conv_yuv410p_rgb32_mmx(AVFrame *in, AVFrame *out, int width, int height);
|
||||
void gfx_conv_yuv411p_rgb32_mmx(AVFrame *in, AVFrame *out, int width, int height);
|
||||
void gfx_conv_yuv420p_rgb32_mmx(AVFrame *in, AVFrame *out, int width, int height);
|
||||
void gfx_conv_yuv420p_rgba32_sse2(AVFrame *in, AVFrame *out, int width, int height);
|
||||
void gfx_conv_yuv422p_rgba32_sse2(AVFrame *in, AVFrame *out, int width, int height);
|
||||
|
||||
#endif
|
||||
|
@ -3,6 +3,7 @@
|
||||
#include "gfx_util.h"
|
||||
#include "gfx_conv_c.h"
|
||||
#include "gfx_conv_mmx.h"
|
||||
#include "CpuCapabilities.h"
|
||||
|
||||
/*
|
||||
* ref docs
|
||||
@ -15,61 +16,51 @@
|
||||
#define TRACE(a...)
|
||||
#endif
|
||||
|
||||
//#define INCLUDE_MMX defined(__INTEL__)
|
||||
#define INCLUDE_MMX 0
|
||||
|
||||
// this function will try to find the best colorspaces for both the ff-codec and
|
||||
// the Media Kit sides.
|
||||
gfx_convert_func resolve_colorspace(color_space colorSpace, PixelFormat pixelFormat)
|
||||
{
|
||||
#if INCLUDE_MMX
|
||||
bool mmx = IsMmxCpu();
|
||||
#endif
|
||||
CPUCapabilities cpu;
|
||||
|
||||
switch (colorSpace)
|
||||
{
|
||||
case B_RGB32:
|
||||
if (pixelFormat == PIX_FMT_YUV410P) {
|
||||
#if INCLUDE_MMX
|
||||
if (mmx) {
|
||||
TRACE("resolve_colorspace: gfx_conv_yuv410p_rgb32_mmx\n");
|
||||
return gfx_conv_yuv410p_rgb32_mmx;
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
// if (cpu.HasMMX()) {
|
||||
// TRACE("resolve_colorspace: gfx_conv_yuv410p_rgb32_mmx\n");
|
||||
// return gfx_conv_yuv410p_rgb32_mmx;
|
||||
// } else {
|
||||
TRACE("resolve_colorspace: gfx_conv_yuv410p_rgb32_c\n");
|
||||
return gfx_conv_yuv410p_rgb32_c;
|
||||
}
|
||||
// }
|
||||
}
|
||||
|
||||
if (pixelFormat == PIX_FMT_YUV411P) {
|
||||
#if INCLUDE_MMX
|
||||
if (mmx) {
|
||||
TRACE("resolve_colorspace: gfx_conv_yuv411p_rgb32_mmx\n");
|
||||
return gfx_conv_yuv411p_rgb32_mmx;
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
// if (cpu.HasMMX()) {
|
||||
// TRACE("resolve_colorspace: gfx_conv_yuv411p_rgb32_mmx\n");
|
||||
// return gfx_conv_yuv411p_rgb32_mmx;
|
||||
// } else {
|
||||
TRACE("resolve_colorspace: gfx_conv_yuv411p_rgb32_c\n");
|
||||
return gfx_conv_yuv411p_rgb32_c;
|
||||
}
|
||||
// }
|
||||
}
|
||||
|
||||
if (pixelFormat == PIX_FMT_YUV420P || pixelFormat == PIX_FMT_YUVJ420P) {
|
||||
#if INCLUDE_MMX
|
||||
if (mmx) {
|
||||
TRACE("resolve_colorspace: gfx_conv_yuv420p_rgb32_mmx\n");
|
||||
return gfx_conv_yuv420p_rgb32_mmx;
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
TRACE("resolve_colorspace: gfx_conv_yuv420p_rgb32_c\n");
|
||||
if (cpu.HasSSE2()) {
|
||||
TRACE("resolve_colorspace: gfx_conv_yuv420p_rgba32_sse2\n");
|
||||
return gfx_conv_yuv420p_rgba32_sse2;
|
||||
} else {
|
||||
TRACE("resolve_colorspace: gfx_conv_YCbCr420p_RGB32_c\n");
|
||||
return gfx_conv_YCbCr420p_RGB32_c;
|
||||
}
|
||||
}
|
||||
|
||||
if (pixelFormat == PIX_FMT_YUV422P || pixelFormat == PIX_FMT_YUVJ422P) {
|
||||
return gfx_conv_YCbCr422_RGB32_c;
|
||||
if (cpu.HasSSE2()) {
|
||||
return gfx_conv_yuv422p_rgba32_sse2;
|
||||
} else {
|
||||
return gfx_conv_YCbCr422_RGB32_c;
|
||||
}
|
||||
}
|
||||
|
||||
TRACE("resolve_colorspace: %s => B_RGB32: NULL\n", pixfmt_to_string(pixelFormat));
|
||||
@ -86,55 +77,43 @@ gfx_convert_func resolve_colorspace(color_space colorSpace, PixelFormat pixelFor
|
||||
case B_YCbCr422:
|
||||
|
||||
if (pixelFormat == PIX_FMT_YUV410P) {
|
||||
#if INCLUDE_MMX
|
||||
if (mmx) {
|
||||
TRACE("resolve_colorspace: gfx_conv_yuv410p_ycbcr422_mmx\n");
|
||||
return gfx_conv_yuv410p_ycbcr422_mmx;
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
// if (cpu.HasMMX()) {
|
||||
// TRACE("resolve_colorspace: gfx_conv_yuv410p_ycbcr422_mmx\n");
|
||||
// return gfx_conv_yuv410p_ycbcr422_mmx;
|
||||
// } else {
|
||||
TRACE("resolve_colorspace: gfx_conv_yuv410p_ycbcr422_c\n");
|
||||
return gfx_conv_yuv410p_ycbcr422_c;
|
||||
}
|
||||
// }
|
||||
}
|
||||
|
||||
if (pixelFormat == PIX_FMT_YUV411P) {
|
||||
#if INCLUDE_MMX
|
||||
if (mmx) {
|
||||
TRACE("resolve_colorspace: gfx_conv_yuv411p_ycbcr422_mmx\n");
|
||||
return gfx_conv_yuv411p_ycbcr422_mmx;
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
// if (cpu.HasMMX()) {
|
||||
// TRACE("resolve_colorspace: gfx_conv_yuv411p_ycbcr422_mmx\n");
|
||||
// return gfx_conv_yuv411p_ycbcr422_mmx;
|
||||
// } else {
|
||||
TRACE("resolve_colorspace: gfx_conv_yuv411p_ycbcr422_c\n");
|
||||
return gfx_conv_yuv411p_ycbcr422_c;
|
||||
}
|
||||
// }
|
||||
}
|
||||
|
||||
if (pixelFormat == PIX_FMT_YUV420P || pixelFormat == PIX_FMT_YUVJ420P) {
|
||||
#if INCLUDE_MMX
|
||||
if (mmx) {
|
||||
TRACE("resolve_colorspace: gfx_conv_yuv420p_ycbcr422_mmx\n");
|
||||
return gfx_conv_yuv420p_ycbcr422_mmx;
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
// if (cpu.HasMMX()) {
|
||||
// TRACE("resolve_colorspace: gfx_conv_yuv420p_ycbcr422_mmx\n");
|
||||
// return gfx_conv_yuv420p_ycbcr422_mmx;
|
||||
// } else {
|
||||
TRACE("resolve_colorspace: gfx_conv_yuv420p_ycbcr422_c\n");
|
||||
return gfx_conv_yuv420p_ycbcr422_c;
|
||||
}
|
||||
// }
|
||||
}
|
||||
|
||||
if (pixelFormat == PIX_FMT_YUYV422) {
|
||||
#if INCLUDE_MMX
|
||||
if (mmx) {
|
||||
TRACE("resolve_colorspace: PIX_FMT_YUV422 => B_YCbCr422: gfx_conv_null_mmx\n");
|
||||
return gfx_conv_null_mmx;
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
// if (cpu.HasMMX()) {
|
||||
// TRACE("resolve_colorspace: PIX_FMT_YUV422 => B_YCbCr422: gfx_conv_null_mmx\n");
|
||||
// return gfx_conv_null_mmx;
|
||||
// } else {
|
||||
TRACE("resolve_colorspace: PIX_FMT_YUV422 => B_YCbCr422: gfx_conv_null_c\n");
|
||||
return gfx_conv_null_c;
|
||||
}
|
||||
// }
|
||||
}
|
||||
|
||||
TRACE("resolve_colorspace: %s => B_YCbCr422: NULL\n", pixfmt_to_string(pixelFormat));
|
||||
|
274
src/add-ons/media/plugins/ffmpeg/yuvrgb.nasm
Normal file
274
src/add-ons/media/plugins/ffmpeg/yuvrgb.nasm
Normal file
@ -0,0 +1,274 @@
|
||||
/*
|
||||
* Copyright (C) 2009 David McPaul
|
||||
*
|
||||
* All rights reserved. Distributed under the terms of the MIT License.
|
||||
*/
|
||||
|
||||
; A rather unoptimised set of yuv to rgb converters
|
||||
; does 8 pixels at a time
|
||||
|
||||
; inputer:
|
||||
; reads 128bits of yuv 8 bit data and puts
|
||||
; the y values converted to 16 bit in xmm0
|
||||
; the u values converted to 16 bit and duplicated into xmm1
|
||||
; the v values converted to 16 bit and duplicated into xmm2
|
||||
|
||||
; conversion:
|
||||
; does the yuv to rgb conversion using 16 bit fixed point and the
|
||||
; results are placed into the following registers as 8 bit clamped values
|
||||
; r values in xmm3
|
||||
; g values in xmm4
|
||||
; b values in xmm5
|
||||
|
||||
; outputer:
|
||||
; writes out the rgba pixels as 8 bit values with 0 for alpha
|
||||
|
||||
; xmm6 used for scratch
|
||||
; xmm7 used for scratch
|
||||
|
||||
%macro cglobal 1
|
||||
global _%1
|
||||
%define %1 _%1
|
||||
align 16
|
||||
%1:
|
||||
%endmacro
|
||||
|
||||
; conversion code
|
||||
%macro yuv2rgb 0
|
||||
; u = u - 128
|
||||
; v = v - 128
|
||||
; r = y + v + v >> 2 + v >> 3 + v >> 5
|
||||
; g = y - (u >> 2 + u >> 4 + u >> 5) - (v >> 1 + v >> 3 + v >> 4 + v >> 5)
|
||||
; b = y + u + u >> 1 + u >> 2 + u >> 6
|
||||
; subtract 16 from y
|
||||
movdqa xmm7, [Const16] ; loads a constant using data cache
|
||||
psubsw xmm0,xmm7 ; y = y - 16
|
||||
; subtract 128 from u and v
|
||||
; mov eax,128*10001H ; load a constant using instruction cache
|
||||
; movd xmm7,eax ; but requires eax to be saved
|
||||
; pshufd xmm7,xmm7,0 ; and uses more instructions
|
||||
movdqa xmm7, [Const128] ; loads a constant using data cache (slower on first fetch but then cached)
|
||||
psubsw xmm1,xmm7 ; u = u - 128
|
||||
psubsw xmm2,xmm7 ; v = v - 128
|
||||
; load r,g,b with y
|
||||
movdqa xmm3,xmm0 ; r = y
|
||||
pshufd xmm4,xmm0, 0xE4 ; g = y
|
||||
movdqa xmm5,xmm0 ; b = y
|
||||
; r = r + v + v >> 2 + v >> 3 + v >> 5
|
||||
paddsw xmm3, xmm2 ; add v to r
|
||||
movdqa xmm6, xmm2 ; move v to scratch
|
||||
psraw xmm6,2 ; divide by 4
|
||||
paddsw xmm3, xmm6 ; and add to r
|
||||
psraw xmm6,1 ; divide by 2
|
||||
paddsw xmm3, xmm6 ; and add to r
|
||||
psraw xmm6,2 ; divide by 4
|
||||
paddsw xmm3, xmm6 ; and add to r
|
||||
|
||||
; g = y - u >> 2 - u >> 4 - u >> 5 - v >> 1 - v >> 3 - v >> 4 - v >> 5
|
||||
movdqa xmm6,xmm1 ; move u to scratch
|
||||
psraw xmm6,2 ; divide by 4
|
||||
psubsw xmm4,xmm6 ; subtract from g
|
||||
psraw xmm6,2 ; divide by 4
|
||||
psubsw xmm4,xmm6 ; subtract from g
|
||||
psraw xmm6,1 ; divide by 2
|
||||
psubsw xmm4,xmm6 ; subtract from g
|
||||
|
||||
movdqa xmm6,xmm2 ; move v to scratch
|
||||
psraw xmm6,1 ; divide by 2
|
||||
psubsw xmm4,xmm6 ; subtract from g
|
||||
psraw xmm6,2 ; divide by 4
|
||||
psubsw xmm4,xmm6 ; subtract from g
|
||||
psraw xmm6,1 ; divide by 2
|
||||
psubsw xmm4,xmm6 ; subtract from g
|
||||
psraw xmm6,1 ; divide by 2
|
||||
psubsw xmm4,xmm6 ; subtract from g
|
||||
; b = y + u + u >> 1 + u >> 2 + u >> 6
|
||||
paddsw xmm5, xmm1 ; add u to b
|
||||
movdqa xmm6, xmm1 ; move u to scratch
|
||||
psraw xmm6,1 ; divide by 2
|
||||
paddsw xmm5, xmm6 ; and add to b
|
||||
psraw xmm6,1 ; divide by 2
|
||||
paddsw xmm5, xmm6 ; and add to b
|
||||
psraw xmm6,4 ; divide by 32
|
||||
paddsw xmm5, xmm6 ; and add to b
|
||||
%endmacro
|
||||
|
||||
; outputer
|
||||
%macro rgba32output 0
|
||||
; clamp values
|
||||
pxor xmm7,xmm7
|
||||
packuswb xmm3,xmm7 ; clamp to 0,255 and pack R to 8 bit per pixel
|
||||
packuswb xmm4,xmm7 ; clamp to 0,255 and pack G to 8 bit per pixel
|
||||
packuswb xmm5,xmm7 ; clamp to 0,255 and pack B to 8 bit per pixel
|
||||
; convert to bgra32 packed
|
||||
punpcklbw xmm3,xmm7 ; r0r0r0r0r0r0r0r0
|
||||
punpcklbw xmm5,xmm4 ; bgbgbgbgbgbgbgbg
|
||||
movdqa xmm0, xmm5 ; save gb values
|
||||
punpcklwd xmm5,xmm3 ; lower half bgr0bgr0bgr0bgr0
|
||||
punpckhwd xmm0,xmm3 ; upper half bgr0bgr0bgr0bgr0
|
||||
; write to output ptr
|
||||
movntdq [edi], xmm5 ; output first 4 pixels bypassing cache
|
||||
movntdq [edi+16], xmm0 ; output second 4 pixels bypassing cache
|
||||
%endmacro
|
||||
|
||||
SECTION .data align=16
|
||||
|
||||
Const16 dw 16
|
||||
dw 16
|
||||
dw 16
|
||||
dw 16
|
||||
dw 16
|
||||
dw 16
|
||||
dw 16
|
||||
dw 16
|
||||
|
||||
Const128 dw 128
|
||||
dw 128
|
||||
dw 128
|
||||
dw 128
|
||||
dw 128
|
||||
dw 128
|
||||
dw 128
|
||||
dw 128
|
||||
|
||||
; void Convert_YUV422_RGBA32_SSE2(void *fromPtr, void *toPtr, int width)
|
||||
width equ ebp+16
|
||||
toPtr equ ebp+12
|
||||
fromPtr equ ebp+8
|
||||
|
||||
; void Convert_YUV420P_RGBA32_SSE2(void *fromYPtr, void *fromUPtr, void *fromVPtr, void *toPtr, int width)
|
||||
width1 equ ebp+24
|
||||
toPtr1 equ ebp+20
|
||||
fromVPtr equ ebp+16
|
||||
fromUPtr equ ebp+12
|
||||
fromYPtr equ ebp+8
|
||||
|
||||
SECTION .text align=16
|
||||
|
||||
cglobal Convert_YUV422_RGBA32_SSE2
|
||||
; reserve variables
|
||||
push ebp
|
||||
mov ebp, esp
|
||||
push edi
|
||||
push esi
|
||||
push ecx
|
||||
|
||||
mov esi, [fromPtr]
|
||||
mov edi, [toPtr]
|
||||
mov ecx, [width]
|
||||
prefetchnta [esi] ; hint that we will be loading our data outside of cache
|
||||
; loop width / 8 times
|
||||
shr ecx,3
|
||||
test ecx,ecx
|
||||
jng ENDLOOP
|
||||
REPEATLOOP: ; loop over width / 8
|
||||
; push ecx ; preserve loop counter
|
||||
; YUV422 packed inputer
|
||||
movdqa xmm0, [esi] ; should have yuyv yuyv yuyv yuyv
|
||||
pshufd xmm1, xmm0, 0xE4 ; copy to xmm1
|
||||
movdqa xmm2, xmm0 ; copy to xmm2
|
||||
; extract y
|
||||
pxor xmm7,xmm7 ; 00000000000000000000000000000000
|
||||
pcmpeqd xmm6,xmm6 ; ffffffffffffffffffffffffffffffff
|
||||
punpcklbw xmm6,xmm7 ; interleave xmm7 into xmm6 ff00ff00ff00ff00ff00ff00ff00ff00
|
||||
pand xmm0, xmm6 ; clear all but y values leaving y0y0 etc
|
||||
; extract u and duplicate so each u in yuyv becomes 0u0u
|
||||
psrld xmm6,8 ; 00ff0000 00ff0000 00ff0000 00ff0000
|
||||
pand xmm1, xmm6 ; clear all yv values leaving 0u00 etc
|
||||
psrld xmm1,8 ; rotate u to get u000
|
||||
pshuflw xmm1,xmm1, 0xA0 ; copy u values
|
||||
pshufhw xmm1,xmm1, 0xA0 ; to get u0u0
|
||||
; extract v
|
||||
pslld xmm6,16 ; 000000ff000000ff 000000ff000000ff
|
||||
pand xmm2, xmm6 ; clear all yu values leaving 000v etc
|
||||
psrld xmm2,8 ; rotate v to get 00v0
|
||||
pshuflw xmm2,xmm2, 0xF5 ; copy v values
|
||||
pshufhw xmm2,xmm2, 0xF5 ; to get v0v0
|
||||
|
||||
yuv2rgb
|
||||
|
||||
rgba32output
|
||||
|
||||
; endloop
|
||||
add edi,32
|
||||
add esi,16
|
||||
; pop ecx
|
||||
sub ecx, 1 ; apparently sub is better than dec
|
||||
jnz REPEATLOOP
|
||||
ENDLOOP:
|
||||
; Cleanup
|
||||
pop ecx
|
||||
pop esi
|
||||
pop edi
|
||||
mov esp, ebp
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
cglobal Convert_YUV420P_RGBA32_SSE2
|
||||
; reserve variables
|
||||
push ebp
|
||||
mov ebp, esp
|
||||
push edi
|
||||
push esi
|
||||
push ecx
|
||||
push eax
|
||||
push ebx
|
||||
|
||||
mov esi, [fromYPtr]
|
||||
mov eax, [fromUPtr]
|
||||
mov ebx, [fromVPtr]
|
||||
mov edi, [toPtr1]
|
||||
mov ecx, [width1]
|
||||
; loop width / 8 times
|
||||
shr ecx,3
|
||||
test ecx,ecx
|
||||
jng ENDLOOP1
|
||||
REPEATLOOP1: ; loop over width / 8
|
||||
; push ecx ; preserve loop counter
|
||||
; YUV420 Planar inputer
|
||||
movq mm0, [esi] ; fetch 8 y values (8 bit) (direct unaligned sse2 loads might be better)
|
||||
movd mm1, [eax] ; fetch 4 u values
|
||||
movd mm2, [ebx] ; fetch 4 v values
|
||||
|
||||
movq2dq xmm0, mm0 ; copy y to sse register yyyyyyyy00000000
|
||||
movq2dq xmm1, mm1 ; copy u to sse register uuuu000000000000
|
||||
movq2dq xmm2, mm2 ; copy v to sse register vvvv000000000000
|
||||
; extract y
|
||||
pxor xmm7,xmm7 ; 00000000000000000000000000000000
|
||||
punpcklbw xmm0,xmm7 ; interleave xmm7 into xmm0 y0y0y0y0y0y0y0y0
|
||||
; extract u and duplicate so each becomes 0u0u
|
||||
punpcklbw xmm1,xmm7 ; interleave xmm7 into xmm1 u0u0u0u000000000
|
||||
punpcklwd xmm1,xmm7 ; interleave again u000u000u000u000
|
||||
pshuflw xmm1,xmm1, 0xA0 ; copy u values
|
||||
pshufhw xmm1,xmm1, 0xA0 ; to get u0u0
|
||||
; extract v
|
||||
punpcklbw xmm2,xmm7 ; interleave xmm7 into xmm1 v0v0v0v000000000
|
||||
punpcklwd xmm2,xmm7 ; interleave again v000v000v000v000
|
||||
pshuflw xmm2,xmm2, 0xA0 ; copy v values
|
||||
pshufhw xmm2,xmm2, 0xA0 ; to get v0v0
|
||||
|
||||
yuv2rgb
|
||||
|
||||
rgba32output
|
||||
|
||||
; endloop
|
||||
add edi,32
|
||||
add esi,8
|
||||
add eax,4
|
||||
add ebx,4
|
||||
; pop ecx
|
||||
sub ecx, 1 ; apparently sub is better than dec
|
||||
jnz REPEATLOOP1
|
||||
ENDLOOP1:
|
||||
; Cleanup
|
||||
emms
|
||||
pop ebx
|
||||
pop eax
|
||||
pop ecx
|
||||
pop esi
|
||||
pop edi
|
||||
mov esp, ebp
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
SECTION .note.GNU-stack noalloc noexec nowrite progbits
|
Loading…
Reference in New Issue
Block a user