correct yuv422 planar conversion. Seperate sse, sse2 and ssse3 asm code. Add packed convertor
git-svn-id: file:///srv/svn/repos/haiku/haiku/trunk@39913 a95241bf-73f2-0310-859d-f6bbb57e9c96
This commit is contained in:
parent
88cbc0dbc5
commit
51d9b4fe14
@ -27,7 +27,9 @@ Addon ffmpeg :
|
||||
gfx_conv_c_lookup.cpp
|
||||
gfx_conv_mmx.cpp
|
||||
gfx_util.cpp
|
||||
yuvrgb.nasm
|
||||
yuvrgb_sse.nasm
|
||||
yuvrgb_sse2.nasm
|
||||
yuvrgb_ssse3.nasm
|
||||
:
|
||||
libavformat.a
|
||||
libavcodec.a
|
||||
|
@ -1,22 +1,63 @@
|
||||
#include "gfx_conv_mmx.h"
|
||||
#include "gfx_conv_c.h"
|
||||
|
||||
|
||||
extern "C" void _Convert_YUV420P_RGBA32_SSE2(void *fromYPtr, void *fromUPtr,
|
||||
void *fromVPtr, void *toPtr, int width);
|
||||
extern "C" void _Convert_YUV422_RGBA32_SSE2(void *fromYPtr, void *toPtr,
|
||||
int width);
|
||||
extern "C" void _Convert_YUV420P_RGBA32_SSE(void *fromYPtr, void *fromUPtr,
|
||||
void *fromVPtr, void *toPtr, int width);
|
||||
// Packed
|
||||
extern "C" void _Convert_YUV422_RGBA32_SSE(void *fromYPtr, void *toPtr,
|
||||
int width);
|
||||
extern "C" void _Convert_YUV422_RGBA32_SSE2(void *fromYPtr, void *toPtr,
|
||||
int width);
|
||||
extern "C" void _Convert_YUV422_RGBA32_SSSE3(void *fromYPtr, void *toPtr,
|
||||
int width);
|
||||
|
||||
// Planar
|
||||
extern "C" void _Convert_YUV420P_RGBA32_SSE(void *fromYPtr, void *fromUPtr,
|
||||
void *fromVPtr, void *toPtr, int width);
|
||||
extern "C" void _Convert_YUV420P_RGBA32_SSE2(void *fromYPtr, void *fromUPtr,
|
||||
void *fromVPtr, void *toPtr, int width);
|
||||
extern "C" void _Convert_YUV420P_RGBA32_SSSE3(void *fromYPtr, void *fromUPtr,
|
||||
void *fromVPtr, void *toPtr, int width);
|
||||
|
||||
|
||||
// Planar YUV420 means 2 Y lines share a UV line
|
||||
void
|
||||
gfx_conv_yuv420p_rgba32_sse(AVFrame *in, AVFrame *out, int width, int height)
|
||||
{
|
||||
// in and out buffers must be aligned to 16 bytes,
|
||||
// in should be as ffmpeg allocates it
|
||||
if ((off_t)out->data[0] % 16 != 0) {
|
||||
gfx_conv_YCbCr420p_RGB32_c(in, out, width, height);
|
||||
return;
|
||||
}
|
||||
|
||||
uint8 *ybase = (uint8 *)in->data[0];
|
||||
uint8 *ubase = (uint8 *)in->data[1];
|
||||
uint8 *vbase = (uint8 *)in->data[2];
|
||||
uint8 *rgbbase = (uint8 *)out->data[0];
|
||||
|
||||
int yBaseInc = in->linesize[0];
|
||||
int uBaseInc = in->linesize[1];
|
||||
int vBaseInc = in->linesize[2];
|
||||
int rgbBaseInc = out->linesize[0];
|
||||
|
||||
for (int i=0;i<height;i+=2) {
|
||||
// First Y row
|
||||
_Convert_YUV420P_RGBA32_SSE(ybase, ubase, vbase, rgbbase, width);
|
||||
ybase += yBaseInc;
|
||||
rgbbase += rgbBaseInc;
|
||||
|
||||
// Second Y row but same u and v row
|
||||
_Convert_YUV420P_RGBA32_SSE(ybase, ubase, vbase, rgbbase, width);
|
||||
ybase += yBaseInc;
|
||||
ubase += uBaseInc;
|
||||
vbase += vBaseInc;
|
||||
rgbbase += rgbBaseInc;
|
||||
}
|
||||
}
|
||||
|
||||
// Planar YUV420 means 2 Y lines share a UV line
|
||||
void
|
||||
gfx_conv_yuv420p_rgba32_sse2(AVFrame *in, AVFrame *out, int width, int height)
|
||||
{
|
||||
// Planar YUV420
|
||||
|
||||
{
|
||||
// in and out buffers must be aligned to 32 bytes,
|
||||
// in should be as ffmpeg allocates it
|
||||
if ((off_t)out->data[0] % 32 != 0) {
|
||||
@ -49,12 +90,157 @@ gfx_conv_yuv420p_rgba32_sse2(AVFrame *in, AVFrame *out, int width, int height)
|
||||
}
|
||||
}
|
||||
|
||||
// Planar YUV420 means 2 Y lines share a UV line
|
||||
void
|
||||
gfx_conv_yuv420p_rgba32_ssse3(AVFrame *in, AVFrame *out, int width, int height)
|
||||
{
|
||||
// in and out buffers must be aligned to 32 bytes,
|
||||
// in should be as ffmpeg allocates it
|
||||
if ((off_t)out->data[0] % 32 != 0) {
|
||||
gfx_conv_YCbCr420p_RGB32_c(in, out, width, height);
|
||||
return;
|
||||
}
|
||||
|
||||
uint8 *ybase = (uint8 *)in->data[0];
|
||||
uint8 *ubase = (uint8 *)in->data[1];
|
||||
uint8 *vbase = (uint8 *)in->data[2];
|
||||
uint8 *rgbbase = (uint8 *)out->data[0];
|
||||
|
||||
int yBaseInc = in->linesize[0];
|
||||
int uBaseInc = in->linesize[1];
|
||||
int vBaseInc = in->linesize[2];
|
||||
int rgbBaseInc = out->linesize[0];
|
||||
|
||||
for (int i=0;i<height;i+=2) {
|
||||
// First Y row
|
||||
_Convert_YUV420P_RGBA32_SSSE3(ybase, ubase, vbase, rgbbase, width);
|
||||
ybase += yBaseInc;
|
||||
rgbbase += rgbBaseInc;
|
||||
|
||||
// Second Y row but same u and v row
|
||||
_Convert_YUV420P_RGBA32_SSSE3(ybase, ubase, vbase, rgbbase, width);
|
||||
ybase += yBaseInc;
|
||||
ubase += uBaseInc;
|
||||
vbase += vBaseInc;
|
||||
rgbbase += rgbBaseInc;
|
||||
}
|
||||
}
|
||||
|
||||
// Planar YUV422 means each Y line has it's own UV line
|
||||
void
|
||||
gfx_conv_yuv422p_rgba32_sse(AVFrame *in, AVFrame *out, int width, int height)
|
||||
{
|
||||
// in and out buffers must be aligned to 32 bytes,
|
||||
// in should be as ffmpeg allocates it
|
||||
if ((off_t)out->data[0] % 32 != 0) {
|
||||
gfx_conv_YCbCr422_RGB32_c(in, out, width, height);
|
||||
return;
|
||||
}
|
||||
|
||||
uint8 *ybase = (uint8 *)in->data[0];
|
||||
uint8 *ubase = (uint8 *)in->data[1];
|
||||
uint8 *vbase = (uint8 *)in->data[2];
|
||||
uint8 *rgbbase = (uint8 *)out->data[0];
|
||||
|
||||
int yBaseInc = in->linesize[0];
|
||||
int uBaseInc = in->linesize[1];
|
||||
int vBaseInc = in->linesize[2];
|
||||
int rgbBaseInc = out->linesize[0];
|
||||
|
||||
for (int i=0;i<height;i++) {
|
||||
_Convert_YUV420P_RGBA32_SSE(ybase, ubase, vbase, rgbbase, width);
|
||||
ybase += yBaseInc;
|
||||
ubase += uBaseInc;
|
||||
vbase += vBaseInc;
|
||||
rgbbase += rgbBaseInc;
|
||||
}
|
||||
}
|
||||
|
||||
// Planar YUV422 means each Y line has it's own UV line
|
||||
void
|
||||
gfx_conv_yuv422p_rgba32_sse2(AVFrame *in, AVFrame *out, int width, int height)
|
||||
{
|
||||
// Packed YUV422
|
||||
// in and out buffers must be aligned to 32 bytes,
|
||||
// in should be as ffmpeg allocates it
|
||||
if ((off_t)out->data[0] % 32 != 0) {
|
||||
gfx_conv_YCbCr422_RGB32_c(in, out, width, height);
|
||||
return;
|
||||
}
|
||||
|
||||
uint8 *ybase = (uint8 *)in->data[0];
|
||||
uint8 *ubase = (uint8 *)in->data[1];
|
||||
uint8 *vbase = (uint8 *)in->data[2];
|
||||
uint8 *rgbbase = (uint8 *)out->data[0];
|
||||
|
||||
int yBaseInc = in->linesize[0];
|
||||
int uBaseInc = in->linesize[1];
|
||||
int vBaseInc = in->linesize[2];
|
||||
int rgbBaseInc = out->linesize[0];
|
||||
|
||||
for (int i=0;i<height;i++) {
|
||||
_Convert_YUV420P_RGBA32_SSE2(ybase, ubase, vbase, rgbbase, width);
|
||||
ybase += yBaseInc;
|
||||
ubase += uBaseInc;
|
||||
vbase += vBaseInc;
|
||||
rgbbase += rgbBaseInc;
|
||||
}
|
||||
}
|
||||
|
||||
// Planar YUV422 means each Y line has it's own UV line
|
||||
void
|
||||
gfx_conv_yuv422p_rgba32_ssse3(AVFrame *in, AVFrame *out, int width, int height)
|
||||
{
|
||||
// in and out buffers must be aligned to 32 bytes,
|
||||
// in should be as ffmpeg allocates it
|
||||
if ((off_t)out->data[0] % 32 != 0) {
|
||||
gfx_conv_YCbCr422_RGB32_c(in, out, width, height);
|
||||
return;
|
||||
}
|
||||
|
||||
uint8 *ybase = (uint8 *)in->data[0];
|
||||
uint8 *ubase = (uint8 *)in->data[1];
|
||||
uint8 *vbase = (uint8 *)in->data[2];
|
||||
uint8 *rgbbase = (uint8 *)out->data[0];
|
||||
|
||||
int yBaseInc = in->linesize[0];
|
||||
int uBaseInc = in->linesize[1];
|
||||
int vBaseInc = in->linesize[2];
|
||||
int rgbBaseInc = out->linesize[0];
|
||||
|
||||
for (int i=0;i<height;i++) {
|
||||
_Convert_YUV420P_RGBA32_SSSE3(ybase, ubase, vbase, rgbbase, width);
|
||||
ybase += yBaseInc;
|
||||
ubase += uBaseInc;
|
||||
vbase += vBaseInc;
|
||||
rgbbase += rgbBaseInc;
|
||||
}
|
||||
}
|
||||
|
||||
// Packed YUV422 (YUYV)
|
||||
void
|
||||
gfx_conv_yuv422_rgba32_sse(AVFrame *in, AVFrame *out, int width, int height)
|
||||
{
|
||||
// in and out buffers must be aligned to 16 bytes,
|
||||
// in should be as ffmpeg allocates it
|
||||
if ((off_t)out->data[0] % 16 != 0) {
|
||||
gfx_conv_YCbCr422_RGB32_c(in, out, width, height);
|
||||
return;
|
||||
}
|
||||
|
||||
uint8 *ybase = (uint8 *)in->data[0];
|
||||
uint8 *rgbbase = (uint8 *)out->data[0];
|
||||
|
||||
for (int i = 0; i <= height; i++) {
|
||||
_Convert_YUV422_RGBA32_SSE(ybase, rgbbase, width);
|
||||
ybase += in->linesize[0];
|
||||
rgbbase += out->linesize[0];
|
||||
}
|
||||
}
|
||||
|
||||
// Packed YUV422 (YUYV)
|
||||
void
|
||||
gfx_conv_yuv422_rgba32_sse2(AVFrame *in, AVFrame *out, int width, int height)
|
||||
{
|
||||
// in and out buffers must be aligned to 32 bytes,
|
||||
// in should be as ffmpeg allocates it
|
||||
if ((off_t)out->data[0] % 32 != 0) {
|
||||
@ -72,53 +258,13 @@ gfx_conv_yuv422p_rgba32_sse2(AVFrame *in, AVFrame *out, int width, int height)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Packed YUV422 (YUYV)
|
||||
void
|
||||
gfx_conv_yuv420p_rgba32_sse(AVFrame *in, AVFrame *out, int width, int height)
|
||||
gfx_conv_yuv422_rgba32_ssse3(AVFrame *in, AVFrame *out, int width, int height)
|
||||
{
|
||||
// Planar YUV420
|
||||
|
||||
// in and out buffers must be aligned to 16 bytes,
|
||||
// in and out buffers must be aligned to 32 bytes,
|
||||
// in should be as ffmpeg allocates it
|
||||
if ((off_t)out->data[0] % 16 != 0) {
|
||||
gfx_conv_YCbCr420p_RGB32_c(in, out, width, height);
|
||||
return;
|
||||
}
|
||||
|
||||
uint8 *ybase = (uint8 *)in->data[0];
|
||||
uint8 *ubase = (uint8 *)in->data[1];
|
||||
uint8 *vbase = (uint8 *)in->data[2];
|
||||
uint8 *rgbbase = (uint8 *)out->data[0];
|
||||
|
||||
int yBaseInc = in->linesize[0];
|
||||
int uBaseInc = in->linesize[1];
|
||||
int vBaseInc = in->linesize[2];
|
||||
int rgbBaseInc = out->linesize[0];
|
||||
|
||||
for (int i=0;i<height;i+=2) {
|
||||
// First Y row
|
||||
_Convert_YUV420P_RGBA32_SSE(ybase, ubase, vbase, rgbbase, width);
|
||||
ybase += yBaseInc;
|
||||
rgbbase += rgbBaseInc;
|
||||
|
||||
// Second Y row but same u and v row
|
||||
_Convert_YUV420P_RGBA32_SSE(ybase, ubase, vbase, rgbbase, width);
|
||||
ybase += yBaseInc;
|
||||
ubase += uBaseInc;
|
||||
vbase += vBaseInc;
|
||||
rgbbase += rgbBaseInc;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
gfx_conv_yuv422p_rgba32_sse(AVFrame *in, AVFrame *out, int width, int height)
|
||||
{
|
||||
// Packed YUV422
|
||||
|
||||
// in and out buffers must be aligned to 16 bytes,
|
||||
// in should be as ffmpeg allocates it
|
||||
if ((off_t)out->data[0] % 16 != 0) {
|
||||
if ((off_t)out->data[0] % 32 != 0) {
|
||||
gfx_conv_YCbCr422_RGB32_c(in, out, width, height);
|
||||
return;
|
||||
}
|
||||
@ -127,7 +273,7 @@ gfx_conv_yuv422p_rgba32_sse(AVFrame *in, AVFrame *out, int width, int height)
|
||||
uint8 *rgbbase = (uint8 *)out->data[0];
|
||||
|
||||
for (int i = 0; i <= height; i++) {
|
||||
_Convert_YUV422_RGBA32_SSE(ybase, rgbbase, width);
|
||||
_Convert_YUV422_RGBA32_SSSE3(ybase, rgbbase, width);
|
||||
ybase += in->linesize[0];
|
||||
rgbbase += out->linesize[0];
|
||||
}
|
||||
|
@ -7,9 +7,17 @@
|
||||
|
||||
void gfx_conv_null_mmx(AVFrame *in, AVFrame *out, int width, int height);
|
||||
|
||||
void gfx_conv_yuv420p_rgba32_sse2(AVFrame *in, AVFrame *out, int width, int height);
|
||||
void gfx_conv_yuv422p_rgba32_sse2(AVFrame *in, AVFrame *out, int width, int height);
|
||||
// Planar
|
||||
void gfx_conv_yuv420p_rgba32_sse(AVFrame *in, AVFrame *out, int width, int height);
|
||||
void gfx_conv_yuv420p_rgba32_sse2(AVFrame *in, AVFrame *out, int width, int height);
|
||||
void gfx_conv_yuv420p_rgba32_ssse3(AVFrame *in, AVFrame *out, int width, int height);
|
||||
void gfx_conv_yuv422p_rgba32_sse(AVFrame *in, AVFrame *out, int width, int height);
|
||||
void gfx_conv_yuv422p_rgba32_sse2(AVFrame *in, AVFrame *out, int width, int height);
|
||||
void gfx_conv_yuv422p_rgba32_ssse3(AVFrame *in, AVFrame *out, int width, int height);
|
||||
|
||||
// Packed
|
||||
void gfx_conv_yuv422_rgba32_sse(AVFrame *in, AVFrame *out, int width, int height);
|
||||
void gfx_conv_yuv422_rgba32_sse2(AVFrame *in, AVFrame *out, int width, int height);
|
||||
void gfx_conv_yuv422_rgba32_ssse3(AVFrame *in, AVFrame *out, int width, int height);
|
||||
|
||||
#endif
|
||||
|
@ -29,6 +29,7 @@ resolve_colorspace(color_space colorSpace, PixelFormat pixelFormat, int width,
|
||||
|
||||
switch (colorSpace) {
|
||||
case B_RGB32:
|
||||
// Planar Formats
|
||||
if (pixelFormat == PIX_FMT_YUV410P) {
|
||||
TRACE("resolve_colorspace: gfx_conv_yuv410p_rgb32_c\n");
|
||||
return gfx_conv_yuv410p_rgb32_c;
|
||||
@ -57,14 +58,32 @@ resolve_colorspace(color_space colorSpace, PixelFormat pixelFormat, int width,
|
||||
|
||||
if (pixelFormat == PIX_FMT_YUV422P
|
||||
|| pixelFormat == PIX_FMT_YUVJ422P) {
|
||||
if (cpu.HasSSE2() && width % 8 == 0)
|
||||
if (cpu.HasSSE2() && width % 8 == 0) {
|
||||
TRACE("resolve_colorspace: gfx_conv_yuv422p_RGB32_sse2\n");
|
||||
return gfx_conv_yuv422p_rgba32_sse2;
|
||||
else if (cpu.HasSSE1() && width % 4 == 0)
|
||||
} else if (cpu.HasSSE1() && width % 4 == 0) {
|
||||
TRACE("resolve_colorspace: gfx_conv_yuv422p_RGB32_sse\n");
|
||||
return gfx_conv_yuv422p_rgba32_sse;
|
||||
else
|
||||
} else {
|
||||
TRACE("resolve_colorspace: gfx_conv_YCbCr422p_RGB32_c\n");
|
||||
return gfx_conv_YCbCr422_RGB32_c;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Packed Formats
|
||||
if (pixelFormat == PIX_FMT_YUYV422) {
|
||||
if (cpu.HasSSSE3() && width % 8 == 0) {
|
||||
return gfx_conv_yuv422_rgba32_ssse3;
|
||||
} else if (cpu.HasSSE2() && width % 8 == 0) {
|
||||
return gfx_conv_yuv422_rgba32_sse2;
|
||||
} else if (cpu.HasSSE1() && width % 4 == 0
|
||||
&& height % 2 == 0) {
|
||||
return gfx_conv_yuv422_rgba32_sse;
|
||||
} else {
|
||||
return gfx_conv_YCbCr422_RGB32_c;
|
||||
}
|
||||
}
|
||||
|
||||
TRACE("resolve_colorspace: %s => B_RGB32: NULL\n",
|
||||
pixfmt_to_string(pixelFormat));
|
||||
return NULL;
|
||||
|
268
src/add-ons/media/plugins/ffmpeg/yuvrgb_sse.nasm
Normal file
268
src/add-ons/media/plugins/ffmpeg/yuvrgb_sse.nasm
Normal file
@ -0,0 +1,268 @@
|
||||
;
|
||||
; Copyright (C) 2009-2010 David McPaul
|
||||
;
|
||||
; All rights reserved. Distributed under the terms of the MIT License.
|
||||
;
|
||||
|
||||
; A rather unoptimised set of sse yuv to rgb converters
|
||||
; does 4 pixels per loop
|
||||
|
||||
; inputer:
|
||||
; reads 128 bits of yuv 8 bit data and puts
|
||||
; the y values converted to 16 bit in mm0
|
||||
; the u values converted to 16 bit and duplicated into mm1
|
||||
; the v values converted to 16 bit and duplicated into mm2
|
||||
|
||||
; conversion:
|
||||
; does the yuv to rgb conversion using 16 bit fixed point and the
|
||||
; results are placed into the following registers as 8 bit clamped values
|
||||
; r values in mm3
|
||||
; g values in mm4
|
||||
; b values in mm5
|
||||
|
||||
; outputer:
|
||||
; writes out the rgba pixels as 8 bit values with 0 for alpha
|
||||
|
||||
; mm6 used for scratch
|
||||
; mm7 used for scratch
|
||||
|
||||
%macro cglobal 1
|
||||
global _%1
|
||||
%define %1 _%1
|
||||
align 16
|
||||
%1:
|
||||
%endmacro
|
||||
|
||||
; conversion code
|
||||
%macro yuv2rgbsse 0
|
||||
; u = u - 128
|
||||
; v = v - 128
|
||||
; r = y + v + v >> 2 + v >> 3 + v >> 5
|
||||
; g = y - (u >> 2 + u >> 4 + u >> 5) - (v >> 1 + v >> 3 + v >> 4 + v >> 5)
|
||||
; b = y + u + u >> 1 + u >> 2 + u >> 6
|
||||
; subtract 16 from y
|
||||
movq mm7, [Const16] ; loads a constant using data cache (slower on first fetch but then cached)
|
||||
psubsw mm0,mm7 ; y = y - 16
|
||||
; subtract 128 from u and v
|
||||
movq mm7, [Const128] ; loads a constant using data cache (slower on first fetch but then cached)
|
||||
psubsw mm1,mm7 ; u = u - 128
|
||||
psubsw mm2,mm7 ; v = v - 128
|
||||
; load r,g,b with y
|
||||
movq mm3,mm0 ; r = y
|
||||
pshufw mm5,mm0, 0xE4 ; b = y
|
||||
|
||||
; r = r + v + v >> 2 + v >> 3 + v >> 5
|
||||
paddsw mm3, mm2 ; add v to r
|
||||
movq mm7, mm1 ; move u to scratch
|
||||
pshufw mm6, mm2, 0xE4 ; move v to scratch
|
||||
|
||||
psraw mm6,2 ; divide v by 4
|
||||
paddsw mm3, mm6 ; and add to r
|
||||
psraw mm6,1 ; divide v by 2
|
||||
paddsw mm3, mm6 ; and add to r
|
||||
psraw mm6,2 ; divide v by 4
|
||||
paddsw mm3, mm6 ; and add to r
|
||||
|
||||
; b = y + u + u >> 1 + u >> 2 + u >> 6
|
||||
paddsw mm5, mm1 ; add u to b
|
||||
psraw mm7,1 ; divide u by 2
|
||||
paddsw mm5, mm7 ; and add to b
|
||||
psraw mm7,1 ; divide u by 2
|
||||
paddsw mm5, mm7 ; and add to b
|
||||
psraw mm7,4 ; divide u by 32
|
||||
paddsw mm5, mm7 ; and add to b
|
||||
|
||||
; g = y - u >> 2 - u >> 4 - u >> 5 - v >> 1 - v >> 3 - v >> 4 - v >> 5
|
||||
movq mm7,mm2 ; move v to scratch
|
||||
pshufw mm6,mm1, 0xE4 ; move u to scratch
|
||||
movq mm4,mm0 ; g = y
|
||||
|
||||
psraw mm6,2 ; divide u by 4
|
||||
psubsw mm4,mm6 ; subtract from g
|
||||
psraw mm6,2 ; divide u by 4
|
||||
psubsw mm4,mm6 ; subtract from g
|
||||
psraw mm6,1 ; divide u by 2
|
||||
psubsw mm4,mm6 ; subtract from g
|
||||
|
||||
psraw mm7,1 ; divide v by 2
|
||||
psubsw mm4,mm7 ; subtract from g
|
||||
psraw mm7,2 ; divide v by 4
|
||||
psubsw mm4,mm7 ; subtract from g
|
||||
psraw mm7,1 ; divide v by 2
|
||||
psubsw mm4,mm7 ; subtract from g
|
||||
psraw mm7,1 ; divide v by 2
|
||||
psubsw mm4,mm7 ; subtract from g
|
||||
%endmacro
|
||||
|
||||
; outputer
|
||||
%macro rgba32sseoutput 0
|
||||
; clamp values
|
||||
pxor mm7,mm7
|
||||
packuswb mm3,mm7 ; clamp to 0,255 and pack R to 8 bit per pixel
|
||||
packuswb mm4,mm7 ; clamp to 0,255 and pack G to 8 bit per pixel
|
||||
packuswb mm5,mm7 ; clamp to 0,255 and pack B to 8 bit per pixel
|
||||
; convert to bgra32 packed
|
||||
punpcklbw mm5,mm4 ; bgbgbgbgbgbgbgbg
|
||||
movq mm0, mm5 ; save bg values
|
||||
punpcklbw mm3,mm7 ; r0r0r0r0
|
||||
punpcklwd mm5,mm3 ; lower half bgr0bgr0
|
||||
punpckhwd mm0,mm3 ; upper half bgr0bgr0
|
||||
; write to output ptr
|
||||
movq [edi], mm5 ; output first 2 pixels
|
||||
movq [edi+8], mm0 ; output second 2 pixels
|
||||
%endmacro
|
||||
|
||||
SECTION .data align=16
|
||||
|
||||
Const16 dw 16
|
||||
dw 16
|
||||
dw 16
|
||||
dw 16
|
||||
dw 16
|
||||
dw 16
|
||||
dw 16
|
||||
dw 16
|
||||
|
||||
Const128 dw 128
|
||||
dw 128
|
||||
dw 128
|
||||
dw 128
|
||||
dw 128
|
||||
dw 128
|
||||
dw 128
|
||||
dw 128
|
||||
|
||||
; Packed Convert
|
||||
; void Convert_YUV422_RGBA32_SSE(void *fromPtr, void *toPtr, int width)
|
||||
width equ ebp+16
|
||||
toPtr equ ebp+12
|
||||
fromPtr equ ebp+8
|
||||
|
||||
; Planar Convert
|
||||
; void Convert_YUV420P_RGBA32_SSE(void *fromYPtr, void *fromUPtr, void *fromVPtr, void *toPtr, int width)
|
||||
width1 equ ebp+24
|
||||
toPtr1 equ ebp+20
|
||||
fromVPtr equ ebp+16
|
||||
fromUPtr equ ebp+12
|
||||
fromYPtr equ ebp+8
|
||||
|
||||
SECTION .text align=16
|
||||
|
||||
; YUY2 FOURCC
|
||||
cglobal Convert_YUV422_RGBA32_SSE
|
||||
; reserve variables
|
||||
push ebp
|
||||
mov ebp, esp
|
||||
push edi
|
||||
push esi
|
||||
push ecx
|
||||
|
||||
mov esi, [fromPtr]
|
||||
mov ecx, [width]
|
||||
mov edi, [toPtr]
|
||||
; loop width / 4 times
|
||||
shr ecx,2
|
||||
test ecx,ecx
|
||||
jng ENDLOOP2
|
||||
REPEATLOOP2: ; loop over width / 4
|
||||
|
||||
; YUV422 packed inputer
|
||||
movq mm0, [esi] ; should have yuyv yuyv
|
||||
pshufw mm1, mm0, 0xE4 ; copy to mm1
|
||||
movq mm2, mm0 ; copy to mm2
|
||||
; extract y
|
||||
pxor mm7,mm7 ; 0000000000000000
|
||||
pcmpeqb mm6,mm6 ; ffffffffffffffff
|
||||
punpckhbw mm6,mm7 ; interleave mm7 into mm6 ff00ff00ff00ff00
|
||||
pand mm0, mm6 ; clear all but y values leaving y0y0 etc
|
||||
; extract u and duplicate so each u in yuyv becomes 0u0u
|
||||
psrld mm6,8 ; 00ff0000 00ff0000
|
||||
pand mm1, mm6 ; clear all yv values leaving 0u00 etc
|
||||
psrld mm1,8 ; rotate u to get u000
|
||||
pshufw mm1,mm1, 0xA0 ; copy u values to get u0u0 (SSE not MMX)
|
||||
; extract v
|
||||
pslld mm6,16 ; 000000ff000000ff
|
||||
pand mm2, mm6 ; clear all yu values leaving 000v etc
|
||||
psrld mm2,8 ; rotate v to get 00v0
|
||||
pshufw mm2,mm2, 0xF5 ; copy v values to get v0v0 (SSE not MMX)
|
||||
|
||||
yuv2rgbsse
|
||||
|
||||
rgba32sseoutput
|
||||
|
||||
; endloop
|
||||
add edi,16
|
||||
add esi,8
|
||||
sub ecx, 1 ; apparently sub is better than dec
|
||||
jnz REPEATLOOP2
|
||||
ENDLOOP2:
|
||||
; Cleanup
|
||||
emms ; reset mmx regs back to float
|
||||
pop ecx
|
||||
pop esi
|
||||
pop edi
|
||||
mov esp, ebp
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
cglobal Convert_YUV420P_RGBA32_SSE
|
||||
; reserve variables
|
||||
push ebp
|
||||
mov ebp, esp
|
||||
push edi
|
||||
push esi
|
||||
push ecx
|
||||
push eax
|
||||
push ebx
|
||||
|
||||
mov esi, [fromYPtr]
|
||||
mov eax, [fromUPtr]
|
||||
mov ebx, [fromVPtr]
|
||||
mov edi, [toPtr1]
|
||||
mov ecx, [width1]
|
||||
; loop width / 4 times
|
||||
shr ecx,2
|
||||
test ecx,ecx
|
||||
jng ENDLOOP3
|
||||
REPEATLOOP3: ; loop over width / 4
|
||||
; YUV420 Planar inputer
|
||||
movq mm0, [esi] ; fetch 4 y values (8 bit) yyyy0000
|
||||
movd mm1, [eax] ; fetch 2 u values (8 bit) uu000000
|
||||
movd mm2, [ebx] ; fetch 2 v values (8 bit) vv000000
|
||||
|
||||
; extract y
|
||||
pxor mm7,mm7 ; 0000000000000000
|
||||
punpcklbw mm0,mm7 ; interleave xmm7 into xmm0 y0y0y0y
|
||||
; extract u and duplicate so each becomes 0u0u
|
||||
punpcklbw mm1,mm7 ; interleave xmm7 into xmm1 u0u00000
|
||||
punpcklwd mm1,mm7 ; interleave again u000u000
|
||||
pshufw mm1,mm1, 0xA0 ; copy u values to get u0u0
|
||||
; extract v
|
||||
punpcklbw mm2,mm7 ; interleave xmm7 into xmm1 v0v00000
|
||||
punpcklwd mm2,mm7 ; interleave again v000v000
|
||||
pshufw mm2,mm2, 0xA0 ; copy v values to get v0v0
|
||||
|
||||
yuv2rgbsse
|
||||
|
||||
rgba32sseoutput
|
||||
|
||||
; endloop
|
||||
add edi,16
|
||||
add esi,4
|
||||
add eax,2
|
||||
add ebx,2
|
||||
sub ecx, 1 ; apparently sub is better than dec
|
||||
jnz REPEATLOOP3
|
||||
ENDLOOP3:
|
||||
; Cleanup
|
||||
emms
|
||||
pop ebx
|
||||
pop eax
|
||||
pop ecx
|
||||
pop esi
|
||||
pop edi
|
||||
mov esp, ebp
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
SECTION .note.GNU-stack noalloc noexec nowrite progbits
|
@ -4,11 +4,11 @@
|
||||
; All rights reserved. Distributed under the terms of the MIT License.
|
||||
;
|
||||
|
||||
; A rather unoptimised set of yuv to rgb converters
|
||||
; does 8 pixels at a time
|
||||
; A rather unoptimised set of sse2 yuv to rgb converters
|
||||
; does 8 pixels per loop
|
||||
|
||||
; inputer:
|
||||
; reads 128bits of yuv 8 bit data and puts
|
||||
; reads 128 bits of yuv 8 bit data and puts
|
||||
; the y values converted to 16 bit in xmm0
|
||||
; the u values converted to 16 bit and duplicated into xmm1
|
||||
; the v values converted to 16 bit and duplicated into xmm2
|
||||
@ -94,67 +94,6 @@
|
||||
psubsw xmm4,xmm7 ; subtract from g
|
||||
%endmacro
|
||||
|
||||
; conversion code
|
||||
%macro yuv2rgbsse 0
|
||||
; u = u - 128
|
||||
; v = v - 128
|
||||
; r = y + v + v >> 2 + v >> 3 + v >> 5
|
||||
; g = y - (u >> 2 + u >> 4 + u >> 5) - (v >> 1 + v >> 3 + v >> 4 + v >> 5)
|
||||
; b = y + u + u >> 1 + u >> 2 + u >> 6
|
||||
; subtract 16 from y
|
||||
movq mm7, [Const16] ; loads a constant using data cache (slower on first fetch but then cached)
|
||||
psubsw mm0,mm7 ; y = y - 16
|
||||
; subtract 128 from u and v
|
||||
movq mm7, [Const128] ; loads a constant using data cache (slower on first fetch but then cached)
|
||||
psubsw mm1,mm7 ; u = u - 128
|
||||
psubsw mm2,mm7 ; v = v - 128
|
||||
; load r,g,b with y
|
||||
movq mm3,mm0 ; r = y
|
||||
pshufw mm5,mm0, 0xE4 ; b = y
|
||||
|
||||
; r = r + v + v >> 2 + v >> 3 + v >> 5
|
||||
paddsw mm3, mm2 ; add v to r
|
||||
movq mm7, mm1 ; move u to scratch
|
||||
pshufw mm6, mm2, 0xE4 ; move v to scratch
|
||||
|
||||
psraw mm6,2 ; divide v by 4
|
||||
paddsw mm3, mm6 ; and add to r
|
||||
psraw mm6,1 ; divide v by 2
|
||||
paddsw mm3, mm6 ; and add to r
|
||||
psraw mm6,2 ; divide v by 4
|
||||
paddsw mm3, mm6 ; and add to r
|
||||
|
||||
; b = y + u + u >> 1 + u >> 2 + u >> 6
|
||||
paddsw mm5, mm1 ; add u to b
|
||||
psraw mm7,1 ; divide u by 2
|
||||
paddsw mm5, mm7 ; and add to b
|
||||
psraw mm7,1 ; divide u by 2
|
||||
paddsw mm5, mm7 ; and add to b
|
||||
psraw mm7,4 ; divide u by 32
|
||||
paddsw mm5, mm7 ; and add to b
|
||||
|
||||
; g = y - u >> 2 - u >> 4 - u >> 5 - v >> 1 - v >> 3 - v >> 4 - v >> 5
|
||||
movq mm7,mm2 ; move v to scratch
|
||||
pshufw mm6,mm1, 0xE4 ; move u to scratch
|
||||
movq mm4,mm0 ; g = y
|
||||
|
||||
psraw mm6,2 ; divide u by 4
|
||||
psubsw mm4,mm6 ; subtract from g
|
||||
psraw mm6,2 ; divide u by 4
|
||||
psubsw mm4,mm6 ; subtract from g
|
||||
psraw mm6,1 ; divide u by 2
|
||||
psubsw mm4,mm6 ; subtract from g
|
||||
|
||||
psraw mm7,1 ; divide v by 2
|
||||
psubsw mm4,mm7 ; subtract from g
|
||||
psraw mm7,2 ; divide v by 4
|
||||
psubsw mm4,mm7 ; subtract from g
|
||||
psraw mm7,1 ; divide v by 2
|
||||
psubsw mm4,mm7 ; subtract from g
|
||||
psraw mm7,1 ; divide v by 2
|
||||
psubsw mm4,mm7 ; subtract from g
|
||||
%endmacro
|
||||
|
||||
; outputer
|
||||
%macro rgba32sse2output 0
|
||||
; clamp values
|
||||
@ -173,24 +112,6 @@
|
||||
movntdq [edi+16], xmm0 ; output second 4 pixels bypassing cache
|
||||
%endmacro
|
||||
|
||||
; outputer
|
||||
%macro rgba32sseoutput 0
|
||||
; clamp values
|
||||
pxor mm7,mm7
|
||||
packuswb mm3,mm7 ; clamp to 0,255 and pack R to 8 bit per pixel
|
||||
packuswb mm4,mm7 ; clamp to 0,255 and pack G to 8 bit per pixel
|
||||
packuswb mm5,mm7 ; clamp to 0,255 and pack B to 8 bit per pixel
|
||||
; convert to bgra32 packed
|
||||
punpcklbw mm5,mm4 ; bgbgbgbgbgbgbgbg
|
||||
movq mm0, mm5 ; save bg values
|
||||
punpcklbw mm3,mm7 ; r0r0r0r0
|
||||
punpcklwd mm5,mm3 ; lower half bgr0bgr0
|
||||
punpckhwd mm0,mm3 ; upper half bgr0bgr0
|
||||
; write to output ptr
|
||||
movq [edi], mm5 ; output first 2 pixels
|
||||
movq [edi+8], mm0 ; output second 2 pixels
|
||||
%endmacro
|
||||
|
||||
SECTION .data align=16
|
||||
|
||||
Const16 dw 16
|
||||
@ -342,120 +263,4 @@ ENDLOOP1:
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
cglobal Convert_YUV422_RGBA32_SSE
|
||||
; reserve variables
|
||||
push ebp
|
||||
mov ebp, esp
|
||||
push edi
|
||||
push esi
|
||||
push ecx
|
||||
|
||||
mov esi, [fromPtr]
|
||||
mov ecx, [width]
|
||||
mov edi, [toPtr]
|
||||
; loop width / 4 times
|
||||
shr ecx,2
|
||||
test ecx,ecx
|
||||
jng ENDLOOP2
|
||||
REPEATLOOP2: ; loop over width / 4
|
||||
|
||||
; YUV422 packed inputer
|
||||
movq mm0, [esi] ; should have yuyv yuyv
|
||||
pshufw mm1, mm0, 0xE4 ; copy to mm1
|
||||
movq mm2, mm0 ; copy to mm2
|
||||
; extract y
|
||||
pxor mm7,mm7 ; 0000000000000000
|
||||
pcmpeqb mm6,mm6 ; ffffffffffffffff
|
||||
punpckhbw mm6,mm7 ; interleave mm7 into mm6 ff00ff00ff00ff00
|
||||
pand mm0, mm6 ; clear all but y values leaving y0y0 etc
|
||||
; extract u and duplicate so each u in yuyv becomes 0u0u
|
||||
psrld mm6,8 ; 00ff0000 00ff0000
|
||||
pand mm1, mm6 ; clear all yv values leaving 0u00 etc
|
||||
psrld mm1,8 ; rotate u to get u000
|
||||
pshufw mm1,mm1, 0xA0 ; copy u values to get u0u0 (SSE not MMX)
|
||||
; extract v
|
||||
pslld mm6,16 ; 000000ff000000ff
|
||||
pand mm2, mm6 ; clear all yu values leaving 000v etc
|
||||
psrld mm2,8 ; rotate v to get 00v0
|
||||
pshufw mm2,mm2, 0xF5 ; copy v values to get v0v0 (SSE not MMX)
|
||||
|
||||
yuv2rgbsse
|
||||
|
||||
rgba32sseoutput
|
||||
|
||||
; endloop
|
||||
add edi,16
|
||||
add esi,8
|
||||
sub ecx, 1 ; apparently sub is better than dec
|
||||
jnz REPEATLOOP2
|
||||
ENDLOOP2:
|
||||
; Cleanup
|
||||
emms ; reset mmx regs back to float
|
||||
pop ecx
|
||||
pop esi
|
||||
pop edi
|
||||
mov esp, ebp
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
cglobal Convert_YUV420P_RGBA32_SSE
|
||||
; reserve variables
|
||||
push ebp
|
||||
mov ebp, esp
|
||||
push edi
|
||||
push esi
|
||||
push ecx
|
||||
push eax
|
||||
push ebx
|
||||
|
||||
mov esi, [fromYPtr]
|
||||
mov eax, [fromUPtr]
|
||||
mov ebx, [fromVPtr]
|
||||
mov edi, [toPtr1]
|
||||
mov ecx, [width1]
|
||||
; loop width / 4 times
|
||||
shr ecx,2
|
||||
test ecx,ecx
|
||||
jng ENDLOOP3
|
||||
REPEATLOOP3: ; loop over width / 4
|
||||
; YUV420 Planar inputer
|
||||
movq mm0, [esi] ; fetch 4 y values (8 bit) yyyy0000
|
||||
movd mm1, [eax] ; fetch 2 u values (8 bit) uu000000
|
||||
movd mm2, [ebx] ; fetch 2 v values (8 bit) vv000000
|
||||
|
||||
; extract y
|
||||
pxor mm7,mm7 ; 0000000000000000
|
||||
punpcklbw mm0,mm7 ; interleave xmm7 into xmm0 y0y0y0y
|
||||
; extract u and duplicate so each becomes 0u0u
|
||||
punpcklbw mm1,mm7 ; interleave xmm7 into xmm1 u0u00000
|
||||
punpcklwd mm1,mm7 ; interleave again u000u000
|
||||
pshufw mm1,mm1, 0xA0 ; copy u values to get u0u0
|
||||
; extract v
|
||||
punpcklbw mm2,mm7 ; interleave xmm7 into xmm1 v0v00000
|
||||
punpcklwd mm2,mm7 ; interleave again v000v000
|
||||
pshufw mm2,mm2, 0xA0 ; copy v values to get v0v0
|
||||
|
||||
yuv2rgbsse
|
||||
|
||||
rgba32sseoutput
|
||||
|
||||
; endloop
|
||||
add edi,16
|
||||
add esi,4
|
||||
add eax,2
|
||||
add ebx,2
|
||||
sub ecx, 1 ; apparently sub is better than dec
|
||||
jnz REPEATLOOP3
|
||||
ENDLOOP3:
|
||||
; Cleanup
|
||||
emms
|
||||
pop ebx
|
||||
pop eax
|
||||
pop ecx
|
||||
pop esi
|
||||
pop edi
|
||||
mov esp, ebp
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
SECTION .note.GNU-stack noalloc noexec nowrite progbits
|
307
src/add-ons/media/plugins/ffmpeg/yuvrgb_ssse3.nasm
Normal file
307
src/add-ons/media/plugins/ffmpeg/yuvrgb_ssse3.nasm
Normal file
@ -0,0 +1,307 @@
|
||||
;
|
||||
; Copyright (C) 2009-2010 David McPaul
|
||||
;
|
||||
; All rights reserved. Distributed under the terms of the MIT License.
|
||||
;
|
||||
|
||||
; A rather unoptimised set of ssse3 yuv to rgb converters
|
||||
; does 8 pixels per loop
|
||||
|
||||
; inputer:
|
||||
; reads 128 bits of yuv 8 bit data and puts
|
||||
; the y values converted to 16 bit in xmm0
|
||||
; the u values converted to 16 bit and duplicated into xmm1
|
||||
; the v values converted to 16 bit and duplicated into xmm2
|
||||
|
||||
; conversion:
|
||||
; does the yuv to rgb conversion using 16 bit fixed point and the
|
||||
; results are placed into the following registers as 8 bit clamped values
|
||||
; r values in xmm3
|
||||
; g values in xmm4
|
||||
; b values in xmm5
|
||||
|
||||
; outputer:
|
||||
; writes out the rgba pixels as 8 bit values with 0 for alpha
|
||||
|
||||
; xmm6 used for scratch
|
||||
; xmm7 used for scratch
|
||||
|
||||
%macro cglobal 1
|
||||
global _%1
|
||||
%define %1 _%1
|
||||
align 16
|
||||
%1:
|
||||
%endmacro
|
||||
|
||||
; conversion code
|
||||
%macro yuv2rgbsse2 0
|
||||
; u = u - 128
|
||||
; v = v - 128
|
||||
; r = y + v + v >> 2 + v >> 3 + v >> 5
|
||||
; g = y - (u >> 2 + u >> 4 + u >> 5) - (v >> 1 + v >> 3 + v >> 4 + v >> 5)
|
||||
; b = y + u + u >> 1 + u >> 2 + u >> 6
|
||||
; subtract 16 from y
|
||||
movdqa xmm7, [Const16] ; loads a constant using data cache (slower on first fetch but then cached)
|
||||
psubsw xmm0,xmm7 ; y = y - 16
|
||||
; subtract 128 from u and v
|
||||
movdqa xmm7, [Const128] ; loads a constant using data cache (slower on first fetch but then cached)
|
||||
psubsw xmm1,xmm7 ; u = u - 128
|
||||
psubsw xmm2,xmm7 ; v = v - 128
|
||||
; load r,b with y
|
||||
movdqa xmm3,xmm0 ; r = y
|
||||
pshufd xmm5,xmm0, 0xE4 ; b = y
|
||||
|
||||
; r = y + v + v >> 2 + v >> 3 + v >> 5
|
||||
paddsw xmm3, xmm2 ; add v to r
|
||||
movdqa xmm7, xmm1 ; move u to scratch
|
||||
pshufd xmm6, xmm2, 0xE4 ; move v to scratch
|
||||
|
||||
psraw xmm6,2 ; divide v by 4
|
||||
paddsw xmm3, xmm6 ; and add to r
|
||||
psraw xmm6,1 ; divide v by 2
|
||||
paddsw xmm3, xmm6 ; and add to r
|
||||
psraw xmm6,2 ; divide v by 4
|
||||
paddsw xmm3, xmm6 ; and add to r
|
||||
|
||||
; b = y + u + u >> 1 + u >> 2 + u >> 6
|
||||
paddsw xmm5, xmm1 ; add u to b
|
||||
psraw xmm7,1 ; divide u by 2
|
||||
paddsw xmm5, xmm7 ; and add to b
|
||||
psraw xmm7,1 ; divide u by 2
|
||||
paddsw xmm5, xmm7 ; and add to b
|
||||
psraw xmm7,4 ; divide u by 32
|
||||
paddsw xmm5, xmm7 ; and add to b
|
||||
|
||||
; g = y - u >> 2 - u >> 4 - u >> 5 - v >> 1 - v >> 3 - v >> 4 - v >> 5
|
||||
movdqa xmm7,xmm2 ; move v to scratch
|
||||
pshufd xmm6,xmm1, 0xE4 ; move u to scratch
|
||||
movdqa xmm4,xmm0 ; g = y
|
||||
|
||||
psraw xmm6,2 ; divide u by 4
|
||||
psubsw xmm4,xmm6 ; subtract from g
|
||||
psraw xmm6,2 ; divide u by 4
|
||||
psubsw xmm4,xmm6 ; subtract from g
|
||||
psraw xmm6,1 ; divide u by 2
|
||||
psubsw xmm4,xmm6 ; subtract from g
|
||||
|
||||
psraw xmm7,1 ; divide v by 2
|
||||
psubsw xmm4,xmm7 ; subtract from g
|
||||
psraw xmm7,2 ; divide v by 4
|
||||
psubsw xmm4,xmm7 ; subtract from g
|
||||
psraw xmm7,1 ; divide v by 2
|
||||
psubsw xmm4,xmm7 ; subtract from g
|
||||
psraw xmm7,1 ; divide v by 2
|
||||
psubsw xmm4,xmm7 ; subtract from g
|
||||
%endmacro
|
||||
|
||||
; outputer
|
||||
%macro rgba32sse2output 0
|
||||
; clamp values
|
||||
pxor xmm7,xmm7
|
||||
packuswb xmm3,xmm7 ; clamp to 0,255 and pack R to 8 bit per pixel
|
||||
packuswb xmm4,xmm7 ; clamp to 0,255 and pack G to 8 bit per pixel
|
||||
packuswb xmm5,xmm7 ; clamp to 0,255 and pack B to 8 bit per pixel
|
||||
; convert to bgra32 packed
|
||||
punpcklbw xmm5,xmm4 ; bgbgbgbgbgbgbgbg
|
||||
movdqa xmm0, xmm5 ; save bg values
|
||||
punpcklbw xmm3,xmm7 ; r0r0r0r0r0r0r0r0
|
||||
punpcklwd xmm5,xmm3 ; lower half bgr0bgr0bgr0bgr0
|
||||
punpckhwd xmm0,xmm3 ; upper half bgr0bgr0bgr0bgr0
|
||||
; write to output ptr
|
||||
movntdq [edi], xmm5 ; output first 4 pixels bypassing cache
|
||||
movntdq [edi+16], xmm0 ; output second 4 pixels bypassing cache
|
||||
%endmacro
|
||||
|
||||
SECTION .data align=16
|
||||
|
||||
Const16 dw 16
|
||||
dw 16
|
||||
dw 16
|
||||
dw 16
|
||||
dw 16
|
||||
dw 16
|
||||
dw 16
|
||||
dw 16
|
||||
|
||||
Const128 dw 128
|
||||
dw 128
|
||||
dw 128
|
||||
dw 128
|
||||
dw 128
|
||||
dw 128
|
||||
dw 128
|
||||
dw 128
|
||||
|
||||
UMask db 0x01
|
||||
db 0x80
|
||||
db 0x01
|
||||
db 0x80
|
||||
db 0x05
|
||||
db 0x80
|
||||
db 0x05
|
||||
db 0x80
|
||||
db 0x09
|
||||
db 0x80
|
||||
db 0x09
|
||||
db 0x80
|
||||
db 0x0d
|
||||
db 0x80
|
||||
db 0x0d
|
||||
db 0x80
|
||||
|
||||
VMask db 0x03
|
||||
db 0x80
|
||||
db 0x03
|
||||
db 0x80
|
||||
db 0x07
|
||||
db 0x80
|
||||
db 0x07
|
||||
db 0x80
|
||||
db 0x0b
|
||||
db 0x80
|
||||
db 0x0b
|
||||
db 0x80
|
||||
db 0x0f
|
||||
db 0x80
|
||||
db 0x0f
|
||||
db 0x80
|
||||
|
||||
YMask db 0x00
|
||||
db 0x80
|
||||
db 0x02
|
||||
db 0x80
|
||||
db 0x04
|
||||
db 0x80
|
||||
db 0x06
|
||||
db 0x80
|
||||
db 0x08
|
||||
db 0x80
|
||||
db 0x0a
|
||||
db 0x80
|
||||
db 0x0c
|
||||
db 0x80
|
||||
db 0x0e
|
||||
db 0x80
|
||||
|
||||
|
||||
; void Convert_YUV422_RGBA32_SSSE3(void *fromPtr, void *toPtr, int width)
|
||||
width equ ebp+16
|
||||
toPtr equ ebp+12
|
||||
fromPtr equ ebp+8
|
||||
|
||||
; void Convert_YUV420P_RGBA32_SSSE3(void *fromYPtr, void *fromUPtr, void *fromVPtr, void *toPtr, int width)
|
||||
width1 equ ebp+24
|
||||
toPtr1 equ ebp+20
|
||||
fromVPtr equ ebp+16
|
||||
fromUPtr equ ebp+12
|
||||
fromYPtr equ ebp+8
|
||||
|
||||
SECTION .text align=16
|
||||
|
||||
cglobal Convert_YUV422_RGBA32_SSSE3
|
||||
; reserve variables
|
||||
push ebp
|
||||
mov ebp, esp
|
||||
push edi
|
||||
push esi
|
||||
push ecx
|
||||
|
||||
mov esi, [fromPtr]
|
||||
mov edi, [toPtr]
|
||||
mov ecx, [width]
|
||||
; loop width / 8 times
|
||||
shr ecx,3
|
||||
test ecx,ecx
|
||||
jng ENDLOOP
|
||||
REPEATLOOP: ; loop over width / 8
|
||||
; YUV422 packed inputer
|
||||
movdqa xmm0, [esi] ; should have yuyv yuyv yuyv yuyv
|
||||
pshufd xmm1, xmm0, 0xE4 ; copy to xmm1
|
||||
movdqa xmm2, xmm0 ; copy to xmm2
|
||||
; extract both y giving y0y0
|
||||
pshufb xmm0, [YMask]
|
||||
; extract u and duplicate so each u in yuyv becomes u0u0
|
||||
pshufb xmm1, [UMask]
|
||||
; extract v and duplicate so each v in yuyv becomes v0v0
|
||||
pshufb xmm2, [VMask]
|
||||
|
||||
yuv2rgbsse2
|
||||
|
||||
rgba32sse2output
|
||||
|
||||
; endloop
|
||||
add edi,32
|
||||
add esi,16
|
||||
sub ecx, 1 ; apparently sub is better than dec
|
||||
jnz REPEATLOOP
|
||||
ENDLOOP:
|
||||
; Cleanup
|
||||
pop ecx
|
||||
pop esi
|
||||
pop edi
|
||||
mov esp, ebp
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
cglobal Convert_YUV420P_RGBA32_SSSE3
|
||||
; reserve variables
|
||||
push ebp
|
||||
mov ebp, esp
|
||||
push edi
|
||||
push esi
|
||||
push ecx
|
||||
push eax
|
||||
push ebx
|
||||
|
||||
mov esi, [fromYPtr]
|
||||
mov eax, [fromUPtr]
|
||||
mov ebx, [fromVPtr]
|
||||
mov edi, [toPtr1]
|
||||
mov ecx, [width1]
|
||||
; loop width / 8 times
|
||||
shr ecx,3
|
||||
test ecx,ecx
|
||||
jng ENDLOOP1
|
||||
REPEATLOOP1: ; loop over width / 8
|
||||
; YUV420 Planar inputer
|
||||
movq xmm0, [esi] ; fetch 8 y values (8 bit) yyyyyyyy00000000
|
||||
movd xmm1, [eax] ; fetch 4 u values (8 bit) uuuu000000000000
|
||||
movd xmm2, [ebx] ; fetch 4 v values (8 bit) vvvv000000000000
|
||||
|
||||
; extract y
|
||||
pxor xmm7,xmm7 ; 00000000000000000000000000000000
|
||||
punpcklbw xmm0,xmm7 ; interleave xmm7 into xmm0 y0y0y0y0y0y0y0y0
|
||||
; extract u and duplicate so each becomes 0u0u
|
||||
punpcklbw xmm1,xmm7 ; interleave xmm7 into xmm1 u0u0u0u000000000
|
||||
punpcklwd xmm1,xmm7 ; interleave again u000u000u000u000
|
||||
pshuflw xmm1,xmm1, 0xA0 ; copy u values
|
||||
pshufhw xmm1,xmm1, 0xA0 ; to get u0u0
|
||||
; extract v
|
||||
punpcklbw xmm2,xmm7 ; interleave xmm7 into xmm1 v0v0v0v000000000
|
||||
punpcklwd xmm2,xmm7 ; interleave again v000v000v000v000
|
||||
pshuflw xmm2,xmm2, 0xA0 ; copy v values
|
||||
pshufhw xmm2,xmm2, 0xA0 ; to get v0v0
|
||||
|
||||
yuv2rgbsse2
|
||||
|
||||
rgba32sse2output
|
||||
|
||||
; endloop
|
||||
add edi,32
|
||||
add esi,8
|
||||
add eax,4
|
||||
add ebx,4
|
||||
sub ecx, 1 ; apparently sub is better than dec
|
||||
jnz REPEATLOOP1
|
||||
ENDLOOP1:
|
||||
; Cleanup
|
||||
pop ebx
|
||||
pop eax
|
||||
pop ecx
|
||||
pop esi
|
||||
pop edi
|
||||
mov esp, ebp
|
||||
pop ebp
|
||||
ret
|
||||
|
||||
SECTION .note.GNU-stack noalloc noexec nowrite progbits
|
Loading…
x
Reference in New Issue
Block a user