correct yuv422 planar conversion. Seperate sse, sse2 and ssse3 asm code. Add packed convertor

git-svn-id: file:///srv/svn/repos/haiku/haiku/trunk@39913 a95241bf-73f2-0310-859d-f6bbb57e9c96
This commit is contained in:
David McPaul 2010-12-21 07:25:40 +00:00
parent 88cbc0dbc5
commit 51d9b4fe14
7 changed files with 816 additions and 261 deletions

View File

@ -27,7 +27,9 @@ Addon ffmpeg :
gfx_conv_c_lookup.cpp
gfx_conv_mmx.cpp
gfx_util.cpp
yuvrgb.nasm
yuvrgb_sse.nasm
yuvrgb_sse2.nasm
yuvrgb_ssse3.nasm
:
libavformat.a
libavcodec.a

View File

@ -1,22 +1,63 @@
#include "gfx_conv_mmx.h"
#include "gfx_conv_c.h"
extern "C" void _Convert_YUV420P_RGBA32_SSE2(void *fromYPtr, void *fromUPtr,
void *fromVPtr, void *toPtr, int width);
extern "C" void _Convert_YUV422_RGBA32_SSE2(void *fromYPtr, void *toPtr,
int width);
extern "C" void _Convert_YUV420P_RGBA32_SSE(void *fromYPtr, void *fromUPtr,
void *fromVPtr, void *toPtr, int width);
// Packed
extern "C" void _Convert_YUV422_RGBA32_SSE(void *fromYPtr, void *toPtr,
int width);
extern "C" void _Convert_YUV422_RGBA32_SSE2(void *fromYPtr, void *toPtr,
int width);
extern "C" void _Convert_YUV422_RGBA32_SSSE3(void *fromYPtr, void *toPtr,
int width);
// Planar
extern "C" void _Convert_YUV420P_RGBA32_SSE(void *fromYPtr, void *fromUPtr,
void *fromVPtr, void *toPtr, int width);
extern "C" void _Convert_YUV420P_RGBA32_SSE2(void *fromYPtr, void *fromUPtr,
void *fromVPtr, void *toPtr, int width);
extern "C" void _Convert_YUV420P_RGBA32_SSSE3(void *fromYPtr, void *fromUPtr,
void *fromVPtr, void *toPtr, int width);
// Planar YUV420 means 2 Y lines share a UV line
void
gfx_conv_yuv420p_rgba32_sse(AVFrame *in, AVFrame *out, int width, int height)
{
// in and out buffers must be aligned to 16 bytes,
// in should be as ffmpeg allocates it
if ((off_t)out->data[0] % 16 != 0) {
gfx_conv_YCbCr420p_RGB32_c(in, out, width, height);
return;
}
uint8 *ybase = (uint8 *)in->data[0];
uint8 *ubase = (uint8 *)in->data[1];
uint8 *vbase = (uint8 *)in->data[2];
uint8 *rgbbase = (uint8 *)out->data[0];
int yBaseInc = in->linesize[0];
int uBaseInc = in->linesize[1];
int vBaseInc = in->linesize[2];
int rgbBaseInc = out->linesize[0];
for (int i=0;i<height;i+=2) {
// First Y row
_Convert_YUV420P_RGBA32_SSE(ybase, ubase, vbase, rgbbase, width);
ybase += yBaseInc;
rgbbase += rgbBaseInc;
// Second Y row but same u and v row
_Convert_YUV420P_RGBA32_SSE(ybase, ubase, vbase, rgbbase, width);
ybase += yBaseInc;
ubase += uBaseInc;
vbase += vBaseInc;
rgbbase += rgbBaseInc;
}
}
// Planar YUV420 means 2 Y lines share a UV line
void
gfx_conv_yuv420p_rgba32_sse2(AVFrame *in, AVFrame *out, int width, int height)
{
// Planar YUV420
{
// in and out buffers must be aligned to 32 bytes,
// in should be as ffmpeg allocates it
if ((off_t)out->data[0] % 32 != 0) {
@ -49,12 +90,157 @@ gfx_conv_yuv420p_rgba32_sse2(AVFrame *in, AVFrame *out, int width, int height)
}
}
// Planar YUV420 means 2 Y lines share a UV line
void
gfx_conv_yuv420p_rgba32_ssse3(AVFrame *in, AVFrame *out, int width, int height)
{
// in and out buffers must be aligned to 32 bytes,
// in should be as ffmpeg allocates it
if ((off_t)out->data[0] % 32 != 0) {
gfx_conv_YCbCr420p_RGB32_c(in, out, width, height);
return;
}
uint8 *ybase = (uint8 *)in->data[0];
uint8 *ubase = (uint8 *)in->data[1];
uint8 *vbase = (uint8 *)in->data[2];
uint8 *rgbbase = (uint8 *)out->data[0];
int yBaseInc = in->linesize[0];
int uBaseInc = in->linesize[1];
int vBaseInc = in->linesize[2];
int rgbBaseInc = out->linesize[0];
for (int i=0;i<height;i+=2) {
// First Y row
_Convert_YUV420P_RGBA32_SSSE3(ybase, ubase, vbase, rgbbase, width);
ybase += yBaseInc;
rgbbase += rgbBaseInc;
// Second Y row but same u and v row
_Convert_YUV420P_RGBA32_SSSE3(ybase, ubase, vbase, rgbbase, width);
ybase += yBaseInc;
ubase += uBaseInc;
vbase += vBaseInc;
rgbbase += rgbBaseInc;
}
}
// Planar YUV422 means each Y line has it's own UV line
void
gfx_conv_yuv422p_rgba32_sse(AVFrame *in, AVFrame *out, int width, int height)
{
// in and out buffers must be aligned to 32 bytes,
// in should be as ffmpeg allocates it
if ((off_t)out->data[0] % 32 != 0) {
gfx_conv_YCbCr422_RGB32_c(in, out, width, height);
return;
}
uint8 *ybase = (uint8 *)in->data[0];
uint8 *ubase = (uint8 *)in->data[1];
uint8 *vbase = (uint8 *)in->data[2];
uint8 *rgbbase = (uint8 *)out->data[0];
int yBaseInc = in->linesize[0];
int uBaseInc = in->linesize[1];
int vBaseInc = in->linesize[2];
int rgbBaseInc = out->linesize[0];
for (int i=0;i<height;i++) {
_Convert_YUV420P_RGBA32_SSE(ybase, ubase, vbase, rgbbase, width);
ybase += yBaseInc;
ubase += uBaseInc;
vbase += vBaseInc;
rgbbase += rgbBaseInc;
}
}
// Planar YUV422 means each Y line has it's own UV line
void
gfx_conv_yuv422p_rgba32_sse2(AVFrame *in, AVFrame *out, int width, int height)
{
// Packed YUV422
// in and out buffers must be aligned to 32 bytes,
// in should be as ffmpeg allocates it
if ((off_t)out->data[0] % 32 != 0) {
gfx_conv_YCbCr422_RGB32_c(in, out, width, height);
return;
}
uint8 *ybase = (uint8 *)in->data[0];
uint8 *ubase = (uint8 *)in->data[1];
uint8 *vbase = (uint8 *)in->data[2];
uint8 *rgbbase = (uint8 *)out->data[0];
int yBaseInc = in->linesize[0];
int uBaseInc = in->linesize[1];
int vBaseInc = in->linesize[2];
int rgbBaseInc = out->linesize[0];
for (int i=0;i<height;i++) {
_Convert_YUV420P_RGBA32_SSE2(ybase, ubase, vbase, rgbbase, width);
ybase += yBaseInc;
ubase += uBaseInc;
vbase += vBaseInc;
rgbbase += rgbBaseInc;
}
}
// Planar YUV422 means each Y line has it's own UV line
void
gfx_conv_yuv422p_rgba32_ssse3(AVFrame *in, AVFrame *out, int width, int height)
{
// in and out buffers must be aligned to 32 bytes,
// in should be as ffmpeg allocates it
if ((off_t)out->data[0] % 32 != 0) {
gfx_conv_YCbCr422_RGB32_c(in, out, width, height);
return;
}
uint8 *ybase = (uint8 *)in->data[0];
uint8 *ubase = (uint8 *)in->data[1];
uint8 *vbase = (uint8 *)in->data[2];
uint8 *rgbbase = (uint8 *)out->data[0];
int yBaseInc = in->linesize[0];
int uBaseInc = in->linesize[1];
int vBaseInc = in->linesize[2];
int rgbBaseInc = out->linesize[0];
for (int i=0;i<height;i++) {
_Convert_YUV420P_RGBA32_SSSE3(ybase, ubase, vbase, rgbbase, width);
ybase += yBaseInc;
ubase += uBaseInc;
vbase += vBaseInc;
rgbbase += rgbBaseInc;
}
}
// Packed YUV422 (YUYV)
void
gfx_conv_yuv422_rgba32_sse(AVFrame *in, AVFrame *out, int width, int height)
{
// in and out buffers must be aligned to 16 bytes,
// in should be as ffmpeg allocates it
if ((off_t)out->data[0] % 16 != 0) {
gfx_conv_YCbCr422_RGB32_c(in, out, width, height);
return;
}
uint8 *ybase = (uint8 *)in->data[0];
uint8 *rgbbase = (uint8 *)out->data[0];
for (int i = 0; i <= height; i++) {
_Convert_YUV422_RGBA32_SSE(ybase, rgbbase, width);
ybase += in->linesize[0];
rgbbase += out->linesize[0];
}
}
// Packed YUV422 (YUYV)
void
gfx_conv_yuv422_rgba32_sse2(AVFrame *in, AVFrame *out, int width, int height)
{
// in and out buffers must be aligned to 32 bytes,
// in should be as ffmpeg allocates it
if ((off_t)out->data[0] % 32 != 0) {
@ -72,53 +258,13 @@ gfx_conv_yuv422p_rgba32_sse2(AVFrame *in, AVFrame *out, int width, int height)
}
}
// Packed YUV422 (YUYV)
void
gfx_conv_yuv420p_rgba32_sse(AVFrame *in, AVFrame *out, int width, int height)
gfx_conv_yuv422_rgba32_ssse3(AVFrame *in, AVFrame *out, int width, int height)
{
// Planar YUV420
// in and out buffers must be aligned to 16 bytes,
// in and out buffers must be aligned to 32 bytes,
// in should be as ffmpeg allocates it
if ((off_t)out->data[0] % 16 != 0) {
gfx_conv_YCbCr420p_RGB32_c(in, out, width, height);
return;
}
uint8 *ybase = (uint8 *)in->data[0];
uint8 *ubase = (uint8 *)in->data[1];
uint8 *vbase = (uint8 *)in->data[2];
uint8 *rgbbase = (uint8 *)out->data[0];
int yBaseInc = in->linesize[0];
int uBaseInc = in->linesize[1];
int vBaseInc = in->linesize[2];
int rgbBaseInc = out->linesize[0];
for (int i=0;i<height;i+=2) {
// First Y row
_Convert_YUV420P_RGBA32_SSE(ybase, ubase, vbase, rgbbase, width);
ybase += yBaseInc;
rgbbase += rgbBaseInc;
// Second Y row but same u and v row
_Convert_YUV420P_RGBA32_SSE(ybase, ubase, vbase, rgbbase, width);
ybase += yBaseInc;
ubase += uBaseInc;
vbase += vBaseInc;
rgbbase += rgbBaseInc;
}
}
void
gfx_conv_yuv422p_rgba32_sse(AVFrame *in, AVFrame *out, int width, int height)
{
// Packed YUV422
// in and out buffers must be aligned to 16 bytes,
// in should be as ffmpeg allocates it
if ((off_t)out->data[0] % 16 != 0) {
if ((off_t)out->data[0] % 32 != 0) {
gfx_conv_YCbCr422_RGB32_c(in, out, width, height);
return;
}
@ -127,7 +273,7 @@ gfx_conv_yuv422p_rgba32_sse(AVFrame *in, AVFrame *out, int width, int height)
uint8 *rgbbase = (uint8 *)out->data[0];
for (int i = 0; i <= height; i++) {
_Convert_YUV422_RGBA32_SSE(ybase, rgbbase, width);
_Convert_YUV422_RGBA32_SSSE3(ybase, rgbbase, width);
ybase += in->linesize[0];
rgbbase += out->linesize[0];
}

View File

@ -7,9 +7,17 @@
void gfx_conv_null_mmx(AVFrame *in, AVFrame *out, int width, int height);
void gfx_conv_yuv420p_rgba32_sse2(AVFrame *in, AVFrame *out, int width, int height);
void gfx_conv_yuv422p_rgba32_sse2(AVFrame *in, AVFrame *out, int width, int height);
// Planar
void gfx_conv_yuv420p_rgba32_sse(AVFrame *in, AVFrame *out, int width, int height);
void gfx_conv_yuv420p_rgba32_sse2(AVFrame *in, AVFrame *out, int width, int height);
void gfx_conv_yuv420p_rgba32_ssse3(AVFrame *in, AVFrame *out, int width, int height);
void gfx_conv_yuv422p_rgba32_sse(AVFrame *in, AVFrame *out, int width, int height);
void gfx_conv_yuv422p_rgba32_sse2(AVFrame *in, AVFrame *out, int width, int height);
void gfx_conv_yuv422p_rgba32_ssse3(AVFrame *in, AVFrame *out, int width, int height);
// Packed
void gfx_conv_yuv422_rgba32_sse(AVFrame *in, AVFrame *out, int width, int height);
void gfx_conv_yuv422_rgba32_sse2(AVFrame *in, AVFrame *out, int width, int height);
void gfx_conv_yuv422_rgba32_ssse3(AVFrame *in, AVFrame *out, int width, int height);
#endif

View File

@ -29,6 +29,7 @@ resolve_colorspace(color_space colorSpace, PixelFormat pixelFormat, int width,
switch (colorSpace) {
case B_RGB32:
// Planar Formats
if (pixelFormat == PIX_FMT_YUV410P) {
TRACE("resolve_colorspace: gfx_conv_yuv410p_rgb32_c\n");
return gfx_conv_yuv410p_rgb32_c;
@ -57,14 +58,32 @@ resolve_colorspace(color_space colorSpace, PixelFormat pixelFormat, int width,
if (pixelFormat == PIX_FMT_YUV422P
|| pixelFormat == PIX_FMT_YUVJ422P) {
if (cpu.HasSSE2() && width % 8 == 0)
if (cpu.HasSSE2() && width % 8 == 0) {
TRACE("resolve_colorspace: gfx_conv_yuv422p_RGB32_sse2\n");
return gfx_conv_yuv422p_rgba32_sse2;
else if (cpu.HasSSE1() && width % 4 == 0)
} else if (cpu.HasSSE1() && width % 4 == 0) {
TRACE("resolve_colorspace: gfx_conv_yuv422p_RGB32_sse\n");
return gfx_conv_yuv422p_rgba32_sse;
else
} else {
TRACE("resolve_colorspace: gfx_conv_YCbCr422p_RGB32_c\n");
return gfx_conv_YCbCr422_RGB32_c;
}
}
// Packed Formats
if (pixelFormat == PIX_FMT_YUYV422) {
if (cpu.HasSSSE3() && width % 8 == 0) {
return gfx_conv_yuv422_rgba32_ssse3;
} else if (cpu.HasSSE2() && width % 8 == 0) {
return gfx_conv_yuv422_rgba32_sse2;
} else if (cpu.HasSSE1() && width % 4 == 0
&& height % 2 == 0) {
return gfx_conv_yuv422_rgba32_sse;
} else {
return gfx_conv_YCbCr422_RGB32_c;
}
}
TRACE("resolve_colorspace: %s => B_RGB32: NULL\n",
pixfmt_to_string(pixelFormat));
return NULL;

View File

@ -0,0 +1,268 @@
;
; Copyright (C) 2009-2010 David McPaul
;
; All rights reserved. Distributed under the terms of the MIT License.
;
; A rather unoptimised set of sse yuv to rgb converters
; does 4 pixels per loop
; inputer:
; reads 128 bits of yuv 8 bit data and puts
; the y values converted to 16 bit in mm0
; the u values converted to 16 bit and duplicated into mm1
; the v values converted to 16 bit and duplicated into mm2
; conversion:
; does the yuv to rgb conversion using 16 bit fixed point and the
; results are placed into the following registers as 8 bit clamped values
; r values in mm3
; g values in mm4
; b values in mm5
; outputer:
; writes out the rgba pixels as 8 bit values with 0 for alpha
; mm6 used for scratch
; mm7 used for scratch
%macro cglobal 1
global _%1
%define %1 _%1
align 16
%1:
%endmacro
; conversion code
%macro yuv2rgbsse 0
; u = u - 128
; v = v - 128
; r = y + v + v >> 2 + v >> 3 + v >> 5
; g = y - (u >> 2 + u >> 4 + u >> 5) - (v >> 1 + v >> 3 + v >> 4 + v >> 5)
; b = y + u + u >> 1 + u >> 2 + u >> 6
; subtract 16 from y
movq mm7, [Const16] ; loads a constant using data cache (slower on first fetch but then cached)
psubsw mm0,mm7 ; y = y - 16
; subtract 128 from u and v
movq mm7, [Const128] ; loads a constant using data cache (slower on first fetch but then cached)
psubsw mm1,mm7 ; u = u - 128
psubsw mm2,mm7 ; v = v - 128
; load r,g,b with y
movq mm3,mm0 ; r = y
pshufw mm5,mm0, 0xE4 ; b = y
; r = r + v + v >> 2 + v >> 3 + v >> 5
paddsw mm3, mm2 ; add v to r
movq mm7, mm1 ; move u to scratch
pshufw mm6, mm2, 0xE4 ; move v to scratch
psraw mm6,2 ; divide v by 4
paddsw mm3, mm6 ; and add to r
psraw mm6,1 ; divide v by 2
paddsw mm3, mm6 ; and add to r
psraw mm6,2 ; divide v by 4
paddsw mm3, mm6 ; and add to r
; b = y + u + u >> 1 + u >> 2 + u >> 6
paddsw mm5, mm1 ; add u to b
psraw mm7,1 ; divide u by 2
paddsw mm5, mm7 ; and add to b
psraw mm7,1 ; divide u by 2
paddsw mm5, mm7 ; and add to b
psraw mm7,4 ; divide u by 32
paddsw mm5, mm7 ; and add to b
; g = y - u >> 2 - u >> 4 - u >> 5 - v >> 1 - v >> 3 - v >> 4 - v >> 5
movq mm7,mm2 ; move v to scratch
pshufw mm6,mm1, 0xE4 ; move u to scratch
movq mm4,mm0 ; g = y
psraw mm6,2 ; divide u by 4
psubsw mm4,mm6 ; subtract from g
psraw mm6,2 ; divide u by 4
psubsw mm4,mm6 ; subtract from g
psraw mm6,1 ; divide u by 2
psubsw mm4,mm6 ; subtract from g
psraw mm7,1 ; divide v by 2
psubsw mm4,mm7 ; subtract from g
psraw mm7,2 ; divide v by 4
psubsw mm4,mm7 ; subtract from g
psraw mm7,1 ; divide v by 2
psubsw mm4,mm7 ; subtract from g
psraw mm7,1 ; divide v by 2
psubsw mm4,mm7 ; subtract from g
%endmacro
; outputer
%macro rgba32sseoutput 0
; clamp values
pxor mm7,mm7
packuswb mm3,mm7 ; clamp to 0,255 and pack R to 8 bit per pixel
packuswb mm4,mm7 ; clamp to 0,255 and pack G to 8 bit per pixel
packuswb mm5,mm7 ; clamp to 0,255 and pack B to 8 bit per pixel
; convert to bgra32 packed
punpcklbw mm5,mm4 ; bgbgbgbgbgbgbgbg
movq mm0, mm5 ; save bg values
punpcklbw mm3,mm7 ; r0r0r0r0
punpcklwd mm5,mm3 ; lower half bgr0bgr0
punpckhwd mm0,mm3 ; upper half bgr0bgr0
; write to output ptr
movq [edi], mm5 ; output first 2 pixels
movq [edi+8], mm0 ; output second 2 pixels
%endmacro
SECTION .data align=16
Const16 dw 16
dw 16
dw 16
dw 16
dw 16
dw 16
dw 16
dw 16
Const128 dw 128
dw 128
dw 128
dw 128
dw 128
dw 128
dw 128
dw 128
; Packed Convert
; void Convert_YUV422_RGBA32_SSE(void *fromPtr, void *toPtr, int width)
width equ ebp+16
toPtr equ ebp+12
fromPtr equ ebp+8
; Planar Convert
; void Convert_YUV420P_RGBA32_SSE(void *fromYPtr, void *fromUPtr, void *fromVPtr, void *toPtr, int width)
width1 equ ebp+24
toPtr1 equ ebp+20
fromVPtr equ ebp+16
fromUPtr equ ebp+12
fromYPtr equ ebp+8
SECTION .text align=16
; YUY2 FOURCC
cglobal Convert_YUV422_RGBA32_SSE
; reserve variables
push ebp
mov ebp, esp
push edi
push esi
push ecx
mov esi, [fromPtr]
mov ecx, [width]
mov edi, [toPtr]
; loop width / 4 times
shr ecx,2
test ecx,ecx
jng ENDLOOP2
REPEATLOOP2: ; loop over width / 4
; YUV422 packed inputer
movq mm0, [esi] ; should have yuyv yuyv
pshufw mm1, mm0, 0xE4 ; copy to mm1
movq mm2, mm0 ; copy to mm2
; extract y
pxor mm7,mm7 ; 0000000000000000
pcmpeqb mm6,mm6 ; ffffffffffffffff
punpckhbw mm6,mm7 ; interleave mm7 into mm6 ff00ff00ff00ff00
pand mm0, mm6 ; clear all but y values leaving y0y0 etc
; extract u and duplicate so each u in yuyv becomes 0u0u
psrld mm6,8 ; 00ff0000 00ff0000
pand mm1, mm6 ; clear all yv values leaving 0u00 etc
psrld mm1,8 ; rotate u to get u000
pshufw mm1,mm1, 0xA0 ; copy u values to get u0u0 (SSE not MMX)
; extract v
pslld mm6,16 ; 000000ff000000ff
pand mm2, mm6 ; clear all yu values leaving 000v etc
psrld mm2,8 ; rotate v to get 00v0
pshufw mm2,mm2, 0xF5 ; copy v values to get v0v0 (SSE not MMX)
yuv2rgbsse
rgba32sseoutput
; endloop
add edi,16
add esi,8
sub ecx, 1 ; apparently sub is better than dec
jnz REPEATLOOP2
ENDLOOP2:
; Cleanup
emms ; reset mmx regs back to float
pop ecx
pop esi
pop edi
mov esp, ebp
pop ebp
ret
cglobal Convert_YUV420P_RGBA32_SSE
; reserve variables
push ebp
mov ebp, esp
push edi
push esi
push ecx
push eax
push ebx
mov esi, [fromYPtr]
mov eax, [fromUPtr]
mov ebx, [fromVPtr]
mov edi, [toPtr1]
mov ecx, [width1]
; loop width / 4 times
shr ecx,2
test ecx,ecx
jng ENDLOOP3
REPEATLOOP3: ; loop over width / 4
; YUV420 Planar inputer
movq mm0, [esi] ; fetch 4 y values (8 bit) yyyy0000
movd mm1, [eax] ; fetch 2 u values (8 bit) uu000000
movd mm2, [ebx] ; fetch 2 v values (8 bit) vv000000
; extract y
pxor mm7,mm7 ; 0000000000000000
punpcklbw mm0,mm7 ; interleave xmm7 into xmm0 y0y0y0y
; extract u and duplicate so each becomes 0u0u
punpcklbw mm1,mm7 ; interleave xmm7 into xmm1 u0u00000
punpcklwd mm1,mm7 ; interleave again u000u000
pshufw mm1,mm1, 0xA0 ; copy u values to get u0u0
; extract v
punpcklbw mm2,mm7 ; interleave xmm7 into xmm1 v0v00000
punpcklwd mm2,mm7 ; interleave again v000v000
pshufw mm2,mm2, 0xA0 ; copy v values to get v0v0
yuv2rgbsse
rgba32sseoutput
; endloop
add edi,16
add esi,4
add eax,2
add ebx,2
sub ecx, 1 ; apparently sub is better than dec
jnz REPEATLOOP3
ENDLOOP3:
; Cleanup
emms
pop ebx
pop eax
pop ecx
pop esi
pop edi
mov esp, ebp
pop ebp
ret
SECTION .note.GNU-stack noalloc noexec nowrite progbits

View File

@ -4,11 +4,11 @@
; All rights reserved. Distributed under the terms of the MIT License.
;
; A rather unoptimised set of yuv to rgb converters
; does 8 pixels at a time
; A rather unoptimised set of sse2 yuv to rgb converters
; does 8 pixels per loop
; inputer:
; reads 128bits of yuv 8 bit data and puts
; reads 128 bits of yuv 8 bit data and puts
; the y values converted to 16 bit in xmm0
; the u values converted to 16 bit and duplicated into xmm1
; the v values converted to 16 bit and duplicated into xmm2
@ -94,67 +94,6 @@
psubsw xmm4,xmm7 ; subtract from g
%endmacro
; conversion code
%macro yuv2rgbsse 0
; u = u - 128
; v = v - 128
; r = y + v + v >> 2 + v >> 3 + v >> 5
; g = y - (u >> 2 + u >> 4 + u >> 5) - (v >> 1 + v >> 3 + v >> 4 + v >> 5)
; b = y + u + u >> 1 + u >> 2 + u >> 6
; subtract 16 from y
movq mm7, [Const16] ; loads a constant using data cache (slower on first fetch but then cached)
psubsw mm0,mm7 ; y = y - 16
; subtract 128 from u and v
movq mm7, [Const128] ; loads a constant using data cache (slower on first fetch but then cached)
psubsw mm1,mm7 ; u = u - 128
psubsw mm2,mm7 ; v = v - 128
; load r,g,b with y
movq mm3,mm0 ; r = y
pshufw mm5,mm0, 0xE4 ; b = y
; r = r + v + v >> 2 + v >> 3 + v >> 5
paddsw mm3, mm2 ; add v to r
movq mm7, mm1 ; move u to scratch
pshufw mm6, mm2, 0xE4 ; move v to scratch
psraw mm6,2 ; divide v by 4
paddsw mm3, mm6 ; and add to r
psraw mm6,1 ; divide v by 2
paddsw mm3, mm6 ; and add to r
psraw mm6,2 ; divide v by 4
paddsw mm3, mm6 ; and add to r
; b = y + u + u >> 1 + u >> 2 + u >> 6
paddsw mm5, mm1 ; add u to b
psraw mm7,1 ; divide u by 2
paddsw mm5, mm7 ; and add to b
psraw mm7,1 ; divide u by 2
paddsw mm5, mm7 ; and add to b
psraw mm7,4 ; divide u by 32
paddsw mm5, mm7 ; and add to b
; g = y - u >> 2 - u >> 4 - u >> 5 - v >> 1 - v >> 3 - v >> 4 - v >> 5
movq mm7,mm2 ; move v to scratch
pshufw mm6,mm1, 0xE4 ; move u to scratch
movq mm4,mm0 ; g = y
psraw mm6,2 ; divide u by 4
psubsw mm4,mm6 ; subtract from g
psraw mm6,2 ; divide u by 4
psubsw mm4,mm6 ; subtract from g
psraw mm6,1 ; divide u by 2
psubsw mm4,mm6 ; subtract from g
psraw mm7,1 ; divide v by 2
psubsw mm4,mm7 ; subtract from g
psraw mm7,2 ; divide v by 4
psubsw mm4,mm7 ; subtract from g
psraw mm7,1 ; divide v by 2
psubsw mm4,mm7 ; subtract from g
psraw mm7,1 ; divide v by 2
psubsw mm4,mm7 ; subtract from g
%endmacro
; outputer
%macro rgba32sse2output 0
; clamp values
@ -173,24 +112,6 @@
movntdq [edi+16], xmm0 ; output second 4 pixels bypassing cache
%endmacro
; outputer
%macro rgba32sseoutput 0
; clamp values
pxor mm7,mm7
packuswb mm3,mm7 ; clamp to 0,255 and pack R to 8 bit per pixel
packuswb mm4,mm7 ; clamp to 0,255 and pack G to 8 bit per pixel
packuswb mm5,mm7 ; clamp to 0,255 and pack B to 8 bit per pixel
; convert to bgra32 packed
punpcklbw mm5,mm4 ; bgbgbgbgbgbgbgbg
movq mm0, mm5 ; save bg values
punpcklbw mm3,mm7 ; r0r0r0r0
punpcklwd mm5,mm3 ; lower half bgr0bgr0
punpckhwd mm0,mm3 ; upper half bgr0bgr0
; write to output ptr
movq [edi], mm5 ; output first 2 pixels
movq [edi+8], mm0 ; output second 2 pixels
%endmacro
SECTION .data align=16
Const16 dw 16
@ -342,120 +263,4 @@ ENDLOOP1:
pop ebp
ret
cglobal Convert_YUV422_RGBA32_SSE
; reserve variables
push ebp
mov ebp, esp
push edi
push esi
push ecx
mov esi, [fromPtr]
mov ecx, [width]
mov edi, [toPtr]
; loop width / 4 times
shr ecx,2
test ecx,ecx
jng ENDLOOP2
REPEATLOOP2: ; loop over width / 4
; YUV422 packed inputer
movq mm0, [esi] ; should have yuyv yuyv
pshufw mm1, mm0, 0xE4 ; copy to mm1
movq mm2, mm0 ; copy to mm2
; extract y
pxor mm7,mm7 ; 0000000000000000
pcmpeqb mm6,mm6 ; ffffffffffffffff
punpckhbw mm6,mm7 ; interleave mm7 into mm6 ff00ff00ff00ff00
pand mm0, mm6 ; clear all but y values leaving y0y0 etc
; extract u and duplicate so each u in yuyv becomes 0u0u
psrld mm6,8 ; 00ff0000 00ff0000
pand mm1, mm6 ; clear all yv values leaving 0u00 etc
psrld mm1,8 ; rotate u to get u000
pshufw mm1,mm1, 0xA0 ; copy u values to get u0u0 (SSE not MMX)
; extract v
pslld mm6,16 ; 000000ff000000ff
pand mm2, mm6 ; clear all yu values leaving 000v etc
psrld mm2,8 ; rotate v to get 00v0
pshufw mm2,mm2, 0xF5 ; copy v values to get v0v0 (SSE not MMX)
yuv2rgbsse
rgba32sseoutput
; endloop
add edi,16
add esi,8
sub ecx, 1 ; apparently sub is better than dec
jnz REPEATLOOP2
ENDLOOP2:
; Cleanup
emms ; reset mmx regs back to float
pop ecx
pop esi
pop edi
mov esp, ebp
pop ebp
ret
cglobal Convert_YUV420P_RGBA32_SSE
; reserve variables
push ebp
mov ebp, esp
push edi
push esi
push ecx
push eax
push ebx
mov esi, [fromYPtr]
mov eax, [fromUPtr]
mov ebx, [fromVPtr]
mov edi, [toPtr1]
mov ecx, [width1]
; loop width / 4 times
shr ecx,2
test ecx,ecx
jng ENDLOOP3
REPEATLOOP3: ; loop over width / 4
; YUV420 Planar inputer
movq mm0, [esi] ; fetch 4 y values (8 bit) yyyy0000
movd mm1, [eax] ; fetch 2 u values (8 bit) uu000000
movd mm2, [ebx] ; fetch 2 v values (8 bit) vv000000
; extract y
pxor mm7,mm7 ; 0000000000000000
punpcklbw mm0,mm7 ; interleave xmm7 into xmm0 y0y0y0y
; extract u and duplicate so each becomes 0u0u
punpcklbw mm1,mm7 ; interleave xmm7 into xmm1 u0u00000
punpcklwd mm1,mm7 ; interleave again u000u000
pshufw mm1,mm1, 0xA0 ; copy u values to get u0u0
; extract v
punpcklbw mm2,mm7 ; interleave xmm7 into xmm1 v0v00000
punpcklwd mm2,mm7 ; interleave again v000v000
pshufw mm2,mm2, 0xA0 ; copy v values to get v0v0
yuv2rgbsse
rgba32sseoutput
; endloop
add edi,16
add esi,4
add eax,2
add ebx,2
sub ecx, 1 ; apparently sub is better than dec
jnz REPEATLOOP3
ENDLOOP3:
; Cleanup
emms
pop ebx
pop eax
pop ecx
pop esi
pop edi
mov esp, ebp
pop ebp
ret
SECTION .note.GNU-stack noalloc noexec nowrite progbits

View File

@ -0,0 +1,307 @@
;
; Copyright (C) 2009-2010 David McPaul
;
; All rights reserved. Distributed under the terms of the MIT License.
;
; A rather unoptimised set of ssse3 yuv to rgb converters
; does 8 pixels per loop
; inputer:
; reads 128 bits of yuv 8 bit data and puts
; the y values converted to 16 bit in xmm0
; the u values converted to 16 bit and duplicated into xmm1
; the v values converted to 16 bit and duplicated into xmm2
; conversion:
; does the yuv to rgb conversion using 16 bit fixed point and the
; results are placed into the following registers as 8 bit clamped values
; r values in xmm3
; g values in xmm4
; b values in xmm5
; outputer:
; writes out the rgba pixels as 8 bit values with 0 for alpha
; xmm6 used for scratch
; xmm7 used for scratch
%macro cglobal 1
global _%1
%define %1 _%1
align 16
%1:
%endmacro
; conversion code
%macro yuv2rgbsse2 0
; u = u - 128
; v = v - 128
; r = y + v + v >> 2 + v >> 3 + v >> 5
; g = y - (u >> 2 + u >> 4 + u >> 5) - (v >> 1 + v >> 3 + v >> 4 + v >> 5)
; b = y + u + u >> 1 + u >> 2 + u >> 6
; subtract 16 from y
movdqa xmm7, [Const16] ; loads a constant using data cache (slower on first fetch but then cached)
psubsw xmm0,xmm7 ; y = y - 16
; subtract 128 from u and v
movdqa xmm7, [Const128] ; loads a constant using data cache (slower on first fetch but then cached)
psubsw xmm1,xmm7 ; u = u - 128
psubsw xmm2,xmm7 ; v = v - 128
; load r,b with y
movdqa xmm3,xmm0 ; r = y
pshufd xmm5,xmm0, 0xE4 ; b = y
; r = y + v + v >> 2 + v >> 3 + v >> 5
paddsw xmm3, xmm2 ; add v to r
movdqa xmm7, xmm1 ; move u to scratch
pshufd xmm6, xmm2, 0xE4 ; move v to scratch
psraw xmm6,2 ; divide v by 4
paddsw xmm3, xmm6 ; and add to r
psraw xmm6,1 ; divide v by 2
paddsw xmm3, xmm6 ; and add to r
psraw xmm6,2 ; divide v by 4
paddsw xmm3, xmm6 ; and add to r
; b = y + u + u >> 1 + u >> 2 + u >> 6
paddsw xmm5, xmm1 ; add u to b
psraw xmm7,1 ; divide u by 2
paddsw xmm5, xmm7 ; and add to b
psraw xmm7,1 ; divide u by 2
paddsw xmm5, xmm7 ; and add to b
psraw xmm7,4 ; divide u by 32
paddsw xmm5, xmm7 ; and add to b
; g = y - u >> 2 - u >> 4 - u >> 5 - v >> 1 - v >> 3 - v >> 4 - v >> 5
movdqa xmm7,xmm2 ; move v to scratch
pshufd xmm6,xmm1, 0xE4 ; move u to scratch
movdqa xmm4,xmm0 ; g = y
psraw xmm6,2 ; divide u by 4
psubsw xmm4,xmm6 ; subtract from g
psraw xmm6,2 ; divide u by 4
psubsw xmm4,xmm6 ; subtract from g
psraw xmm6,1 ; divide u by 2
psubsw xmm4,xmm6 ; subtract from g
psraw xmm7,1 ; divide v by 2
psubsw xmm4,xmm7 ; subtract from g
psraw xmm7,2 ; divide v by 4
psubsw xmm4,xmm7 ; subtract from g
psraw xmm7,1 ; divide v by 2
psubsw xmm4,xmm7 ; subtract from g
psraw xmm7,1 ; divide v by 2
psubsw xmm4,xmm7 ; subtract from g
%endmacro
; outputer
%macro rgba32sse2output 0
; clamp values
pxor xmm7,xmm7
packuswb xmm3,xmm7 ; clamp to 0,255 and pack R to 8 bit per pixel
packuswb xmm4,xmm7 ; clamp to 0,255 and pack G to 8 bit per pixel
packuswb xmm5,xmm7 ; clamp to 0,255 and pack B to 8 bit per pixel
; convert to bgra32 packed
punpcklbw xmm5,xmm4 ; bgbgbgbgbgbgbgbg
movdqa xmm0, xmm5 ; save bg values
punpcklbw xmm3,xmm7 ; r0r0r0r0r0r0r0r0
punpcklwd xmm5,xmm3 ; lower half bgr0bgr0bgr0bgr0
punpckhwd xmm0,xmm3 ; upper half bgr0bgr0bgr0bgr0
; write to output ptr
movntdq [edi], xmm5 ; output first 4 pixels bypassing cache
movntdq [edi+16], xmm0 ; output second 4 pixels bypassing cache
%endmacro
SECTION .data align=16
Const16 dw 16
dw 16
dw 16
dw 16
dw 16
dw 16
dw 16
dw 16
Const128 dw 128
dw 128
dw 128
dw 128
dw 128
dw 128
dw 128
dw 128
UMask db 0x01
db 0x80
db 0x01
db 0x80
db 0x05
db 0x80
db 0x05
db 0x80
db 0x09
db 0x80
db 0x09
db 0x80
db 0x0d
db 0x80
db 0x0d
db 0x80
VMask db 0x03
db 0x80
db 0x03
db 0x80
db 0x07
db 0x80
db 0x07
db 0x80
db 0x0b
db 0x80
db 0x0b
db 0x80
db 0x0f
db 0x80
db 0x0f
db 0x80
YMask db 0x00
db 0x80
db 0x02
db 0x80
db 0x04
db 0x80
db 0x06
db 0x80
db 0x08
db 0x80
db 0x0a
db 0x80
db 0x0c
db 0x80
db 0x0e
db 0x80
; void Convert_YUV422_RGBA32_SSSE3(void *fromPtr, void *toPtr, int width)
width equ ebp+16
toPtr equ ebp+12
fromPtr equ ebp+8
; void Convert_YUV420P_RGBA32_SSSE3(void *fromYPtr, void *fromUPtr, void *fromVPtr, void *toPtr, int width)
width1 equ ebp+24
toPtr1 equ ebp+20
fromVPtr equ ebp+16
fromUPtr equ ebp+12
fromYPtr equ ebp+8
SECTION .text align=16
cglobal Convert_YUV422_RGBA32_SSSE3
; reserve variables
push ebp
mov ebp, esp
push edi
push esi
push ecx
mov esi, [fromPtr]
mov edi, [toPtr]
mov ecx, [width]
; loop width / 8 times
shr ecx,3
test ecx,ecx
jng ENDLOOP
REPEATLOOP: ; loop over width / 8
; YUV422 packed inputer
movdqa xmm0, [esi] ; should have yuyv yuyv yuyv yuyv
pshufd xmm1, xmm0, 0xE4 ; copy to xmm1
movdqa xmm2, xmm0 ; copy to xmm2
; extract both y giving y0y0
pshufb xmm0, [YMask]
; extract u and duplicate so each u in yuyv becomes u0u0
pshufb xmm1, [UMask]
; extract v and duplicate so each v in yuyv becomes v0v0
pshufb xmm2, [VMask]
yuv2rgbsse2
rgba32sse2output
; endloop
add edi,32
add esi,16
sub ecx, 1 ; apparently sub is better than dec
jnz REPEATLOOP
ENDLOOP:
; Cleanup
pop ecx
pop esi
pop edi
mov esp, ebp
pop ebp
ret
cglobal Convert_YUV420P_RGBA32_SSSE3
; reserve variables
push ebp
mov ebp, esp
push edi
push esi
push ecx
push eax
push ebx
mov esi, [fromYPtr]
mov eax, [fromUPtr]
mov ebx, [fromVPtr]
mov edi, [toPtr1]
mov ecx, [width1]
; loop width / 8 times
shr ecx,3
test ecx,ecx
jng ENDLOOP1
REPEATLOOP1: ; loop over width / 8
; YUV420 Planar inputer
movq xmm0, [esi] ; fetch 8 y values (8 bit) yyyyyyyy00000000
movd xmm1, [eax] ; fetch 4 u values (8 bit) uuuu000000000000
movd xmm2, [ebx] ; fetch 4 v values (8 bit) vvvv000000000000
; extract y
pxor xmm7,xmm7 ; 00000000000000000000000000000000
punpcklbw xmm0,xmm7 ; interleave xmm7 into xmm0 y0y0y0y0y0y0y0y0
; extract u and duplicate so each becomes 0u0u
punpcklbw xmm1,xmm7 ; interleave xmm7 into xmm1 u0u0u0u000000000
punpcklwd xmm1,xmm7 ; interleave again u000u000u000u000
pshuflw xmm1,xmm1, 0xA0 ; copy u values
pshufhw xmm1,xmm1, 0xA0 ; to get u0u0
; extract v
punpcklbw xmm2,xmm7 ; interleave xmm7 into xmm1 v0v0v0v000000000
punpcklwd xmm2,xmm7 ; interleave again v000v000v000v000
pshuflw xmm2,xmm2, 0xA0 ; copy v values
pshufhw xmm2,xmm2, 0xA0 ; to get v0v0
yuv2rgbsse2
rgba32sse2output
; endloop
add edi,32
add esi,8
add eax,4
add ebx,4
sub ecx, 1 ; apparently sub is better than dec
jnz REPEATLOOP1
ENDLOOP1:
; Cleanup
pop ebx
pop eax
pop ecx
pop esi
pop edi
mov esp, ebp
pop ebp
ret
SECTION .note.GNU-stack noalloc noexec nowrite progbits