Use multiplication instead of shifts and adds. Include SSSE3 routine

git-svn-id: file:///srv/svn/repos/haiku/haiku/trunk@40768 a95241bf-73f2-0310-859d-f6bbb57e9c96
This commit is contained in:
David McPaul 2011-03-01 21:32:01 +00:00
parent 146d274d81
commit 0df942b65c
4 changed files with 345 additions and 219 deletions

View File

@ -42,9 +42,11 @@ resolve_colorspace(color_space colorSpace, PixelFormat pixelFormat, int width,
if (pixelFormat == PIX_FMT_YUV420P
|| pixelFormat == PIX_FMT_YUVJ420P) {
if (cpu.HasSSE2() && width % 8 == 0 && height % 2 == 0) {
TRACE("resolve_colorspace: "
"gfx_conv_yuv420p_rgba32_sse2\n");
if (cpu.HasSSSE3() && width % 8 == 0 && height % 2 == 0) {
TRACE("resolve_colorspace: gfx_conv_yuv420p_rgba32_ssse3\n");
return gfx_conv_yuv420p_rgba32_ssse3;
} else if (cpu.HasSSE2() && width % 8 == 0 && height % 2 == 0) {
TRACE("resolve_colorspace: gfx_conv_yuv420p_rgba32_sse2\n");
return gfx_conv_yuv420p_rgba32_sse2;
} else if (cpu.HasSSE1() && width % 4 == 0
&& height % 2 == 0) {
@ -58,7 +60,10 @@ resolve_colorspace(color_space colorSpace, PixelFormat pixelFormat, int width,
if (pixelFormat == PIX_FMT_YUV422P
|| pixelFormat == PIX_FMT_YUVJ422P) {
if (cpu.HasSSE2() && width % 8 == 0) {
if (cpu.HasSSSE3() && width % 8 == 0) {
TRACE("resolve_colorspace: gfx_conv_yuv422p_RGB32_ssse3\n");
return gfx_conv_yuv422p_rgba32_ssse3;
} else if (cpu.HasSSE2() && width % 8 == 0) {
TRACE("resolve_colorspace: gfx_conv_yuv422p_RGB32_sse2\n");
return gfx_conv_yuv422p_rgba32_sse2;
} else if (cpu.HasSSE1() && width % 4 == 0) {

View File

@ -42,7 +42,7 @@
; b = y + u + u >> 1 + u >> 2 + u >> 6
; subtract 16 from y
movq mm7, [Const16] ; loads a constant using data cache (slower on first fetch but then cached)
psubsw mm0,mm7 ; y = y - 16
; psubsw mm0,mm7 ; y = y - 16
; subtract 128 from u and v
movq mm7, [Const128] ; loads a constant using data cache (slower on first fetch but then cached)
psubsw mm1,mm7 ; u = u - 128

View File

@ -33,85 +33,6 @@
%1:
%endmacro
; conversion code
%macro yuv2rgbsse2 0
; u = u - 128
; v = v - 128
; r = y + v + v >> 2 + v >> 3 + v >> 5
; g = y - (u >> 2 + u >> 4 + u >> 5) - (v >> 1 + v >> 3 + v >> 4 + v >> 5)
; b = y + u + u >> 1 + u >> 2 + u >> 6
; subtract 16 from y
movdqa xmm7, [Const16] ; loads a constant using data cache (slower on first fetch but then cached)
psubsw xmm0,xmm7 ; y = y - 16
; subtract 128 from u and v
movdqa xmm7, [Const128] ; loads a constant using data cache (slower on first fetch but then cached)
psubsw xmm1,xmm7 ; u = u - 128
psubsw xmm2,xmm7 ; v = v - 128
; load r,b with y
movdqa xmm3,xmm0 ; r = y
pshufd xmm5,xmm0, 0xE4 ; b = y
; r = y + v + v >> 2 + v >> 3 + v >> 5
paddsw xmm3, xmm2 ; add v to r
movdqa xmm7, xmm1 ; move u to scratch
pshufd xmm6, xmm2, 0xE4 ; move v to scratch
psraw xmm6,2 ; divide v by 4
paddsw xmm3, xmm6 ; and add to r
psraw xmm6,1 ; divide v by 2
paddsw xmm3, xmm6 ; and add to r
psraw xmm6,2 ; divide v by 4
paddsw xmm3, xmm6 ; and add to r
; b = y + u + u >> 1 + u >> 2 + u >> 6
paddsw xmm5, xmm1 ; add u to b
psraw xmm7,1 ; divide u by 2
paddsw xmm5, xmm7 ; and add to b
psraw xmm7,1 ; divide u by 2
paddsw xmm5, xmm7 ; and add to b
psraw xmm7,4 ; divide u by 32
paddsw xmm5, xmm7 ; and add to b
; g = y - u >> 2 - u >> 4 - u >> 5 - v >> 1 - v >> 3 - v >> 4 - v >> 5
movdqa xmm7,xmm2 ; move v to scratch
pshufd xmm6,xmm1, 0xE4 ; move u to scratch
movdqa xmm4,xmm0 ; g = y
psraw xmm6,2 ; divide u by 4
psubsw xmm4,xmm6 ; subtract from g
psraw xmm6,2 ; divide u by 4
psubsw xmm4,xmm6 ; subtract from g
psraw xmm6,1 ; divide u by 2
psubsw xmm4,xmm6 ; subtract from g
psraw xmm7,1 ; divide v by 2
psubsw xmm4,xmm7 ; subtract from g
psraw xmm7,2 ; divide v by 4
psubsw xmm4,xmm7 ; subtract from g
psraw xmm7,1 ; divide v by 2
psubsw xmm4,xmm7 ; subtract from g
psraw xmm7,1 ; divide v by 2
psubsw xmm4,xmm7 ; subtract from g
%endmacro
; outputer
%macro rgba32sse2output 0
; clamp values
pxor xmm7,xmm7
packuswb xmm3,xmm7 ; clamp to 0,255 and pack R to 8 bit per pixel
packuswb xmm4,xmm7 ; clamp to 0,255 and pack G to 8 bit per pixel
packuswb xmm5,xmm7 ; clamp to 0,255 and pack B to 8 bit per pixel
; convert to bgra32 packed
punpcklbw xmm5,xmm4 ; bgbgbgbgbgbgbgbg
movdqa xmm0, xmm5 ; save bg values
punpcklbw xmm3,xmm7 ; r0r0r0r0r0r0r0r0
punpcklwd xmm5,xmm3 ; lower half bgr0bgr0bgr0bgr0
punpckhwd xmm0,xmm3 ; upper half bgr0bgr0bgr0bgr0
; write to output ptr
movntdq [edi], xmm5 ; output first 4 pixels bypassing cache
movntdq [edi+16], xmm0 ; output second 4 pixels bypassing cache
%endmacro
SECTION .data align=16
Const16 dw 16
@ -131,6 +52,137 @@ Const128 dw 128
dw 128
dw 128
dw 128
RConst dw 0
dw 5743
dw 0
dw 5743
dw 0
dw 5743
dw 0
dw 5743
GConst dw -1409
dw -2925
dw -1409
dw -2925
dw -1409
dw -2925
dw -1409
dw -2925
BConst dw 7258
dw 0
dw 7258
dw 0
dw 7258
dw 0
dw 7258
dw 0
shuffconst db 0x0
db 0x01
db 0x00
db 0x01
db 0x04
db 0x05
db 0x04
db 0x05
db 0x08
db 0x09
db 0x08
db 0x09
db 0x0c
db 0x0d
db 0x0c
db 0x0d
YMask db 0x00
db 0x80
db 0x02
db 0x80
db 0x04
db 0x80
db 0x06
db 0x80
db 0x08
db 0x80
db 0x0a
db 0x80
db 0x0c
db 0x80
db 0x0e
db 0x80
UVMask db 0x01
db 0x80
db 0x03
db 0x80
db 0x05
db 0x80
db 0x07
db 0x80
db 0x09
db 0x80
db 0x0b
db 0x80
db 0x0d
db 0x80
db 0x0f
db 0x80
; conversion code
%macro yuv2rgbsse2 0
; u = u - 128
; v = v - 128
; r = y + 0 * u + 1.402 * v
; g = y + -0.344 * u + -0.714 * v
; b = y + 1.772 * u + 0 * v
; subtract 16 from y
; psubsw xmm0, [Const16] ; y = y - 16
; subtract 128 from u and v
psubsw xmm3, [Const128] ; u = u - 128, v = v -128
movdqa xmm4, xmm3 ; duplicate
pshufd xmm5, xmm3, 0xE4 ; duplicate
pmaddwd xmm3, [RConst] ; multiply and add
pmaddwd xmm4, [GConst] ; to get RGB offsets to Y
pmaddwd xmm5, [BConst] ;
psrad xmm3, 12 ; Scale back to original range
psrad xmm4, 12 ;
psrad xmm5, 12 ;
pshuflw xmm3, xmm3, 0xa0 ; duplicate results
pshufhw xmm3, xmm3, 0xa0
pshuflw xmm4, xmm4, 0xa0
pshufhw xmm4, xmm4, 0xa0
pshuflw xmm5, xmm5, 0xa0
pshufhw xmm5, xmm5, 0xa0
paddsw xmm3, xmm0 ; add to y
paddsw xmm4, xmm0 ;
paddsw xmm5, xmm0 ;
%endmacro
; outputer
%macro rgba32sse2output 0
; clamp values
pxor xmm7,xmm7
packuswb xmm3,xmm7 ; clamp to 0,255 and pack R to 8 bit per pixel
packuswb xmm4,xmm7 ; clamp to 0,255 and pack G to 8 bit per pixel
packuswb xmm5,xmm7 ; clamp to 0,255 and pack B to 8 bit per pixel
; convert to bgra32 packed
punpcklbw xmm5,xmm4 ; bgbgbgbgbgbgbgbg
movdqa xmm0, xmm5 ; save bg values
punpcklbw xmm3,xmm7 ; r0r0r0r0r0r0r0r0
punpcklwd xmm5,xmm3 ; lower half bgr0bgr0bgr0bgr0
punpckhwd xmm0,xmm3 ; upper half bgr0bgr0bgr0bgr0
; write to output ptr
movntdq [edi], xmm5 ; output first 4 pixels bypassing cache
movntdq [edi+16], xmm0 ; output second 4 pixels bypassing cache
%endmacro
; void Convert_YUV422_RGBA32_SSE2(void *fromPtr, void *toPtr, int width)
width equ ebp+16
@ -162,28 +214,20 @@ cglobal Convert_YUV422_RGBA32_SSE2
test ecx,ecx
jng ENDLOOP
REPEATLOOP: ; loop over width / 8
prefetchnta [esi+256]
; YUV422 packed inputer
movdqa xmm0, [esi] ; should have yuyv yuyv yuyv yuyv
pshufd xmm1, xmm0, 0xE4 ; copy to xmm1
movdqa xmm2, xmm0 ; copy to xmm2
pshufd xmm3, xmm0, 0xE4 ; copy to xmm3
; extract y
pxor xmm7,xmm7 ; 00000000000000000000000000000000
pcmpeqd xmm6,xmm6 ; ffffffffffffffffffffffffffffffff
punpcklbw xmm6,xmm7 ; interleave xmm7 into xmm6 ff00ff00ff00ff00ff00ff00ff00ff00
pxor xmm7, xmm7 ; 00000000000000000000000000000000
pcmpeqd xmm6, xmm6 ; ffffffffffffffffffffffffffffffff
punpcklbw xmm6, xmm7 ; interleave xmm7 into xmm6 ff00ff00ff00ff00ff00ff00ff00ff00
pand xmm0, xmm6 ; clear all but y values leaving y0y0 etc
; extract u and duplicate so each u in yuyv becomes 0u0u
psrld xmm6,8 ; 00ff0000 00ff0000 00ff0000 00ff0000
pand xmm1, xmm6 ; clear all yv values leaving 0u00 etc
psrld xmm1,8 ; rotate u to get u000
pshuflw xmm1,xmm1, 0xA0 ; copy u values
pshufhw xmm1,xmm1, 0xA0 ; to get u0u0
; extract v
pslld xmm6,16 ; 000000ff000000ff 000000ff000000ff
pand xmm2, xmm6 ; clear all yu values leaving 000v etc
psrld xmm2,8 ; rotate v to get 00v0
pshuflw xmm2,xmm2, 0xF5 ; copy v values
pshufhw xmm2,xmm2, 0xF5 ; to get v0v0
; extract u and v
psllw xmm6, 8 ; 00ff00ff00ff00ff00ff00ff00ff00ff
pand xmm3, xmm6 ; extract uv values 0u0v0u0v0u0v0u0v0u0v
psrlw xmm3, 8 ; covert to 16bit u0v0u0v0u0v0u0v0u0v0
yuv2rgbsse2
rgba32sse2output
@ -224,22 +268,16 @@ cglobal Convert_YUV420P_RGBA32_SSE2
REPEATLOOP1: ; loop over width / 8
; YUV420 Planar inputer
movq xmm0, [esi] ; fetch 8 y values (8 bit) yyyyyyyy00000000
movd xmm1, [eax] ; fetch 4 u values (8 bit) uuuu000000000000
movd xmm2, [ebx] ; fetch 4 v values (8 bit) vvvv000000000000
movd xmm3, [eax] ; fetch 4 u values (8 bit) uuuu000000000000
movd xmm1, [ebx] ; fetch 4 v values (8 bit) vvvv000000000000
; extract y
pxor xmm7,xmm7 ; 00000000000000000000000000000000
punpcklbw xmm0,xmm7 ; interleave xmm7 into xmm0 y0y0y0y0y0y0y0y0
; extract u and duplicate so each becomes 0u0u
punpcklbw xmm1,xmm7 ; interleave xmm7 into xmm1 u0u0u0u000000000
punpcklwd xmm1,xmm7 ; interleave again u000u000u000u000
pshuflw xmm1,xmm1, 0xA0 ; copy u values
pshufhw xmm1,xmm1, 0xA0 ; to get u0u0
; extract v
punpcklbw xmm2,xmm7 ; interleave xmm7 into xmm1 v0v0v0v000000000
punpcklwd xmm2,xmm7 ; interleave again v000v000v000v000
pshuflw xmm2,xmm2, 0xA0 ; copy v values
pshufhw xmm2,xmm2, 0xA0 ; to get v0v0
pxor xmm7, xmm7 ; 00000000000000000000000000000000
punpcklbw xmm0, xmm7 ; interleave xmm7 into xmm0 y0y0y0y0y0y0y0y0
; combine u and v
punpcklbw xmm3, xmm1 ; uvuvuvuv00000000
punpcklbw xmm3, xmm7 ; u0v0u0v0u0v0u0v0
yuv2rgbsse2
@ -263,4 +301,58 @@ ENDLOOP1:
pop ebp
ret
cglobal Test_SSE2
; reserve variables
push ebp
mov ebp, esp
push edi
push esi
push ecx
push eax
push ebx
mov esi, [fromPtr]
mov edi, [toPtr]
movdqa xmm0, [esi] ; should have yuyv yuyv yuyv yuyv
pshufd xmm1, xmm0, 0xE4 ; copy to xmm1
movdqa xmm3, xmm0 ; copy to xmm2
; extract y
pxor xmm7,xmm7 ; 00000000000000000000000000000000
pcmpeqd xmm6,xmm6 ; ffffffffffffffffffffffffffffffff
punpcklbw xmm6,xmm7 ; interleave xmm7 into xmm6 ff00ff00ff00ff00ff00ff00ff00ff00
pand xmm0, xmm6 ; clear all but y values leaving y0y0 etc
; extract u and duplicate so each u in yuyv becomes 0u0u
psrld xmm6,8 ; 00ff0000 00ff0000 00ff0000 00ff0000
pand xmm1, xmm6 ; clear all yv values leaving 0u00 etc
psrld xmm1,8 ; rotate u to get u000
; extract v
pslld xmm6,16 ; 000000ff000000ff 000000ff000000ff
pand xmm3, xmm6 ; clear all yu values leaving 000v etc
psrld xmm3,8 ; rotate v to get 00v0
por xmm3, xmm1
psubsw xmm3, [Const128] ; u = u - 128, v = v -128
pmaddwd xmm3, [RConst] ; multiply and add
psrad xmm3, 12 ; Scale back to original range
pshufb xmm3, [shuffconst] ; duplicate results
; paddsw xmm3, xmm0 ; add to y
; pxor xmm7,xmm7
; packuswb xmm3,xmm7 ; clamp to 0,255 and pack R to 8 bit per pixel
movntdq [edi], xmm3 ; output first 4 pixels bypassing cache
; Cleanup
pop ebx
pop eax
pop ecx
pop esi
pop edi
mov esp, ebp
pop ebp
ret
SECTION .note.GNU-stack noalloc noexec nowrite progbits

View File

@ -33,85 +33,6 @@
%1:
%endmacro
; conversion code
%macro yuv2rgbsse2 0
; u = u - 128
; v = v - 128
; r = y + v + v >> 2 + v >> 3 + v >> 5
; g = y - (u >> 2 + u >> 4 + u >> 5) - (v >> 1 + v >> 3 + v >> 4 + v >> 5)
; b = y + u + u >> 1 + u >> 2 + u >> 6
; subtract 16 from y
movdqa xmm7, [Const16] ; loads a constant using data cache (slower on first fetch but then cached)
psubsw xmm0,xmm7 ; y = y - 16
; subtract 128 from u and v
movdqa xmm7, [Const128] ; loads a constant using data cache (slower on first fetch but then cached)
psubsw xmm1,xmm7 ; u = u - 128
psubsw xmm2,xmm7 ; v = v - 128
; load r,b with y
movdqa xmm3,xmm0 ; r = y
pshufd xmm5,xmm0, 0xE4 ; b = y
; r = y + v + v >> 2 + v >> 3 + v >> 5
paddsw xmm3, xmm2 ; add v to r
movdqa xmm7, xmm1 ; move u to scratch
pshufd xmm6, xmm2, 0xE4 ; move v to scratch
psraw xmm6,2 ; divide v by 4
paddsw xmm3, xmm6 ; and add to r
psraw xmm6,1 ; divide v by 2
paddsw xmm3, xmm6 ; and add to r
psraw xmm6,2 ; divide v by 4
paddsw xmm3, xmm6 ; and add to r
; b = y + u + u >> 1 + u >> 2 + u >> 6
paddsw xmm5, xmm1 ; add u to b
psraw xmm7,1 ; divide u by 2
paddsw xmm5, xmm7 ; and add to b
psraw xmm7,1 ; divide u by 2
paddsw xmm5, xmm7 ; and add to b
psraw xmm7,4 ; divide u by 32
paddsw xmm5, xmm7 ; and add to b
; g = y - u >> 2 - u >> 4 - u >> 5 - v >> 1 - v >> 3 - v >> 4 - v >> 5
movdqa xmm7,xmm2 ; move v to scratch
pshufd xmm6,xmm1, 0xE4 ; move u to scratch
movdqa xmm4,xmm0 ; g = y
psraw xmm6,2 ; divide u by 4
psubsw xmm4,xmm6 ; subtract from g
psraw xmm6,2 ; divide u by 4
psubsw xmm4,xmm6 ; subtract from g
psraw xmm6,1 ; divide u by 2
psubsw xmm4,xmm6 ; subtract from g
psraw xmm7,1 ; divide v by 2
psubsw xmm4,xmm7 ; subtract from g
psraw xmm7,2 ; divide v by 4
psubsw xmm4,xmm7 ; subtract from g
psraw xmm7,1 ; divide v by 2
psubsw xmm4,xmm7 ; subtract from g
psraw xmm7,1 ; divide v by 2
psubsw xmm4,xmm7 ; subtract from g
%endmacro
; outputer
%macro rgba32sse2output 0
; clamp values
pxor xmm7,xmm7
packuswb xmm3,xmm7 ; clamp to 0,255 and pack R to 8 bit per pixel
packuswb xmm4,xmm7 ; clamp to 0,255 and pack G to 8 bit per pixel
packuswb xmm5,xmm7 ; clamp to 0,255 and pack B to 8 bit per pixel
; convert to bgra32 packed
punpcklbw xmm5,xmm4 ; bgbgbgbgbgbgbgbg
movdqa xmm0, xmm5 ; save bg values
punpcklbw xmm3,xmm7 ; r0r0r0r0r0r0r0r0
punpcklwd xmm5,xmm3 ; lower half bgr0bgr0bgr0bgr0
punpckhwd xmm0,xmm3 ; upper half bgr0bgr0bgr0bgr0
; write to output ptr
movntdq [edi], xmm5 ; output first 4 pixels bypassing cache
movntdq [edi+16], xmm0 ; output second 4 pixels bypassing cache
%endmacro
SECTION .data align=16
Const16 dw 16
@ -183,6 +104,118 @@ YMask db 0x00
db 0x0e
db 0x80
UVMask db 0x01
db 0x80
db 0x03
db 0x80
db 0x05
db 0x80
db 0x07
db 0x80
db 0x09
db 0x80
db 0x0b
db 0x80
db 0x0d
db 0x80
db 0x0f
db 0x80
shuffconst db 0x0
db 0x01
db 0x00
db 0x01
db 0x04
db 0x05
db 0x04
db 0x05
db 0x08
db 0x09
db 0x08
db 0x09
db 0x0c
db 0x0d
db 0x0c
db 0x0d
RConst dw 0
dw 5743
dw 0
dw 5743
dw 0
dw 5743
dw 0
dw 5743
GConst dw -1409
dw -2925
dw -1409
dw -2925
dw -1409
dw -2925
dw -1409
dw -2925
BConst dw 7258
dw 0
dw 7258
dw 0
dw 7258
dw 0
dw 7258
dw 0
; conversion code
%macro yuv2rgbssse3 0
; u = u - 128
; v = v - 128
; r = y + 0 * u + 1.403 * v
; g = y + -0.344 * u + -0.714 * v
; b = y + 1.773 * u + 0 * v
; subtract 128 from u and v
psubsw xmm3, [Const128] ; u = u - 128, v = v -128
pshufd xmm5, xmm3, 0xE4 ; duplicate
movdqa xmm4, xmm3 ; duplicate
; subtract 16 from y
; psubsw xmm0, [Const16] ; y = y - 16
pmaddwd xmm3, [RConst] ; multiply and add
pmaddwd xmm4, [GConst] ; to get RGB offsets to Y
pmaddwd xmm5, [BConst] ;
psrad xmm3, 12 ; Scale back to original range
psrad xmm4, 12 ;
psrad xmm5, 12 ;
pshufb xmm3, [shuffconst] ; duplicate results
pshufb xmm4, [shuffconst] ; 2 y values per const
pshufb xmm5, [shuffconst] ;
paddsw xmm3, xmm0 ; and add to y
paddsw xmm4, xmm0 ;
paddsw xmm5, xmm0 ;
%endmacro
; outputer
%macro rgba32ssse3output 0
; clamp values
pxor xmm7,xmm7
packuswb xmm3,xmm7 ; clamp to 0,255 and pack R to 8 bit per pixel
packuswb xmm4,xmm7 ; clamp to 0,255 and pack G to 8 bit per pixel
packuswb xmm5,xmm7 ; clamp to 0,255 and pack B to 8 bit per pixel
; convert to bgra32 packed
punpcklbw xmm5,xmm4 ; bgbgbgbgbgbgbgbg
movdqa xmm0, xmm5 ; save bg values
punpcklbw xmm3,xmm7 ; r0r0r0r0r0r0r0r0
punpcklwd xmm5,xmm3 ; lower half bgr0bgr0bgr0bgr0
punpckhwd xmm0,xmm3 ; upper half bgr0bgr0bgr0bgr0
; write to output ptr
movntdq [edi], xmm5 ; output first 4 pixels bypassing cache
movntdq [edi+16], xmm0 ; output second 4 pixels bypassing cache
%endmacro
; void Convert_YUV422_RGBA32_SSSE3(void *fromPtr, void *toPtr, int width)
width equ ebp+16
@ -214,20 +247,18 @@ cglobal Convert_YUV422_RGBA32_SSSE3
test ecx,ecx
jng ENDLOOP
REPEATLOOP: ; loop over width / 8
prefetchnta [esi+256]
; YUV422 packed inputer
movdqa xmm0, [esi] ; should have yuyv yuyv yuyv yuyv
pshufd xmm1, xmm0, 0xE4 ; copy to xmm1
movdqa xmm2, xmm0 ; copy to xmm2
pshufd xmm3, xmm0, 0xE4 ; copy to xmm1
; extract both y giving y0y0
pshufb xmm0, [YMask]
; extract u and duplicate so each u in yuyv becomes u0u0
pshufb xmm1, [UMask]
; extract v and duplicate so each v in yuyv becomes v0v0
pshufb xmm2, [VMask]
; extract u and v to have u0v0
pshufb xmm3, [UVMask]
yuv2rgbsse2
yuv2rgbssse3
rgba32sse2output
rgba32ssse3output
; endloop
add edi,32
@ -263,28 +294,26 @@ cglobal Convert_YUV420P_RGBA32_SSSE3
test ecx,ecx
jng ENDLOOP1
REPEATLOOP1: ; loop over width / 8
prefetchnta [esi+256]
prefetchnta [eax+128]
prefetchnta [ebx+128]
; YUV420 Planar inputer
movq xmm0, [esi] ; fetch 8 y values (8 bit) yyyyyyyy00000000
movd xmm1, [eax] ; fetch 4 u values (8 bit) uuuu000000000000
movd xmm2, [ebx] ; fetch 4 v values (8 bit) vvvv000000000000
movd xmm3, [eax] ; fetch 4 u values (8 bit) uuuu000000000000
movd xmm1, [ebx] ; fetch 4 v values (8 bit) vvvv000000000000
; extract y
; convert y to 16 bit
pxor xmm7,xmm7 ; 00000000000000000000000000000000
punpcklbw xmm0,xmm7 ; interleave xmm7 into xmm0 y0y0y0y0y0y0y0y0
; extract u and duplicate so each becomes 0u0u
punpcklbw xmm1,xmm7 ; interleave xmm7 into xmm1 u0u0u0u000000000
punpcklwd xmm1,xmm7 ; interleave again u000u000u000u000
pshuflw xmm1,xmm1, 0xA0 ; copy u values
pshufhw xmm1,xmm1, 0xA0 ; to get u0u0
; extract v
punpcklbw xmm2,xmm7 ; interleave xmm7 into xmm1 v0v0v0v000000000
punpcklwd xmm2,xmm7 ; interleave again v000v000v000v000
pshuflw xmm2,xmm2, 0xA0 ; copy v values
pshufhw xmm2,xmm2, 0xA0 ; to get v0v0
yuv2rgbsse2
rgba32sse2output
; combine u and v
punpcklbw xmm3,xmm1 ; uvuvuvuv00000000
punpcklbw xmm3,xmm7 ; u0v0u0v0u0v0u0v0
yuv2rgbssse3
rgba32ssse3output
; endloop
add edi,32