OpenH264 YUV data conversion with intel SSSE3 in assembly

This commit is contained in:
erbth 2014-08-13 20:56:40 +02:00
parent a8945306a1
commit 095a7aba99
11 changed files with 574 additions and 337 deletions

2
.gitignore vendored
View File

@ -106,7 +106,7 @@ client/DirectFB/dfreerdp
server/Sample/sfreerdp-server server/Sample/sfreerdp-server
server/X11/xfreerdp-server server/X11/xfreerdp-server
xcode xcode
libfreerdp/codec/test/TestOpenH264 libfreerdp/codec/test/TestOpenH264ASM
# Other # Other
*~ *~

View File

@ -478,7 +478,6 @@ int dvcman_receive_channel_data_first(IWTSVirtualChannelManager* pChannelMgr, UI
Stream_Release(channel->dvc_data); Stream_Release(channel->dvc_data);
channel->dvc_data = StreamPool_Take(channel->dvcman->pool, length); channel->dvc_data = StreamPool_Take(channel->dvcman->pool, length);
//Stream_AddRef(channel->dvc_data);
return 0; return 0;
} }
@ -488,6 +487,7 @@ int dvcman_receive_channel_data(IWTSVirtualChannelManager* pChannelMgr, UINT32 C
int status = 0; int status = 0;
DVCMAN_CHANNEL* channel; DVCMAN_CHANNEL* channel;
UINT32 dataSize = Stream_GetRemainingLength(data); UINT32 dataSize = Stream_GetRemainingLength(data);
wStream* s;
channel = (DVCMAN_CHANNEL*) dvcman_find_channel_by_id(pChannelMgr, ChannelId); channel = (DVCMAN_CHANNEL*) dvcman_find_channel_by_id(pChannelMgr, ChannelId);
@ -500,7 +500,6 @@ int dvcman_receive_channel_data(IWTSVirtualChannelManager* pChannelMgr, UINT32 C
if (channel->dvc_data) if (channel->dvc_data)
{ {
/* Fragmented data */ /* Fragmented data */
//if (Stream_GetPosition(channel->dvc_data) + dataSize > (UINT32) Stream_Capacity(channel->dvc_data))
if (Stream_GetPosition(channel->dvc_data) + dataSize > (UINT32) Stream_Length(channel->dvc_data)) if (Stream_GetPosition(channel->dvc_data) + dataSize > (UINT32) Stream_Length(channel->dvc_data))
{ {
DEBUG_WARN("data exceeding declared length!"); DEBUG_WARN("data exceeding declared length!");
@ -511,14 +510,15 @@ int dvcman_receive_channel_data(IWTSVirtualChannelManager* pChannelMgr, UINT32 C
Stream_Write(channel->dvc_data, Stream_Pointer(data), dataSize); Stream_Write(channel->dvc_data, Stream_Pointer(data), dataSize);
//if (((size_t) Stream_GetPosition(channel->dvc_data)) >= Stream_Capacity(channel->dvc_data)-1)
if (((size_t) Stream_GetPosition(channel->dvc_data)) >= Stream_Length(channel->dvc_data)-1) if (((size_t) Stream_GetPosition(channel->dvc_data)) >= Stream_Length(channel->dvc_data)-1)
{ {
Stream_SealLength(channel->dvc_data); Stream_SealLength(channel->dvc_data);
Stream_SetPosition(channel->dvc_data, 0); Stream_SetPosition(channel->dvc_data, 0);
status = channel->channel_callback->OnDataReceived(channel->channel_callback, channel->dvc_data); s=channel->dvc_data;
Stream_Release(channel->dvc_data);
channel->dvc_data = NULL; channel->dvc_data = NULL;
status = channel->channel_callback->OnDataReceived(channel->channel_callback, s);
Stream_Release(s);
} }
} }
else else

View File

@ -139,7 +139,7 @@ int xf_OutputUpdate(xfContext* xfc)
int xf_OutputExpose(xfContext* xfc, int x, int y, int width, int height) int xf_OutputExpose(xfContext* xfc, int x, int y, int width, int height)
{ {
/** ********************************* /** *********************************
* to be improved * to be improved?
* *********************************/ * *********************************/
RECTANGLE_16 invalidRect; RECTANGLE_16 invalidRect;
@ -366,15 +366,6 @@ int xf_SurfaceCommand_H264(xfContext* xfc, RdpgfxClientContext* context, RDPGFX_
RDPGFX_H264_METABLOCK* meta; RDPGFX_H264_METABLOCK* meta;
RDPGFX_H264_BITMAP_STREAM* bs; RDPGFX_H264_BITMAP_STREAM* bs;
static struct timeval TGES1;
struct timeval TGES2,TDEC1,TDEC2;
TGES2.tv_usec=TGES1.tv_usec;
TGES2.tv_sec=TGES1.tv_sec;
gettimeofday(&TGES1,NULL);
printf("time since last xf_SurfaceCommand_H264: %d sec %d usec\n",(int)(TGES1.tv_sec-TGES2.tv_sec),(int)(TGES1.tv_usec-TGES2.tv_usec));
h264 = xfc->h264; h264 = xfc->h264;
@ -392,13 +383,14 @@ int xf_SurfaceCommand_H264(xfContext* xfc, RdpgfxClientContext* context, RDPGFX_
DstData = surface->data; DstData = surface->data;
gettimeofday(&TDEC1,NULL);
status = h264_decompress(xfc->h264, bs->data, bs->length, &DstData, status = h264_decompress(xfc->h264, bs->data, bs->length, &DstData,
PIXEL_FORMAT_XRGB32, surface->scanline, cmd->left, cmd->top, cmd->width, cmd->height); PIXEL_FORMAT_XRGB32, surface->scanline, cmd->left, cmd->top, cmd->width, cmd->height);
gettimeofday(&TDEC2,NULL);
//printf("decoding took %d sec %d usec\n",(int)(TDEC2.tv_sec-TDEC1.tv_sec),(int)(TDEC2.tv_usec-TDEC1.tv_usec));
//printf("xf_SurfaceCommand_H264: status: %d\n", status); if (status < 0)
{
printf("h264_decompress failure: %d\n",status);
return -1;
}
if (status < 0) if (status < 0)
return -1; return -1;
@ -427,9 +419,6 @@ int xf_SurfaceCommand_H264(xfContext* xfc, RdpgfxClientContext* context, RDPGFX_
updateRects = (RECTANGLE_16*) region16_rects(&updateRegion, &nbUpdateRects); updateRects = (RECTANGLE_16*) region16_rects(&updateRegion, &nbUpdateRects);
#if 0
printf("numRegionRects: %d nbUpdateRects: %d\n", meta->numRegionRects, nbUpdateRects);
#endif
for (j = 0; j < nbUpdateRects; j++) for (j = 0; j < nbUpdateRects; j++)
{ {
@ -439,13 +428,6 @@ int xf_SurfaceCommand_H264(xfContext* xfc, RdpgfxClientContext* context, RDPGFX_
nHeight = updateRects[j].bottom - updateRects[j].top; nHeight = updateRects[j].bottom - updateRects[j].top;
/* update region from decoded H264 buffer */ /* update region from decoded H264 buffer */
#if 0
printf("nXDst: %d nYDst: %d nWidth: %d nHeight: %d decoded: width: %d height: %d cmd: left: %d top: %d right: %d bottom: %d\n",
nXDst, nYDst, nWidth, nHeight, h264->width, h264->height,
cmd->left, cmd->top, cmd->right, cmd->bottom);
#endif
freerdp_image_copy(surface->data, PIXEL_FORMAT_XRGB32, surface->scanline, freerdp_image_copy(surface->data, PIXEL_FORMAT_XRGB32, surface->scanline,
nXDst, nYDst, nWidth, nHeight, nXDst, nYDst, nWidth, nHeight,
h264->data, PIXEL_FORMAT_XRGB32, h264->scanline, nXDst, nYDst); h264->data, PIXEL_FORMAT_XRGB32, h264->scanline, nXDst, nYDst);
@ -457,19 +439,9 @@ int xf_SurfaceCommand_H264(xfContext* xfc, RdpgfxClientContext* context, RDPGFX_
region16_uninit(&updateRegion); region16_uninit(&updateRegion);
region16_uninit(&clippingRects); region16_uninit(&clippingRects);
#if 0
/* fill with red for now to distinguish from the rest */
freerdp_image_fill(surface->data, PIXEL_FORMAT_XRGB32, surface->scanline, if (!xfc->inGfxFrame)
cmd->left, cmd->top, cmd->width, cmd->height, 0xFF0000);
#endif
if (!xfc->inGfxFrame){
xf_OutputUpdate(xfc); xf_OutputUpdate(xfc);
}
gettimeofday(&TGES2,NULL);
printf("the whole command took %d sec %d usec\n",(int)(TGES2.tv_sec-TGES1.tv_sec),(int)(TGES2.tv_usec-TGES1.tv_usec));
return 1; return 1;
} }

View File

@ -92,17 +92,44 @@ if(WITH_OPENH264)
add_definitions(-DWITH_OPENH264) add_definitions(-DWITH_OPENH264)
include_directories(${OPENH264_INCLUDE_DIR}) include_directories(${OPENH264_INCLUDE_DIR})
set(FREERDP_OPENH264_LIBS ${OPENH264_LIBRARIES}) set(FREERDP_OPENH264_LIBS ${OPENH264_LIBRARIES})
if(CMAKE_SIZEOF_VOID_P EQUAL 8)
set(arch64 TRUE)
else()
set(arch64 FALSE)
endif()
if(WITH_OPENH264_ASM) if(WITH_OPENH264_ASM)
set(OPENH264_ASM OPENH264_ASM_o) set(OPENH264_ASM OPENH264_ASM_o)
set(OBJ ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${OPENH264_ASM}.dir/h264.asm.o)
set(SRC ${CMAKE_CURRENT_SOURCE_DIR}/h264.asm)
add_definitions(-DWITH_OPENH264_ASM) add_definitions(-DWITH_OPENH264_ASM)
add_custom_target(${OPENH264_ASM}) add_custom_target(${OPENH264_ASM})
add_custom_command(TARGET ${OPENH264_ASM}
COMMAND nasm ARGS -f elf64 -o ${OBJ} ${SRC} if(arch64)
COMMENT "building H.264 asm objects ...") set(SRC ${CMAKE_CURRENT_SOURCE_DIR}/h264_x64.asm)
set(OBJ ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${OPENH264_ASM}.dir/h264_x64.asm.o)
add_custom_command(TARGET ${OPENH264_ASM}
COMMAND nasm ARGS -f elf64 -o ${OBJ} ${SRC})
else()
message(FATAL_ERROR "OpenH264 YUV data converting is not implemented in 32 bit assembly yet.")
endif()
set(FREERDP_OPENH264_LIBS ${OPENH264_LIBRARIES} ${OBJ})
endif()
if(WITH_OPENH264_SSSE3)
set(OPENH264_ASM OPENH264_ASM_o)
add_definitions(-DWITH_OPENH264_SSSE3)
add_custom_target(${OPENH264_ASM})
if(arch64)
set(SRC ${CMAKE_CURRENT_SOURCE_DIR}/h264_ssse3_x64.asm)
set(OBJ ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${OPENH264_ASM}.dir/h264_ssse3_x64.asm.o)
add_custom_command(TARGET ${OPENH264_ASM}
COMMAND nasm ARGS -f elf64 -o ${OBJ} ${SRC})
else()
message(FATAL_ERROR "OpenH264 YUV data converting with SSSE3 is not implemented in 32 bit assembly yet.")
endif()
set(FREERDP_OPENH264_LIBS ${OPENH264_LIBRARIES} ${OBJ}) set(FREERDP_OPENH264_LIBS ${OPENH264_LIBRARIES} ${OBJ})
endif() endif()
endif() endif()
@ -144,7 +171,7 @@ else()
install(TARGETS ${MODULE_NAME} DESTINATION ${CMAKE_INSTALL_LIBDIR} EXPORT FreeRDPTargets) install(TARGETS ${MODULE_NAME} DESTINATION ${CMAKE_INSTALL_LIBDIR} EXPORT FreeRDPTargets)
endif() endif()
if(WITH_OPENH264_ASM) if(WITH_OPENH264_ASM OR WITH_OPENH264_SSSE3)
add_dependencies(${MODULE_NAME} ${OPENH264_ASM}) add_dependencies(${MODULE_NAME} ${OPENH264_ASM})
endif() endif()

View File

@ -1,262 +0,0 @@
;R=(256*Y+403*(V-128)+128)/265 =(256*Y+403*V-51456)/256
;G=(256*Y-48*(U-128)-120*(V-128)+128)/256 =(256*Y-48*U-120*V+21632)/256
;B=(256*Y+475*(U-128)+128)/256 =(256*Y+475*U-60672)/256
section .data
dbg1: db "DEBUG1",10
dbg2: db "DEBUG2",10
dbg3: db "DEBUG3",10
dbg4: db "DEBUG4",10
dbg equ $-dbg4
section .bss
temp1: resd 1
temp2: resd 1
temp3: resd 1
temp4: resd 1
section .text
extern printf
;global YUV_to_RGB_asm
YUV_to_RGB_asm:
shl edi,8
mov eax,edx
imul eax,403
mov [temp1],eax
add eax,edi
sub eax,51456
jae YUV_to_RGB_asm1
mov eax,0
jmp YUV_to_RGB_asm11
YUV_to_RGB_asm1:
cmp eax, 0xFFFF
jbe YUV_to_RGB_asm11
mov eax,0xFF00
YUV_to_RGB_asm11:
and eax,0xFF00
shl eax,8
mov ebx,esi
imul ebx,475
mov [temp2],ebx
add ebx,edi
sub ebx,60672
jae YUV_to_RGB_asm2
mov ebx, 0
jmp YUV_to_RGB_asm21
YUV_to_RGB_asm2:
cmp ebx,0xFFFF
jbe YUV_to_RGB_asm21
mov ebx,0xFF00
YUV_to_RGB_asm21:
and ebx,0xFF00
shr ebx,8
imul edx,120
mov [temp3],edx
sub edi,edx
imul esi,48
mov [temp4],esi
sub edi,esi
add edi,21632
jae YUV_to_RGB_asm3
mov edi, 0
jmp YUV_to_RGB_asm31
YUV_to_RGB_asm3:
cmp edi,0xFFFF
jbe YUV_to_RGB_asm31
mov edi, 0xFF00
YUV_to_RGB_asm31:
and edi,0xFF00
or eax,edi
or eax,ebx
ret
YUV_to_RGB_2asm:
shl edi,8
mov eax,[temp1]
add eax,edi
sub eax,51456
jae YUV_to_RGB_2asm1
mov eax,0
jmp YUV_to_RGB_2asm11
YUV_to_RGB_2asm1:
cmp eax, 0xFFFF
jbe YUV_to_RGB_2asm11
mov eax,0xFF00
YUV_to_RGB_2asm11:
and eax,0xFF00
shl eax,8
mov ebx,[temp2]
add ebx,edi
sub ebx,60672
jae YUV_to_RGB_2asm2
mov ebx, 0
jmp YUV_to_RGB_2asm21
YUV_to_RGB_2asm2:
cmp ebx,0xFFFF
jbe YUV_to_RGB_2asm21
mov ebx,0xFF00
YUV_to_RGB_2asm21:
and ebx,0xFF00
shr ebx,8
sub edi,[temp3]
sub edi,[temp4]
add edi,21632
jae YUV_to_RGB_2asm3
mov edi, 0
jmp YUV_to_RGB_2asm31
YUV_to_RGB_2asm3:
cmp edi,0xFFFF
jbe YUV_to_RGB_2asm31
mov edi, 0xFF00
YUV_to_RGB_2asm31:
and edi,0xFF00
or eax,edi
or eax,ebx
ret
;extern int freerdp_image_yuv_to_xrgb_asm(unsigned char *pDstData,unsigned char **pSrcData,int nWidth,int nHeight);
global freerdp_image_yuv_to_xrgb_asm
freerdp_image_yuv_to_xrgb_asm:
push rbp
mov rbp, rsp
;cWidth: cx
sub rsp,56 ;pDstData,pSrcData[3],nWidth,nHeight,cHeight
push rbx
mov [rbp-8],rdi
mov rax,[rsi]
mov [rbp-16],rax
mov rax,[rsi+8]
mov [rbp-24],rax
mov rax,[rsi+16]
mov [rbp-32],rax
mov [rbp-40],rdx
shr rcx,1 ;/2
mov [rbp-48],rcx
mov rax,[rbp-48]
mov [rbp-56],rax
freerdp_image_yuv_to_xrgb_asm_loopH:
mov rcx,[rbp-40]
shr rcx,1
freerdp_image_yuv_to_xrgb_asm_loopW:
mov rax,[rbp-16]
mov edi,[rax]
mov rax,[rbp-24]
mov esi,[rax]
inc rax
mov [rbp-24],rax
mov rax,[rbp-32]
mov edx,[rax]
inc rax
mov [rbp-32],rax
call YUV_to_RGB_asm
mov rbx,[rbp-8]
mov [rbx],eax
mov rax,[rbp-16]
mov rbx,[rbp-40]
mov edi,[rax+rbx]
inc rax
mov [rbp-16],rax
call YUV_to_RGB_2asm
mov rbx,[rbp-8]
mov rdx,[rbp-40]
mov [rbx+rdx],eax
add rbx,4
mov [rbp-8],rbx
mov rax,[rbp-16]
mov edi,[rax]
call YUV_to_RGB_2asm
mov rbx,[rbp-8]
mov [rbx],eax
mov rax,[rbp-16]
mov rbx,[rbp-40]
mov edi,[rax+rbx]
inc rax
mov [rbp-16],rax
call YUV_to_RGB_2asm
mov rbx,[rbp-8]
mov rdx,[rbp-40]
mov [rbx+rdx],eax
add rbx,4
mov [rbp-8],rbx
dec cx
jne freerdp_image_yuv_to_xrgb_asm_loopW
mov rax,[rbp-8]
add rax,[rbp-40]
mov [rbp-8],rax
mov rax,[rbp-16]
add rax,[rbp-40]
mov [rbp-16],rax
dec qword [rbp-56]
jne freerdp_image_yuv_to_xrgb_asm_loopH
;END
mov rax,0
END:
pop rbx
mov rsp,rbp
pop rbp
ret

View File

@ -30,9 +30,14 @@
#include <sys/time.h> #include <sys/time.h>
#ifdef WITH_OPENH264_SSSE3
extern int check_ssse3();
extern int freerdp_image_yuv420p_to_xrgb(BYTE *pDstData,BYTE **pSrcData,int nWidth,int nHeight,int iStride0,int iStride1);
#else
#ifdef WITH_OPENH264_ASM #ifdef WITH_OPENH264_ASM
extern int freerdp_image_yuv_to_xrgb_asm(BYTE *pDstData,BYTE **pSrcData,int nWidth,int nHeight,int iStride0,int iStride1); extern int freerdp_image_yuv_to_xrgb_asm(BYTE *pDstData,BYTE **pSrcData,int nWidth,int nHeight,int iStride0,int iStride1);
#endif #endif
#endif
#define USE_GRAY_SCALE 0 #define USE_GRAY_SCALE 0
#define USE_UPCONVERT 0 #define USE_UPCONVERT 0
@ -381,7 +386,7 @@ static int openh264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSiz
state = (*h264->pDecoder)->DecodeFrame2(h264->pDecoder, NULL, 0, pYUVData, &sBufferInfo); state = (*h264->pDecoder)->DecodeFrame2(h264->pDecoder, NULL, 0, pYUVData, &sBufferInfo);
gettimeofday(&T2,NULL); gettimeofday(&T2,NULL);
printf("\tdecoding took: %u sec %u usec\n",(unsigned int)(T2.tv_sec-T1.tv_sec),(unsigned int)(T2.tv_usec-T1.tv_usec)); //printf("\tdecoding took: %u sec %u usec\n",(unsigned int)(T2.tv_sec-T1.tv_sec),(unsigned int)(T2.tv_usec-T1.tv_usec));
pSystemBuffer = &sBufferInfo.UsrData.sSystemBuffer; pSystemBuffer = &sBufferInfo.UsrData.sSystemBuffer;
@ -416,14 +421,15 @@ static int openh264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSiz
if (h264_prepare_rgb_buffer(h264, pSystemBuffer->iWidth, pSystemBuffer->iHeight) < 0) if (h264_prepare_rgb_buffer(h264, pSystemBuffer->iWidth, pSystemBuffer->iHeight) < 0)
return -1; return -1;
#ifdef WITH_OPENH264_SSSE3
freerdp_image_yuv420p_to_xrgb(h264->data,pYUVData,h264->width,h264->height,pSystemBuffer->iStride[0],pSystemBuffer->iStride[1]);
#else
#ifdef WITH_OPENH264_ASM #ifdef WITH_OPENH264_ASM
gettimeofday(&T1,NULL);
freerdp_image_yuv_to_xrgb_asm(h264->data,pYUVData,h264->width,h264->height,pSystemBuffer->iStride[0],pSystemBuffer->iStride[1]); freerdp_image_yuv_to_xrgb_asm(h264->data,pYUVData,h264->width,h264->height,pSystemBuffer->iStride[0],pSystemBuffer->iStride[1]);
gettimeofday(&T2,NULL);
printf("\tconverting took: %u sec %u usec\n",(unsigned int)(T2.tv_sec-T1.tv_sec),(unsigned int)(T2.tv_usec-T1.tv_usec));
#else #else
freerdp_image_copy_yuv420p_to_xrgb(h264->data, h264->scanline, 0, 0, freerdp_image_copy_yuv420p_to_xrgb(h264->data, h264->scanline, 0, 0,
h264->width, h264->height, pYUVData, pSystemBuffer->iStride, 0, 0); h264->width, h264->height, pYUVData, pSystemBuffer->iStride, 0, 0);
#endif
#endif #endif
return 1; return 1;
@ -448,6 +454,13 @@ static BOOL openh264_init(H264_CONTEXT* h264)
SDecodingParam sDecParam; SDecodingParam sDecParam;
long status; long status;
#ifdef WITH_OPENH264_SSSE3
if(check_ssse3()){
printf("SSSE3 seems to be not supported on this system, try without WITH_OPENH264_ASM ...");
return FALSE;
}
#endif
WelsCreateDecoder(&h264->pDecoder); WelsCreateDecoder(&h264->pDecoder);

View File

@ -0,0 +1,447 @@
section .text
global check_ssse3
check_ssse3:
push rbx
pushf
pop rax
or rax,1<<21
push rax
popf
pushf
pop rax
test rax,1<<21
jz check_ssse3_end
and rax,~(1<<21)
push rax
popf
mov eax,1
mov ebx,0
cpuid
test edx,1<<25 ;sse
jz check_ssse3_end
test edx,1<<26 ;sse2
jz check_ssse3_end
test ecx,1<<0 ;sse3
jz check_ssse3_end
test ecx,1<<9 ;ssse3
jz check_ssse3_end
pop rbx
mov eax,0
ret
check_ssse3_end:
pop rbx
mov eax,1
ret
;extern int freerdp_image_yuv420p_to_xrgb(unsigned char *pDstData,unsigned char **pSrcData,int nWidth,int nHeight,int istride0,int istride1)
global freerdp_image_yuv420p_to_xrgb
freerdp_image_yuv420p_to_xrgb:
push rbx
push rbp
;check wether stack is aligned to 16 byte boundary
mov rax,rsp
and rax,1111B
mov r15,22
sub r15b,al
sub rsp,r15
mov rbp,rsp
xor r10,r10
xor r11,r11
xor r12,r12
xor r13,r13
xor r14,r14
sub rsp,316 ;pDstData 8,Y 8,U 8,V 8,nWidth 2,nHeight 2,iStride0 2,iStride1 2,last_column 1,last_line 1,G 16,B 16,R 16,add:128 16
;sub:128 16,mul:48 16,mul:475 16,mul:403 16,mul:120 16,VaddY 2,VaddUV 2,res 12,cmp:255 16,cmp:0 16,shuflleR 16,andG 16,shuffleB 16,shuffleY 16,shuffleUV 16,scanline 2
;last_line: if the last (U,V doubled) line should be skipped, set to 1B
;last_column: if the last 4 columns should be skipped, set to 1B
mov [rbp-8],rdi
mov rax,[rsi]
mov [rbp-16],rax
mov rax,[rsi+8]
mov [rbp-24],rax
mov rax,[rsi+16]
mov [rbp-32],rax
mov [rbp-34],dx
mov r13w,cx
and r8,0FFFFH
mov [rbp-38],r8w
and r9,0FFFFH
mov [rbp-40],r9w
shl r8w,1
sub r8w,dx
mov r11w,r8w
mov r10w,dx
shr dx,1
sub r9w,dx
mov r12w,r9w
mov r8w,[rbp-34]
shr r8w,2
shl r10w,2
mov r9w,[rbp-38]
;and al,11B
;jz no_column_rest
;inc word [rbp-34]
;no_column_rest:
;mov [rbp-41],al
mov r14b,r13b
and r14b,1B
;jz no_line_rest
inc r13w
;no_line_rest:
shr r13w,1
;init masks
mov eax,00000080H
mov [rbp-106],eax
mov [rbp-102],eax
mov [rbp-98],eax
mov [rbp-94],eax
mov eax,00800080H
mov [rbp-122],eax
mov [rbp-118],eax
mov [rbp-114],eax
mov [rbp-110],eax
mov eax,00300030H
mov [rbp-138],eax
mov [rbp-134],eax
mov [rbp-130],eax
mov [rbp-126],eax
mov eax,01DB01DBH
mov [rbp-154],eax
mov [rbp-150],eax
mov [rbp-146],eax
mov [rbp-142],eax
mov eax,01930193H
mov [rbp-170],eax
mov [rbp-166],eax
mov [rbp-162],eax
mov [rbp-158],eax
mov eax,00780078H
mov [rbp-186],eax
mov [rbp-182],eax
mov [rbp-178],eax
mov [rbp-174],eax
mov eax,000FF0000H
mov [rbp-218],eax
mov [rbp-214],eax
mov [rbp-210],eax
mov [rbp-206],eax
mov eax,00000000H
mov [rbp-234],eax
mov [rbp-230],eax
mov [rbp-226],eax
mov [rbp-222],eax
;shuffle masks
;00 xx 00 00 00 xx 00 00 00 xx 00 00 00 xx 00 00
;00 rr gg bb 00 rr gg bb 00 rr gg bb 00 rr gg bb
mov eax,00FF0000H
mov [rbp-250],eax
mov [rbp-246],eax
mov [rbp-242],eax
mov [rbp-238],eax
mov eax,80800280H
mov [rbp-266],eax
mov eax,80800680H
mov [rbp-262],eax
mov eax,80800A80H
mov [rbp-258],eax
mov eax,80800E80H
mov [rbp-254],eax
mov eax,80808002H
mov [rbp-282],eax
mov eax,80808006H
mov [rbp-278],eax
mov eax,8080800AH
mov [rbp-274],eax
mov eax,8080800EH
mov [rbp-270],eax
;dd cc bb aa
;00 00 dd 00 00 00 cc 00 00 00 bb 00 00 00 aa 00
mov eax,80800080H
mov [rbp-298],eax
mov eax,80800180H
mov [rbp-294],eax
mov eax,80800280H
mov [rbp-290],eax
mov eax,80800380H
mov [rbp-286],eax
;dd cc bb aa
;00 dd 00 dd 00 cc 00 cc 00 bb 00 bb 00 aa 00 aa
mov eax,80008000H
mov [rbp-314],eax
mov eax,80018001H
mov [rbp-310],eax
mov eax,80028002H
mov [rbp-306],eax
mov eax,80038003H
mov [rbp-302],eax
mov rsi,[rbp-16]
mov rax,[rbp-24]
mov rbx,[rbp-32]
freerdp_image_yuv420p_to_xrgb_hloop:
dec r13w
js freerdp_image_yuv420p_to_xrgb_hloop_end
jnz not_last_line
shl r14b,1
not_last_line:
xor cx,cx
freerdp_image_yuv420p_to_xrgb_wloop:
;main loop
; C = Y;
; D = U - 128;
; E = V - 128;
;
; R = clip(( 256 * C + 403 * E + 128) >> 8);
; G = clip(( 256 * C - 48 * D - 120 * E + 128) >> 8);
; B = clip(( 256 * C + 475 * D + 128) >> 8);
test cx,1B
jnz load_yuv_data
;prepare U data
movd xmm0,[rax]
movdqa xmm5,[rbp-314]
pshufb xmm0,xmm5
add rax,4
movdqa xmm3,[rbp-122]
psubsw xmm0,xmm3
movdqa xmm2,xmm0
movdqa xmm4,xmm0
movdqa xmm7,[rbp-138]
pmullw xmm0,xmm7
pmulhw xmm4,xmm7
movdqa xmm7,xmm0
punpcklwd xmm0,xmm4 ;what an awesome instruction!
punpckhwd xmm7,xmm4
movdqa xmm4,xmm7
movdqa xmm6,[rbp-106]
psubd xmm0,xmm6
psubd xmm4,xmm6
movdqa xmm1,xmm2
movdqa xmm7,[rbp-154]
pmullw xmm1,xmm7
pmulhw xmm2,xmm7
movdqa xmm7,xmm1
punpcklwd xmm1,xmm2
punpckhwd xmm7,xmm2
paddd xmm1,xmm6
paddd xmm7,xmm6
movdqa [rbp-74],xmm7
;prepare V data
movd xmm2,[rbx]
pshufb xmm2,xmm5
add rbx,4
psubsw xmm2,xmm3
movdqa xmm5,xmm2
movdqa xmm3,xmm2
movdqa xmm7,[rbp-170]
pmullw xmm2,xmm7
pmulhw xmm3,xmm7
movdqa xmm7,xmm2
punpcklwd xmm2,xmm3
punpckhwd xmm7,xmm3
paddd xmm2,xmm6
paddd xmm7,xmm6
movdqa [rbp-90],xmm7
movdqa xmm3,xmm5
movdqa xmm7,[rbp-186]
pmullw xmm3,xmm7
pmulhw xmm5,xmm7
movdqa xmm7,xmm3
punpcklwd xmm3,xmm5
punpckhwd xmm7,xmm5
paddd xmm0,xmm3
paddd xmm4,xmm7
movdqa [rbp-58],xmm4
jmp valid_yuv_data
load_yuv_data:
movdqa xmm1,[rbp-74]
movdqa xmm2,[rbp-90]
movdqa xmm0,[rbp-58]
valid_yuv_data:
;Y data processing
movd xmm4,[rsi]
pshufb xmm4,[rbp-298]
movdqa xmm5,xmm4
movdqa xmm6,xmm4
paddd xmm4,xmm2
psubd xmm5,xmm0
paddd xmm6,xmm1
pslld xmm4,8
pslld xmm5,8
pslld xmm6,8
movdqa xmm7,[rbp-234]
pmaxsw xmm4,xmm7 ;what an awesome instruction!
pmaxsw xmm5,xmm7
pmaxsw xmm6,xmm7
movdqa xmm7,[rbp-218]
pminsw xmm4,xmm7
pminsw xmm5,xmm7
pminsw xmm6,xmm7
pand xmm4,[rbp-250]
pshufb xmm5,[rbp-266]
pshufb xmm6,[rbp-282]
por xmm4,xmm5
por xmm4,xmm6
movdqa [rdi],xmm4
;Y data processing in secound line
test r14b,2
jnz skip_last_line1
movd xmm4,[rsi+r9]
pshufb xmm4,[rbp-298]
movdqa xmm5,xmm4
movdqa xmm6,xmm4
paddd xmm4,xmm2
psubd xmm5,xmm0
paddd xmm6,xmm1
pslld xmm4,8
pslld xmm5,8
pslld xmm6,8
movdqa xmm7,[rbp-234]
pmaxsw xmm4,xmm7 ;what an awesome instruction!
pmaxsw xmm5,xmm7
pmaxsw xmm6,xmm7
movdqa xmm7,[rbp-218]
pminsw xmm4,xmm7
pminsw xmm5,xmm7
pminsw xmm6,xmm7
pand xmm4,[rbp-250]
pshufb xmm5,[rbp-266]
pshufb xmm6,[rbp-282]
por xmm4,xmm5
por xmm4,xmm6
movdqa [rdi+r10],xmm4
skip_last_line1:
add rdi,16
add rsi,4
inc cx
cmp cx,r8w
jne freerdp_image_yuv420p_to_xrgb_wloop
freerdp_image_yuv420p_to_xrgb_wloop_end:
add rdi,r10
add rsi,r11
add rax,r12
add rbx,r12
;mov eax,r12d
;jmp freerdp_image_yuv420p_to_xrgb_end
jmp freerdp_image_yuv420p_to_xrgb_hloop
freerdp_image_yuv420p_to_xrgb_hloop_end:
mov eax,0
freerdp_image_yuv420p_to_xrgb_end:
mov rsp,rbp
add rsp,r15
pop rbp
pop rbx
ret

View File

@ -2,10 +2,6 @@
;G=(256*Y-48*(U-128)-120*(V-128)+128)/256 =(256*Y-48*U-120*V+21632)/256 ;G=(256*Y-48*(U-128)-120*(V-128)+128)/256 =(256*Y-48*U-120*V+21632)/256
;B=(256*Y+475*(U-128)+128)/256 =(256*Y+475*U-60672)/256 ;B=(256*Y+475*(U-128)+128)/256 =(256*Y+475*U-60672)/256
section .data
debug: db "DEBUG",10
dblen: equ $-debug
section .text section .text
;global YUV_to_RGB_asm ;global YUV_to_RGB_asm
YUV_to_RGB_asm: YUV_to_RGB_asm:

View File

@ -0,0 +1,20 @@
TestOpenH264ASM: h264_ssse3.asm.o TestOpenH264ASM.c.o h264.c.o
gcc -o TestOpenH264ASM h264_ssse3.asm.o TestOpenH264ASM.c.o h264.c.o
h264_ssse3.asm.o: ../h264_ssse3_x64.asm
nasm -f elf64 -o h264_ssse3.asm.o ../h264_ssse3_x64.asm
h264.asm.o: ../h264.asm
nasm -f elf64 -o h264.asm.o ../h264.asm
TestOpenH264ASM.c.o: TestOpenH264ASM.c
gcc -c -o TestOpenH264ASM.c.o TestOpenH264ASM.c
h264.c.o: ../h264.c
gcc -c -O3 -o h264.c.o ../h264.c
clean:
rm -f TestOpenH264ASM TestOpenH264ASM.c.o h264_ssse3.asm.o h264.c.o h264.asm.o
old: h264.asm.o TestOpenH264ASM.c.o h264.c.o
gcc -o TestOpenH264ASM h264.asm.o TestOpenH264ASM.c.o h264.c.o

View File

@ -4,49 +4,70 @@
#include "TestOpenH264ASM.h" #include "TestOpenH264ASM.h"
#define WIDTH 1920
#define HEIGHT 1080
int main(void){ int main(void){
int ret,i; int i,j,k;
int ret;
unsigned char *pDstData_c,*pDstData_asm,*pSrcData[3]; unsigned char *pDstData_c,*pDstData_asm,*pSrcData[3];
int nSrcStep[2]; int nSrcStep[2];
if(check_ssse3()){
fprintf(stderr,"ssse3 not supported!\n");
return EXIT_FAILURE;
}
struct timeval t1,t2,t3; struct timeval t1,t2,t3;
pSrcData[0]=malloc(1920*1080*sizeof(char)); pSrcData[0]=malloc(1984*HEIGHT*sizeof(char));
pSrcData[1]=malloc(1920*1080/4*sizeof(char)); pSrcData[1]=malloc(1984*HEIGHT/4*sizeof(char));
pSrcData[2]=malloc(1920*1080/4*sizeof(char)); pSrcData[2]=malloc(1984*HEIGHT/4*sizeof(char));
pDstData_asm=malloc(1920*1080*4*sizeof(char)); pDstData_asm=malloc(WIDTH*HEIGHT*4*sizeof(char));
pDstData_c=malloc(1920*1080*4*sizeof(char)); pDstData_c=malloc(WIDTH*HEIGHT*4*sizeof(char));
for(i=0;i<1920*1080;i++){ for(i=0;i<WIDTH*HEIGHT;i++){
pSrcData[0][i]=i%255; pSrcData[0][i]=i%255;
pSrcData[1][i/4]=pSrcData[0][i]; pSrcData[1][i/4]=pSrcData[0][i];
pSrcData[2][i/4]=255-pSrcData[0][i]; pSrcData[2][i/4]=255-pSrcData[0][i];
} }
printf("%X\n",pSrcData[0][0]); nSrcStep[0]=1984;
nSrcStep[1]=992;
nSrcStep[0]=1088;
nSrcStep[1]=544;
gettimeofday(&t1,NULL); gettimeofday(&t1,NULL);
ret=freerdp_image_yuv_to_xrgb_asm(pDstData_asm,pSrcData,1024,768,1088,544); ret=freerdp_image_yuv420p_to_xrgb(pDstData_asm,pSrcData,WIDTH,HEIGHT,nSrcStep[0],nSrcStep[1]);
gettimeofday(&t2,NULL); gettimeofday(&t2,NULL);
freerdp_image_copy_yuv420p_to_xrgb(pDstData_c,1024*4,0,0,1024,768,pSrcData,nSrcStep,0,0); freerdp_image_copy_yuv420p_to_xrgb(pDstData_c,WIDTH*4,0,0,WIDTH,HEIGHT,pSrcData,nSrcStep,0,0);
gettimeofday(&t3,NULL); gettimeofday(&t3,NULL);
printf("in asm (%d) it took %u sec %u usec,\nin c %u sec %u usec.\n",ret,(int)(t2.tv_sec-t1.tv_sec),(int)(t2.tv_usec-t1.tv_usec), printf("in asm (0x%08X) it took %u sec %u usec,\nin c %u sec %u usec.\n",ret,(int)(t2.tv_sec-t1.tv_sec),(int)(t2.tv_usec-t1.tv_usec),
(int)(t3.tv_sec-t2.tv_sec),(int)(t3.tv_usec-t2.tv_usec)); (int)(t3.tv_sec-t2.tv_sec),(int)(t3.tv_usec-t2.tv_usec));
printf("in asm the result was %X %X %X\n in c %X %X %X.\n",(unsigned char *)pDstData_asm[92],(unsigned char *)pDstData_asm[93],(unsigned char *)pDstData_asm[94], printf("in asm the result was %X %X %X\n in c %X %X %X.\n",pDstData_asm[0],pDstData_asm[1],pDstData_asm[2],
(unsigned char *)pDstData_c[92],(unsigned char *)pDstData_c[93],(unsigned char *)pDstData_c[94]); pDstData_c[0],pDstData_c[1],pDstData_c[2]);
for(i=0;i<(1920*1080*4);i++){ /*k=0;
for(i=0;i<HEIGHT+1;i++){
for(j=0;j<WIDTH;j++){
printf("%08X:%08X ",((unsigned int*)pDstData_asm)[k],((unsigned int*)pDstData_c)[k]);
k++;
}
puts("\n");
}*/
k=1;
for(i=0;i<(WIDTH*HEIGHT*4);i++){
if(pDstData_c[i]!=pDstData_asm[i]){ if(pDstData_c[i]!=pDstData_asm[i]){
k=0;
printf("MISSMATCH at %d: %2X != %2X\n",i,(unsigned char)pDstData_asm[i],(unsigned char)pDstData_c[i]); printf("MISSMATCH at %d: %2X != %2X\n",i,(unsigned char)pDstData_asm[i],(unsigned char)pDstData_c[i]);
break; break;
} }
} }
if(k)
printf("everything OK\n");
free(pSrcData[0]); free(pSrcData[0]);
free(pSrcData[1]); free(pSrcData[1]);
free(pSrcData[2]); free(pSrcData[2]);

View File

@ -4,4 +4,7 @@ extern int YUV_to_RGB(unsigned char Y,unsigned char U,unsigned char V);
extern int freerdp_image_yuv_to_xrgb_asm(unsigned char *pDstData,unsigned char **pSrcData,int nWidth,int nHeight,int istride0,int istride1); extern int freerdp_image_yuv_to_xrgb_asm(unsigned char *pDstData,unsigned char **pSrcData,int nWidth,int nHeight,int istride0,int istride1);
int freerdp_image_copy_yuv420p_to_xrgb(unsigned char* pDstData, int nDstStep, int nXDst, int nYDst, int freerdp_image_copy_yuv420p_to_xrgb(unsigned char* pDstData, int nDstStep, int nXDst, int nYDst,
int nWidth, int nHeight, unsigned char* pSrcData[3], int nSrcStep[2], int nXSrc, int nYSrc); int nWidth, int nHeight, unsigned char* pSrcData[3], int nSrcStep[2], int nXSrc, int nYSrc);
extern int check_ssse3();
extern int freerdp_image_yuv420p_to_xrgb(unsigned char *pDstData,unsigned char **pSrcData,int nWidth,int nHeight,int istride0,int istride1);