From 095a7aba999b9a50257a700ff8c2c927d2d4fac5 Mon Sep 17 00:00:00 2001 From: erbth Date: Wed, 13 Aug 2014 20:56:40 +0200 Subject: [PATCH] OpenH264 YUV data conversion with intel SSSE3 in assembly --- .gitignore | 2 +- channels/drdynvc/client/dvcman.c | 10 +- client/X11/xf_gfx.c | 42 +- libfreerdp/codec/CMakeLists.txt | 43 +- libfreerdp/codec/h264.asm.alt | 262 ---------- libfreerdp/codec/h264.c | 21 +- libfreerdp/codec/h264_ssse3_x64.asm | 447 ++++++++++++++++++ libfreerdp/codec/{h264.asm => h264_x64.asm} | 4 - .../codec/test/Makefile.TestOpenH264ASM | 20 + libfreerdp/codec/test/TestOpenH264ASM.c | 55 ++- libfreerdp/codec/test/TestOpenH264ASM.h | 5 +- 11 files changed, 574 insertions(+), 337 deletions(-) delete mode 100644 libfreerdp/codec/h264.asm.alt create mode 100644 libfreerdp/codec/h264_ssse3_x64.asm rename libfreerdp/codec/{h264.asm => h264_x64.asm} (98%) create mode 100644 libfreerdp/codec/test/Makefile.TestOpenH264ASM diff --git a/.gitignore b/.gitignore index 928ef7b95..94ec2bf89 100644 --- a/.gitignore +++ b/.gitignore @@ -106,7 +106,7 @@ client/DirectFB/dfreerdp server/Sample/sfreerdp-server server/X11/xfreerdp-server xcode -libfreerdp/codec/test/TestOpenH264 +libfreerdp/codec/test/TestOpenH264ASM # Other *~ diff --git a/channels/drdynvc/client/dvcman.c b/channels/drdynvc/client/dvcman.c index 532a68575..dd51a95ca 100644 --- a/channels/drdynvc/client/dvcman.c +++ b/channels/drdynvc/client/dvcman.c @@ -478,7 +478,6 @@ int dvcman_receive_channel_data_first(IWTSVirtualChannelManager* pChannelMgr, UI Stream_Release(channel->dvc_data); channel->dvc_data = StreamPool_Take(channel->dvcman->pool, length); - //Stream_AddRef(channel->dvc_data); return 0; } @@ -488,6 +487,7 @@ int dvcman_receive_channel_data(IWTSVirtualChannelManager* pChannelMgr, UINT32 C int status = 0; DVCMAN_CHANNEL* channel; UINT32 dataSize = Stream_GetRemainingLength(data); + wStream* s; channel = (DVCMAN_CHANNEL*) dvcman_find_channel_by_id(pChannelMgr, ChannelId); @@ -500,7 +500,6 @@ int dvcman_receive_channel_data(IWTSVirtualChannelManager* pChannelMgr, UINT32 C if (channel->dvc_data) { /* Fragmented data */ - //if (Stream_GetPosition(channel->dvc_data) + dataSize > (UINT32) Stream_Capacity(channel->dvc_data)) if (Stream_GetPosition(channel->dvc_data) + dataSize > (UINT32) Stream_Length(channel->dvc_data)) { DEBUG_WARN("data exceeding declared length!"); @@ -511,14 +510,15 @@ int dvcman_receive_channel_data(IWTSVirtualChannelManager* pChannelMgr, UINT32 C Stream_Write(channel->dvc_data, Stream_Pointer(data), dataSize); - //if (((size_t) Stream_GetPosition(channel->dvc_data)) >= Stream_Capacity(channel->dvc_data)-1) if (((size_t) Stream_GetPosition(channel->dvc_data)) >= Stream_Length(channel->dvc_data)-1) { Stream_SealLength(channel->dvc_data); Stream_SetPosition(channel->dvc_data, 0); - status = channel->channel_callback->OnDataReceived(channel->channel_callback, channel->dvc_data); - Stream_Release(channel->dvc_data); + s=channel->dvc_data; channel->dvc_data = NULL; + + status = channel->channel_callback->OnDataReceived(channel->channel_callback, s); + Stream_Release(s); } } else diff --git a/client/X11/xf_gfx.c b/client/X11/xf_gfx.c index 0b6ab8899..b7b7cbccc 100644 --- a/client/X11/xf_gfx.c +++ b/client/X11/xf_gfx.c @@ -139,7 +139,7 @@ int xf_OutputUpdate(xfContext* xfc) int xf_OutputExpose(xfContext* xfc, int x, int y, int width, int height) { /** ********************************* - * to be improved + * to be improved? * *********************************/ RECTANGLE_16 invalidRect; @@ -366,15 +366,6 @@ int xf_SurfaceCommand_H264(xfContext* xfc, RdpgfxClientContext* context, RDPGFX_ RDPGFX_H264_METABLOCK* meta; RDPGFX_H264_BITMAP_STREAM* bs; - static struct timeval TGES1; - struct timeval TGES2,TDEC1,TDEC2; - - TGES2.tv_usec=TGES1.tv_usec; - TGES2.tv_sec=TGES1.tv_sec; - - gettimeofday(&TGES1,NULL); - printf("time since last xf_SurfaceCommand_H264: %d sec %d usec\n",(int)(TGES1.tv_sec-TGES2.tv_sec),(int)(TGES1.tv_usec-TGES2.tv_usec)); - h264 = xfc->h264; @@ -392,13 +383,14 @@ int xf_SurfaceCommand_H264(xfContext* xfc, RdpgfxClientContext* context, RDPGFX_ DstData = surface->data; - gettimeofday(&TDEC1,NULL); status = h264_decompress(xfc->h264, bs->data, bs->length, &DstData, PIXEL_FORMAT_XRGB32, surface->scanline, cmd->left, cmd->top, cmd->width, cmd->height); - gettimeofday(&TDEC2,NULL); - //printf("decoding took %d sec %d usec\n",(int)(TDEC2.tv_sec-TDEC1.tv_sec),(int)(TDEC2.tv_usec-TDEC1.tv_usec)); - //printf("xf_SurfaceCommand_H264: status: %d\n", status); + if (status < 0) + { + printf("h264_decompress failure: %d\n",status); + return -1; + } if (status < 0) return -1; @@ -427,9 +419,6 @@ int xf_SurfaceCommand_H264(xfContext* xfc, RdpgfxClientContext* context, RDPGFX_ updateRects = (RECTANGLE_16*) region16_rects(&updateRegion, &nbUpdateRects); -#if 0 - printf("numRegionRects: %d nbUpdateRects: %d\n", meta->numRegionRects, nbUpdateRects); -#endif for (j = 0; j < nbUpdateRects; j++) { @@ -439,13 +428,6 @@ int xf_SurfaceCommand_H264(xfContext* xfc, RdpgfxClientContext* context, RDPGFX_ nHeight = updateRects[j].bottom - updateRects[j].top; /* update region from decoded H264 buffer */ - -#if 0 - printf("nXDst: %d nYDst: %d nWidth: %d nHeight: %d decoded: width: %d height: %d cmd: left: %d top: %d right: %d bottom: %d\n", - nXDst, nYDst, nWidth, nHeight, h264->width, h264->height, - cmd->left, cmd->top, cmd->right, cmd->bottom); -#endif - freerdp_image_copy(surface->data, PIXEL_FORMAT_XRGB32, surface->scanline, nXDst, nYDst, nWidth, nHeight, h264->data, PIXEL_FORMAT_XRGB32, h264->scanline, nXDst, nYDst); @@ -457,19 +439,9 @@ int xf_SurfaceCommand_H264(xfContext* xfc, RdpgfxClientContext* context, RDPGFX_ region16_uninit(&updateRegion); region16_uninit(&clippingRects); -#if 0 - /* fill with red for now to distinguish from the rest */ - freerdp_image_fill(surface->data, PIXEL_FORMAT_XRGB32, surface->scanline, - cmd->left, cmd->top, cmd->width, cmd->height, 0xFF0000); -#endif - - if (!xfc->inGfxFrame){ + if (!xfc->inGfxFrame) xf_OutputUpdate(xfc); - } - - gettimeofday(&TGES2,NULL); - printf("the whole command took %d sec %d usec\n",(int)(TGES2.tv_sec-TGES1.tv_sec),(int)(TGES2.tv_usec-TGES1.tv_usec)); return 1; } diff --git a/libfreerdp/codec/CMakeLists.txt b/libfreerdp/codec/CMakeLists.txt index ea20105ff..1289cd45e 100644 --- a/libfreerdp/codec/CMakeLists.txt +++ b/libfreerdp/codec/CMakeLists.txt @@ -92,17 +92,44 @@ if(WITH_OPENH264) add_definitions(-DWITH_OPENH264) include_directories(${OPENH264_INCLUDE_DIR}) set(FREERDP_OPENH264_LIBS ${OPENH264_LIBRARIES}) - + + if(CMAKE_SIZEOF_VOID_P EQUAL 8) + set(arch64 TRUE) + else() + set(arch64 FALSE) + endif() + if(WITH_OPENH264_ASM) set(OPENH264_ASM OPENH264_ASM_o) - set(OBJ ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${OPENH264_ASM}.dir/h264.asm.o) - set(SRC ${CMAKE_CURRENT_SOURCE_DIR}/h264.asm) - add_definitions(-DWITH_OPENH264_ASM) add_custom_target(${OPENH264_ASM}) - add_custom_command(TARGET ${OPENH264_ASM} - COMMAND nasm ARGS -f elf64 -o ${OBJ} ${SRC} - COMMENT "building H.264 asm objects ...") + + if(arch64) + set(SRC ${CMAKE_CURRENT_SOURCE_DIR}/h264_x64.asm) + set(OBJ ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${OPENH264_ASM}.dir/h264_x64.asm.o) + add_custom_command(TARGET ${OPENH264_ASM} + COMMAND nasm ARGS -f elf64 -o ${OBJ} ${SRC}) + else() + message(FATAL_ERROR "OpenH264 YUV data converting is not implemented in 32 bit assembly yet.") + endif() + + set(FREERDP_OPENH264_LIBS ${OPENH264_LIBRARIES} ${OBJ}) + endif() + + if(WITH_OPENH264_SSSE3) + set(OPENH264_ASM OPENH264_ASM_o) + add_definitions(-DWITH_OPENH264_SSSE3) + add_custom_target(${OPENH264_ASM}) + + if(arch64) + set(SRC ${CMAKE_CURRENT_SOURCE_DIR}/h264_ssse3_x64.asm) + set(OBJ ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${OPENH264_ASM}.dir/h264_ssse3_x64.asm.o) + add_custom_command(TARGET ${OPENH264_ASM} + COMMAND nasm ARGS -f elf64 -o ${OBJ} ${SRC}) + else() + message(FATAL_ERROR "OpenH264 YUV data converting with SSSE3 is not implemented in 32 bit assembly yet.") + endif() + set(FREERDP_OPENH264_LIBS ${OPENH264_LIBRARIES} ${OBJ}) endif() endif() @@ -144,7 +171,7 @@ else() install(TARGETS ${MODULE_NAME} DESTINATION ${CMAKE_INSTALL_LIBDIR} EXPORT FreeRDPTargets) endif() -if(WITH_OPENH264_ASM) +if(WITH_OPENH264_ASM OR WITH_OPENH264_SSSE3) add_dependencies(${MODULE_NAME} ${OPENH264_ASM}) endif() diff --git a/libfreerdp/codec/h264.asm.alt b/libfreerdp/codec/h264.asm.alt deleted file mode 100644 index 98ae6f950..000000000 --- a/libfreerdp/codec/h264.asm.alt +++ /dev/null @@ -1,262 +0,0 @@ -;R=(256*Y+403*(V-128)+128)/265 =(256*Y+403*V-51456)/256 -;G=(256*Y-48*(U-128)-120*(V-128)+128)/256 =(256*Y-48*U-120*V+21632)/256 -;B=(256*Y+475*(U-128)+128)/256 =(256*Y+475*U-60672)/256 - -section .data - dbg1: db "DEBUG1",10 - dbg2: db "DEBUG2",10 - dbg3: db "DEBUG3",10 - dbg4: db "DEBUG4",10 - dbg equ $-dbg4 - -section .bss - temp1: resd 1 - temp2: resd 1 - temp3: resd 1 - temp4: resd 1 - -section .text - extern printf - - ;global YUV_to_RGB_asm -YUV_to_RGB_asm: - shl edi,8 - - mov eax,edx - imul eax,403 - mov [temp1],eax - add eax,edi - sub eax,51456 - - jae YUV_to_RGB_asm1 - mov eax,0 - jmp YUV_to_RGB_asm11 - -YUV_to_RGB_asm1: - cmp eax, 0xFFFF - jbe YUV_to_RGB_asm11 - mov eax,0xFF00 - -YUV_to_RGB_asm11: - and eax,0xFF00 - shl eax,8 - - mov ebx,esi - imul ebx,475 - mov [temp2],ebx - add ebx,edi - sub ebx,60672 - - jae YUV_to_RGB_asm2 - mov ebx, 0 - jmp YUV_to_RGB_asm21 - -YUV_to_RGB_asm2: - cmp ebx,0xFFFF - jbe YUV_to_RGB_asm21 - mov ebx,0xFF00 - -YUV_to_RGB_asm21: - and ebx,0xFF00 - shr ebx,8 - - imul edx,120 - mov [temp3],edx - sub edi,edx - imul esi,48 - mov [temp4],esi - sub edi,esi - add edi,21632 - - jae YUV_to_RGB_asm3 - mov edi, 0 - jmp YUV_to_RGB_asm31 - -YUV_to_RGB_asm3: - cmp edi,0xFFFF - jbe YUV_to_RGB_asm31 - mov edi, 0xFF00 - -YUV_to_RGB_asm31: - and edi,0xFF00 - - or eax,edi - or eax,ebx - - ret - - - -YUV_to_RGB_2asm: - shl edi,8 - - mov eax,[temp1] - add eax,edi - sub eax,51456 - - jae YUV_to_RGB_2asm1 - mov eax,0 - jmp YUV_to_RGB_2asm11 - -YUV_to_RGB_2asm1: - cmp eax, 0xFFFF - jbe YUV_to_RGB_2asm11 - mov eax,0xFF00 - -YUV_to_RGB_2asm11: - and eax,0xFF00 - shl eax,8 - - mov ebx,[temp2] - add ebx,edi - sub ebx,60672 - - jae YUV_to_RGB_2asm2 - mov ebx, 0 - jmp YUV_to_RGB_2asm21 - -YUV_to_RGB_2asm2: - cmp ebx,0xFFFF - jbe YUV_to_RGB_2asm21 - mov ebx,0xFF00 - -YUV_to_RGB_2asm21: - and ebx,0xFF00 - shr ebx,8 - - sub edi,[temp3] - sub edi,[temp4] - add edi,21632 - - jae YUV_to_RGB_2asm3 - mov edi, 0 - jmp YUV_to_RGB_2asm31 - -YUV_to_RGB_2asm3: - cmp edi,0xFFFF - jbe YUV_to_RGB_2asm31 - mov edi, 0xFF00 - -YUV_to_RGB_2asm31: - and edi,0xFF00 - - or eax,edi - or eax,ebx - - ret - - -;extern int freerdp_image_yuv_to_xrgb_asm(unsigned char *pDstData,unsigned char **pSrcData,int nWidth,int nHeight); - global freerdp_image_yuv_to_xrgb_asm -freerdp_image_yuv_to_xrgb_asm: - push rbp - mov rbp, rsp - ;cWidth: cx - sub rsp,56 ;pDstData,pSrcData[3],nWidth,nHeight,cHeight - push rbx - - - mov [rbp-8],rdi - - mov rax,[rsi] - mov [rbp-16],rax - mov rax,[rsi+8] - mov [rbp-24],rax - mov rax,[rsi+16] - mov [rbp-32],rax - - mov [rbp-40],rdx - - - shr rcx,1 ;/2 - mov [rbp-48],rcx - - - mov rax,[rbp-48] - mov [rbp-56],rax - -freerdp_image_yuv_to_xrgb_asm_loopH: - mov rcx,[rbp-40] - shr rcx,1 - - -freerdp_image_yuv_to_xrgb_asm_loopW: - mov rax,[rbp-16] - mov edi,[rax] - - mov rax,[rbp-24] - mov esi,[rax] - inc rax - mov [rbp-24],rax - - mov rax,[rbp-32] - mov edx,[rax] - inc rax - mov [rbp-32],rax - - call YUV_to_RGB_asm - - mov rbx,[rbp-8] - mov [rbx],eax - - - mov rax,[rbp-16] - mov rbx,[rbp-40] - mov edi,[rax+rbx] - inc rax - mov [rbp-16],rax - - call YUV_to_RGB_2asm - - mov rbx,[rbp-8] - mov rdx,[rbp-40] - mov [rbx+rdx],eax - add rbx,4 - mov [rbp-8],rbx - - - mov rax,[rbp-16] - mov edi,[rax] - - call YUV_to_RGB_2asm - - mov rbx,[rbp-8] - mov [rbx],eax - - - mov rax,[rbp-16] - mov rbx,[rbp-40] - mov edi,[rax+rbx] - inc rax - mov [rbp-16],rax - - call YUV_to_RGB_2asm - - mov rbx,[rbp-8] - mov rdx,[rbp-40] - mov [rbx+rdx],eax - add rbx,4 - mov [rbp-8],rbx - - dec cx - jne freerdp_image_yuv_to_xrgb_asm_loopW - - - mov rax,[rbp-8] - add rax,[rbp-40] - mov [rbp-8],rax - - mov rax,[rbp-16] - add rax,[rbp-40] - mov [rbp-16],rax - - dec qword [rbp-56] - jne freerdp_image_yuv_to_xrgb_asm_loopH - -;END - mov rax,0 -END: - pop rbx - mov rsp,rbp - pop rbp - ret \ No newline at end of file diff --git a/libfreerdp/codec/h264.c b/libfreerdp/codec/h264.c index abc8f9e0b..50d8cb330 100644 --- a/libfreerdp/codec/h264.c +++ b/libfreerdp/codec/h264.c @@ -30,9 +30,14 @@ #include +#ifdef WITH_OPENH264_SSSE3 +extern int check_ssse3(); +extern int freerdp_image_yuv420p_to_xrgb(BYTE *pDstData,BYTE **pSrcData,int nWidth,int nHeight,int iStride0,int iStride1); +#else #ifdef WITH_OPENH264_ASM extern int freerdp_image_yuv_to_xrgb_asm(BYTE *pDstData,BYTE **pSrcData,int nWidth,int nHeight,int iStride0,int iStride1); #endif +#endif #define USE_GRAY_SCALE 0 #define USE_UPCONVERT 0 @@ -381,7 +386,7 @@ static int openh264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSiz state = (*h264->pDecoder)->DecodeFrame2(h264->pDecoder, NULL, 0, pYUVData, &sBufferInfo); gettimeofday(&T2,NULL); - printf("\tdecoding took: %u sec %u usec\n",(unsigned int)(T2.tv_sec-T1.tv_sec),(unsigned int)(T2.tv_usec-T1.tv_usec)); + //printf("\tdecoding took: %u sec %u usec\n",(unsigned int)(T2.tv_sec-T1.tv_sec),(unsigned int)(T2.tv_usec-T1.tv_usec)); pSystemBuffer = &sBufferInfo.UsrData.sSystemBuffer; @@ -416,14 +421,15 @@ static int openh264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSiz if (h264_prepare_rgb_buffer(h264, pSystemBuffer->iWidth, pSystemBuffer->iHeight) < 0) return -1; +#ifdef WITH_OPENH264_SSSE3 + freerdp_image_yuv420p_to_xrgb(h264->data,pYUVData,h264->width,h264->height,pSystemBuffer->iStride[0],pSystemBuffer->iStride[1]); +#else #ifdef WITH_OPENH264_ASM - gettimeofday(&T1,NULL); freerdp_image_yuv_to_xrgb_asm(h264->data,pYUVData,h264->width,h264->height,pSystemBuffer->iStride[0],pSystemBuffer->iStride[1]); - gettimeofday(&T2,NULL); - printf("\tconverting took: %u sec %u usec\n",(unsigned int)(T2.tv_sec-T1.tv_sec),(unsigned int)(T2.tv_usec-T1.tv_usec)); #else freerdp_image_copy_yuv420p_to_xrgb(h264->data, h264->scanline, 0, 0, h264->width, h264->height, pYUVData, pSystemBuffer->iStride, 0, 0); +#endif #endif return 1; @@ -448,6 +454,13 @@ static BOOL openh264_init(H264_CONTEXT* h264) SDecodingParam sDecParam; long status; + +#ifdef WITH_OPENH264_SSSE3 + if(check_ssse3()){ + printf("SSSE3 seems to be not supported on this system, try without WITH_OPENH264_ASM ..."); + return FALSE; + } +#endif WelsCreateDecoder(&h264->pDecoder); diff --git a/libfreerdp/codec/h264_ssse3_x64.asm b/libfreerdp/codec/h264_ssse3_x64.asm new file mode 100644 index 000000000..f2198c9c6 --- /dev/null +++ b/libfreerdp/codec/h264_ssse3_x64.asm @@ -0,0 +1,447 @@ +section .text + global check_ssse3 + +check_ssse3: + push rbx + + pushf + pop rax + or rax,1<<21 + push rax + popf + pushf + pop rax + test rax,1<<21 + jz check_ssse3_end + + and rax,~(1<<21) + push rax + popf + + + mov eax,1 + mov ebx,0 + cpuid + test edx,1<<25 ;sse + jz check_ssse3_end + test edx,1<<26 ;sse2 + jz check_ssse3_end + test ecx,1<<0 ;sse3 + jz check_ssse3_end + test ecx,1<<9 ;ssse3 + jz check_ssse3_end + + + pop rbx + mov eax,0 + ret + + +check_ssse3_end: + pop rbx + mov eax,1 + ret + + +;extern int freerdp_image_yuv420p_to_xrgb(unsigned char *pDstData,unsigned char **pSrcData,int nWidth,int nHeight,int istride0,int istride1) + global freerdp_image_yuv420p_to_xrgb +freerdp_image_yuv420p_to_xrgb: + push rbx + push rbp + +;check wether stack is aligned to 16 byte boundary + mov rax,rsp + and rax,1111B + mov r15,22 + sub r15b,al + sub rsp,r15 + + mov rbp,rsp + + xor r10,r10 + xor r11,r11 + xor r12,r12 + xor r13,r13 + xor r14,r14 + + sub rsp,316 ;pDstData 8,Y 8,U 8,V 8,nWidth 2,nHeight 2,iStride0 2,iStride1 2,last_column 1,last_line 1,G 16,B 16,R 16,add:128 16 + ;sub:128 16,mul:48 16,mul:475 16,mul:403 16,mul:120 16,VaddY 2,VaddUV 2,res 12,cmp:255 16,cmp:0 16,shuflleR 16,andG 16,shuffleB 16,shuffleY 16,shuffleUV 16,scanline 2 + +;last_line: if the last (U,V doubled) line should be skipped, set to 1B +;last_column: if the last 4 columns should be skipped, set to 1B + + mov [rbp-8],rdi + + mov rax,[rsi] + mov [rbp-16],rax + mov rax,[rsi+8] + mov [rbp-24],rax + mov rax,[rsi+16] + mov [rbp-32],rax + + mov [rbp-34],dx + mov r13w,cx + + and r8,0FFFFH + mov [rbp-38],r8w + and r9,0FFFFH + mov [rbp-40],r9w + + + shl r8w,1 + sub r8w,dx + mov r11w,r8w + + mov r10w,dx + shr dx,1 + sub r9w,dx + mov r12w,r9w + + + mov r8w,[rbp-34] + shr r8w,2 + shl r10w,2 + + mov r9w,[rbp-38] + + ;and al,11B + ;jz no_column_rest + + ;inc word [rbp-34] + +;no_column_rest: + ;mov [rbp-41],al + + + + mov r14b,r13b + and r14b,1B + ;jz no_line_rest + + inc r13w + +;no_line_rest: + shr r13w,1 + + + +;init masks + mov eax,00000080H + mov [rbp-106],eax + mov [rbp-102],eax + mov [rbp-98],eax + mov [rbp-94],eax + + mov eax,00800080H + mov [rbp-122],eax + mov [rbp-118],eax + mov [rbp-114],eax + mov [rbp-110],eax + + mov eax,00300030H + mov [rbp-138],eax + mov [rbp-134],eax + mov [rbp-130],eax + mov [rbp-126],eax + + mov eax,01DB01DBH + mov [rbp-154],eax + mov [rbp-150],eax + mov [rbp-146],eax + mov [rbp-142],eax + + mov eax,01930193H + mov [rbp-170],eax + mov [rbp-166],eax + mov [rbp-162],eax + mov [rbp-158],eax + + mov eax,00780078H + mov [rbp-186],eax + mov [rbp-182],eax + mov [rbp-178],eax + mov [rbp-174],eax + + mov eax,000FF0000H + mov [rbp-218],eax + mov [rbp-214],eax + mov [rbp-210],eax + mov [rbp-206],eax + + mov eax,00000000H + mov [rbp-234],eax + mov [rbp-230],eax + mov [rbp-226],eax + mov [rbp-222],eax + +;shuffle masks + ;00 xx 00 00 00 xx 00 00 00 xx 00 00 00 xx 00 00 + ;00 rr gg bb 00 rr gg bb 00 rr gg bb 00 rr gg bb + mov eax,00FF0000H + mov [rbp-250],eax + mov [rbp-246],eax + mov [rbp-242],eax + mov [rbp-238],eax + + mov eax,80800280H + mov [rbp-266],eax + mov eax,80800680H + mov [rbp-262],eax + mov eax,80800A80H + mov [rbp-258],eax + mov eax,80800E80H + mov [rbp-254],eax + + mov eax,80808002H + mov [rbp-282],eax + mov eax,80808006H + mov [rbp-278],eax + mov eax,8080800AH + mov [rbp-274],eax + mov eax,8080800EH + mov [rbp-270],eax + + ;dd cc bb aa + ;00 00 dd 00 00 00 cc 00 00 00 bb 00 00 00 aa 00 + mov eax,80800080H + mov [rbp-298],eax + mov eax,80800180H + mov [rbp-294],eax + mov eax,80800280H + mov [rbp-290],eax + mov eax,80800380H + mov [rbp-286],eax + + ;dd cc bb aa + ;00 dd 00 dd 00 cc 00 cc 00 bb 00 bb 00 aa 00 aa + mov eax,80008000H + mov [rbp-314],eax + mov eax,80018001H + mov [rbp-310],eax + mov eax,80028002H + mov [rbp-306],eax + mov eax,80038003H + mov [rbp-302],eax + + + mov rsi,[rbp-16] + mov rax,[rbp-24] + mov rbx,[rbp-32] + + +freerdp_image_yuv420p_to_xrgb_hloop: + dec r13w + js freerdp_image_yuv420p_to_xrgb_hloop_end + jnz not_last_line + + shl r14b,1 +not_last_line: + + xor cx,cx +freerdp_image_yuv420p_to_xrgb_wloop: +;main loop +; C = Y; +; D = U - 128; +; E = V - 128; +; +; R = clip(( 256 * C + 403 * E + 128) >> 8); +; G = clip(( 256 * C - 48 * D - 120 * E + 128) >> 8); +; B = clip(( 256 * C + 475 * D + 128) >> 8); + + test cx,1B + jnz load_yuv_data + + + ;prepare U data + movd xmm0,[rax] + movdqa xmm5,[rbp-314] + pshufb xmm0,xmm5 + + add rax,4 + + movdqa xmm3,[rbp-122] + psubsw xmm0,xmm3 + + movdqa xmm2,xmm0 + + movdqa xmm4,xmm0 + movdqa xmm7,[rbp-138] + pmullw xmm0,xmm7 + pmulhw xmm4,xmm7 + + movdqa xmm7,xmm0 + punpcklwd xmm0,xmm4 ;what an awesome instruction! + punpckhwd xmm7,xmm4 + movdqa xmm4,xmm7 + + movdqa xmm6,[rbp-106] + psubd xmm0,xmm6 + psubd xmm4,xmm6 + + + movdqa xmm1,xmm2 + movdqa xmm7,[rbp-154] + pmullw xmm1,xmm7 + pmulhw xmm2,xmm7 + + movdqa xmm7,xmm1 + punpcklwd xmm1,xmm2 + punpckhwd xmm7,xmm2 + + paddd xmm1,xmm6 + paddd xmm7,xmm6 + + movdqa [rbp-74],xmm7 + + + ;prepare V data + movd xmm2,[rbx] + pshufb xmm2,xmm5 + + add rbx,4 + + psubsw xmm2,xmm3 + + movdqa xmm5,xmm2 + + movdqa xmm3,xmm2 + movdqa xmm7,[rbp-170] + pmullw xmm2,xmm7 + pmulhw xmm3,xmm7 + + movdqa xmm7,xmm2 + punpcklwd xmm2,xmm3 + punpckhwd xmm7,xmm3 + + paddd xmm2,xmm6 + paddd xmm7,xmm6 + + movdqa [rbp-90],xmm7 + + + movdqa xmm3,xmm5 + movdqa xmm7,[rbp-186] + pmullw xmm3,xmm7 + pmulhw xmm5,xmm7 + + movdqa xmm7,xmm3 + punpcklwd xmm3,xmm5 + punpckhwd xmm7,xmm5 + + paddd xmm0,xmm3 + paddd xmm4,xmm7 + + movdqa [rbp-58],xmm4 + + jmp valid_yuv_data + +load_yuv_data: + movdqa xmm1,[rbp-74] + movdqa xmm2,[rbp-90] + movdqa xmm0,[rbp-58] + +valid_yuv_data: + + + ;Y data processing + movd xmm4,[rsi] + pshufb xmm4,[rbp-298] + + movdqa xmm5,xmm4 + movdqa xmm6,xmm4 + + paddd xmm4,xmm2 + psubd xmm5,xmm0 + paddd xmm6,xmm1 + + pslld xmm4,8 + pslld xmm5,8 + pslld xmm6,8 + + movdqa xmm7,[rbp-234] + pmaxsw xmm4,xmm7 ;what an awesome instruction! + pmaxsw xmm5,xmm7 + pmaxsw xmm6,xmm7 + + movdqa xmm7,[rbp-218] + pminsw xmm4,xmm7 + pminsw xmm5,xmm7 + pminsw xmm6,xmm7 + + pand xmm4,[rbp-250] + pshufb xmm5,[rbp-266] + pshufb xmm6,[rbp-282] + + por xmm4,xmm5 + por xmm4,xmm6 + + movdqa [rdi],xmm4 + + + ;Y data processing in secound line + test r14b,2 + jnz skip_last_line1 + + movd xmm4,[rsi+r9] + pshufb xmm4,[rbp-298] + + + movdqa xmm5,xmm4 + movdqa xmm6,xmm4 + + paddd xmm4,xmm2 + psubd xmm5,xmm0 + paddd xmm6,xmm1 + + pslld xmm4,8 + pslld xmm5,8 + pslld xmm6,8 + + movdqa xmm7,[rbp-234] + pmaxsw xmm4,xmm7 ;what an awesome instruction! + pmaxsw xmm5,xmm7 + pmaxsw xmm6,xmm7 + + movdqa xmm7,[rbp-218] + pminsw xmm4,xmm7 + pminsw xmm5,xmm7 + pminsw xmm6,xmm7 + + pand xmm4,[rbp-250] + pshufb xmm5,[rbp-266] + pshufb xmm6,[rbp-282] + + por xmm4,xmm5 + por xmm4,xmm6 + + movdqa [rdi+r10],xmm4 + +skip_last_line1: + add rdi,16 + add rsi,4 + + inc cx + cmp cx,r8w + jne freerdp_image_yuv420p_to_xrgb_wloop + +freerdp_image_yuv420p_to_xrgb_wloop_end: + add rdi,r10 + + add rsi,r11 + + add rax,r12 + add rbx,r12 + ;mov eax,r12d + ;jmp freerdp_image_yuv420p_to_xrgb_end + + jmp freerdp_image_yuv420p_to_xrgb_hloop + +freerdp_image_yuv420p_to_xrgb_hloop_end: + + mov eax,0 +freerdp_image_yuv420p_to_xrgb_end: + mov rsp,rbp + add rsp,r15 + pop rbp + pop rbx + ret \ No newline at end of file diff --git a/libfreerdp/codec/h264.asm b/libfreerdp/codec/h264_x64.asm similarity index 98% rename from libfreerdp/codec/h264.asm rename to libfreerdp/codec/h264_x64.asm index 1473849e0..f0bf1d640 100644 --- a/libfreerdp/codec/h264.asm +++ b/libfreerdp/codec/h264_x64.asm @@ -2,10 +2,6 @@ ;G=(256*Y-48*(U-128)-120*(V-128)+128)/256 =(256*Y-48*U-120*V+21632)/256 ;B=(256*Y+475*(U-128)+128)/256 =(256*Y+475*U-60672)/256 -section .data - debug: db "DEBUG",10 - dblen: equ $-debug - section .text ;global YUV_to_RGB_asm YUV_to_RGB_asm: diff --git a/libfreerdp/codec/test/Makefile.TestOpenH264ASM b/libfreerdp/codec/test/Makefile.TestOpenH264ASM new file mode 100644 index 000000000..8e747a647 --- /dev/null +++ b/libfreerdp/codec/test/Makefile.TestOpenH264ASM @@ -0,0 +1,20 @@ +TestOpenH264ASM: h264_ssse3.asm.o TestOpenH264ASM.c.o h264.c.o + gcc -o TestOpenH264ASM h264_ssse3.asm.o TestOpenH264ASM.c.o h264.c.o + +h264_ssse3.asm.o: ../h264_ssse3_x64.asm + nasm -f elf64 -o h264_ssse3.asm.o ../h264_ssse3_x64.asm + +h264.asm.o: ../h264.asm + nasm -f elf64 -o h264.asm.o ../h264.asm + +TestOpenH264ASM.c.o: TestOpenH264ASM.c + gcc -c -o TestOpenH264ASM.c.o TestOpenH264ASM.c + +h264.c.o: ../h264.c + gcc -c -O3 -o h264.c.o ../h264.c + +clean: + rm -f TestOpenH264ASM TestOpenH264ASM.c.o h264_ssse3.asm.o h264.c.o h264.asm.o + +old: h264.asm.o TestOpenH264ASM.c.o h264.c.o + gcc -o TestOpenH264ASM h264.asm.o TestOpenH264ASM.c.o h264.c.o diff --git a/libfreerdp/codec/test/TestOpenH264ASM.c b/libfreerdp/codec/test/TestOpenH264ASM.c index 27dd46b08..f1c463f0b 100644 --- a/libfreerdp/codec/test/TestOpenH264ASM.c +++ b/libfreerdp/codec/test/TestOpenH264ASM.c @@ -4,49 +4,70 @@ #include "TestOpenH264ASM.h" +#define WIDTH 1920 +#define HEIGHT 1080 + int main(void){ - int ret,i; + int i,j,k; + int ret; unsigned char *pDstData_c,*pDstData_asm,*pSrcData[3]; int nSrcStep[2]; + if(check_ssse3()){ + fprintf(stderr,"ssse3 not supported!\n"); + return EXIT_FAILURE; + } + struct timeval t1,t2,t3; - pSrcData[0]=malloc(1920*1080*sizeof(char)); - pSrcData[1]=malloc(1920*1080/4*sizeof(char)); - pSrcData[2]=malloc(1920*1080/4*sizeof(char)); - pDstData_asm=malloc(1920*1080*4*sizeof(char)); - pDstData_c=malloc(1920*1080*4*sizeof(char)); + pSrcData[0]=malloc(1984*HEIGHT*sizeof(char)); + pSrcData[1]=malloc(1984*HEIGHT/4*sizeof(char)); + pSrcData[2]=malloc(1984*HEIGHT/4*sizeof(char)); + pDstData_asm=malloc(WIDTH*HEIGHT*4*sizeof(char)); + pDstData_c=malloc(WIDTH*HEIGHT*4*sizeof(char)); - for(i=0;i<1920*1080;i++){ + for(i=0;i