H.264 hack and first port of YUV to XRGB format conversion to assembly

This commit is contained in:
erbth 2014-07-29 21:42:04 +02:00
parent 9501b6c58e
commit 20e76411dc
8 changed files with 631 additions and 4 deletions

2
.gitignore vendored
View File

@ -92,6 +92,7 @@ RelWithDebInfo
# Binaries
*.a
*.o
*.so
*.so.*
*.dylib
@ -105,6 +106,7 @@ client/DirectFB/dfreerdp
server/Sample/sfreerdp-server
server/X11/xfreerdp-server
xcode
libfreerdp/codec/test/TestOpenH264
# Other
*~

View File

@ -23,6 +23,8 @@
#include "xf_gfx.h"
#include <sys/time.h>
int xf_ResetGraphics(RdpgfxClientContext* context, RDPGFX_RESET_GRAPHICS_PDU* resetGraphics)
{
xfContext* xfc = (xfContext*) context->custom;
@ -353,6 +355,16 @@ int xf_SurfaceCommand_H264(xfContext* xfc, RdpgfxClientContext* context, RDPGFX_
RDPGFX_H264_METABLOCK* meta;
RDPGFX_H264_BITMAP_STREAM* bs;
static struct timeval TGES1;
struct timeval TGES2,TDEC1,TDEC2;
TGES2.tv_usec=TGES1.tv_usec;
TGES2.tv_sec=TGES1.tv_sec;
gettimeofday(&TGES1,NULL);
printf("time since last xf_SurfaceCommand_H264: %d sec %d usec\n",(int)(TGES1.tv_sec-TGES2.tv_sec),(int)(TGES1.tv_usec-TGES2.tv_usec));
h264 = xfc->h264;
bs = (RDPGFX_H264_BITMAP_STREAM*) cmd->extra;
@ -369,8 +381,13 @@ int xf_SurfaceCommand_H264(xfContext* xfc, RdpgfxClientContext* context, RDPGFX_
DstData = surface->data;
gettimeofday(&TDEC1,NULL);
status = h264_decompress(xfc->h264, bs->data, bs->length, &DstData,
PIXEL_FORMAT_XRGB32, surface->scanline, cmd->left, cmd->top, cmd->width, cmd->height);
gettimeofday(&TDEC2,NULL);
printf("decoding took %d sec %d usec\n",(int)(TDEC2.tv_sec-TDEC1.tv_sec),(int)(TDEC2.tv_usec-TDEC1.tv_usec));
free(bs->data);
printf("xf_SurfaceCommand_H264: status: %d\n", status);
@ -440,6 +457,9 @@ int xf_SurfaceCommand_H264(xfContext* xfc, RdpgfxClientContext* context, RDPGFX_
if (!xfc->inGfxFrame){
xf_OutputUpdate(xfc);
}
gettimeofday(&TGES2,NULL);
printf("the whole command took %d sec %d usec\n",(int)(TGES2.tv_sec-TGES1.tv_sec),(int)(TGES2.tv_usec-TGES1.tv_usec));
return 1;
}

View File

@ -91,6 +91,19 @@ if(WITH_OPENH264)
add_definitions(-DWITH_OPENH264)
include_directories(${OPENH264_INCLUDE_DIR})
set(FREERDP_OPENH264_LIBS ${OPENH264_LIBRARIES})
if(WITH_OPENH264_ASM)
set(OPENH264_ASM OPENH264_ASM_o)
set(OBJ ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${OPENH264_ASM}.dir/h264.asm.o)
set(SRC ${CMAKE_CURRENT_SOURCE_DIR}/h264.asm)
add_definitions(-DWITH_OPENH264_ASM)
add_custom_target(${OPENH264_ASM})
add_custom_command(TARGET ${OPENH264_ASM}
COMMAND nasm ARGS -f elf64 -o ${OBJ} ${SRC}
COMMENT "building H.264 asm objects ...")
set(FREERDP_OPENH264_LIBS ${OPENH264_LIBRARIES} ${OBJ})
endif()
endif()
add_complex_library(MODULE ${MODULE_NAME} TYPE "OBJECT"
@ -121,6 +134,10 @@ else()
install(TARGETS ${MODULE_NAME} DESTINATION ${CMAKE_INSTALL_LIBDIR} EXPORT FreeRDPTargets)
endif()
if(WITH_OPENH264_ASM)
add_dependencies(${MODULE_NAME} ${OPENH264_ASM})
endif()
set_property(TARGET ${MODULE_NAME} PROPERTY FOLDER "FreeRDP/libfreerdp")
if(BUILD_TESTING)

236
libfreerdp/codec/h264.asm Normal file
View File

@ -0,0 +1,236 @@
;R=(256*Y+403*(V-128)+128)/265 =(256*Y+403*V-51456)/256
;G=(256*Y-48*(U-128)-120*(V-128)+128)/256 =(256*Y-48*U-120*V+21632)/256
;B=(256*Y+475*(U-128)+128)/256 =(256*Y+475*U-60672)/256
section .data
debug: db "DEBUG",10
dblen: equ $-debug
section .text
;global YUV_to_RGB_asm
YUV_to_RGB_asm:
shl rdi,8
mov eax,edx
imul eax,403
add eax,edi
sub eax,51456
jae YUV_to_RGB_asm1
mov eax,0
jmp YUV_to_RGB_asm11
YUV_to_RGB_asm1:
cmp eax, 0xFFFF
jbe YUV_to_RGB_asm11
mov eax,0xFF00
YUV_to_RGB_asm11:
and eax,0xFF00
shl eax,8
mov ebx,esi
imul ebx,475
add ebx,edi
sub ebx,60672
jae YUV_to_RGB_asm2
mov ebx, 0
jmp YUV_to_RGB_asm21
YUV_to_RGB_asm2:
cmp ebx,0xFFFF
jbe YUV_to_RGB_asm21
mov ebx,0xFF00
YUV_to_RGB_asm21:
and ebx,0xFF00
shr ebx,8
imul edx,120
sub edi,edx
imul esi,48
sub edi,esi
add edi,21632
bt edi,31
jae YUV_to_RGB_asm3
mov edi, 0
jmp YUV_to_RGB_asm31
YUV_to_RGB_asm3:
cmp edi,0xFFFF
jbe YUV_to_RGB_asm31
mov edi, 0xFF00
YUV_to_RGB_asm31:
and edi,0xFF00
or eax,edi
or eax,ebx
ret
;extern int freerdp_image_yuv_to_xrgb_asm(unsigned char *pDstData,unsigned char **pSrcData,int nWidth,int nHeight);
global freerdp_image_yuv_to_xrgb_asm
freerdp_image_yuv_to_xrgb_asm:
push rbp
mov rbp, rsp
;cWidth: cx
sub rsp,72 ;pDstData,pSrcData[3],nWidth,nHeight,cHeight,scanline,iStride[1]
push rbx
mov [rbp-8],rdi
mov rax,[rsi]
mov [rbp-16],rax
mov rax,[rsi+8]
mov [rbp-24],rax
mov rax,[rsi+16]
mov [rbp-32],rax
mov [rbp-40],rdx
shr rcx,1 ;/2
mov [rbp-48],rcx
shl rdx,2
mov [rbp-64],rdx
mov rax,[rbp-48]
mov [rbp-56],rax
mov [rbp-72],r8
mov rax,[rbp-40]
shl dword [rbp-72],1
sub [rbp-72],rax
shr rax,1
sub r9,rax
freerdp_image_yuv_to_xrgb_asm_loopH:
mov rcx,[rbp-40]
shr rcx,1
freerdp_image_yuv_to_xrgb_asm_loopW:
mov rax,[rbp-16]
mov edi,[rax]
and edi,0xFF
mov rax,[rbp-24]
mov esi,[rax]
and esi,0xFF
mov rax,[rbp-32]
mov edx,[rax]
and edx,0xFF
call YUV_to_RGB_asm
mov rbx,[rbp-8]
mov [rbx],eax
mov rax,[rbp-16]
mov edi,[rax+r8]
inc rax
mov [rbp-16],rax
and edi,0xFF
mov rax,[rbp-24]
mov esi,[rax]
and esi,0xFF
mov rax,[rbp-32]
mov edx,[rax]
and edx,0xFF
call YUV_to_RGB_asm
mov rbx,[rbp-8]
mov rdx,[rbp-64]
mov [rbx+rdx],eax
add rbx,4
mov [rbp-8],rbx
mov rax,[rbp-16]
mov edi,[rax]
and edi,0xFF
mov rax,[rbp-24]
mov esi,[rax]
and esi,0xFF
mov rax,[rbp-32]
mov edx,[rax]
and edx,0xFF
call YUV_to_RGB_asm
mov rbx,[rbp-8]
mov [rbx],eax
mov rax,[rbp-16]
mov edi,[rax+r8]
inc rax
mov [rbp-16],rax
and edi,0xFF
mov rax,[rbp-24]
mov esi,[rax]
inc rax
mov [rbp-24],rax
and esi,0xFF
mov rax,[rbp-32]
mov edx,[rax]
inc rax
mov [rbp-32],rax
and edx,0xFF
call YUV_to_RGB_asm
mov rbx,[rbp-8]
mov rdx,[rbp-64]
mov [rbx+rdx],eax
add rbx,4
mov [rbp-8],rbx
dec cx
jne freerdp_image_yuv_to_xrgb_asm_loopW
mov rax,[rbp-8]
add rax,[rbp-64]
mov [rbp-8],rax
mov rax,[rbp-16]
add rax,[rbp-72]
mov [rbp-16],rax
mov rax,[rbp-24]
add rax,r9
mov [rbp-24],rax
mov rax,[rbp-32]
add rax,r9
mov [rbp-32],rax
dec qword [rbp-56]
jne freerdp_image_yuv_to_xrgb_asm_loopH
;END
mov rax,0
END:
pop rbx
mov rsp,rbp
pop rbp
ret

View File

@ -0,0 +1,262 @@
;R=(256*Y+403*(V-128)+128)/265 =(256*Y+403*V-51456)/256
;G=(256*Y-48*(U-128)-120*(V-128)+128)/256 =(256*Y-48*U-120*V+21632)/256
;B=(256*Y+475*(U-128)+128)/256 =(256*Y+475*U-60672)/256
section .data
dbg1: db "DEBUG1",10
dbg2: db "DEBUG2",10
dbg3: db "DEBUG3",10
dbg4: db "DEBUG4",10
dbg equ $-dbg4
section .bss
temp1: resd 1
temp2: resd 1
temp3: resd 1
temp4: resd 1
section .text
extern printf
;global YUV_to_RGB_asm
YUV_to_RGB_asm:
shl edi,8
mov eax,edx
imul eax,403
mov [temp1],eax
add eax,edi
sub eax,51456
jae YUV_to_RGB_asm1
mov eax,0
jmp YUV_to_RGB_asm11
YUV_to_RGB_asm1:
cmp eax, 0xFFFF
jbe YUV_to_RGB_asm11
mov eax,0xFF00
YUV_to_RGB_asm11:
and eax,0xFF00
shl eax,8
mov ebx,esi
imul ebx,475
mov [temp2],ebx
add ebx,edi
sub ebx,60672
jae YUV_to_RGB_asm2
mov ebx, 0
jmp YUV_to_RGB_asm21
YUV_to_RGB_asm2:
cmp ebx,0xFFFF
jbe YUV_to_RGB_asm21
mov ebx,0xFF00
YUV_to_RGB_asm21:
and ebx,0xFF00
shr ebx,8
imul edx,120
mov [temp3],edx
sub edi,edx
imul esi,48
mov [temp4],esi
sub edi,esi
add edi,21632
jae YUV_to_RGB_asm3
mov edi, 0
jmp YUV_to_RGB_asm31
YUV_to_RGB_asm3:
cmp edi,0xFFFF
jbe YUV_to_RGB_asm31
mov edi, 0xFF00
YUV_to_RGB_asm31:
and edi,0xFF00
or eax,edi
or eax,ebx
ret
YUV_to_RGB_2asm:
shl edi,8
mov eax,[temp1]
add eax,edi
sub eax,51456
jae YUV_to_RGB_2asm1
mov eax,0
jmp YUV_to_RGB_2asm11
YUV_to_RGB_2asm1:
cmp eax, 0xFFFF
jbe YUV_to_RGB_2asm11
mov eax,0xFF00
YUV_to_RGB_2asm11:
and eax,0xFF00
shl eax,8
mov ebx,[temp2]
add ebx,edi
sub ebx,60672
jae YUV_to_RGB_2asm2
mov ebx, 0
jmp YUV_to_RGB_2asm21
YUV_to_RGB_2asm2:
cmp ebx,0xFFFF
jbe YUV_to_RGB_2asm21
mov ebx,0xFF00
YUV_to_RGB_2asm21:
and ebx,0xFF00
shr ebx,8
sub edi,[temp3]
sub edi,[temp4]
add edi,21632
jae YUV_to_RGB_2asm3
mov edi, 0
jmp YUV_to_RGB_2asm31
YUV_to_RGB_2asm3:
cmp edi,0xFFFF
jbe YUV_to_RGB_2asm31
mov edi, 0xFF00
YUV_to_RGB_2asm31:
and edi,0xFF00
or eax,edi
or eax,ebx
ret
;extern int freerdp_image_yuv_to_xrgb_asm(unsigned char *pDstData,unsigned char **pSrcData,int nWidth,int nHeight);
global freerdp_image_yuv_to_xrgb_asm
freerdp_image_yuv_to_xrgb_asm:
push rbp
mov rbp, rsp
;cWidth: cx
sub rsp,56 ;pDstData,pSrcData[3],nWidth,nHeight,cHeight
push rbx
mov [rbp-8],rdi
mov rax,[rsi]
mov [rbp-16],rax
mov rax,[rsi+8]
mov [rbp-24],rax
mov rax,[rsi+16]
mov [rbp-32],rax
mov [rbp-40],rdx
shr rcx,1 ;/2
mov [rbp-48],rcx
mov rax,[rbp-48]
mov [rbp-56],rax
freerdp_image_yuv_to_xrgb_asm_loopH:
mov rcx,[rbp-40]
shr rcx,1
freerdp_image_yuv_to_xrgb_asm_loopW:
mov rax,[rbp-16]
mov edi,[rax]
mov rax,[rbp-24]
mov esi,[rax]
inc rax
mov [rbp-24],rax
mov rax,[rbp-32]
mov edx,[rax]
inc rax
mov [rbp-32],rax
call YUV_to_RGB_asm
mov rbx,[rbp-8]
mov [rbx],eax
mov rax,[rbp-16]
mov rbx,[rbp-40]
mov edi,[rax+rbx]
inc rax
mov [rbp-16],rax
call YUV_to_RGB_2asm
mov rbx,[rbp-8]
mov rdx,[rbp-40]
mov [rbx+rdx],eax
add rbx,4
mov [rbp-8],rbx
mov rax,[rbp-16]
mov edi,[rax]
call YUV_to_RGB_2asm
mov rbx,[rbp-8]
mov [rbx],eax
mov rax,[rbp-16]
mov rbx,[rbp-40]
mov edi,[rax+rbx]
inc rax
mov [rbp-16],rax
call YUV_to_RGB_2asm
mov rbx,[rbp-8]
mov rdx,[rbp-40]
mov [rbx+rdx],eax
add rbx,4
mov [rbp-8],rbx
dec cx
jne freerdp_image_yuv_to_xrgb_asm_loopW
mov rax,[rbp-8]
add rax,[rbp-40]
mov [rbp-8],rax
mov rax,[rbp-16]
add rax,[rbp-40]
mov [rbp-16],rax
dec qword [rbp-56]
jne freerdp_image_yuv_to_xrgb_asm_loopH
;END
mov rax,0
END:
pop rbx
mov rsp,rbp
pop rbp
ret

View File

@ -32,6 +32,12 @@
#define USE_UPCONVERT 0
#define USE_TRACE 0
#include <sys/time.h>
#ifdef WITH_OPENH264_ASM
extern int freerdp_image_yuv_to_xrgb_asm(BYTE *pDstData,BYTE **pSrcData,int nWidth,int nHeight,int iStride0,int iStride1);
#endif
static BYTE clip(int x)
{
if (x < 0) return 0;
@ -39,7 +45,7 @@ static BYTE clip(int x)
return (BYTE)x;
}
static UINT32 YUV_to_RGB(BYTE Y, BYTE U, BYTE V)
UINT32 YUV_to_RGB(BYTE Y, BYTE U, BYTE V)
{
BYTE R, G, B;
@ -297,11 +303,15 @@ int h264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize,
BYTE* pV;
int Y, U, V;
int i, j;
struct timeval T1,T2,T3;
gettimeofday(&T2,NULL);
if (!h264 || !h264->pDecoder)
return -1;
pSrcData = h264_strip_nal_unit_au_delimiter(pSrcData, &SrcSize);
//pSrcData = h264_strip_nal_unit_au_delimiter(pSrcData, &SrcSize);
#if 0
printf("h264_decompress: pSrcData=%p, SrcSize=%u, pDstData=%p, nDstStep=%d, nXDst=%d, nYDst=%d, nWidth=%d, nHeight=%d)\n",
@ -349,6 +359,10 @@ int h264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize,
ZeroMemory(&sBufferInfo, sizeof(sBufferInfo));
gettimeofday(&T1,NULL);
printf("\ttime before first DecodeFrame2: %d sec %d usec\n",(int)(T1.tv_sec-T2.tv_sec),(int)(T1.tv_usec-T2.tv_usec));
gettimeofday(&T1,NULL);
state = (*h264->pDecoder)->DecodeFrame2(
h264->pDecoder,
pSrcData,
@ -356,13 +370,17 @@ int h264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize,
pYUVData,
&sBufferInfo);
state = (*h264->pDecoder)->DecodeFrame2(
gettimeofday(&T2,NULL);
state = (*h264->pDecoder)->DecodeFrame2(
h264->pDecoder,
NULL,
0,
pYUVData,
&sBufferInfo);
gettimeofday(&T3,NULL);
// printf("\tfirst DecodeFrame2 took %d sec %d usec, second %d sec %d usec\n",(int)(T2.tv_sec-T1.tv_sec),(int)(T2.tv_usec-T1.tv_usec),
// (int)(T3.tv_sec-T2.tv_sec),(int)(T3.tv_usec-T2.tv_usec));
pSystemBuffer = &sBufferInfo.UsrData.sSystemBuffer;
@ -420,8 +438,16 @@ int h264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize,
if (h264_prepare_rgb_buffer(h264, pSystemBuffer->iWidth, pSystemBuffer->iHeight) < 0)
return -1;
gettimeofday(&T3,NULL);
#ifdef WITH_OPENH264_ASM
freerdp_image_yuv_to_xrgb_asm(h264->data,pYUVData,h264->width,h264->height,pSystemBuffer->iStride[0],pSystemBuffer->iStride[1]);
#else
freerdp_image_copy_yuv420p_to_xrgb(h264->data, h264->scanline, 0, 0,
h264->width, h264->height, pYUVData, pSystemBuffer->iStride, 0, 0);
#endif
gettimeofday(&T1,NULL);//takes about 35ms!!
printf("\tfreerdp_image_copy_yuv420p_to_xrgb took %d sec %d usec\n",(int)(T1.tv_sec-T3.tv_sec),(int)(T1.tv_usec-T3.tv_usec));
if (g_H264DumpFrames)
{

View File

@ -0,0 +1,57 @@
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include "TestOpenH264ASM.h"
int main(void){
int ret,i;
unsigned char *pDstData_c,*pDstData_asm,*pSrcData[3];
int nSrcStep[2];
struct timeval t1,t2,t3;
pSrcData[0]=malloc(1920*1080*sizeof(char));
pSrcData[1]=malloc(1920*1080/4*sizeof(char));
pSrcData[2]=malloc(1920*1080/4*sizeof(char));
pDstData_asm=malloc(1920*1080*4*sizeof(char));
pDstData_c=malloc(1920*1080*4*sizeof(char));
for(i=0;i<1920*1080;i++){
pSrcData[0][i]=i%255;
pSrcData[1][i/4]=pSrcData[0][i];
pSrcData[2][i/4]=255-pSrcData[0][i];
}
printf("%X\n",pSrcData[0][0]);
nSrcStep[0]=1088;
nSrcStep[1]=544;
gettimeofday(&t1,NULL);
ret=freerdp_image_yuv_to_xrgb_asm(pDstData_asm,pSrcData,1024,768,1088,544);
gettimeofday(&t2,NULL);
freerdp_image_copy_yuv420p_to_xrgb(pDstData_c,1024*4,0,0,1024,768,pSrcData,nSrcStep,0,0);
gettimeofday(&t3,NULL);
printf("in asm (%d) it took %u sec %u usec,\nin c %u sec %u usec.\n",ret,(int)(t2.tv_sec-t1.tv_sec),(int)(t2.tv_usec-t1.tv_usec),
(int)(t3.tv_sec-t2.tv_sec),(int)(t3.tv_usec-t2.tv_usec));
printf("in asm the result was %X %X %X\n in c %X %X %X.\n",(unsigned char *)pDstData_asm[92],(unsigned char *)pDstData_asm[93],(unsigned char *)pDstData_asm[94],
(unsigned char *)pDstData_c[92],(unsigned char *)pDstData_c[93],(unsigned char *)pDstData_c[94]);
for(i=0;i<(1920*1080*4);i++){
if(pDstData_c[i]!=pDstData_asm[i]){
printf("MISSMATCH at %d: %2X != %2X\n",i,(unsigned char)pDstData_asm[i],(unsigned char)pDstData_c[i]);
break;
}
}
free(pSrcData[0]);
free(pSrcData[1]);
free(pSrcData[2]);
free(pDstData_c);
free(pDstData_asm);
return 0;
}

View File

@ -0,0 +1,7 @@
extern int YUV_to_RGB_asm(unsigned char Y,unsigned char U,unsigned char V);
extern int YUV_to_RGB_2asm(unsigned char Y);
extern int YUV_to_RGB(unsigned char Y,unsigned char U,unsigned char V);
extern int freerdp_image_yuv_to_xrgb_asm(unsigned char *pDstData,unsigned char **pSrcData,int nWidth,int nHeight,int istride0,int istride1);
int freerdp_image_copy_yuv420p_to_xrgb(unsigned char* pDstData, int nDstStep, int nXDst, int nYDst,
int nWidth, int nHeight, unsigned char* pSrcData[3], int nSrcStep[2], int nXSrc, int nYSrc);