OpenH264 YUV data conversion with intel SSSE3 in assembly
This commit is contained in:
parent
a8945306a1
commit
095a7aba99
2
.gitignore
vendored
2
.gitignore
vendored
@ -106,7 +106,7 @@ client/DirectFB/dfreerdp
|
||||
server/Sample/sfreerdp-server
|
||||
server/X11/xfreerdp-server
|
||||
xcode
|
||||
libfreerdp/codec/test/TestOpenH264
|
||||
libfreerdp/codec/test/TestOpenH264ASM
|
||||
|
||||
# Other
|
||||
*~
|
||||
|
@ -478,7 +478,6 @@ int dvcman_receive_channel_data_first(IWTSVirtualChannelManager* pChannelMgr, UI
|
||||
Stream_Release(channel->dvc_data);
|
||||
|
||||
channel->dvc_data = StreamPool_Take(channel->dvcman->pool, length);
|
||||
//Stream_AddRef(channel->dvc_data);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@ -488,6 +487,7 @@ int dvcman_receive_channel_data(IWTSVirtualChannelManager* pChannelMgr, UINT32 C
|
||||
int status = 0;
|
||||
DVCMAN_CHANNEL* channel;
|
||||
UINT32 dataSize = Stream_GetRemainingLength(data);
|
||||
wStream* s;
|
||||
|
||||
channel = (DVCMAN_CHANNEL*) dvcman_find_channel_by_id(pChannelMgr, ChannelId);
|
||||
|
||||
@ -500,7 +500,6 @@ int dvcman_receive_channel_data(IWTSVirtualChannelManager* pChannelMgr, UINT32 C
|
||||
if (channel->dvc_data)
|
||||
{
|
||||
/* Fragmented data */
|
||||
//if (Stream_GetPosition(channel->dvc_data) + dataSize > (UINT32) Stream_Capacity(channel->dvc_data))
|
||||
if (Stream_GetPosition(channel->dvc_data) + dataSize > (UINT32) Stream_Length(channel->dvc_data))
|
||||
{
|
||||
DEBUG_WARN("data exceeding declared length!");
|
||||
@ -511,14 +510,15 @@ int dvcman_receive_channel_data(IWTSVirtualChannelManager* pChannelMgr, UINT32 C
|
||||
|
||||
Stream_Write(channel->dvc_data, Stream_Pointer(data), dataSize);
|
||||
|
||||
//if (((size_t) Stream_GetPosition(channel->dvc_data)) >= Stream_Capacity(channel->dvc_data)-1)
|
||||
if (((size_t) Stream_GetPosition(channel->dvc_data)) >= Stream_Length(channel->dvc_data)-1)
|
||||
{
|
||||
Stream_SealLength(channel->dvc_data);
|
||||
Stream_SetPosition(channel->dvc_data, 0);
|
||||
status = channel->channel_callback->OnDataReceived(channel->channel_callback, channel->dvc_data);
|
||||
Stream_Release(channel->dvc_data);
|
||||
s=channel->dvc_data;
|
||||
channel->dvc_data = NULL;
|
||||
|
||||
status = channel->channel_callback->OnDataReceived(channel->channel_callback, s);
|
||||
Stream_Release(s);
|
||||
}
|
||||
}
|
||||
else
|
||||
|
@ -139,7 +139,7 @@ int xf_OutputUpdate(xfContext* xfc)
|
||||
int xf_OutputExpose(xfContext* xfc, int x, int y, int width, int height)
|
||||
{
|
||||
/** *********************************
|
||||
* to be improved
|
||||
* to be improved?
|
||||
* *********************************/
|
||||
RECTANGLE_16 invalidRect;
|
||||
|
||||
@ -366,15 +366,6 @@ int xf_SurfaceCommand_H264(xfContext* xfc, RdpgfxClientContext* context, RDPGFX_
|
||||
RDPGFX_H264_METABLOCK* meta;
|
||||
RDPGFX_H264_BITMAP_STREAM* bs;
|
||||
|
||||
static struct timeval TGES1;
|
||||
struct timeval TGES2,TDEC1,TDEC2;
|
||||
|
||||
TGES2.tv_usec=TGES1.tv_usec;
|
||||
TGES2.tv_sec=TGES1.tv_sec;
|
||||
|
||||
gettimeofday(&TGES1,NULL);
|
||||
printf("time since last xf_SurfaceCommand_H264: %d sec %d usec\n",(int)(TGES1.tv_sec-TGES2.tv_sec),(int)(TGES1.tv_usec-TGES2.tv_usec));
|
||||
|
||||
|
||||
h264 = xfc->h264;
|
||||
|
||||
@ -392,13 +383,14 @@ int xf_SurfaceCommand_H264(xfContext* xfc, RdpgfxClientContext* context, RDPGFX_
|
||||
|
||||
DstData = surface->data;
|
||||
|
||||
gettimeofday(&TDEC1,NULL);
|
||||
status = h264_decompress(xfc->h264, bs->data, bs->length, &DstData,
|
||||
PIXEL_FORMAT_XRGB32, surface->scanline, cmd->left, cmd->top, cmd->width, cmd->height);
|
||||
gettimeofday(&TDEC2,NULL);
|
||||
//printf("decoding took %d sec %d usec\n",(int)(TDEC2.tv_sec-TDEC1.tv_sec),(int)(TDEC2.tv_usec-TDEC1.tv_usec));
|
||||
|
||||
//printf("xf_SurfaceCommand_H264: status: %d\n", status);
|
||||
if (status < 0)
|
||||
{
|
||||
printf("h264_decompress failure: %d\n",status);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (status < 0)
|
||||
return -1;
|
||||
@ -427,9 +419,6 @@ int xf_SurfaceCommand_H264(xfContext* xfc, RdpgfxClientContext* context, RDPGFX_
|
||||
|
||||
updateRects = (RECTANGLE_16*) region16_rects(&updateRegion, &nbUpdateRects);
|
||||
|
||||
#if 0
|
||||
printf("numRegionRects: %d nbUpdateRects: %d\n", meta->numRegionRects, nbUpdateRects);
|
||||
#endif
|
||||
|
||||
for (j = 0; j < nbUpdateRects; j++)
|
||||
{
|
||||
@ -439,13 +428,6 @@ int xf_SurfaceCommand_H264(xfContext* xfc, RdpgfxClientContext* context, RDPGFX_
|
||||
nHeight = updateRects[j].bottom - updateRects[j].top;
|
||||
|
||||
/* update region from decoded H264 buffer */
|
||||
|
||||
#if 0
|
||||
printf("nXDst: %d nYDst: %d nWidth: %d nHeight: %d decoded: width: %d height: %d cmd: left: %d top: %d right: %d bottom: %d\n",
|
||||
nXDst, nYDst, nWidth, nHeight, h264->width, h264->height,
|
||||
cmd->left, cmd->top, cmd->right, cmd->bottom);
|
||||
#endif
|
||||
|
||||
freerdp_image_copy(surface->data, PIXEL_FORMAT_XRGB32, surface->scanline,
|
||||
nXDst, nYDst, nWidth, nHeight,
|
||||
h264->data, PIXEL_FORMAT_XRGB32, h264->scanline, nXDst, nYDst);
|
||||
@ -457,19 +439,9 @@ int xf_SurfaceCommand_H264(xfContext* xfc, RdpgfxClientContext* context, RDPGFX_
|
||||
region16_uninit(&updateRegion);
|
||||
region16_uninit(&clippingRects);
|
||||
|
||||
#if 0
|
||||
/* fill with red for now to distinguish from the rest */
|
||||
|
||||
freerdp_image_fill(surface->data, PIXEL_FORMAT_XRGB32, surface->scanline,
|
||||
cmd->left, cmd->top, cmd->width, cmd->height, 0xFF0000);
|
||||
#endif
|
||||
|
||||
if (!xfc->inGfxFrame){
|
||||
if (!xfc->inGfxFrame)
|
||||
xf_OutputUpdate(xfc);
|
||||
}
|
||||
|
||||
gettimeofday(&TGES2,NULL);
|
||||
printf("the whole command took %d sec %d usec\n",(int)(TGES2.tv_sec-TGES1.tv_sec),(int)(TGES2.tv_usec-TGES1.tv_usec));
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
@ -93,16 +93,43 @@ if(WITH_OPENH264)
|
||||
include_directories(${OPENH264_INCLUDE_DIR})
|
||||
set(FREERDP_OPENH264_LIBS ${OPENH264_LIBRARIES})
|
||||
|
||||
if(CMAKE_SIZEOF_VOID_P EQUAL 8)
|
||||
set(arch64 TRUE)
|
||||
else()
|
||||
set(arch64 FALSE)
|
||||
endif()
|
||||
|
||||
if(WITH_OPENH264_ASM)
|
||||
set(OPENH264_ASM OPENH264_ASM_o)
|
||||
set(OBJ ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${OPENH264_ASM}.dir/h264.asm.o)
|
||||
set(SRC ${CMAKE_CURRENT_SOURCE_DIR}/h264.asm)
|
||||
|
||||
add_definitions(-DWITH_OPENH264_ASM)
|
||||
add_custom_target(${OPENH264_ASM})
|
||||
|
||||
if(arch64)
|
||||
set(SRC ${CMAKE_CURRENT_SOURCE_DIR}/h264_x64.asm)
|
||||
set(OBJ ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${OPENH264_ASM}.dir/h264_x64.asm.o)
|
||||
add_custom_command(TARGET ${OPENH264_ASM}
|
||||
COMMAND nasm ARGS -f elf64 -o ${OBJ} ${SRC}
|
||||
COMMENT "building H.264 asm objects ...")
|
||||
COMMAND nasm ARGS -f elf64 -o ${OBJ} ${SRC})
|
||||
else()
|
||||
message(FATAL_ERROR "OpenH264 YUV data converting is not implemented in 32 bit assembly yet.")
|
||||
endif()
|
||||
|
||||
set(FREERDP_OPENH264_LIBS ${OPENH264_LIBRARIES} ${OBJ})
|
||||
endif()
|
||||
|
||||
if(WITH_OPENH264_SSSE3)
|
||||
set(OPENH264_ASM OPENH264_ASM_o)
|
||||
add_definitions(-DWITH_OPENH264_SSSE3)
|
||||
add_custom_target(${OPENH264_ASM})
|
||||
|
||||
if(arch64)
|
||||
set(SRC ${CMAKE_CURRENT_SOURCE_DIR}/h264_ssse3_x64.asm)
|
||||
set(OBJ ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${OPENH264_ASM}.dir/h264_ssse3_x64.asm.o)
|
||||
add_custom_command(TARGET ${OPENH264_ASM}
|
||||
COMMAND nasm ARGS -f elf64 -o ${OBJ} ${SRC})
|
||||
else()
|
||||
message(FATAL_ERROR "OpenH264 YUV data converting with SSSE3 is not implemented in 32 bit assembly yet.")
|
||||
endif()
|
||||
|
||||
set(FREERDP_OPENH264_LIBS ${OPENH264_LIBRARIES} ${OBJ})
|
||||
endif()
|
||||
endif()
|
||||
@ -144,7 +171,7 @@ else()
|
||||
install(TARGETS ${MODULE_NAME} DESTINATION ${CMAKE_INSTALL_LIBDIR} EXPORT FreeRDPTargets)
|
||||
endif()
|
||||
|
||||
if(WITH_OPENH264_ASM)
|
||||
if(WITH_OPENH264_ASM OR WITH_OPENH264_SSSE3)
|
||||
add_dependencies(${MODULE_NAME} ${OPENH264_ASM})
|
||||
endif()
|
||||
|
||||
|
@ -1,262 +0,0 @@
|
||||
;R=(256*Y+403*(V-128)+128)/265 =(256*Y+403*V-51456)/256
|
||||
;G=(256*Y-48*(U-128)-120*(V-128)+128)/256 =(256*Y-48*U-120*V+21632)/256
|
||||
;B=(256*Y+475*(U-128)+128)/256 =(256*Y+475*U-60672)/256
|
||||
|
||||
section .data
|
||||
dbg1: db "DEBUG1",10
|
||||
dbg2: db "DEBUG2",10
|
||||
dbg3: db "DEBUG3",10
|
||||
dbg4: db "DEBUG4",10
|
||||
dbg equ $-dbg4
|
||||
|
||||
section .bss
|
||||
temp1: resd 1
|
||||
temp2: resd 1
|
||||
temp3: resd 1
|
||||
temp4: resd 1
|
||||
|
||||
section .text
|
||||
extern printf
|
||||
|
||||
;global YUV_to_RGB_asm
|
||||
YUV_to_RGB_asm:
|
||||
shl edi,8
|
||||
|
||||
mov eax,edx
|
||||
imul eax,403
|
||||
mov [temp1],eax
|
||||
add eax,edi
|
||||
sub eax,51456
|
||||
|
||||
jae YUV_to_RGB_asm1
|
||||
mov eax,0
|
||||
jmp YUV_to_RGB_asm11
|
||||
|
||||
YUV_to_RGB_asm1:
|
||||
cmp eax, 0xFFFF
|
||||
jbe YUV_to_RGB_asm11
|
||||
mov eax,0xFF00
|
||||
|
||||
YUV_to_RGB_asm11:
|
||||
and eax,0xFF00
|
||||
shl eax,8
|
||||
|
||||
mov ebx,esi
|
||||
imul ebx,475
|
||||
mov [temp2],ebx
|
||||
add ebx,edi
|
||||
sub ebx,60672
|
||||
|
||||
jae YUV_to_RGB_asm2
|
||||
mov ebx, 0
|
||||
jmp YUV_to_RGB_asm21
|
||||
|
||||
YUV_to_RGB_asm2:
|
||||
cmp ebx,0xFFFF
|
||||
jbe YUV_to_RGB_asm21
|
||||
mov ebx,0xFF00
|
||||
|
||||
YUV_to_RGB_asm21:
|
||||
and ebx,0xFF00
|
||||
shr ebx,8
|
||||
|
||||
imul edx,120
|
||||
mov [temp3],edx
|
||||
sub edi,edx
|
||||
imul esi,48
|
||||
mov [temp4],esi
|
||||
sub edi,esi
|
||||
add edi,21632
|
||||
|
||||
jae YUV_to_RGB_asm3
|
||||
mov edi, 0
|
||||
jmp YUV_to_RGB_asm31
|
||||
|
||||
YUV_to_RGB_asm3:
|
||||
cmp edi,0xFFFF
|
||||
jbe YUV_to_RGB_asm31
|
||||
mov edi, 0xFF00
|
||||
|
||||
YUV_to_RGB_asm31:
|
||||
and edi,0xFF00
|
||||
|
||||
or eax,edi
|
||||
or eax,ebx
|
||||
|
||||
ret
|
||||
|
||||
|
||||
|
||||
YUV_to_RGB_2asm:
|
||||
shl edi,8
|
||||
|
||||
mov eax,[temp1]
|
||||
add eax,edi
|
||||
sub eax,51456
|
||||
|
||||
jae YUV_to_RGB_2asm1
|
||||
mov eax,0
|
||||
jmp YUV_to_RGB_2asm11
|
||||
|
||||
YUV_to_RGB_2asm1:
|
||||
cmp eax, 0xFFFF
|
||||
jbe YUV_to_RGB_2asm11
|
||||
mov eax,0xFF00
|
||||
|
||||
YUV_to_RGB_2asm11:
|
||||
and eax,0xFF00
|
||||
shl eax,8
|
||||
|
||||
mov ebx,[temp2]
|
||||
add ebx,edi
|
||||
sub ebx,60672
|
||||
|
||||
jae YUV_to_RGB_2asm2
|
||||
mov ebx, 0
|
||||
jmp YUV_to_RGB_2asm21
|
||||
|
||||
YUV_to_RGB_2asm2:
|
||||
cmp ebx,0xFFFF
|
||||
jbe YUV_to_RGB_2asm21
|
||||
mov ebx,0xFF00
|
||||
|
||||
YUV_to_RGB_2asm21:
|
||||
and ebx,0xFF00
|
||||
shr ebx,8
|
||||
|
||||
sub edi,[temp3]
|
||||
sub edi,[temp4]
|
||||
add edi,21632
|
||||
|
||||
jae YUV_to_RGB_2asm3
|
||||
mov edi, 0
|
||||
jmp YUV_to_RGB_2asm31
|
||||
|
||||
YUV_to_RGB_2asm3:
|
||||
cmp edi,0xFFFF
|
||||
jbe YUV_to_RGB_2asm31
|
||||
mov edi, 0xFF00
|
||||
|
||||
YUV_to_RGB_2asm31:
|
||||
and edi,0xFF00
|
||||
|
||||
or eax,edi
|
||||
or eax,ebx
|
||||
|
||||
ret
|
||||
|
||||
|
||||
;extern int freerdp_image_yuv_to_xrgb_asm(unsigned char *pDstData,unsigned char **pSrcData,int nWidth,int nHeight);
|
||||
global freerdp_image_yuv_to_xrgb_asm
|
||||
freerdp_image_yuv_to_xrgb_asm:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
;cWidth: cx
|
||||
sub rsp,56 ;pDstData,pSrcData[3],nWidth,nHeight,cHeight
|
||||
push rbx
|
||||
|
||||
|
||||
mov [rbp-8],rdi
|
||||
|
||||
mov rax,[rsi]
|
||||
mov [rbp-16],rax
|
||||
mov rax,[rsi+8]
|
||||
mov [rbp-24],rax
|
||||
mov rax,[rsi+16]
|
||||
mov [rbp-32],rax
|
||||
|
||||
mov [rbp-40],rdx
|
||||
|
||||
|
||||
shr rcx,1 ;/2
|
||||
mov [rbp-48],rcx
|
||||
|
||||
|
||||
mov rax,[rbp-48]
|
||||
mov [rbp-56],rax
|
||||
|
||||
freerdp_image_yuv_to_xrgb_asm_loopH:
|
||||
mov rcx,[rbp-40]
|
||||
shr rcx,1
|
||||
|
||||
|
||||
freerdp_image_yuv_to_xrgb_asm_loopW:
|
||||
mov rax,[rbp-16]
|
||||
mov edi,[rax]
|
||||
|
||||
mov rax,[rbp-24]
|
||||
mov esi,[rax]
|
||||
inc rax
|
||||
mov [rbp-24],rax
|
||||
|
||||
mov rax,[rbp-32]
|
||||
mov edx,[rax]
|
||||
inc rax
|
||||
mov [rbp-32],rax
|
||||
|
||||
call YUV_to_RGB_asm
|
||||
|
||||
mov rbx,[rbp-8]
|
||||
mov [rbx],eax
|
||||
|
||||
|
||||
mov rax,[rbp-16]
|
||||
mov rbx,[rbp-40]
|
||||
mov edi,[rax+rbx]
|
||||
inc rax
|
||||
mov [rbp-16],rax
|
||||
|
||||
call YUV_to_RGB_2asm
|
||||
|
||||
mov rbx,[rbp-8]
|
||||
mov rdx,[rbp-40]
|
||||
mov [rbx+rdx],eax
|
||||
add rbx,4
|
||||
mov [rbp-8],rbx
|
||||
|
||||
|
||||
mov rax,[rbp-16]
|
||||
mov edi,[rax]
|
||||
|
||||
call YUV_to_RGB_2asm
|
||||
|
||||
mov rbx,[rbp-8]
|
||||
mov [rbx],eax
|
||||
|
||||
|
||||
mov rax,[rbp-16]
|
||||
mov rbx,[rbp-40]
|
||||
mov edi,[rax+rbx]
|
||||
inc rax
|
||||
mov [rbp-16],rax
|
||||
|
||||
call YUV_to_RGB_2asm
|
||||
|
||||
mov rbx,[rbp-8]
|
||||
mov rdx,[rbp-40]
|
||||
mov [rbx+rdx],eax
|
||||
add rbx,4
|
||||
mov [rbp-8],rbx
|
||||
|
||||
dec cx
|
||||
jne freerdp_image_yuv_to_xrgb_asm_loopW
|
||||
|
||||
|
||||
mov rax,[rbp-8]
|
||||
add rax,[rbp-40]
|
||||
mov [rbp-8],rax
|
||||
|
||||
mov rax,[rbp-16]
|
||||
add rax,[rbp-40]
|
||||
mov [rbp-16],rax
|
||||
|
||||
dec qword [rbp-56]
|
||||
jne freerdp_image_yuv_to_xrgb_asm_loopH
|
||||
|
||||
;END
|
||||
mov rax,0
|
||||
END:
|
||||
pop rbx
|
||||
mov rsp,rbp
|
||||
pop rbp
|
||||
ret
|
@ -30,9 +30,14 @@
|
||||
|
||||
#include <sys/time.h>
|
||||
|
||||
#ifdef WITH_OPENH264_SSSE3
|
||||
extern int check_ssse3();
|
||||
extern int freerdp_image_yuv420p_to_xrgb(BYTE *pDstData,BYTE **pSrcData,int nWidth,int nHeight,int iStride0,int iStride1);
|
||||
#else
|
||||
#ifdef WITH_OPENH264_ASM
|
||||
extern int freerdp_image_yuv_to_xrgb_asm(BYTE *pDstData,BYTE **pSrcData,int nWidth,int nHeight,int iStride0,int iStride1);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define USE_GRAY_SCALE 0
|
||||
#define USE_UPCONVERT 0
|
||||
@ -381,7 +386,7 @@ static int openh264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSiz
|
||||
state = (*h264->pDecoder)->DecodeFrame2(h264->pDecoder, NULL, 0, pYUVData, &sBufferInfo);
|
||||
|
||||
gettimeofday(&T2,NULL);
|
||||
printf("\tdecoding took: %u sec %u usec\n",(unsigned int)(T2.tv_sec-T1.tv_sec),(unsigned int)(T2.tv_usec-T1.tv_usec));
|
||||
//printf("\tdecoding took: %u sec %u usec\n",(unsigned int)(T2.tv_sec-T1.tv_sec),(unsigned int)(T2.tv_usec-T1.tv_usec));
|
||||
|
||||
pSystemBuffer = &sBufferInfo.UsrData.sSystemBuffer;
|
||||
|
||||
@ -416,14 +421,15 @@ static int openh264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSiz
|
||||
if (h264_prepare_rgb_buffer(h264, pSystemBuffer->iWidth, pSystemBuffer->iHeight) < 0)
|
||||
return -1;
|
||||
|
||||
#ifdef WITH_OPENH264_SSSE3
|
||||
freerdp_image_yuv420p_to_xrgb(h264->data,pYUVData,h264->width,h264->height,pSystemBuffer->iStride[0],pSystemBuffer->iStride[1]);
|
||||
#else
|
||||
#ifdef WITH_OPENH264_ASM
|
||||
gettimeofday(&T1,NULL);
|
||||
freerdp_image_yuv_to_xrgb_asm(h264->data,pYUVData,h264->width,h264->height,pSystemBuffer->iStride[0],pSystemBuffer->iStride[1]);
|
||||
gettimeofday(&T2,NULL);
|
||||
printf("\tconverting took: %u sec %u usec\n",(unsigned int)(T2.tv_sec-T1.tv_sec),(unsigned int)(T2.tv_usec-T1.tv_usec));
|
||||
#else
|
||||
freerdp_image_copy_yuv420p_to_xrgb(h264->data, h264->scanline, 0, 0,
|
||||
h264->width, h264->height, pYUVData, pSystemBuffer->iStride, 0, 0);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
return 1;
|
||||
@ -449,6 +455,13 @@ static BOOL openh264_init(H264_CONTEXT* h264)
|
||||
SDecodingParam sDecParam;
|
||||
long status;
|
||||
|
||||
#ifdef WITH_OPENH264_SSSE3
|
||||
if(check_ssse3()){
|
||||
printf("SSSE3 seems to be not supported on this system, try without WITH_OPENH264_ASM ...");
|
||||
return FALSE;
|
||||
}
|
||||
#endif
|
||||
|
||||
WelsCreateDecoder(&h264->pDecoder);
|
||||
|
||||
if (!h264->pDecoder)
|
||||
|
447
libfreerdp/codec/h264_ssse3_x64.asm
Normal file
447
libfreerdp/codec/h264_ssse3_x64.asm
Normal file
@ -0,0 +1,447 @@
|
||||
section .text
|
||||
global check_ssse3
|
||||
|
||||
check_ssse3:
|
||||
push rbx
|
||||
|
||||
pushf
|
||||
pop rax
|
||||
or rax,1<<21
|
||||
push rax
|
||||
popf
|
||||
pushf
|
||||
pop rax
|
||||
test rax,1<<21
|
||||
jz check_ssse3_end
|
||||
|
||||
and rax,~(1<<21)
|
||||
push rax
|
||||
popf
|
||||
|
||||
|
||||
mov eax,1
|
||||
mov ebx,0
|
||||
cpuid
|
||||
test edx,1<<25 ;sse
|
||||
jz check_ssse3_end
|
||||
test edx,1<<26 ;sse2
|
||||
jz check_ssse3_end
|
||||
test ecx,1<<0 ;sse3
|
||||
jz check_ssse3_end
|
||||
test ecx,1<<9 ;ssse3
|
||||
jz check_ssse3_end
|
||||
|
||||
|
||||
pop rbx
|
||||
mov eax,0
|
||||
ret
|
||||
|
||||
|
||||
check_ssse3_end:
|
||||
pop rbx
|
||||
mov eax,1
|
||||
ret
|
||||
|
||||
|
||||
;extern int freerdp_image_yuv420p_to_xrgb(unsigned char *pDstData,unsigned char **pSrcData,int nWidth,int nHeight,int istride0,int istride1)
|
||||
global freerdp_image_yuv420p_to_xrgb
|
||||
freerdp_image_yuv420p_to_xrgb:
|
||||
push rbx
|
||||
push rbp
|
||||
|
||||
;check wether stack is aligned to 16 byte boundary
|
||||
mov rax,rsp
|
||||
and rax,1111B
|
||||
mov r15,22
|
||||
sub r15b,al
|
||||
sub rsp,r15
|
||||
|
||||
mov rbp,rsp
|
||||
|
||||
xor r10,r10
|
||||
xor r11,r11
|
||||
xor r12,r12
|
||||
xor r13,r13
|
||||
xor r14,r14
|
||||
|
||||
sub rsp,316 ;pDstData 8,Y 8,U 8,V 8,nWidth 2,nHeight 2,iStride0 2,iStride1 2,last_column 1,last_line 1,G 16,B 16,R 16,add:128 16
|
||||
;sub:128 16,mul:48 16,mul:475 16,mul:403 16,mul:120 16,VaddY 2,VaddUV 2,res 12,cmp:255 16,cmp:0 16,shuflleR 16,andG 16,shuffleB 16,shuffleY 16,shuffleUV 16,scanline 2
|
||||
|
||||
;last_line: if the last (U,V doubled) line should be skipped, set to 1B
|
||||
;last_column: if the last 4 columns should be skipped, set to 1B
|
||||
|
||||
mov [rbp-8],rdi
|
||||
|
||||
mov rax,[rsi]
|
||||
mov [rbp-16],rax
|
||||
mov rax,[rsi+8]
|
||||
mov [rbp-24],rax
|
||||
mov rax,[rsi+16]
|
||||
mov [rbp-32],rax
|
||||
|
||||
mov [rbp-34],dx
|
||||
mov r13w,cx
|
||||
|
||||
and r8,0FFFFH
|
||||
mov [rbp-38],r8w
|
||||
and r9,0FFFFH
|
||||
mov [rbp-40],r9w
|
||||
|
||||
|
||||
shl r8w,1
|
||||
sub r8w,dx
|
||||
mov r11w,r8w
|
||||
|
||||
mov r10w,dx
|
||||
shr dx,1
|
||||
sub r9w,dx
|
||||
mov r12w,r9w
|
||||
|
||||
|
||||
mov r8w,[rbp-34]
|
||||
shr r8w,2
|
||||
shl r10w,2
|
||||
|
||||
mov r9w,[rbp-38]
|
||||
|
||||
;and al,11B
|
||||
;jz no_column_rest
|
||||
|
||||
;inc word [rbp-34]
|
||||
|
||||
;no_column_rest:
|
||||
;mov [rbp-41],al
|
||||
|
||||
|
||||
|
||||
mov r14b,r13b
|
||||
and r14b,1B
|
||||
;jz no_line_rest
|
||||
|
||||
inc r13w
|
||||
|
||||
;no_line_rest:
|
||||
shr r13w,1
|
||||
|
||||
|
||||
|
||||
;init masks
|
||||
mov eax,00000080H
|
||||
mov [rbp-106],eax
|
||||
mov [rbp-102],eax
|
||||
mov [rbp-98],eax
|
||||
mov [rbp-94],eax
|
||||
|
||||
mov eax,00800080H
|
||||
mov [rbp-122],eax
|
||||
mov [rbp-118],eax
|
||||
mov [rbp-114],eax
|
||||
mov [rbp-110],eax
|
||||
|
||||
mov eax,00300030H
|
||||
mov [rbp-138],eax
|
||||
mov [rbp-134],eax
|
||||
mov [rbp-130],eax
|
||||
mov [rbp-126],eax
|
||||
|
||||
mov eax,01DB01DBH
|
||||
mov [rbp-154],eax
|
||||
mov [rbp-150],eax
|
||||
mov [rbp-146],eax
|
||||
mov [rbp-142],eax
|
||||
|
||||
mov eax,01930193H
|
||||
mov [rbp-170],eax
|
||||
mov [rbp-166],eax
|
||||
mov [rbp-162],eax
|
||||
mov [rbp-158],eax
|
||||
|
||||
mov eax,00780078H
|
||||
mov [rbp-186],eax
|
||||
mov [rbp-182],eax
|
||||
mov [rbp-178],eax
|
||||
mov [rbp-174],eax
|
||||
|
||||
mov eax,000FF0000H
|
||||
mov [rbp-218],eax
|
||||
mov [rbp-214],eax
|
||||
mov [rbp-210],eax
|
||||
mov [rbp-206],eax
|
||||
|
||||
mov eax,00000000H
|
||||
mov [rbp-234],eax
|
||||
mov [rbp-230],eax
|
||||
mov [rbp-226],eax
|
||||
mov [rbp-222],eax
|
||||
|
||||
;shuffle masks
|
||||
;00 xx 00 00 00 xx 00 00 00 xx 00 00 00 xx 00 00
|
||||
;00 rr gg bb 00 rr gg bb 00 rr gg bb 00 rr gg bb
|
||||
mov eax,00FF0000H
|
||||
mov [rbp-250],eax
|
||||
mov [rbp-246],eax
|
||||
mov [rbp-242],eax
|
||||
mov [rbp-238],eax
|
||||
|
||||
mov eax,80800280H
|
||||
mov [rbp-266],eax
|
||||
mov eax,80800680H
|
||||
mov [rbp-262],eax
|
||||
mov eax,80800A80H
|
||||
mov [rbp-258],eax
|
||||
mov eax,80800E80H
|
||||
mov [rbp-254],eax
|
||||
|
||||
mov eax,80808002H
|
||||
mov [rbp-282],eax
|
||||
mov eax,80808006H
|
||||
mov [rbp-278],eax
|
||||
mov eax,8080800AH
|
||||
mov [rbp-274],eax
|
||||
mov eax,8080800EH
|
||||
mov [rbp-270],eax
|
||||
|
||||
;dd cc bb aa
|
||||
;00 00 dd 00 00 00 cc 00 00 00 bb 00 00 00 aa 00
|
||||
mov eax,80800080H
|
||||
mov [rbp-298],eax
|
||||
mov eax,80800180H
|
||||
mov [rbp-294],eax
|
||||
mov eax,80800280H
|
||||
mov [rbp-290],eax
|
||||
mov eax,80800380H
|
||||
mov [rbp-286],eax
|
||||
|
||||
;dd cc bb aa
|
||||
;00 dd 00 dd 00 cc 00 cc 00 bb 00 bb 00 aa 00 aa
|
||||
mov eax,80008000H
|
||||
mov [rbp-314],eax
|
||||
mov eax,80018001H
|
||||
mov [rbp-310],eax
|
||||
mov eax,80028002H
|
||||
mov [rbp-306],eax
|
||||
mov eax,80038003H
|
||||
mov [rbp-302],eax
|
||||
|
||||
|
||||
mov rsi,[rbp-16]
|
||||
mov rax,[rbp-24]
|
||||
mov rbx,[rbp-32]
|
||||
|
||||
|
||||
freerdp_image_yuv420p_to_xrgb_hloop:
|
||||
dec r13w
|
||||
js freerdp_image_yuv420p_to_xrgb_hloop_end
|
||||
jnz not_last_line
|
||||
|
||||
shl r14b,1
|
||||
not_last_line:
|
||||
|
||||
xor cx,cx
|
||||
freerdp_image_yuv420p_to_xrgb_wloop:
|
||||
;main loop
|
||||
; C = Y;
|
||||
; D = U - 128;
|
||||
; E = V - 128;
|
||||
;
|
||||
; R = clip(( 256 * C + 403 * E + 128) >> 8);
|
||||
; G = clip(( 256 * C - 48 * D - 120 * E + 128) >> 8);
|
||||
; B = clip(( 256 * C + 475 * D + 128) >> 8);
|
||||
|
||||
test cx,1B
|
||||
jnz load_yuv_data
|
||||
|
||||
|
||||
;prepare U data
|
||||
movd xmm0,[rax]
|
||||
movdqa xmm5,[rbp-314]
|
||||
pshufb xmm0,xmm5
|
||||
|
||||
add rax,4
|
||||
|
||||
movdqa xmm3,[rbp-122]
|
||||
psubsw xmm0,xmm3
|
||||
|
||||
movdqa xmm2,xmm0
|
||||
|
||||
movdqa xmm4,xmm0
|
||||
movdqa xmm7,[rbp-138]
|
||||
pmullw xmm0,xmm7
|
||||
pmulhw xmm4,xmm7
|
||||
|
||||
movdqa xmm7,xmm0
|
||||
punpcklwd xmm0,xmm4 ;what an awesome instruction!
|
||||
punpckhwd xmm7,xmm4
|
||||
movdqa xmm4,xmm7
|
||||
|
||||
movdqa xmm6,[rbp-106]
|
||||
psubd xmm0,xmm6
|
||||
psubd xmm4,xmm6
|
||||
|
||||
|
||||
movdqa xmm1,xmm2
|
||||
movdqa xmm7,[rbp-154]
|
||||
pmullw xmm1,xmm7
|
||||
pmulhw xmm2,xmm7
|
||||
|
||||
movdqa xmm7,xmm1
|
||||
punpcklwd xmm1,xmm2
|
||||
punpckhwd xmm7,xmm2
|
||||
|
||||
paddd xmm1,xmm6
|
||||
paddd xmm7,xmm6
|
||||
|
||||
movdqa [rbp-74],xmm7
|
||||
|
||||
|
||||
;prepare V data
|
||||
movd xmm2,[rbx]
|
||||
pshufb xmm2,xmm5
|
||||
|
||||
add rbx,4
|
||||
|
||||
psubsw xmm2,xmm3
|
||||
|
||||
movdqa xmm5,xmm2
|
||||
|
||||
movdqa xmm3,xmm2
|
||||
movdqa xmm7,[rbp-170]
|
||||
pmullw xmm2,xmm7
|
||||
pmulhw xmm3,xmm7
|
||||
|
||||
movdqa xmm7,xmm2
|
||||
punpcklwd xmm2,xmm3
|
||||
punpckhwd xmm7,xmm3
|
||||
|
||||
paddd xmm2,xmm6
|
||||
paddd xmm7,xmm6
|
||||
|
||||
movdqa [rbp-90],xmm7
|
||||
|
||||
|
||||
movdqa xmm3,xmm5
|
||||
movdqa xmm7,[rbp-186]
|
||||
pmullw xmm3,xmm7
|
||||
pmulhw xmm5,xmm7
|
||||
|
||||
movdqa xmm7,xmm3
|
||||
punpcklwd xmm3,xmm5
|
||||
punpckhwd xmm7,xmm5
|
||||
|
||||
paddd xmm0,xmm3
|
||||
paddd xmm4,xmm7
|
||||
|
||||
movdqa [rbp-58],xmm4
|
||||
|
||||
jmp valid_yuv_data
|
||||
|
||||
load_yuv_data:
|
||||
movdqa xmm1,[rbp-74]
|
||||
movdqa xmm2,[rbp-90]
|
||||
movdqa xmm0,[rbp-58]
|
||||
|
||||
valid_yuv_data:
|
||||
|
||||
|
||||
;Y data processing
|
||||
movd xmm4,[rsi]
|
||||
pshufb xmm4,[rbp-298]
|
||||
|
||||
movdqa xmm5,xmm4
|
||||
movdqa xmm6,xmm4
|
||||
|
||||
paddd xmm4,xmm2
|
||||
psubd xmm5,xmm0
|
||||
paddd xmm6,xmm1
|
||||
|
||||
pslld xmm4,8
|
||||
pslld xmm5,8
|
||||
pslld xmm6,8
|
||||
|
||||
movdqa xmm7,[rbp-234]
|
||||
pmaxsw xmm4,xmm7 ;what an awesome instruction!
|
||||
pmaxsw xmm5,xmm7
|
||||
pmaxsw xmm6,xmm7
|
||||
|
||||
movdqa xmm7,[rbp-218]
|
||||
pminsw xmm4,xmm7
|
||||
pminsw xmm5,xmm7
|
||||
pminsw xmm6,xmm7
|
||||
|
||||
pand xmm4,[rbp-250]
|
||||
pshufb xmm5,[rbp-266]
|
||||
pshufb xmm6,[rbp-282]
|
||||
|
||||
por xmm4,xmm5
|
||||
por xmm4,xmm6
|
||||
|
||||
movdqa [rdi],xmm4
|
||||
|
||||
|
||||
;Y data processing in secound line
|
||||
test r14b,2
|
||||
jnz skip_last_line1
|
||||
|
||||
movd xmm4,[rsi+r9]
|
||||
pshufb xmm4,[rbp-298]
|
||||
|
||||
|
||||
movdqa xmm5,xmm4
|
||||
movdqa xmm6,xmm4
|
||||
|
||||
paddd xmm4,xmm2
|
||||
psubd xmm5,xmm0
|
||||
paddd xmm6,xmm1
|
||||
|
||||
pslld xmm4,8
|
||||
pslld xmm5,8
|
||||
pslld xmm6,8
|
||||
|
||||
movdqa xmm7,[rbp-234]
|
||||
pmaxsw xmm4,xmm7 ;what an awesome instruction!
|
||||
pmaxsw xmm5,xmm7
|
||||
pmaxsw xmm6,xmm7
|
||||
|
||||
movdqa xmm7,[rbp-218]
|
||||
pminsw xmm4,xmm7
|
||||
pminsw xmm5,xmm7
|
||||
pminsw xmm6,xmm7
|
||||
|
||||
pand xmm4,[rbp-250]
|
||||
pshufb xmm5,[rbp-266]
|
||||
pshufb xmm6,[rbp-282]
|
||||
|
||||
por xmm4,xmm5
|
||||
por xmm4,xmm6
|
||||
|
||||
movdqa [rdi+r10],xmm4
|
||||
|
||||
skip_last_line1:
|
||||
add rdi,16
|
||||
add rsi,4
|
||||
|
||||
inc cx
|
||||
cmp cx,r8w
|
||||
jne freerdp_image_yuv420p_to_xrgb_wloop
|
||||
|
||||
freerdp_image_yuv420p_to_xrgb_wloop_end:
|
||||
add rdi,r10
|
||||
|
||||
add rsi,r11
|
||||
|
||||
add rax,r12
|
||||
add rbx,r12
|
||||
;mov eax,r12d
|
||||
;jmp freerdp_image_yuv420p_to_xrgb_end
|
||||
|
||||
jmp freerdp_image_yuv420p_to_xrgb_hloop
|
||||
|
||||
freerdp_image_yuv420p_to_xrgb_hloop_end:
|
||||
|
||||
mov eax,0
|
||||
freerdp_image_yuv420p_to_xrgb_end:
|
||||
mov rsp,rbp
|
||||
add rsp,r15
|
||||
pop rbp
|
||||
pop rbx
|
||||
ret
|
@ -2,10 +2,6 @@
|
||||
;G=(256*Y-48*(U-128)-120*(V-128)+128)/256 =(256*Y-48*U-120*V+21632)/256
|
||||
;B=(256*Y+475*(U-128)+128)/256 =(256*Y+475*U-60672)/256
|
||||
|
||||
section .data
|
||||
debug: db "DEBUG",10
|
||||
dblen: equ $-debug
|
||||
|
||||
section .text
|
||||
;global YUV_to_RGB_asm
|
||||
YUV_to_RGB_asm:
|
20
libfreerdp/codec/test/Makefile.TestOpenH264ASM
Normal file
20
libfreerdp/codec/test/Makefile.TestOpenH264ASM
Normal file
@ -0,0 +1,20 @@
|
||||
TestOpenH264ASM: h264_ssse3.asm.o TestOpenH264ASM.c.o h264.c.o
|
||||
gcc -o TestOpenH264ASM h264_ssse3.asm.o TestOpenH264ASM.c.o h264.c.o
|
||||
|
||||
h264_ssse3.asm.o: ../h264_ssse3_x64.asm
|
||||
nasm -f elf64 -o h264_ssse3.asm.o ../h264_ssse3_x64.asm
|
||||
|
||||
h264.asm.o: ../h264.asm
|
||||
nasm -f elf64 -o h264.asm.o ../h264.asm
|
||||
|
||||
TestOpenH264ASM.c.o: TestOpenH264ASM.c
|
||||
gcc -c -o TestOpenH264ASM.c.o TestOpenH264ASM.c
|
||||
|
||||
h264.c.o: ../h264.c
|
||||
gcc -c -O3 -o h264.c.o ../h264.c
|
||||
|
||||
clean:
|
||||
rm -f TestOpenH264ASM TestOpenH264ASM.c.o h264_ssse3.asm.o h264.c.o h264.asm.o
|
||||
|
||||
old: h264.asm.o TestOpenH264ASM.c.o h264.c.o
|
||||
gcc -o TestOpenH264ASM h264.asm.o TestOpenH264ASM.c.o h264.c.o
|
@ -4,49 +4,70 @@
|
||||
|
||||
#include "TestOpenH264ASM.h"
|
||||
|
||||
#define WIDTH 1920
|
||||
#define HEIGHT 1080
|
||||
|
||||
int main(void){
|
||||
int ret,i;
|
||||
int i,j,k;
|
||||
int ret;
|
||||
unsigned char *pDstData_c,*pDstData_asm,*pSrcData[3];
|
||||
int nSrcStep[2];
|
||||
|
||||
if(check_ssse3()){
|
||||
fprintf(stderr,"ssse3 not supported!\n");
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
|
||||
struct timeval t1,t2,t3;
|
||||
|
||||
pSrcData[0]=malloc(1920*1080*sizeof(char));
|
||||
pSrcData[1]=malloc(1920*1080/4*sizeof(char));
|
||||
pSrcData[2]=malloc(1920*1080/4*sizeof(char));
|
||||
pDstData_asm=malloc(1920*1080*4*sizeof(char));
|
||||
pDstData_c=malloc(1920*1080*4*sizeof(char));
|
||||
pSrcData[0]=malloc(1984*HEIGHT*sizeof(char));
|
||||
pSrcData[1]=malloc(1984*HEIGHT/4*sizeof(char));
|
||||
pSrcData[2]=malloc(1984*HEIGHT/4*sizeof(char));
|
||||
pDstData_asm=malloc(WIDTH*HEIGHT*4*sizeof(char));
|
||||
pDstData_c=malloc(WIDTH*HEIGHT*4*sizeof(char));
|
||||
|
||||
for(i=0;i<1920*1080;i++){
|
||||
for(i=0;i<WIDTH*HEIGHT;i++){
|
||||
pSrcData[0][i]=i%255;
|
||||
pSrcData[1][i/4]=pSrcData[0][i];
|
||||
pSrcData[2][i/4]=255-pSrcData[0][i];
|
||||
}
|
||||
|
||||
printf("%X\n",pSrcData[0][0]);
|
||||
|
||||
nSrcStep[0]=1088;
|
||||
nSrcStep[1]=544;
|
||||
nSrcStep[0]=1984;
|
||||
nSrcStep[1]=992;
|
||||
|
||||
gettimeofday(&t1,NULL);
|
||||
ret=freerdp_image_yuv_to_xrgb_asm(pDstData_asm,pSrcData,1024,768,1088,544);
|
||||
ret=freerdp_image_yuv420p_to_xrgb(pDstData_asm,pSrcData,WIDTH,HEIGHT,nSrcStep[0],nSrcStep[1]);
|
||||
gettimeofday(&t2,NULL);
|
||||
freerdp_image_copy_yuv420p_to_xrgb(pDstData_c,1024*4,0,0,1024,768,pSrcData,nSrcStep,0,0);
|
||||
freerdp_image_copy_yuv420p_to_xrgb(pDstData_c,WIDTH*4,0,0,WIDTH,HEIGHT,pSrcData,nSrcStep,0,0);
|
||||
gettimeofday(&t3,NULL);
|
||||
|
||||
printf("in asm (%d) it took %u sec %u usec,\nin c %u sec %u usec.\n",ret,(int)(t2.tv_sec-t1.tv_sec),(int)(t2.tv_usec-t1.tv_usec),
|
||||
printf("in asm (0x%08X) it took %u sec %u usec,\nin c %u sec %u usec.\n",ret,(int)(t2.tv_sec-t1.tv_sec),(int)(t2.tv_usec-t1.tv_usec),
|
||||
(int)(t3.tv_sec-t2.tv_sec),(int)(t3.tv_usec-t2.tv_usec));
|
||||
|
||||
printf("in asm the result was %X %X %X\n in c %X %X %X.\n",(unsigned char *)pDstData_asm[92],(unsigned char *)pDstData_asm[93],(unsigned char *)pDstData_asm[94],
|
||||
(unsigned char *)pDstData_c[92],(unsigned char *)pDstData_c[93],(unsigned char *)pDstData_c[94]);
|
||||
printf("in asm the result was %X %X %X\n in c %X %X %X.\n",pDstData_asm[0],pDstData_asm[1],pDstData_asm[2],
|
||||
pDstData_c[0],pDstData_c[1],pDstData_c[2]);
|
||||
|
||||
for(i=0;i<(1920*1080*4);i++){
|
||||
/*k=0;
|
||||
for(i=0;i<HEIGHT+1;i++){
|
||||
for(j=0;j<WIDTH;j++){
|
||||
printf("%08X:%08X ",((unsigned int*)pDstData_asm)[k],((unsigned int*)pDstData_c)[k]);
|
||||
k++;
|
||||
}
|
||||
puts("\n");
|
||||
}*/
|
||||
|
||||
k=1;
|
||||
for(i=0;i<(WIDTH*HEIGHT*4);i++){
|
||||
if(pDstData_c[i]!=pDstData_asm[i]){
|
||||
k=0;
|
||||
printf("MISSMATCH at %d: %2X != %2X\n",i,(unsigned char)pDstData_asm[i],(unsigned char)pDstData_c[i]);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if(k)
|
||||
printf("everything OK\n");
|
||||
|
||||
free(pSrcData[0]);
|
||||
free(pSrcData[1]);
|
||||
free(pSrcData[2]);
|
||||
|
@ -5,3 +5,6 @@ extern int YUV_to_RGB(unsigned char Y,unsigned char U,unsigned char V);
|
||||
extern int freerdp_image_yuv_to_xrgb_asm(unsigned char *pDstData,unsigned char **pSrcData,int nWidth,int nHeight,int istride0,int istride1);
|
||||
int freerdp_image_copy_yuv420p_to_xrgb(unsigned char* pDstData, int nDstStep, int nXDst, int nYDst,
|
||||
int nWidth, int nHeight, unsigned char* pSrcData[3], int nSrcStep[2], int nXSrc, int nYSrc);
|
||||
|
||||
extern int check_ssse3();
|
||||
extern int freerdp_image_yuv420p_to_xrgb(unsigned char *pDstData,unsigned char **pSrcData,int nWidth,int nHeight,int istride0,int istride1);
|
Loading…
Reference in New Issue
Block a user