YUV data conversion with SSSE3 using intrinsics
This commit is contained in:
parent
25593c7250
commit
fee370e4b2
@ -102,50 +102,21 @@ if(WITH_LIBAVCODEC)
|
||||
endif()
|
||||
|
||||
if(WITH_LIBAVCODEC OR WITH_OPENH264)
|
||||
if(CMAKE_SIZEOF_VOID_P EQUAL 8)
|
||||
set(arch64 TRUE)
|
||||
else()
|
||||
set(arch64 FALSE)
|
||||
endif()
|
||||
|
||||
if(WITH_H264_ASM)
|
||||
set(H264_ASM H264_ASM_o)
|
||||
add_definitions(-DWITH_H264_ASM)
|
||||
add_custom_target(${H264_ASM})
|
||||
|
||||
if(arch64)
|
||||
set(SRC ${CMAKE_CURRENT_SOURCE_DIR}/h264_x64.asm)
|
||||
set(OBJ ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${H264_ASM}.dir/h264_x64.asm.o)
|
||||
add_custom_command(TARGET ${H264_ASM}
|
||||
COMMAND nasm ARGS -f elf64 -o ${OBJ} ${SRC})
|
||||
else()
|
||||
set(SRC ${CMAKE_CURRENT_SOURCE_DIR}/h264_x32.asm)
|
||||
set(OBJ ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${H264_ASM}.dir/h264_x32.asm.o)
|
||||
add_custom_command(TARGET ${H264_ASM}
|
||||
COMMAND nasm ARGS -f elf32 -o ${OBJ} ${SRC})
|
||||
endif()
|
||||
|
||||
set(FREERDP_OPENH264_LIBS ${OPENH264_LIBRARIES} ${OBJ})
|
||||
endif()
|
||||
|
||||
if(WITH_H264_SSSE3)
|
||||
set(H264_ASM H264_ASM_o)
|
||||
add_definitions(-DWITH_H264_SSSE3)
|
||||
add_custom_target(${H264_ASM})
|
||||
|
||||
if(arch64)
|
||||
set(SRC ${CMAKE_CURRENT_SOURCE_DIR}/h264_ssse3_x64.asm)
|
||||
set(OBJ ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${H264_ASM}.dir/h264_ssse3_x64.asm.o)
|
||||
add_custom_command(TARGET ${H264_ASM}
|
||||
COMMAND nasm ARGS -f elf64 -o ${OBJ} ${SRC})
|
||||
else()
|
||||
set(SRC ${CMAKE_CURRENT_SOURCE_DIR}/h264_ssse3_x32.asm)
|
||||
set(OBJ ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${H264_ASM}.dir/h264_ssse3_x32.asm.o)
|
||||
add_custom_command(TARGET ${H264_ASM}
|
||||
COMMAND nasm ARGS -f elf32 -o ${OBJ} ${SRC})
|
||||
set(${MODULE_PREFIX}_SRCS
|
||||
${${MODULE_PREFIX}_SRCS}
|
||||
h264_ssse3.c)
|
||||
|
||||
if(CMAKE_COMPILER_IS_GNUCC)
|
||||
set(OPTIMIZATION "${OPTIMIZATION} -msse2 -mssse3")
|
||||
endif()
|
||||
|
||||
set(FREERDP_OPENH264_LIBS ${OPENH264_LIBRARIES} ${OBJ})
|
||||
|
||||
if(MSVC)
|
||||
set(OPTIMIZATION "${OPTIMIZATION} /arch:SSE2")
|
||||
endif()
|
||||
|
||||
set_property(SOURCE h264_ssse3.c PROPERTY COMPILE_FLAGS ${OPTIMIZATION})
|
||||
endif()
|
||||
endif()
|
||||
|
||||
@ -179,10 +150,6 @@ else()
|
||||
install(TARGETS ${MODULE_NAME} DESTINATION ${CMAKE_INSTALL_LIBDIR} EXPORT FreeRDPTargets)
|
||||
endif()
|
||||
|
||||
if(WITH_H264_ASM OR WITH_H264_SSSE3)
|
||||
add_dependencies(${MODULE_NAME} ${H264_ASM})
|
||||
endif()
|
||||
|
||||
set_property(TARGET ${MODULE_NAME} PROPERTY FOLDER "FreeRDP/libfreerdp")
|
||||
|
||||
if(BUILD_TESTING)
|
||||
|
@ -31,12 +31,8 @@
|
||||
#include <sys/time.h>
|
||||
|
||||
#ifdef WITH_H264_SSSE3
|
||||
extern int check_ssse3();
|
||||
extern int freerdp_image_yuv420p_to_xrgb(BYTE *pDstData,BYTE **pSrcData,int nWidth,int nHeight,int *iStride,int scanline);
|
||||
#else
|
||||
#ifdef WITH_H264_ASM
|
||||
extern int freerdp_image_yuv_to_xrgb_asm(BYTE *pDstData,BYTE **pSrcData,int nWidth,int nHeight,int iStride0,int iStride1);
|
||||
#endif
|
||||
extern int freerdp_check_ssse3();
|
||||
extern int freerdp_image_yuv420p_to_xrgb_ssse3(BYTE *pDstData,BYTE **pSrcData,int nWidth,int nHeight,int *iStride,int scanline);
|
||||
#endif
|
||||
|
||||
#define USE_GRAY_SCALE 0
|
||||
@ -408,7 +404,6 @@ static int openh264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSiz
|
||||
if (pSystemBuffer->iFormat != videoFormatI420)
|
||||
return -1;
|
||||
|
||||
/* Convert I420 (same as IYUV) to XRGB. */
|
||||
|
||||
if (g_H264DumpFrames)
|
||||
{
|
||||
@ -416,31 +411,12 @@ static int openh264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSiz
|
||||
}
|
||||
|
||||
g_H264FrameId++;
|
||||
|
||||
|
||||
h264->iStride[0] = pSystemBuffer->iStride[0];
|
||||
h264->iStride[1] = pSystemBuffer->iStride[1];
|
||||
h264->width = pSystemBuffer->iWidth;
|
||||
h264->height = pSystemBuffer->iHeight;
|
||||
|
||||
|
||||
#if 0
|
||||
if (h264_prepare_rgb_buffer(h264, pSystemBuffer->iWidth, pSystemBuffer->iHeight) < 0)
|
||||
return -1;
|
||||
|
||||
gettimeofday(&T1,NULL);
|
||||
#ifdef WITH_H264_SSSE3
|
||||
freerdp_image_yuv420p_to_xrgb(h264->data,pYUVData,h264->width,h264->height,pSystemBuffer->iStride[0],pSystemBuffer->iStride[1]);
|
||||
#else
|
||||
#ifdef WITH_H264_ASM
|
||||
freerdp_image_yuv_to_xrgb_asm(h264->data,pYUVData,h264->width,h264->height,pSystemBuffer->iStride[0],pSystemBuffer->iStride[1]);
|
||||
#else
|
||||
freerdp_image_copy_yuv420p_to_xrgb(h264->data, h264->scanline, 0, 0,
|
||||
h264->width, h264->height, pYUVData, pSystemBuffer->iStride, 0, 0);
|
||||
#endif
|
||||
#endif
|
||||
gettimeofday(&T2,NULL);
|
||||
printf("\tconverting took %u sec %u usec\n",(unsigned int)(T2.tv_sec-T1.tv_sec),(unsigned int)(T2.tv_usec-T1.tv_usec));
|
||||
#endif
|
||||
|
||||
return 1;
|
||||
}
|
||||
@ -677,7 +653,7 @@ int h264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize,
|
||||
BYTE* pDstPoint;
|
||||
|
||||
BYTE** pYUVData;
|
||||
BYTE* pYUVPoint[2];
|
||||
BYTE* pYUVPoint[3];
|
||||
|
||||
RDPGFX_RECT16* rect;
|
||||
int* iStride;
|
||||
@ -743,13 +719,16 @@ int h264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize,
|
||||
pYUVPoint[1] = pYUVData[1] + ret;
|
||||
pYUVPoint[2] = pYUVData[2] + ret;
|
||||
|
||||
#if 1
|
||||
#if 0
|
||||
printf("regionRect: x: %d, y: %d, cx: %d, cy: %d\n",
|
||||
rect->left, rect->top, cx, cy);
|
||||
#endif
|
||||
|
||||
#ifdef WITH_H264_SSSE3
|
||||
freerdp_image_yuv420p_to_xrgb(pDstPoint, pYUVPoint, cx, cy, iStride, nDstStep);
|
||||
freerdp_image_yuv420p_to_xrgb_ssse3(pDstPoint, pYUVPoint, cx, cy, iStride, nDstStep);
|
||||
#else
|
||||
freerdp_image_copy_yuv420p_to_xrgb(pDstPoint, nDstStep, 0, 0,
|
||||
cx, cy, pYUVPoint, iStride, 0, 0);
|
||||
#endif
|
||||
}
|
||||
gettimeofday(&T2,NULL);
|
||||
@ -774,9 +753,9 @@ H264_CONTEXT* h264_context_new(BOOL Compressor)
|
||||
h264 = (H264_CONTEXT*) calloc(1, sizeof(H264_CONTEXT));
|
||||
|
||||
#ifdef WITH_H264_SSSE3
|
||||
if(check_ssse3()){
|
||||
if(freerdp_check_ssse3()){
|
||||
printf("SSSE3 seems to be not supported on this system, try without WITH_H264_SSSE3 ...");
|
||||
return FALSE;
|
||||
return NULL;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
298
libfreerdp/codec/h264_ssse3.c
Normal file
298
libfreerdp/codec/h264_ssse3.c
Normal file
@ -0,0 +1,298 @@
|
||||
/** function for converting YUV420p data to the RGB format (but without any special upconverting)
|
||||
* It's completely written in nasm-x86-assembly for intel processors supporting SSSE3 and higher.
|
||||
* The target scanline (6th parameter) must be a multiple of 16.
|
||||
* iStride[0] must be (target scanline) / 4 or bigger and iStride[1] the next multiple of four
|
||||
* of the half of iStride[0] or bigger
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#include <emmintrin.h>
|
||||
//#include <immintrin.h>
|
||||
#include <tmmintrin.h>
|
||||
|
||||
#include <winpr/sysinfo.h>
|
||||
#include <winpr/crt.h>
|
||||
|
||||
int freerdp_check_ssse3()
|
||||
{
|
||||
if(IsProcessorFeaturePresentEx(PF_EX_SSSE3)&&IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
|
||||
return 0;
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
int freerdp_image_yuv420p_to_xrgb_ssse3(BYTE *pDstData,BYTE **pSrcData,int nWidth,int nHeight,int *iStride,int scanline)
|
||||
{
|
||||
char last_line,last_column;
|
||||
int i,VaddDst,VaddY,VaddUV;
|
||||
|
||||
BYTE *UData,*VData,*YData;
|
||||
|
||||
__m128i r0,r1,r2,r3,r4,r5,r6,r7;
|
||||
__m128i *buffer;
|
||||
|
||||
|
||||
buffer=_aligned_malloc(4*16,16);
|
||||
|
||||
|
||||
YData=pSrcData[0];
|
||||
UData=pSrcData[1];
|
||||
VData=pSrcData[2];
|
||||
|
||||
|
||||
if((last_column=nWidth&3)){
|
||||
switch(last_column){
|
||||
case 1: r7=_mm_set_epi32(0,0,0,0xFFFFFFFF); break;
|
||||
case 2: r7=_mm_set_epi32(0,0,0xFFFFFFFF,0xFFFFFFFF); break;
|
||||
case 3: r7=_mm_set_epi32(0,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF); break;
|
||||
}
|
||||
_mm_store_si128(buffer+48,r7);
|
||||
last_column=1;
|
||||
}
|
||||
|
||||
nWidth+=3;
|
||||
nWidth=nWidth>>2;
|
||||
|
||||
|
||||
last_line=nHeight&1;
|
||||
nHeight++;
|
||||
nHeight=nHeight>>1;
|
||||
|
||||
|
||||
VaddDst=(scanline<<1)-(nWidth<<4);
|
||||
VaddY=(iStride[0]<<1)-(nWidth<<2);
|
||||
VaddUV=iStride[1]-(((nWidth<<1)+2)&0xFFFC);
|
||||
|
||||
|
||||
|
||||
while(nHeight-- >0){
|
||||
if(nHeight==0){
|
||||
last_line=last_line<<1;
|
||||
}
|
||||
|
||||
i=0;
|
||||
do{
|
||||
/*
|
||||
* Well, in the end it should look like this:
|
||||
* C = Y;
|
||||
* D = U - 128;
|
||||
* E = V - 128;
|
||||
*
|
||||
* R = clip(( 256 * C + 403 * E + 128) >> 8);
|
||||
* G = clip(( 256 * C - 48 * D - 120 * E + 128) >> 8);
|
||||
* B = clip(( 256 * C + 475 * D + 128) >> 8);
|
||||
*/
|
||||
if(!(i&0x01)){
|
||||
/* Y-, U- and V-data is stored in different arrays.
|
||||
* We start with processing U-data.
|
||||
*
|
||||
* at first we fetch four U-values from its array and shuffle them like this:
|
||||
* 0d0d 0c0c 0b0b 0a0a
|
||||
* we've done two things: converting the values to signed words and duplicating
|
||||
* each value, because always two pixel "share" the same U- (and V-) data
|
||||
*/
|
||||
r0=_mm_cvtsi32_si128(*(UINT32 *)UData);
|
||||
r5=_mm_set_epi32(0x80038003,0x80028002,0x80018001,0x80008000);
|
||||
r0=_mm_shuffle_epi8(r0,r5);
|
||||
|
||||
UData+=4;
|
||||
|
||||
r3=_mm_set_epi16(128,128,128,128,128,128,128,128);
|
||||
r0=_mm_subs_epi16(r0,r3);
|
||||
|
||||
r2=r0;
|
||||
|
||||
r4=r0;
|
||||
r7=_mm_set_epi16(48,48,48,48,48,48,48,48);
|
||||
r0=_mm_mullo_epi16(r0,r7);
|
||||
r4=_mm_mulhi_epi16(r4,r7);
|
||||
r7=r0;
|
||||
r0=_mm_unpacklo_epi16(r0,r4);
|
||||
r4=_mm_unpackhi_epi16(r7,r4);
|
||||
|
||||
|
||||
r6=_mm_set_epi32(128,128,128,128);
|
||||
r0=_mm_sub_epi32(r0,r6);
|
||||
r4=_mm_sub_epi32(r4,r6);
|
||||
|
||||
|
||||
r1=r2;
|
||||
r7=_mm_set_epi16(475,475,475,475,475,475,475,475);
|
||||
r1=_mm_mullo_epi16(r1,r7);
|
||||
r2=_mm_mulhi_epi16(r2,r7);
|
||||
r7=r1;
|
||||
r1=_mm_unpacklo_epi16(r1,r2);
|
||||
r7=_mm_unpackhi_epi16(r7,r2);
|
||||
|
||||
r1=_mm_add_epi32(r1,r6);
|
||||
r7=_mm_add_epi32(r7,r6);
|
||||
|
||||
_mm_store_si128(buffer+16,r7);
|
||||
|
||||
/* Now we've prepared U-data. Preparing V-data is actually the same, just with other coefficients */
|
||||
r2=_mm_cvtsi32_si128(*(UINT32 *)VData);
|
||||
r2=_mm_shuffle_epi8(r2,r5);
|
||||
|
||||
VData+=4;
|
||||
|
||||
r2=_mm_subs_epi16(r2,r3);
|
||||
|
||||
r5=r2;
|
||||
|
||||
|
||||
r3=r2;
|
||||
r7=_mm_set_epi16(403,403,403,403,403,403,403,403);
|
||||
r2=_mm_mullo_epi16(r2,r7);
|
||||
r3=_mm_mulhi_epi16(r3,r7);
|
||||
r7=r2;
|
||||
r2=_mm_unpacklo_epi16(r2,r3);
|
||||
r7=_mm_unpackhi_epi16(r7,r3);
|
||||
|
||||
r2=_mm_add_epi32(r2,r6);
|
||||
r7=_mm_add_epi32(r7,r6);
|
||||
|
||||
_mm_store_si128(buffer+32,r7);
|
||||
|
||||
|
||||
|
||||
r3=r5;
|
||||
r7=_mm_set_epi16(120,120,120,120,120,120,120,120);
|
||||
r3=_mm_mullo_epi16(r3,r7);
|
||||
r5=_mm_mulhi_epi16(r5,r7);
|
||||
r7=r3;
|
||||
r3=_mm_unpacklo_epi16(r3,r5);
|
||||
r7=_mm_unpackhi_epi16(r7,r5);
|
||||
|
||||
r0=_mm_add_epi32(r0,r3);
|
||||
r4=_mm_add_epi32(r4,r7);
|
||||
|
||||
_mm_store_si128(buffer,r4);
|
||||
}else{
|
||||
r1=_mm_load_si128(buffer+16);
|
||||
r2=_mm_load_si128(buffer+32);
|
||||
r0=_mm_load_si128(buffer);
|
||||
}
|
||||
|
||||
if(++i==nWidth)
|
||||
last_column=last_column<<1;
|
||||
|
||||
//processing Y data
|
||||
r4=_mm_cvtsi32_si128(*(UINT32 *)YData);
|
||||
r7=_mm_set_epi32(0x80800380,0x80800280,0x80800180,0x80800080);
|
||||
r4=_mm_shuffle_epi8(r4,r7);
|
||||
|
||||
r5=r4;
|
||||
r6=r4;
|
||||
|
||||
r4=_mm_add_epi32(r4,r2);
|
||||
r5=_mm_sub_epi32(r5,r0);
|
||||
r6=_mm_add_epi32(r6,r1);
|
||||
|
||||
|
||||
r4=_mm_slli_epi32(r4,8);
|
||||
r5=_mm_slli_epi32(r5,8);
|
||||
r6=_mm_slli_epi32(r6,8);
|
||||
|
||||
r7=_mm_set_epi32(0,0,0,0);
|
||||
r4=_mm_max_epi16(r4,r7);
|
||||
r5=_mm_max_epi16(r5,r7);
|
||||
r6=_mm_max_epi16(r6,r7);
|
||||
|
||||
r7=_mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000);
|
||||
r4=_mm_min_epi16(r4,r7);
|
||||
r5=_mm_min_epi16(r5,r7);
|
||||
r6=_mm_min_epi16(r6,r7);
|
||||
|
||||
//r7=_mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000);
|
||||
r4=_mm_and_si128(r4,r7);
|
||||
|
||||
r7=_mm_set_epi32(0x80800E80,0x80800A80,0x80800680,0x80800280);
|
||||
r5=_mm_shuffle_epi8(r5,r7);
|
||||
|
||||
r7=_mm_set_epi32(0x8080800E,0x8080800A,0x80808006,0x80808002);
|
||||
r6=_mm_shuffle_epi8(r6,r7);
|
||||
|
||||
|
||||
r4=_mm_or_si128(r4,r5);
|
||||
r4=_mm_or_si128(r4,r6);
|
||||
|
||||
|
||||
if(last_column&0x02){
|
||||
r6=_mm_load_si128(buffer+48);
|
||||
r4=_mm_and_si128(r4,r6);
|
||||
r5=_mm_lddqu_si128((__m128i *)pDstData);
|
||||
r6=_mm_andnot_si128(r6,r5);
|
||||
r4=_mm_or_si128(r4,r6);
|
||||
}
|
||||
_mm_storeu_si128((__m128i *)pDstData,r4);
|
||||
|
||||
//Y data processing in secound line
|
||||
if(!(last_line&0x02)){
|
||||
r4=_mm_cvtsi32_si128(*(UINT32 *)(YData+iStride[0]));
|
||||
r7=_mm_set_epi32(0x80800380,0x80800280,0x80800180,0x80800080);
|
||||
r4=_mm_shuffle_epi8(r4,r7);
|
||||
|
||||
r5=r4;
|
||||
r6=r4;
|
||||
|
||||
r4=_mm_add_epi32(r4,r2);
|
||||
r5=_mm_sub_epi32(r5,r0);
|
||||
r6=_mm_add_epi32(r6,r1);
|
||||
|
||||
|
||||
r4=_mm_slli_epi32(r4,8);
|
||||
r5=_mm_slli_epi32(r5,8);
|
||||
r6=_mm_slli_epi32(r6,8);
|
||||
|
||||
r7=_mm_set_epi32(0,0,0,0);
|
||||
r4=_mm_max_epi16(r4,r7);
|
||||
r5=_mm_max_epi16(r5,r7);
|
||||
r6=_mm_max_epi16(r6,r7);
|
||||
|
||||
r7=_mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000);
|
||||
r4=_mm_min_epi16(r4,r7);
|
||||
r5=_mm_min_epi16(r5,r7);
|
||||
r6=_mm_min_epi16(r6,r7);
|
||||
|
||||
r7=_mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000);
|
||||
r4=_mm_and_si128(r4,r7);
|
||||
|
||||
r7=_mm_set_epi32(0x80800E80,0x80800A80,0x80800680,0x80800280);
|
||||
r5=_mm_shuffle_epi8(r5,r7);
|
||||
|
||||
r7=_mm_set_epi32(0x8080800E,0x8080800A,0x80808006,0x80808002);
|
||||
r6=_mm_shuffle_epi8(r6,r7);
|
||||
|
||||
|
||||
r4=_mm_or_si128(r4,r5);
|
||||
r4=_mm_or_si128(r4,r6);
|
||||
|
||||
|
||||
if(last_column&0x02){
|
||||
r6=_mm_load_si128(buffer+48);
|
||||
r4=_mm_and_si128(r4,r6);
|
||||
r5=_mm_lddqu_si128((__m128i *)(pDstData+scanline));
|
||||
r6=_mm_andnot_si128(r6,r5);
|
||||
r4=_mm_or_si128(r4,r6);
|
||||
|
||||
last_column=last_column>>1;
|
||||
}
|
||||
_mm_storeu_si128((__m128i *)(pDstData+scanline),r4);
|
||||
}
|
||||
|
||||
pDstData+=16;
|
||||
YData+=4;
|
||||
|
||||
}while(i<nWidth);
|
||||
|
||||
pDstData+=VaddDst;
|
||||
YData+=VaddY;
|
||||
UData+=VaddUV;
|
||||
VData+=VaddUV;
|
||||
}
|
||||
|
||||
_aligned_free(buffer);
|
||||
return 0;
|
||||
}
|
@ -1,454 +0,0 @@
|
||||
; a entire function for converting YUV420p data to the RGB format (without any special upconverting)
|
||||
; It's completely written in nasm-x86-assembly for intel processors supporting SSSE3 and higher.
|
||||
; Restrictions are that output data has to be aligned to 16 byte (a question of REAL performance!)
|
||||
; and the width of resolution must be divisable by four.
|
||||
;
|
||||
section .text
|
||||
global check_ssse3
|
||||
|
||||
check_ssse3:
|
||||
push ebx
|
||||
|
||||
pushf
|
||||
pop eax
|
||||
or eax,1<<21
|
||||
push eax
|
||||
popf
|
||||
pushf
|
||||
pop eax
|
||||
test eax,1<<21
|
||||
jz check_ssse3_end
|
||||
|
||||
and eax,~(1<<21)
|
||||
push eax
|
||||
popf
|
||||
|
||||
|
||||
mov eax,1
|
||||
mov ebx,0
|
||||
cpuid
|
||||
test edx,1<<25 ;sse
|
||||
jz check_ssse3_end
|
||||
test edx,1<<26 ;sse2
|
||||
jz check_ssse3_end
|
||||
test ecx,1<<0 ;sse3
|
||||
jz check_ssse3_end
|
||||
test ecx,1<<9 ;ssse3
|
||||
jz check_ssse3_end
|
||||
|
||||
|
||||
pop ebx
|
||||
mov eax,0
|
||||
ret
|
||||
|
||||
|
||||
check_ssse3_end:
|
||||
pop ebx
|
||||
mov eax,1
|
||||
ret
|
||||
|
||||
|
||||
;extern int freerdp_image_yuv420p_to_xrgb(unsigned char *pDstData,unsigned char **pSrcData,int nWidth,int nHeight,int istride0,int istride1)
|
||||
global freerdp_image_yuv420p_to_xrgb
|
||||
freerdp_image_yuv420p_to_xrgb:
|
||||
push ebx
|
||||
push ebp
|
||||
|
||||
;check wether stack is aligned to 16 byte boundary
|
||||
;
|
||||
; ---current stack value---|-----x-----|----42 byte---|---16 byte aligned stack---
|
||||
; lets say 508 2 506 464
|
||||
; 1FCH 2H 1FAH 1D0H
|
||||
; 1F0H 1D0H
|
||||
; |------1FCH&FH----|1FCH&^FH
|
||||
; |1FCH&FH-AH |--AH-|---16 byte aligned stack------------
|
||||
; We've got only one problem: what if 1FCH&FH was smaller than AH?
|
||||
; We could either add something to sp (impossible) or subtract 10H-(AH-1FCH&FH) [%10H]
|
||||
; That's the same like (1FCH&FH-AH+10H)&FH and (1FCH+6H)&FH
|
||||
mov eax,esp
|
||||
add eax,6H
|
||||
and eax,1111B
|
||||
sub esp,eax
|
||||
|
||||
mov ebp,esp
|
||||
|
||||
;"local variables"
|
||||
sub esp,318 ;res 8 -8,res 8 -16,res 8 -24,U 8 -32,nWidth 2 -34,nHeight 2 -36,iStride0 2 -38,iStride1 2 -40,last_line 1 -41,res 1 -42,G 16 -58,B 16 -74,
|
||||
;R 16 -90,add:128 16 -106,sub:128 16 -122,mul:48 16 -138,mul:475 16 -154,mul:403 16 -170,mul:120 16 -186,VaddY 4 -190,VaddUV 4 -194,stack offset 8 -202,
|
||||
;cmp:255 16 -218,cmp:0 16 -234,shuflleR 16 -250,andG 16 -266,shuffleB 16 -280,shuffleY 16 -296,shuffleUV 16 -314,scanline 4 -318
|
||||
|
||||
;pDstData:edi,
|
||||
|
||||
mov [ebp-202],eax
|
||||
|
||||
;last_line: if the last (U,V doubled) line should be skipped, set to 1B
|
||||
|
||||
mov edi,[ebp+eax+12]
|
||||
|
||||
mov ecx,[ebp+eax+16]
|
||||
mov esi,[ecx]
|
||||
mov ebx,[ecx+4]
|
||||
mov [ebp-32],ebx
|
||||
mov ebx,[ecx+8]
|
||||
|
||||
|
||||
mov edx,[ebp+eax+20]
|
||||
mov [ebp-34],dx
|
||||
|
||||
shr word [ebp-34],2
|
||||
|
||||
mov [ebp-318],edx
|
||||
shl dword [ebp-318],2
|
||||
|
||||
|
||||
mov ecx,[ebp+eax+24]
|
||||
|
||||
mov [ebp-41],cl
|
||||
and byte [ebp-41],1B
|
||||
|
||||
inc cx
|
||||
shr cx,1
|
||||
mov [ebp-36],cx
|
||||
|
||||
|
||||
mov ecx,[ebp+eax+28]
|
||||
mov [ebp-38],cx
|
||||
|
||||
shl cx,1
|
||||
sub cx,dx
|
||||
mov [ebp-190],ecx
|
||||
|
||||
|
||||
mov ecx,[ebp+eax+32]
|
||||
mov [ebp-40],cx
|
||||
|
||||
|
||||
shr dx,1
|
||||
sub cx,dx
|
||||
mov [ebp-194],ecx
|
||||
|
||||
|
||||
mov eax,[ebp-32]
|
||||
|
||||
|
||||
;init masks
|
||||
mov ecx,00000080H
|
||||
mov [ebp-106],ecx
|
||||
mov [ebp-102],ecx
|
||||
mov [ebp-98],ecx
|
||||
mov [ebp-94],ecx
|
||||
|
||||
mov ecx,00800080H
|
||||
mov [ebp-122],ecx
|
||||
mov [ebp-118],ecx
|
||||
mov [ebp-114],ecx
|
||||
mov [ebp-110],ecx
|
||||
|
||||
mov ecx,00300030H
|
||||
mov [ebp-138],ecx
|
||||
mov [ebp-134],ecx
|
||||
mov [ebp-130],ecx
|
||||
mov [ebp-126],ecx
|
||||
|
||||
mov ecx,01DB01DBH
|
||||
mov [ebp-154],ecx
|
||||
mov [ebp-150],ecx
|
||||
mov [ebp-146],ecx
|
||||
mov [ebp-142],ecx
|
||||
|
||||
mov ecx,01930193H
|
||||
mov [ebp-170],ecx
|
||||
mov [ebp-166],ecx
|
||||
mov [ebp-162],ecx
|
||||
mov [ebp-158],ecx
|
||||
|
||||
mov ecx,00780078H
|
||||
mov [ebp-186],ecx
|
||||
mov [ebp-182],ecx
|
||||
mov [ebp-178],ecx
|
||||
mov [ebp-174],ecx
|
||||
|
||||
mov ecx,000FF0000H
|
||||
mov [ebp-218],ecx
|
||||
mov [ebp-214],ecx
|
||||
mov [ebp-210],ecx
|
||||
mov [ebp-206],ecx
|
||||
|
||||
mov ecx,00000000H
|
||||
mov [ebp-234],ecx
|
||||
mov [ebp-230],ecx
|
||||
mov [ebp-226],ecx
|
||||
mov [ebp-222],ecx
|
||||
|
||||
;shuffle masks
|
||||
;00 xx 00 00 00 xx 00 00 00 xx 00 00 00 xx 00 00
|
||||
;00 rr gg bb 00 rr gg bb 00 rr gg bb 00 rr gg bb
|
||||
mov ecx,00FF0000H
|
||||
mov [ebp-250],ecx
|
||||
mov [ebp-246],ecx
|
||||
mov [ebp-242],ecx
|
||||
mov [ebp-238],ecx
|
||||
|
||||
mov ecx,80800280H
|
||||
mov [ebp-266],ecx
|
||||
mov ecx,80800680H
|
||||
mov [ebp-262],ecx
|
||||
mov ecx,80800A80H
|
||||
mov [ebp-258],ecx
|
||||
mov ecx,80800E80H
|
||||
mov [ebp-254],ecx
|
||||
|
||||
mov ecx,80808002H
|
||||
mov [ebp-282],ecx
|
||||
mov ecx,80808006H
|
||||
mov [ebp-278],ecx
|
||||
mov ecx,8080800AH
|
||||
mov [ebp-274],ecx
|
||||
mov ecx,8080800EH
|
||||
mov [ebp-270],ecx
|
||||
|
||||
;dd cc bb aa
|
||||
;00 00 dd 00 00 00 cc 00 00 00 bb 00 00 00 aa 00
|
||||
mov ecx,80800080H
|
||||
mov [ebp-298],ecx
|
||||
mov ecx,80800180H
|
||||
mov [ebp-294],ecx
|
||||
mov ecx,80800280H
|
||||
mov [ebp-290],ecx
|
||||
mov ecx,80800380H
|
||||
mov [ebp-286],ecx
|
||||
|
||||
;dd cc bb aa
|
||||
;00 dd 00 dd 00 cc 00 cc 00 bb 00 bb 00 aa 00 aa
|
||||
mov ecx,80008000H
|
||||
mov [ebp-314],ecx
|
||||
mov ecx,80018001H
|
||||
mov [ebp-310],ecx
|
||||
mov ecx,80028002H
|
||||
mov [ebp-306],ecx
|
||||
mov ecx,80038003H
|
||||
mov [ebp-302],ecx
|
||||
|
||||
|
||||
|
||||
freerdp_image_yuv420p_to_xrgb_hloop:
|
||||
dec word [ebp-36]
|
||||
js freerdp_image_yuv420p_to_xrgb_hloop_end
|
||||
jnz not_last_line
|
||||
|
||||
shl byte [ebp-41],1
|
||||
not_last_line:
|
||||
|
||||
mov cx,[ebp-34]
|
||||
freerdp_image_yuv420p_to_xrgb_wloop:
|
||||
;main loop
|
||||
; C = Y;
|
||||
; D = U - 128;
|
||||
; E = V - 128;
|
||||
;
|
||||
; R = clip(( 256 * C + 403 * E + 128) >> 8);
|
||||
; G = clip(( 256 * C - 48 * D - 120 * E + 128) >> 8);
|
||||
; B = clip(( 256 * C + 475 * D + 128) >> 8);
|
||||
|
||||
test cx,1B
|
||||
jnz load_yuv_data
|
||||
|
||||
|
||||
;prepare U data
|
||||
movd xmm0,[eax]
|
||||
movdqa xmm5,[ebp-314]
|
||||
pshufb xmm0,xmm5 ;but this is the omest instruction of all!!
|
||||
|
||||
add eax,4
|
||||
|
||||
movdqa xmm3,[ebp-122]
|
||||
psubsw xmm0,xmm3
|
||||
|
||||
movdqa xmm2,xmm0
|
||||
|
||||
movdqa xmm4,xmm0
|
||||
movdqa xmm7,[ebp-138]
|
||||
pmullw xmm0,xmm7
|
||||
pmulhw xmm4,xmm7
|
||||
|
||||
movdqa xmm7,xmm0
|
||||
punpcklwd xmm0,xmm4 ;what an awesome instruction!
|
||||
punpckhwd xmm7,xmm4
|
||||
movdqa xmm4,xmm7
|
||||
|
||||
movdqa xmm6,[ebp-106]
|
||||
psubd xmm0,xmm6
|
||||
psubd xmm4,xmm6
|
||||
|
||||
|
||||
movdqa xmm1,xmm2
|
||||
movdqa xmm7,[ebp-154]
|
||||
pmullw xmm1,xmm7
|
||||
pmulhw xmm2,xmm7
|
||||
|
||||
movdqa xmm7,xmm1
|
||||
punpcklwd xmm1,xmm2
|
||||
punpckhwd xmm7,xmm2
|
||||
|
||||
paddd xmm1,xmm6
|
||||
paddd xmm7,xmm6
|
||||
|
||||
movdqa [ebp-74],xmm7
|
||||
|
||||
|
||||
;prepare V data
|
||||
movd xmm2,[ebx]
|
||||
pshufb xmm2,xmm5
|
||||
|
||||
add ebx,4
|
||||
|
||||
psubsw xmm2,xmm3
|
||||
|
||||
movdqa xmm5,xmm2
|
||||
|
||||
movdqa xmm3,xmm2
|
||||
movdqa xmm7,[ebp-170]
|
||||
pmullw xmm2,xmm7
|
||||
pmulhw xmm3,xmm7
|
||||
|
||||
movdqa xmm7,xmm2
|
||||
punpcklwd xmm2,xmm3
|
||||
punpckhwd xmm7,xmm3
|
||||
|
||||
paddd xmm2,xmm6
|
||||
paddd xmm7,xmm6
|
||||
|
||||
movdqa [ebp-90],xmm7
|
||||
|
||||
|
||||
movdqa xmm3,xmm5
|
||||
movdqa xmm7,[ebp-186]
|
||||
pmullw xmm3,xmm7
|
||||
pmulhw xmm5,xmm7
|
||||
|
||||
movdqa xmm7,xmm3
|
||||
punpcklwd xmm3,xmm5
|
||||
punpckhwd xmm7,xmm5
|
||||
|
||||
paddd xmm0,xmm3
|
||||
paddd xmm4,xmm7
|
||||
|
||||
movdqa [ebp-58],xmm4
|
||||
|
||||
jmp valid_yuv_data
|
||||
|
||||
load_yuv_data:
|
||||
movdqa xmm1,[ebp-74]
|
||||
movdqa xmm2,[ebp-90]
|
||||
movdqa xmm0,[ebp-58]
|
||||
|
||||
valid_yuv_data:
|
||||
|
||||
|
||||
;Y data processing
|
||||
movd xmm4,[esi]
|
||||
pshufb xmm4,[ebp-298]
|
||||
|
||||
movdqa xmm5,xmm4
|
||||
movdqa xmm6,xmm4
|
||||
|
||||
paddd xmm4,xmm2
|
||||
psubd xmm5,xmm0
|
||||
paddd xmm6,xmm1
|
||||
|
||||
pslld xmm4,8
|
||||
pslld xmm5,8
|
||||
pslld xmm6,8
|
||||
|
||||
movdqa xmm7,[ebp-234]
|
||||
pmaxsw xmm4,xmm7 ;what an awesome instruction!
|
||||
pmaxsw xmm5,xmm7
|
||||
pmaxsw xmm6,xmm7
|
||||
|
||||
movdqa xmm7,[ebp-218]
|
||||
pminsw xmm4,xmm7
|
||||
pminsw xmm5,xmm7
|
||||
pminsw xmm6,xmm7
|
||||
|
||||
pand xmm4,[ebp-250]
|
||||
pshufb xmm5,[ebp-266]
|
||||
pshufb xmm6,[ebp-282]
|
||||
|
||||
por xmm4,xmm5
|
||||
por xmm4,xmm6
|
||||
|
||||
movdqa [edi],xmm4
|
||||
|
||||
|
||||
;Y data processing in secound line
|
||||
test byte [ebp-41],2
|
||||
jnz skip_last_line1
|
||||
|
||||
mov dx,[ebp-38]
|
||||
and edx,0FFFFH
|
||||
movd xmm4,[esi+edx]
|
||||
pshufb xmm4,[ebp-298]
|
||||
|
||||
|
||||
movdqa xmm5,xmm4
|
||||
movdqa xmm6,xmm4
|
||||
|
||||
paddd xmm4,xmm2
|
||||
psubd xmm5,xmm0
|
||||
paddd xmm6,xmm1
|
||||
|
||||
pslld xmm4,8
|
||||
pslld xmm5,8
|
||||
pslld xmm6,8
|
||||
|
||||
movdqa xmm7,[ebp-234]
|
||||
pmaxsw xmm4,xmm7 ;what an awesome instruction!
|
||||
pmaxsw xmm5,xmm7
|
||||
pmaxsw xmm6,xmm7
|
||||
|
||||
movdqa xmm7,[ebp-218]
|
||||
pminsw xmm4,xmm7
|
||||
pminsw xmm5,xmm7
|
||||
pminsw xmm6,xmm7
|
||||
|
||||
pand xmm4,[ebp-250]
|
||||
pshufb xmm5,[ebp-266]
|
||||
pshufb xmm6,[ebp-282]
|
||||
|
||||
por xmm4,xmm5
|
||||
por xmm4,xmm6
|
||||
|
||||
mov edx,[ebp-318]
|
||||
movdqa [edi+edx],xmm4
|
||||
|
||||
skip_last_line1:
|
||||
add edi,16
|
||||
add esi,4
|
||||
|
||||
dec cx
|
||||
jne freerdp_image_yuv420p_to_xrgb_wloop
|
||||
|
||||
freerdp_image_yuv420p_to_xrgb_wloop_end:
|
||||
mov edx,[ebp-318]
|
||||
add edi,edx
|
||||
|
||||
mov edx,[ebp-190]
|
||||
add esi,edx
|
||||
|
||||
mov edx,[ebp-194]
|
||||
add eax,edx
|
||||
add ebx,edx
|
||||
|
||||
jmp freerdp_image_yuv420p_to_xrgb_hloop
|
||||
|
||||
freerdp_image_yuv420p_to_xrgb_hloop_end:
|
||||
|
||||
mov eax,0
|
||||
freerdp_image_yuv420p_to_xrgb_end:
|
||||
mov edx,[ebp-202]
|
||||
|
||||
mov esp,ebp
|
||||
add esp,edx
|
||||
pop ebp
|
||||
pop ebx
|
||||
ret
|
@ -1,628 +0,0 @@
|
||||
; function for converting YUV420p data to the RGB format (but without any special upconverting)
|
||||
; It's completely written in nasm-x86-assembly for intel processors supporting SSSE3 and higher.
|
||||
; The target scanline (6th parameter) must be a multiple of 16.
|
||||
; iStride[0] must be (target scanline) / 4 or bigger and iStride[1] the next multiple of four
|
||||
; of the half of iStride[0] or bigger
|
||||
;
|
||||
section .text
|
||||
global check_ssse3
|
||||
|
||||
check_ssse3:
|
||||
push rbx
|
||||
|
||||
pushf
|
||||
pop rax
|
||||
or rax,1<<21
|
||||
push rax
|
||||
popf
|
||||
pushf
|
||||
pop rax
|
||||
test rax,1<<21
|
||||
jz check_ssse3_end
|
||||
|
||||
and rax,~(1<<21)
|
||||
push rax
|
||||
popf
|
||||
|
||||
|
||||
mov eax,1
|
||||
mov ebx,0
|
||||
cpuid
|
||||
test edx,1<<25 ;sse
|
||||
jz check_ssse3_end
|
||||
test edx,1<<26 ;sse2
|
||||
jz check_ssse3_end
|
||||
test ecx,1<<0 ;sse3
|
||||
jz check_ssse3_end
|
||||
test ecx,1<<9 ;ssse3
|
||||
jz check_ssse3_end
|
||||
|
||||
|
||||
pop rbx
|
||||
mov eax,0
|
||||
ret
|
||||
|
||||
|
||||
check_ssse3_end:
|
||||
pop rbx
|
||||
mov eax,1
|
||||
ret
|
||||
|
||||
|
||||
;extern int freerdp_image_yuv420p_to_xrgb(unsigned char *pDstData,unsigned char **pSrcData,int nWidth,int nHeight,int *istride,int scanline)
|
||||
global freerdp_image_yuv420p_to_xrgb
|
||||
freerdp_image_yuv420p_to_xrgb:
|
||||
push rbx
|
||||
push rbp
|
||||
|
||||
;check wether stack is aligned to 16 byte boundary
|
||||
;
|
||||
; ---current stack value---|-----x-----|----42 byte---|---16 byte aligned stack---
|
||||
; lets say 508 2 506 464
|
||||
; 1FCH 2H 1FAH 1D0H
|
||||
; 1F0H 1D0H
|
||||
; |------1FCH&FH----|1FCH&^FH
|
||||
; |1FCH&FH-AH |--AH-|---16 byte aligned stack------------
|
||||
; We've got only one problem: what if 1FCH&FH was smaller than AH?
|
||||
; We could either add something to sp (impossible) or subtract 10H-(AH-1FCH&FH) [%10H]
|
||||
; That's the same like (1FCH&FH-AH+10H)&FH and (1FCH+6H)&FH
|
||||
mov r15,rsp
|
||||
add r15,6H
|
||||
and r15,1111B
|
||||
sub rsp,r15
|
||||
|
||||
mov rbp,rsp
|
||||
|
||||
xor r10,r10
|
||||
xor r11,r11
|
||||
xor r12,r12
|
||||
xor r13,r13
|
||||
xor r14,r14
|
||||
|
||||
;"local variables"
|
||||
sub rsp,338 ;pDstData 8 -8,Y 8 -16,U 8 -24,V 8 -32,nWidth 2 -34,nHeight 2 -36,iStride0 2 -38,iStride1 2 -40,last_line 1 -41,last_column 1 -42,
|
||||
;G 16 -58,B 16 -74,R 16 -90,add:128 16 -106,sub:128 16 -122,mul:48 16 -138,mul:475 16 -154,mul:403 16 -170,mul:120 16 -186,VaddY 2 -188,VaddUV 2 -190,
|
||||
;res 12 -202,cmp:255 16 -218,cmp:0 16 -234,shuflleR 16 -250,andG 16 -266,shuffleB 16 -280,shuffleY 16 -296,shuffleUV 16 -314,andRemainingColumns 16 -330,
|
||||
;VddDst 8 -338
|
||||
|
||||
;last_line: if the last (U,V doubled) line should be skipped, set to 10B
|
||||
;last_column: if it's the last column in a line, set to 10B (for handling line-endings not multiple by four)
|
||||
|
||||
mov [rbp-8],rdi
|
||||
|
||||
mov rax,[rsi]
|
||||
mov [rbp-16],rax
|
||||
mov rax,[rsi+8]
|
||||
mov [rbp-24],rax
|
||||
mov rax,[rsi+16]
|
||||
mov [rbp-32],rax
|
||||
|
||||
mov [rbp-34],dx
|
||||
mov r13w,cx
|
||||
|
||||
mov r10w,r9w
|
||||
and r10,0FFFFH
|
||||
|
||||
|
||||
mov ecx,[r8]
|
||||
mov [rbp-38],ecx
|
||||
mov r12d,[r8+4]
|
||||
mov [rbp-40],r12w
|
||||
|
||||
|
||||
mov [rbp-42],dl
|
||||
and byte [rbp-42],11B
|
||||
|
||||
|
||||
mov [rbp-338],r10
|
||||
shr word [rbp-338],1
|
||||
shl cx,1
|
||||
|
||||
mov r8w,[rbp-34]
|
||||
add r8w,3
|
||||
and r8w, 0FFFCH
|
||||
|
||||
sub [rbp-338],r8w
|
||||
sub cx,r8w
|
||||
|
||||
shr r8w,1
|
||||
|
||||
mov dx,r8w
|
||||
add dx,2
|
||||
and dx,0FFFCH
|
||||
sub r12w,dx
|
||||
|
||||
shl dword [rbp-338],2
|
||||
mov r11w,cx
|
||||
|
||||
shr r8w,1
|
||||
|
||||
mov r9w,[rbp-38]
|
||||
|
||||
|
||||
;and al,11B
|
||||
;jz no_column_rest
|
||||
|
||||
;inc word [rbp-34]
|
||||
|
||||
;no_column_rest:
|
||||
;mov [rbp-41],al
|
||||
|
||||
|
||||
|
||||
mov r14b,r13b
|
||||
and r14b,1B
|
||||
;jz no_line_rest
|
||||
|
||||
inc r13w
|
||||
|
||||
;no_line_rest:
|
||||
shr r13w,1
|
||||
|
||||
|
||||
|
||||
;init masks
|
||||
mov eax,00000080H
|
||||
mov [rbp-106],eax
|
||||
mov [rbp-102],eax
|
||||
mov [rbp-98],eax
|
||||
mov [rbp-94],eax
|
||||
|
||||
mov eax,00800080H
|
||||
mov [rbp-122],eax
|
||||
mov [rbp-118],eax
|
||||
mov [rbp-114],eax
|
||||
mov [rbp-110],eax
|
||||
|
||||
mov eax,00300030H
|
||||
mov [rbp-138],eax
|
||||
mov [rbp-134],eax
|
||||
mov [rbp-130],eax
|
||||
mov [rbp-126],eax
|
||||
|
||||
mov eax,01DB01DBH
|
||||
mov [rbp-154],eax
|
||||
mov [rbp-150],eax
|
||||
mov [rbp-146],eax
|
||||
mov [rbp-142],eax
|
||||
|
||||
mov eax,01930193H
|
||||
mov [rbp-170],eax
|
||||
mov [rbp-166],eax
|
||||
mov [rbp-162],eax
|
||||
mov [rbp-158],eax
|
||||
|
||||
mov eax,00780078H
|
||||
mov [rbp-186],eax
|
||||
mov [rbp-182],eax
|
||||
mov [rbp-178],eax
|
||||
mov [rbp-174],eax
|
||||
|
||||
mov eax,000FF0000H
|
||||
mov [rbp-218],eax
|
||||
mov [rbp-214],eax
|
||||
mov [rbp-210],eax
|
||||
mov [rbp-206],eax
|
||||
|
||||
mov eax,00000000H
|
||||
mov [rbp-234],eax
|
||||
mov [rbp-230],eax
|
||||
mov [rbp-226],eax
|
||||
mov [rbp-222],eax
|
||||
|
||||
;shuffle masks
|
||||
;00 xx 00 00 00 xx 00 00 00 xx 00 00 00 xx 00 00
|
||||
;00 rr gg bb 00 rr gg bb 00 rr gg bb 00 rr gg bb
|
||||
mov eax,00FF0000H
|
||||
mov [rbp-250],eax
|
||||
mov [rbp-246],eax
|
||||
mov [rbp-242],eax
|
||||
mov [rbp-238],eax
|
||||
|
||||
mov eax,80800280H
|
||||
mov [rbp-266],eax
|
||||
mov eax,80800680H
|
||||
mov [rbp-262],eax
|
||||
mov eax,80800A80H
|
||||
mov [rbp-258],eax
|
||||
mov eax,80800E80H
|
||||
mov [rbp-254],eax
|
||||
|
||||
mov eax,80808002H
|
||||
mov [rbp-282],eax
|
||||
mov eax,80808006H
|
||||
mov [rbp-278],eax
|
||||
mov eax,8080800AH
|
||||
mov [rbp-274],eax
|
||||
mov eax,8080800EH
|
||||
mov [rbp-270],eax
|
||||
|
||||
;dd cc bb aa
|
||||
;00 00 dd 00 00 00 cc 00 00 00 bb 00 00 00 aa 00
|
||||
mov eax,80800080H
|
||||
mov [rbp-298],eax
|
||||
mov eax,80800180H
|
||||
mov [rbp-294],eax
|
||||
mov eax,80800280H
|
||||
mov [rbp-290],eax
|
||||
mov eax,80800380H
|
||||
mov [rbp-286],eax
|
||||
|
||||
;dd cc bb aa
|
||||
;00 dd 00 dd 00 cc 00 cc 00 bb 00 bb 00 aa 00 aa
|
||||
mov eax,80008000H
|
||||
mov [rbp-314],eax
|
||||
mov eax,80018001H
|
||||
mov [rbp-310],eax
|
||||
mov eax,80028002H
|
||||
mov [rbp-306],eax
|
||||
mov eax,80038003H
|
||||
mov [rbp-302],eax
|
||||
|
||||
;remaining columns and mask
|
||||
cmp byte [rbp-42],0
|
||||
je freerdp_image_yuv420p_to_xrgb_no_columns_remain
|
||||
|
||||
mov dl,[rbp-42]
|
||||
xor ebx,ebx
|
||||
xor ecx,ecx
|
||||
xor esi,esi
|
||||
|
||||
mov eax,0FFFFFFFFH
|
||||
cmp dl,1H
|
||||
je freerdp_image_yuv420p_to_xrgb_write_columns_remain
|
||||
|
||||
mov ebx,0FFFFFFFFH
|
||||
cmp dl,2H
|
||||
je freerdp_image_yuv420p_to_xrgb_write_columns_remain
|
||||
|
||||
mov ecx,0FFFFFFFFH
|
||||
|
||||
freerdp_image_yuv420p_to_xrgb_write_columns_remain:
|
||||
mov [rbp-330],eax
|
||||
mov [rbp-326],ebx
|
||||
mov [rbp-322],ecx
|
||||
mov [rbp-318],esi
|
||||
mov byte [rbp-42],1
|
||||
|
||||
freerdp_image_yuv420p_to_xrgb_no_columns_remain:
|
||||
|
||||
|
||||
mov rsi,[rbp-16]
|
||||
mov rax,[rbp-24]
|
||||
mov rbx,[rbp-32]
|
||||
|
||||
;jmp freerdp_image_yuv420p_to_xrgb_end
|
||||
|
||||
freerdp_image_yuv420p_to_xrgb_hloop:
|
||||
dec r13w
|
||||
js freerdp_image_yuv420p_to_xrgb_hloop_end
|
||||
jnz not_last_line
|
||||
|
||||
shl r14b,1
|
||||
not_last_line:
|
||||
|
||||
xor cx,cx
|
||||
freerdp_image_yuv420p_to_xrgb_wloop:
|
||||
; Well, in the end it should look like this:
|
||||
; C = Y;
|
||||
; D = U - 128;
|
||||
; E = V - 128;
|
||||
;
|
||||
; R = clip(( 256 * C + 403 * E + 128) >> 8);
|
||||
; G = clip(( 256 * C - 48 * D - 120 * E + 128) >> 8);
|
||||
; B = clip(( 256 * C + 475 * D + 128) >> 8);
|
||||
|
||||
test cx,1B
|
||||
jnz freerdp_image_yuv420p_to_xrgb_load_yuv_data
|
||||
|
||||
|
||||
; Y-, U- and V-data is stored in different arrays.
|
||||
; We start with processing U-data.
|
||||
|
||||
; at first we fetch four U-values from its array and shuffle them like this:
|
||||
; 0d0d 0c0c 0b0b 0a0a
|
||||
; we've done two things: converting the values to signed words and duplicating
|
||||
; each value, because always two pixel "share" the same U- (and V-) data
|
||||
movd xmm0,[rax]
|
||||
movdqa xmm5,[rbp-314]
|
||||
pshufb xmm0,xmm5 ;but this is the awesomest instruction of all!!
|
||||
|
||||
add rax,4
|
||||
|
||||
; then we subtract 128 from each value, so we get D
|
||||
movdqa xmm3,[rbp-122]
|
||||
psubsw xmm0,xmm3
|
||||
|
||||
; we need to do two things with our D, so let's store it for later use
|
||||
movdqa xmm2,xmm0
|
||||
|
||||
; now we can multiply our D with 48 and unpack it to xmm4:xmm0
|
||||
; this is what we need to get G data later on
|
||||
movdqa xmm4,xmm0
|
||||
movdqa xmm7,[rbp-138]
|
||||
pmullw xmm0,xmm7
|
||||
pmulhw xmm4,xmm7
|
||||
|
||||
movdqa xmm7,xmm0
|
||||
punpcklwd xmm0,xmm4 ;what an awesome instruction!
|
||||
punpckhwd xmm7,xmm4
|
||||
movdqa xmm4,xmm7
|
||||
|
||||
; to complete this step, add (?) 128 to each value (rounding ?!)
|
||||
; yeah, add. in the end this will be subtracted from something,
|
||||
; because it's part of G: 256*C - (48*D + 120*E - 128), 48*D-128 !
|
||||
; by the way, our values have become signed dwords during multiplication!
|
||||
movdqa xmm6,[rbp-106]
|
||||
psubd xmm0,xmm6
|
||||
psubd xmm4,xmm6
|
||||
|
||||
|
||||
; to get B data, we need to prepare a secound value, D*475+128
|
||||
movdqa xmm1,xmm2
|
||||
movdqa xmm7,[rbp-154]
|
||||
pmullw xmm1,xmm7
|
||||
pmulhw xmm2,xmm7
|
||||
|
||||
movdqa xmm7,xmm1
|
||||
punpcklwd xmm1,xmm2
|
||||
punpckhwd xmm7,xmm2
|
||||
|
||||
paddd xmm1,xmm6
|
||||
paddd xmm7,xmm6
|
||||
|
||||
; so we got something like this: xmm7:xmm1
|
||||
; this pair contains values for 16 pixel:
|
||||
; aabbccdd
|
||||
; aabbccdd, but we can only work on four pixel at once, so we need to save upper values
|
||||
movdqa [rbp-74],xmm7
|
||||
|
||||
|
||||
; Now we've prepared U-data. Preparing V-data is actually the same, just with other coefficients.
|
||||
movd xmm2,[rbx]
|
||||
pshufb xmm2,xmm5
|
||||
|
||||
add rbx,4
|
||||
|
||||
psubsw xmm2,xmm3
|
||||
|
||||
movdqa xmm5,xmm2
|
||||
|
||||
; this is also known as E*403+128, we need it to convert R data
|
||||
movdqa xmm3,xmm2
|
||||
movdqa xmm7,[rbp-170]
|
||||
pmullw xmm2,xmm7
|
||||
pmulhw xmm3,xmm7
|
||||
|
||||
movdqa xmm7,xmm2
|
||||
punpcklwd xmm2,xmm3
|
||||
punpckhwd xmm7,xmm3
|
||||
|
||||
paddd xmm2,xmm6
|
||||
paddd xmm7,xmm6
|
||||
|
||||
; and preserve upper four values for future ...
|
||||
movdqa [rbp-90],xmm7
|
||||
|
||||
|
||||
; doing this step: E*120
|
||||
movdqa xmm3,xmm5
|
||||
movdqa xmm7,[rbp-186]
|
||||
pmullw xmm3,xmm7
|
||||
pmulhw xmm5,xmm7
|
||||
|
||||
movdqa xmm7,xmm3
|
||||
punpcklwd xmm3,xmm5
|
||||
punpckhwd xmm7,xmm5
|
||||
|
||||
; now we complete what we've begun above:
|
||||
; (48*D-128) + (120*E) = (48*D +120*E -128)
|
||||
paddd xmm0,xmm3
|
||||
paddd xmm4,xmm7
|
||||
|
||||
; and store to memory !
|
||||
movdqa [rbp-58],xmm4
|
||||
|
||||
; real assembly programmers do not only produce best results between 0 and 5 o'clock,
|
||||
; but are also kangaroos!
|
||||
jmp freerdp_image_yuv420p_to_xrgb_valid_yuv_data
|
||||
|
||||
freerdp_image_yuv420p_to_xrgb_load_yuv_data:
|
||||
; maybe you've wondered about the conditional jump to this label above ?
|
||||
; Well, we prepared UV data for eight pixel in each line, but can only process four
|
||||
; per loop. So we need to load the upper four pixel data from memory each secound loop!
|
||||
movdqa xmm1,[rbp-74]
|
||||
movdqa xmm2,[rbp-90]
|
||||
movdqa xmm0,[rbp-58]
|
||||
|
||||
freerdp_image_yuv420p_to_xrgb_valid_yuv_data:
|
||||
|
||||
inc cx
|
||||
cmp cx,r8w
|
||||
jne freerdp_image_yuv420p_to_xrgb_not_last_columns
|
||||
|
||||
shl byte [rbp-42],1
|
||||
|
||||
|
||||
freerdp_image_yuv420p_to_xrgb_not_last_columns:
|
||||
|
||||
; We didn't produce any output yet, so let's do so!
|
||||
; Ok, fetch four pixel from the Y-data array and shuffle them like this:
|
||||
; 00d0 00c0 00b0 00a0, to get signed dwords and multiply by 256
|
||||
movd xmm4,[rsi]
|
||||
pshufb xmm4,[rbp-298]
|
||||
|
||||
movdqa xmm5,xmm4
|
||||
movdqa xmm6,xmm4
|
||||
|
||||
; no we can perform the "real" conversion itself and produce output!
|
||||
paddd xmm4,xmm2
|
||||
psubd xmm5,xmm0
|
||||
paddd xmm6,xmm1
|
||||
|
||||
; in the end, we only need bytes for RGB values.
|
||||
; So, what do we do? right! shifting left makes values bigger and thats always good.
|
||||
; before we had dwords of data, and by shifting left and treating the result
|
||||
; as packed words, we get not only signed words, but do also divide by 256
|
||||
; imagine, data is now ordered this way: ddx0 ccx0 bbx0 aax0, and x is the least
|
||||
; significant byte, that we don't need anymore, because we've done some rounding
|
||||
pslld xmm4,8
|
||||
pslld xmm5,8
|
||||
pslld xmm6,8
|
||||
|
||||
; one thing we still have to face is the clip() function ...
|
||||
; we have still signed words, and there are those min/max instructions in SSE2 ...
|
||||
; the max instruction takes always the bigger of the two operands and stores it in the first one,
|
||||
; and it operates with signs !
|
||||
; if we feed it with our values and zeros, it takes the zeros if our values are smaller than
|
||||
; zero and otherwise our values
|
||||
movdqa xmm7,[rbp-234]
|
||||
pmaxsw xmm4,xmm7 ;what an awesome instruction!
|
||||
pmaxsw xmm5,xmm7
|
||||
pmaxsw xmm6,xmm7
|
||||
|
||||
; the same thing just completely different can be used to limit our values to 255,
|
||||
; but now using the min instruction and 255s
|
||||
movdqa xmm7,[rbp-218]
|
||||
pminsw xmm4,xmm7
|
||||
pminsw xmm5,xmm7
|
||||
pminsw xmm6,xmm7
|
||||
|
||||
; Now we got our bytes.
|
||||
; the moment has come to assemble the three channels R,G and B to the xrgb dwords
|
||||
; on Red channel we just have to and each futural dword with 00FF0000H
|
||||
pand xmm4,[rbp-250]
|
||||
; on Green channel we have to shuffle somehow, so we get something like this:
|
||||
; 00d0 00c0 00b0 00a0
|
||||
pshufb xmm5,[rbp-266]
|
||||
; and on Blue channel that one:
|
||||
; 000d 000c 000b 000a
|
||||
pshufb xmm6,[rbp-282]
|
||||
|
||||
; and at last we or it together and get this one:
|
||||
; xrgb xrgb xrgb xrgb
|
||||
por xmm4,xmm5
|
||||
por xmm4,xmm6
|
||||
|
||||
; Only thing to do know is writing data to memory, but this gets a bit more
|
||||
; complicated if the width is not a multiple of four and it is the last column in line.
|
||||
; but otherwise just play the kangaroo
|
||||
test byte [rbp-42],2
|
||||
je freerdp_image_yuv420p_to_xrgb_column_process_complete
|
||||
|
||||
; let's say, we need to only convert six pixel in width
|
||||
; Ok, the first 4 pixel will be converted just like every 4 pixel else, but
|
||||
; if it's the last loop in line, [rbp-42] is shifted left by one (curious? have a look above),
|
||||
; and we land here. Through initialisation a mask was prepared. In this case it looks like
|
||||
; 0000FFFFH 0000FFFFH 0000FFFFH 0000FFFFH
|
||||
movdqa xmm6,[rbp-330]
|
||||
; we and our output data with this mask to get only the valid pixel
|
||||
pand xmm4,xmm6
|
||||
; then we fetch memory from the destination array ...
|
||||
movdqu xmm5,[rdi]
|
||||
; ... and and it with the inverse mask. We get only those pixel, which should not be updated
|
||||
pandn xmm6,xmm5
|
||||
; we only have to or the two values together and write it back to the destination array,
|
||||
; and only the pixel that should be updated really get changed.
|
||||
por xmm4,xmm6
|
||||
|
||||
freerdp_image_yuv420p_to_xrgb_column_process_complete:
|
||||
movdqu [rdi],xmm4
|
||||
|
||||
|
||||
; Because UV data is the same for two lines, we can process the secound line just here,
|
||||
; in the same loop. Only thing we need to do is to add some offsets to the Y- and destination
|
||||
; pointer. These offsets are iStride[0] and the target scanline.
|
||||
; But if we don't need to process the secound line, like if we are in the last line of processing nine lines,
|
||||
; we just skip all this.
|
||||
test r14b,2
|
||||
jnz freerdp_yuv420p_to_xrgb_skip_last_line
|
||||
|
||||
movd xmm4,[rsi+r9]
|
||||
pshufb xmm4,[rbp-298]
|
||||
|
||||
|
||||
movdqa xmm5,xmm4
|
||||
movdqa xmm6,xmm4
|
||||
|
||||
paddd xmm4,xmm2
|
||||
psubd xmm5,xmm0
|
||||
paddd xmm6,xmm1
|
||||
|
||||
pslld xmm4,8
|
||||
pslld xmm5,8
|
||||
pslld xmm6,8
|
||||
|
||||
movdqa xmm7,[rbp-234]
|
||||
pmaxsw xmm4,xmm7 ;what an awesome instruction!
|
||||
pmaxsw xmm5,xmm7
|
||||
pmaxsw xmm6,xmm7
|
||||
|
||||
movdqa xmm7,[rbp-218]
|
||||
pminsw xmm4,xmm7
|
||||
pminsw xmm5,xmm7
|
||||
pminsw xmm6,xmm7
|
||||
|
||||
pand xmm4,[rbp-250]
|
||||
pshufb xmm5,[rbp-266]
|
||||
pshufb xmm6,[rbp-282]
|
||||
|
||||
por xmm4,xmm5
|
||||
por xmm4,xmm6
|
||||
|
||||
test byte [rbp-42],2
|
||||
je freerdp_image_yuv420p_to_xrgb_column_process_complete2
|
||||
|
||||
movdqa xmm6,[rbp-330]
|
||||
pand xmm4,xmm6
|
||||
movdqu xmm5,[rdi+r10]
|
||||
pandn xmm6,xmm5
|
||||
por xmm4,xmm6
|
||||
|
||||
; only thing is, we should shift [rbp-42] back here, because we have processed the last column,
|
||||
; and this "special condition" can be released
|
||||
shr byte [rbp-42],1
|
||||
|
||||
freerdp_image_yuv420p_to_xrgb_column_process_complete2:
|
||||
movdqu [rdi+r10],xmm4
|
||||
|
||||
|
||||
freerdp_yuv420p_to_xrgb_skip_last_line:
|
||||
; after all we have to increase the destination- and Y-data pointer by four pixel
|
||||
add rdi,16
|
||||
add rsi,4
|
||||
|
||||
cmp cx,r8w
|
||||
jne freerdp_image_yuv420p_to_xrgb_wloop
|
||||
|
||||
freerdp_image_yuv420p_to_xrgb_wloop_end:
|
||||
; after each line we have to add the scanline to the destination pointer, because
|
||||
; we are processing two lines at once, but only increasing the destination pointer
|
||||
; in the first line. Well, we only have one pointer, so it's the easiest way to access
|
||||
; the secound line with the one pointer and an offset (scanline)
|
||||
; if we're not converting the full width of the scanline, like only 64 pixel, but the
|
||||
; output buffer was "designed" for 1920p HD, we have to add the remaining length for each line,
|
||||
; to get into the next line.
|
||||
add rdi,[rbp-338]
|
||||
|
||||
; same thing has to be done for Y-data, but with iStride[0] instead of the target scanline
|
||||
add rsi,r11
|
||||
|
||||
; and again for UV data, but here it's enough to add the remaining length, because
|
||||
; UV data is the same for two lines and there exists only one "UV line" on two "real lines"
|
||||
add rax,r12
|
||||
add rbx,r12
|
||||
;mov eax,r12d
|
||||
;jmp freerdp_image_yuv420p_to_xrgb_end
|
||||
|
||||
jmp freerdp_image_yuv420p_to_xrgb_hloop
|
||||
|
||||
freerdp_image_yuv420p_to_xrgb_hloop_end:
|
||||
|
||||
mov eax,0
|
||||
freerdp_image_yuv420p_to_xrgb_end:
|
||||
mov rsp,rbp
|
||||
add rsp,r15
|
||||
pop rbp
|
||||
pop rbx
|
||||
ret
|
@ -1,240 +0,0 @@
|
||||
;R=(256*Y+403*(V-128)+128)/265 =(256*Y+403*V-51456)/256
|
||||
;G=(256*Y-48*(U-128)-120*(V-128)+128)/256 =(256*Y-48*U-120*V+21632)/256
|
||||
;B=(256*Y+475*(U-128)+128)/256 =(256*Y+475*U-60672)/256
|
||||
|
||||
section .text
|
||||
;global YUV_to_RGB_asm
|
||||
YUV_to_RGB_asm:
|
||||
shl edi,8
|
||||
|
||||
mov eax,edx
|
||||
imul eax,403
|
||||
add eax,edi
|
||||
sub eax,51456
|
||||
|
||||
jae YUV_to_RGB_asm1
|
||||
mov eax,0
|
||||
jmp YUV_to_RGB_asm11
|
||||
|
||||
YUV_to_RGB_asm1:
|
||||
cmp eax, 0xFFFF
|
||||
jbe YUV_to_RGB_asm11
|
||||
mov eax,0xFF00
|
||||
|
||||
YUV_to_RGB_asm11:
|
||||
and eax,0xFF00
|
||||
shl eax,8
|
||||
|
||||
mov ebx,esi
|
||||
imul ebx,475
|
||||
add ebx,edi
|
||||
sub ebx,60672
|
||||
|
||||
jae YUV_to_RGB_asm2
|
||||
mov ebx, 0
|
||||
jmp YUV_to_RGB_asm21
|
||||
|
||||
YUV_to_RGB_asm2:
|
||||
cmp ebx,0xFFFF
|
||||
jbe YUV_to_RGB_asm21
|
||||
mov ebx,0xFF00
|
||||
|
||||
YUV_to_RGB_asm21:
|
||||
and ebx,0xFF00
|
||||
shr ebx,8
|
||||
|
||||
imul edx,120
|
||||
sub edi,edx
|
||||
imul esi,48
|
||||
sub edi,esi
|
||||
add edi,21632
|
||||
|
||||
bt edi,31
|
||||
jae YUV_to_RGB_asm3
|
||||
mov edi, 0
|
||||
jmp YUV_to_RGB_asm31
|
||||
|
||||
YUV_to_RGB_asm3:
|
||||
cmp edi,0xFFFF
|
||||
jbe YUV_to_RGB_asm31
|
||||
mov edi, 0xFF00
|
||||
|
||||
YUV_to_RGB_asm31:
|
||||
and edi,0xFF00
|
||||
|
||||
or eax,edi
|
||||
or eax,ebx
|
||||
|
||||
ret
|
||||
|
||||
;extern int freerdp_image_yuv_to_xrgb_asm(unsigned char *pDstData,unsigned char **pSrcData,int nWidth,int nHeight);
|
||||
global freerdp_image_yuv_to_xrgb_asm
|
||||
freerdp_image_yuv_to_xrgb_asm:
|
||||
push ebp
|
||||
mov ebp, esp
|
||||
;cWidth: cx
|
||||
sub esp,36 ;pDstData,pSrcData[3],nWidth,nHeight,cHeight,scanline,iStride[0] addition
|
||||
push ebx
|
||||
|
||||
|
||||
mov edi,[ebp+8]
|
||||
mov [ebp-4],edi
|
||||
|
||||
mov esi,[ebp+12]
|
||||
mov eax,[esi]
|
||||
mov [ebp-8],eax
|
||||
mov eax,[esi+4]
|
||||
mov [ebp-12],eax
|
||||
mov eax,[esi+8]
|
||||
mov [ebp-16],eax
|
||||
|
||||
mov edx,[ebp+16]
|
||||
mov [ebp-20],edx
|
||||
|
||||
|
||||
mov ecx,[ebp+20]
|
||||
shr ecx,1 ;/2
|
||||
mov [ebp-24],ecx
|
||||
|
||||
|
||||
shl edx,2
|
||||
mov [ebp-32],edx
|
||||
|
||||
|
||||
mov eax,[ebp-24]
|
||||
mov [ebp-28],eax
|
||||
|
||||
|
||||
mov ebx,[ebp+24]
|
||||
mov [ebp-36],ebx
|
||||
mov eax,[ebp-20]
|
||||
shl dword [ebp-36],1
|
||||
sub [ebp-36],eax
|
||||
|
||||
shr eax,1
|
||||
sub [ebp+28],eax
|
||||
|
||||
freerdp_image_yuv_to_xrgb_asm_loopH:
|
||||
mov ecx,[ebp-20]
|
||||
shr ecx,1
|
||||
|
||||
|
||||
freerdp_image_yuv_to_xrgb_asm_loopW:
|
||||
mov eax,[ebp-8]
|
||||
mov edi,[eax]
|
||||
and edi,0xFF
|
||||
|
||||
mov eax,[ebp-12]
|
||||
mov esi,[eax]
|
||||
and esi,0xFF
|
||||
|
||||
mov eax,[ebp-16]
|
||||
mov edx,[eax]
|
||||
and edx,0xFF
|
||||
|
||||
call YUV_to_RGB_asm
|
||||
|
||||
mov ebx,[ebp-4]
|
||||
mov [ebx],eax
|
||||
|
||||
|
||||
mov eax,[ebp-8]
|
||||
mov ebx,[ebp+24]
|
||||
mov edi,[eax+ebx]
|
||||
inc eax
|
||||
mov [ebp-8],eax
|
||||
and edi,0xFF
|
||||
|
||||
mov eax,[ebp-12]
|
||||
mov esi,[eax]
|
||||
and esi,0xFF
|
||||
|
||||
mov eax,[ebp-16]
|
||||
mov edx,[eax]
|
||||
and edx,0xFF
|
||||
|
||||
call YUV_to_RGB_asm
|
||||
|
||||
mov ebx,[ebp-4]
|
||||
mov edx,[ebp-32]
|
||||
mov [ebx+edx],eax
|
||||
add ebx,4
|
||||
mov [ebp-4],ebx
|
||||
|
||||
|
||||
mov eax,[ebp-8]
|
||||
mov edi,[eax]
|
||||
and edi,0xFF
|
||||
|
||||
mov eax,[ebp-12]
|
||||
mov esi,[eax]
|
||||
and esi,0xFF
|
||||
|
||||
mov eax,[ebp-16]
|
||||
mov edx,[eax]
|
||||
and edx,0xFF
|
||||
|
||||
call YUV_to_RGB_asm
|
||||
|
||||
mov ebx,[ebp-4]
|
||||
mov [ebx],eax
|
||||
|
||||
|
||||
mov eax,[ebp-8]
|
||||
mov ebx,[ebp+24]
|
||||
mov edi,[eax+ebx]
|
||||
inc eax
|
||||
mov [ebp-8],eax
|
||||
and edi,0xFF
|
||||
|
||||
mov eax,[ebp-12]
|
||||
mov esi,[eax]
|
||||
inc eax
|
||||
mov [ebp-12],eax
|
||||
and esi,0xFF
|
||||
|
||||
mov eax,[ebp-16]
|
||||
mov edx,[eax]
|
||||
inc eax
|
||||
mov [ebp-16],eax
|
||||
and edx,0xFF
|
||||
|
||||
call YUV_to_RGB_asm
|
||||
|
||||
mov ebx,[ebp-4]
|
||||
mov edx,[ebp-32]
|
||||
mov [ebx+edx],eax
|
||||
add ebx,4
|
||||
mov [ebp-4],ebx
|
||||
|
||||
dec cx
|
||||
jne freerdp_image_yuv_to_xrgb_asm_loopW
|
||||
|
||||
|
||||
mov eax,[ebp-4]
|
||||
add eax,[ebp-32]
|
||||
mov [ebp-4],eax
|
||||
|
||||
mov eax,[ebp-8]
|
||||
add eax,[ebp-36]
|
||||
mov [ebp-8],eax
|
||||
|
||||
mov ebx,[ebp+28]
|
||||
mov eax,[ebp-12]
|
||||
add eax,ebx
|
||||
mov [ebp-12],eax
|
||||
|
||||
mov eax,[ebp-16]
|
||||
add eax,ebx
|
||||
mov [ebp-16],eax
|
||||
|
||||
dec dword [ebp-28]
|
||||
jne freerdp_image_yuv_to_xrgb_asm_loopH
|
||||
|
||||
;END
|
||||
mov eax,0
|
||||
END:
|
||||
pop ebx
|
||||
mov esp,ebp
|
||||
pop ebp
|
||||
ret
|
@ -1,269 +0,0 @@
|
||||
;R=(256*Y+403*(V-128)+128)/265 =(256*Y+403*V-51456)/256
|
||||
;G=(256*Y-48*(U-128)-120*(V-128)+128)/256 =(256*Y-48*U-120*V+21632)/256
|
||||
;B=(256*Y+475*(U-128)+128)/256 =(256*Y+475*U-60672)/256
|
||||
|
||||
section .text
|
||||
;global YUV_to_RGB_asm
|
||||
YUV_to_RGB_asm:
|
||||
shl rdi,8
|
||||
|
||||
mov eax,edx
|
||||
imul eax,403
|
||||
add eax,edi
|
||||
sub eax,51456
|
||||
|
||||
jae YUV_to_RGB_asm1
|
||||
mov eax,0
|
||||
jmp YUV_to_RGB_asm11
|
||||
|
||||
YUV_to_RGB_asm1:
|
||||
cmp eax, 0xFFFF
|
||||
jbe YUV_to_RGB_asm11
|
||||
mov eax,0xFF00
|
||||
|
||||
YUV_to_RGB_asm11:
|
||||
and eax,0xFF00
|
||||
shl eax,8
|
||||
|
||||
mov ebx,esi
|
||||
imul ebx,475
|
||||
add ebx,edi
|
||||
sub ebx,60672
|
||||
|
||||
jae YUV_to_RGB_asm2
|
||||
mov ebx, 0
|
||||
jmp YUV_to_RGB_asm21
|
||||
|
||||
YUV_to_RGB_asm2:
|
||||
cmp ebx,0xFFFF
|
||||
jbe YUV_to_RGB_asm21
|
||||
mov ebx,0xFF00
|
||||
|
||||
YUV_to_RGB_asm21:
|
||||
and ebx,0xFF00
|
||||
shr ebx,8
|
||||
|
||||
imul edx,120
|
||||
sub edi,edx
|
||||
imul esi,48
|
||||
sub edi,esi
|
||||
add edi,21632
|
||||
|
||||
bt edi,31
|
||||
jae YUV_to_RGB_asm3
|
||||
mov edi, 0
|
||||
jmp YUV_to_RGB_asm31
|
||||
|
||||
YUV_to_RGB_asm3:
|
||||
cmp edi,0xFFFF
|
||||
jbe YUV_to_RGB_asm31
|
||||
mov edi, 0xFF00
|
||||
|
||||
YUV_to_RGB_asm31:
|
||||
and edi,0xFF00
|
||||
|
||||
or eax,edi
|
||||
or eax,ebx
|
||||
|
||||
ret
|
||||
|
||||
;extern int freerdp_image_yuv_to_xrgb_asm(unsigned char *pDstData,unsigned char **pSrcData,int nWidth,int nHeight,int *istride,int scanline);
|
||||
global freerdp_image_yuv_to_xrgb_asm
|
||||
freerdp_image_yuv_to_xrgb_asm:
|
||||
push rbx
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
;cWidth: cx
|
||||
sub rsp,82 ;pDstData -8,pSrcData[3] -32,nWidth -40,nHeight -48,cHeight -56,scanline -64,iStride[0] -72,VaddDst -80,last_column 1 -81,last_line 1 -82
|
||||
|
||||
;last_column: set to 10B, if last column should be skipped ('cause UV data is the same for two columns and two columns are processed at once)
|
||||
;last_line: set to 10B, if last line should be skipped ('cause UV data is the same for two lines and two lines are processed at once)
|
||||
|
||||
|
||||
mov [rbp-8],rdi
|
||||
|
||||
mov rax,[rsi]
|
||||
mov [rbp-16],rax
|
||||
mov rax,[rsi+8]
|
||||
mov [rbp-24],rax
|
||||
mov rax,[rsi+16]
|
||||
mov [rbp-32],rax
|
||||
|
||||
and rdx,0FFFFH
|
||||
;mov [rbp-40],rdx
|
||||
|
||||
|
||||
shr rcx,1 ;/2
|
||||
mov [rbp-48],rcx
|
||||
|
||||
|
||||
and r9,0FFFFH
|
||||
mov [rbp-64],r9
|
||||
|
||||
shr r9d,1
|
||||
sub r9d,edx
|
||||
shl r9d,2
|
||||
mov [rbp-80],r9
|
||||
|
||||
|
||||
mov rax,[rbp-48]
|
||||
mov [rbp-56],rax
|
||||
|
||||
|
||||
mov rcx,[r8]
|
||||
and rcx,0FFFFH
|
||||
mov [rbp-72],rcx
|
||||
shl dword [rbp-72],1
|
||||
sub [rbp-72],rdx
|
||||
|
||||
mov r9,[r8+4]
|
||||
mov r8,rcx
|
||||
|
||||
and r9,0FFFFH
|
||||
shr rax,1
|
||||
sub r9,rax
|
||||
|
||||
|
||||
mov al,dl
|
||||
and al,1B
|
||||
mov [rbp-81],al
|
||||
inc dx
|
||||
shr edx,1
|
||||
mov [rbp-40],rdx
|
||||
|
||||
freerdp_image_yuv_to_xrgb_asm_loopH:
|
||||
mov cx,[rbp-40]
|
||||
|
||||
|
||||
freerdp_image_yuv_to_xrgb_asm_loopW:
|
||||
dec cx
|
||||
jne freerdp_image_yuv_to_xrgb_asm_not_last_column
|
||||
|
||||
shl byte [rbp-81],1
|
||||
|
||||
freerdp_image_yuv_to_xrgb_asm_not_last_column:
|
||||
|
||||
|
||||
mov rax,[rbp-16]
|
||||
mov edi,[rax]
|
||||
and edi,0xFF
|
||||
|
||||
mov rax,[rbp-24]
|
||||
mov esi,[rax]
|
||||
and esi,0xFF
|
||||
|
||||
mov rax,[rbp-32]
|
||||
mov edx,[rax]
|
||||
and edx,0xFF
|
||||
|
||||
call YUV_to_RGB_asm
|
||||
|
||||
mov rbx,[rbp-8]
|
||||
mov [rbx],eax
|
||||
|
||||
|
||||
test byte [rbp-81],2
|
||||
jne freerdp_image_yuv_to_xrgb_asm_skip_last_column
|
||||
|
||||
mov rax,[rbp-16]
|
||||
mov edi,[rax+r8]
|
||||
and edi,0xFF
|
||||
|
||||
mov rax,[rbp-24]
|
||||
mov esi,[rax]
|
||||
and esi,0xFF
|
||||
|
||||
mov rax,[rbp-32]
|
||||
mov edx,[rax]
|
||||
and edx,0xFF
|
||||
|
||||
call YUV_to_RGB_asm
|
||||
|
||||
mov rbx,[rbp-8]
|
||||
mov rdx,[rbp-64]
|
||||
mov [rbx+rdx],eax
|
||||
|
||||
freerdp_image_yuv_to_xrgb_asm_skip_last_column:
|
||||
add qword [rbp-8],4
|
||||
inc qword [rbp-16]
|
||||
|
||||
|
||||
mov rax,[rbp-16]
|
||||
mov edi,[rax]
|
||||
and edi,0xFF
|
||||
|
||||
mov rax,[rbp-24]
|
||||
mov esi,[rax]
|
||||
and esi,0xFF
|
||||
|
||||
mov rax,[rbp-32]
|
||||
mov edx,[rax]
|
||||
and edx,0xFF
|
||||
|
||||
call YUV_to_RGB_asm
|
||||
|
||||
mov rbx,[rbp-8]
|
||||
mov [rbx],eax
|
||||
|
||||
|
||||
test byte [rbp-81],2
|
||||
jne freerdp_image_yuv_to_xrgb_asm_skip_last_column2
|
||||
|
||||
mov rax,[rbp-16]
|
||||
mov edi,[rax+r8]
|
||||
and edi,0xFF
|
||||
|
||||
mov rax,[rbp-24]
|
||||
mov esi,[rax]
|
||||
and esi,0xFF
|
||||
|
||||
mov rax,[rbp-32]
|
||||
mov edx,[rax]
|
||||
and edx,0xFF
|
||||
|
||||
call YUV_to_RGB_asm
|
||||
|
||||
;shr [rbp-81],1
|
||||
|
||||
mov rbx,[rbp-8]
|
||||
mov rdx,[rbp-64]
|
||||
mov [rbx+rdx],eax
|
||||
|
||||
freerdp_image_yuv_to_xrgb_asm_skip_last_column2:
|
||||
add qword [rbp-8],4
|
||||
inc qword [rbp-16]
|
||||
inc qword [rbp-24]
|
||||
inc qword [rbp-32]
|
||||
|
||||
|
||||
test cx,0FFFFH
|
||||
jne freerdp_image_yuv_to_xrgb_asm_loopW
|
||||
jmp END
|
||||
|
||||
|
||||
mov rax,[rbp-8]
|
||||
add rax,[rbp-80]
|
||||
mov [rbp-8],rax
|
||||
|
||||
mov rax,[rbp-16]
|
||||
add rax,[rbp-72]
|
||||
mov [rbp-16],rax
|
||||
|
||||
mov rax,[rbp-24]
|
||||
add rax,r9
|
||||
mov [rbp-24],rax
|
||||
|
||||
mov rax,[rbp-32]
|
||||
add rax,r9
|
||||
mov [rbp-32],rax
|
||||
|
||||
dec qword [rbp-56]
|
||||
jne freerdp_image_yuv_to_xrgb_asm_loopH
|
||||
|
||||
;END
|
||||
mov rax,0
|
||||
END:
|
||||
mov rsp,rbp
|
||||
pop rbp
|
||||
pop rbx
|
||||
ret
|
14
libfreerdp/codec/test/Makefile.TestOpenH264SSSE3
Normal file
14
libfreerdp/codec/test/Makefile.TestOpenH264SSSE3
Normal file
@ -0,0 +1,14 @@
|
||||
TestOpenH264: TestOpenH264ASM.c.o h264.c.o h264_ssse3.c.o
|
||||
gcc -o TestOpenH264 TestOpenH264ASM.c.o h264.c.o h264_ssse3.c.o -lwinpr
|
||||
|
||||
h264_ssse3.c.o: ../h264_ssse3.c
|
||||
gcc -c -O3 -o h264_ssse3.c.o ../h264_ssse3.c -mssse3
|
||||
|
||||
TestOpenH264ASM.c.o: TestOpenH264ASM.c
|
||||
gcc -c -o TestOpenH264ASM.c.o TestOpenH264ASM.c
|
||||
|
||||
h264.c.o: ../h264.c
|
||||
gcc -c -o h264.c.o ../h264.c
|
||||
|
||||
clean:
|
||||
rm -f TestOpenH264 TestOpenH264ASM.c.o h264_ssse3.asm.o h264.c.o h264.asm.o
|
BIN
libfreerdp/codec/test/TestOpenH264
Executable file
BIN
libfreerdp/codec/test/TestOpenH264
Executable file
Binary file not shown.
Loading…
Reference in New Issue
Block a user