YUV data conversion of H.264 implementation (egfx):

only convert invalid areas
SIMD SSSE3 conversion in primitives
compiling all primitives sources with optimization

and cleanup after last merge
This commit is contained in:
erbth 2014-09-09 00:13:18 +02:00
parent cbc8b3a7e1
commit 7828725413
15 changed files with 199 additions and 363 deletions

View File

@ -486,7 +486,6 @@ int dvcman_receive_channel_data(IWTSVirtualChannelManager* pChannelMgr, UINT32 C
int status = 0;
DVCMAN_CHANNEL* channel;
UINT32 dataSize = Stream_GetRemainingLength(data);
wStream* s;
channel = (DVCMAN_CHANNEL*) dvcman_find_channel_by_id(pChannelMgr, ChannelId);
@ -499,7 +498,7 @@ int dvcman_receive_channel_data(IWTSVirtualChannelManager* pChannelMgr, UINT32 C
if (channel->dvc_data)
{
/* Fragmented data */
if (Stream_GetPosition(channel->dvc_data) + dataSize > (UINT32) Stream_Length(channel->dvc_data))
if (Stream_GetPosition(channel->dvc_data) + dataSize > (UINT32) Stream_Capacity(channel->dvc_data))
{
CLOG_ERR("data exceeding declared length!");
Stream_Release(channel->dvc_data);
@ -513,11 +512,9 @@ int dvcman_receive_channel_data(IWTSVirtualChannelManager* pChannelMgr, UINT32 C
{
Stream_SealLength(channel->dvc_data);
Stream_SetPosition(channel->dvc_data, 0);
s=channel->dvc_data;
status = channel->channel_callback->OnDataReceived(channel->channel_callback, channel->dvc_data);
Stream_Release(channel->dvc_data);
channel->dvc_data = NULL;
status = channel->channel_callback->OnDataReceived(channel->channel_callback, s);
Stream_Release(s);
}
}
else

View File

@ -29,8 +29,7 @@ typedef struct _H264_CONTEXT H264_CONTEXT;
typedef BOOL (*pfnH264SubsystemInit)(H264_CONTEXT* h264);
typedef void (*pfnH264SubsystemUninit)(H264_CONTEXT* h264);
typedef int (*pfnH264SubsystemDecompress)(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize,
BYTE* pDstData, DWORD DstFormat, int nDstStep, int nXDst, int nYDst, int nWidth, int nHeight);
typedef int (*pfnH264SubsystemDecompress)(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize);
struct _H264_CONTEXT_SUBSYSTEM
{
@ -50,6 +49,9 @@ struct _H264_CONTEXT
UINT32 width;
UINT32 height;
//int scanline;
BYTE* pYUVData[3];
int iStride[3];
/*
<<<<<<< HEAD

View File

@ -101,24 +101,6 @@ if(WITH_LIBAVCODEC)
set(FREERDP_LIBAVCODEC_LIBS ${LIBAVCODEC_LIB} ${LIBAVUTIL_LIB})
endif()
if(WITH_LIBAVCODEC OR WITH_OPENH264)
if(WITH_H264_SSSE3)
add_definitions(-DWITH_H264_SSSE3)
set(${MODULE_PREFIX}_SRCS
${${MODULE_PREFIX}_SRCS}
h264_ssse3.c)
if(CMAKE_COMPILER_IS_GNUCC)
set(OPTIMIZATION "${OPTIMIZATION} -msse2 -mssse3")
endif()
if(MSVC)
set(OPTIMIZATION "${OPTIMIZATION} /arch:SSE2")
endif()
set_property(SOURCE h264_ssse3.c PROPERTY COMPILE_FLAGS ${OPTIMIZATION})
endif()
endif()
add_complex_library(MODULE ${MODULE_NAME} TYPE "OBJECT"
MONOLITHIC ${MONOLITHIC_BUILD}

View File

@ -28,39 +28,14 @@
#include <freerdp/primitives.h>
#include <freerdp/codec/h264.h>
#ifdef WITH_LIBAVCODEC
int h264_prepare_rgb_buffer(H264_CONTEXT* h264, int width, int height)
{
UINT32 size;
#include <sys/time.h>
h264->width = width;
h264->height = height;
h264->scanline = h264->width * 4;
size = h264->scanline * h264->height;
if (size > h264->size)
{
h264->size = size;
if (!h264->data)
h264->data = (BYTE*) _aligned_malloc(h264->size, 16);
else
h264->data = (BYTE*) _aligned_realloc(h264->data, h264->size, 16);
}
if (!h264->data)
return -1;
return 1;
}
#endif
/**
* Dummy subsystem
*/
static int dummy_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize,
BYTE* pDstData, DWORD DstFormat, int nDstStep, int nXDst, int nYDst, int nWidth, int nHeight)
static int dummy_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize)
{
return -1;
}
@ -107,13 +82,9 @@ static void openh264_trace_callback(H264_CONTEXT* h264, int level, const char* m
static int openh264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize)
{
int srcStep[3];
prim_size_t roi;
BYTE* pYUVData[3];
DECODING_STATE state;
SBufferInfo sBufferInfo;
SSysMEMBuffer* pSystemBuffer;
primitives_t* prims = primitives_get();
H264_CONTEXT_OPENH264* sys = (H264_CONTEXT_OPENH264*) h264->pSystemData;
struct timeval T1,T2;
@ -147,7 +118,7 @@ static int openh264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSiz
*/
if (sBufferInfo.iBufferStatus != 1)
state = (*sys->pDecoder)->DecodeFrame2(sys->pDecoder, NULL, 0, pYUVData, &sBufferInfo);
state = (*sys->pDecoder)->DecodeFrame2(sys->pDecoder, NULL, 0, h264->pYUVData, &sBufferInfo);
gettimeofday(&T2,NULL);
printf("OpenH264: decoding took: %u sec %u usec\n",(unsigned int)(T2.tv_sec-T1.tv_sec),(unsigned int)(T2.tv_usec-T1.tv_usec));
@ -164,17 +135,19 @@ static int openh264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSiz
if (state != 0)
return -1;
if (!h264->pYUVData[0] || !h264->pYUVData[1] || !h264->pYUVData[2])
return -1;
if (sBufferInfo.iBufferStatus != 1)
return -1;
return -2;
if (pSystemBuffer->iFormat != videoFormatI420)
return -1;
if (!h264->pYUVData[0] || !h264->pYUVData[1] || !h264->pYUVData[2])
return -1;
h264->iStride[0] = pSystemBuffer->iStride[0];
h264->iStride[1] = pSystemBuffer->iStride[1];
h264->iStride[2] = pSystemBuffer->iStride[1];
h264->width = pSystemBuffer->iWidth;
h264->height = pSystemBuffer->iHeight;
@ -305,16 +278,11 @@ struct _H264_CONTEXT_LIBAVCODEC
};
typedef struct _H264_CONTEXT_LIBAVCODEC H264_CONTEXT_LIBAVCODEC;
static int libavcodec_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize,
BYTE* pDstData, DWORD DstFormat, int nDstStep, int nXDst, int nYDst, int nWidth, int nHeight)
static int libavcodec_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize)
{
int status;
int srcStep[3];
int gotFrame = 0;
AVPacket packet;
prim_size_t roi;
const BYTE* pSrc[3];
primitives_t* prims = primitives_get();
H264_CONTEXT_LIBAVCODEC* sys = (H264_CONTEXT_LIBAVCODEC*) h264->pSystemData;
struct timeval T1,T2;
@ -346,22 +314,19 @@ static int libavcodec_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcS
if (gotFrame)
{
if (h264_prepare_rgb_buffer(h264, sys->videoFrame->width, sys->videoFrame->height) < 0)
return -1;
h264->pYUVData[0] = sys->videoFrame->data[0];
h264->pYUVData[1] = sys->videoFrame->data[1];
h264->pYUVData[2] = sys->videoFrame->data[2];
roi.width = h264->width;
roi.height = h264->height;
h264->iStride[0] = sys->videoFrame->linesize[0];
h264->iStride[1] = sys->videoFrame->linesize[1];
h264->iStride[2] = sys->videoFrame->linesize[2];
pSrc[0] = sys->videoFrame->data[0];
pSrc[1] = sys->videoFrame->data[1];
pSrc[2] = sys->videoFrame->data[2];
srcStep[0] = sys->videoFrame->linesize[0];
srcStep[1] = sys->videoFrame->linesize[1];
srcStep[2] = sys->videoFrame->linesize[2];
prims->YUV420ToRGB_8u_P3AC4R(pSrc, srcStep, h264->data, h264->scanline, &roi);
h264->width = sys->videoFrame->width;
h264->height = sys->videoFrame->height;
}
else
return -2;
return 1;
}
@ -482,6 +447,8 @@ int h264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize,
int* iStride;
int ret, i, cx, cy;
int UncompressedSize;
primitives_t *prims = primitives_get();
prim_size_t roi;
struct timeval T1,T2;
@ -489,24 +456,24 @@ int h264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize,
return -1;
#if 0
printf("h264_decompress: pSrcData=%p, SrcSize=%u, pDstData=%p, nDstStep=%d, nXDst=%d, nYDst=%d, nWidth=%d, nHeight=%d)\n",
pSrcData, SrcSize, *ppDstData, nDstStep, nXDst, nYDst, nWidth, nHeight);
printf("h264_decompress: pSrcData=%p, SrcSize=%u, pDstData=%p, nDstStep=%d, nDstHeight=%d, numRegionRects=%d\n",
pSrcData, SrcSize, *ppDstData, nDstStep, nDstHeight, numRegionRects);
#endif
if (!(pDstData = *ppDstData))
return -1;
<<<<<<< HEAD
if (h264->subsystem->Decompress(h264, pSrcData, SrcSize,
pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight))
return -1;
if ((ret = h264->subsystem->Decompress(h264, pSrcData, SrcSize)) < 0)
return ret;
UncompressedSize = h264->width * h264->height * 4;
if (UncompressedSize > (nDstStep * nDstHeight))
return -1;
pYUVData = h264->pYUVData;
iStride = h264->iStride;
gettimeofday(&T1,NULL);
for (i = 0; i < numRegionRects; i++){
@ -517,32 +484,18 @@ int h264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize,
pDstPoint = pDstData + rect->top * nDstStep + rect->left * 4;
pYUVPoint[0] = pYUVData[0] + rect->top * iStride[0] + rect->left;
ret = rect->top/2 * iStride[1] + rect->left/2;
pYUVPoint[1] = pYUVData[1] + ret;
pYUVPoint[2] = pYUVData[2] + ret;
pYUVPoint[1] = pYUVData[1] + rect->top/2 * iStride[1] + rect->left/2;
pYUVPoint[2] = pYUVData[2] + rect->top/2 * iStride[2] + rect->left/2;
#if 0
printf("regionRect: x: %d, y: %d, cx: %d, cy: %d\n",
rect->left, rect->top, cx, cy);
#endif
#ifdef WITH_H264_SSSE3
freerdp_image_yuv420p_to_xrgb_ssse3(pDstPoint, pYUVPoint, cx, cy, iStride, nDstStep);
#else
/* roi.width = h264->width;
roi.height = h264->height;
roi.width = cx;
roi.height = cy;
pSrc[0] = sys->videoFrame->data[0];
pSrc[1] = sys->videoFrame->data[1];
pSrc[2] = sys->videoFrame->data[2];
srcStep[0] = sys->videoFrame->linesize[0];
srcStep[1] = sys->videoFrame->linesize[1];
srcStep[2] = sys->videoFrame->linesize[2];
prims->YUV420ToRGB_8u_P3AC4R(pSrc, srcStep, h264->data, h264->scanline, &roi)
*/
#endif
prims->YUV420ToRGB_8u_P3AC4R((const BYTE**) pYUVPoint, iStride, pDstPoint, nDstStep, &roi);
}
gettimeofday(&T2,NULL);
printf("converting took %u sec %u usec\n",(unsigned int)(T2.tv_sec-T1.tv_sec),(unsigned int)(T2.tv_usec-T1.tv_usec));
@ -582,24 +535,12 @@ H264_CONTEXT* h264_context_new(BOOL Compressor)
h264 = (H264_CONTEXT*) calloc(1, sizeof(H264_CONTEXT));
#ifdef WITH_H264_SSSE3
if(freerdp_check_ssse3()){
printf("SSSE3 seems to be not supported on this system, try without WITH_H264_SSSE3 ...");
return NULL;
}
#endif
if (h264)
{
h264->Compressor = Compressor;
h264->subsystem = &g_Subsystem_dummy;
#ifdef WITH_LIBAVCODEC
if (h264_prepare_rgb_buffer(h264, 256, 256) < 0)
return NULL;
#endif
if (!h264_context_init(h264))
{
free(h264);
@ -614,10 +555,6 @@ void h264_context_free(H264_CONTEXT* h264)
{
if (h264)
{
#ifdef WITH_LIBAVCODEC
_aligned_free(h264->data);
#endif
h264->subsystem->Uninit(h264);
free(h264);

View File

@ -1,17 +0,0 @@
TestOpenH264ASM: TestOpenH264ASM.c.o h264.c.o h264_ssse3.asm.o h264.asm.o
gcc -o TestOpenH264ASM TestOpenH264ASM.c.o h264.c.o h264_ssse3.asm.o h264.asm.o -lwinpr
h264_ssse3.asm.o: ../h264_ssse3_x32.asm
nasm -f elf32 -o h264_ssse3.asm.o ../h264_ssse3_x32.asm
h264.asm.o: ../h264_x32.asm
nasm -f elf32 -o h264.asm.o ../h264_x32.asm
TestOpenH264ASM.c.o: TestOpenH264ASM.c
gcc -c -o TestOpenH264ASM.c.o TestOpenH264ASM.c
h264.c.o: ../h264.c
gcc -c -o h264.c.o ../h264.c
clean:
rm -f TestOpenH264ASM TestOpenH264ASM.c.o h264_ssse3.asm.o h264.c.o h264.asm.o

View File

@ -1,17 +0,0 @@
TestOpenH264ASM: TestOpenH264ASM.c.o h264.c.o h264_ssse3.asm.o h264.asm.o
gcc -o TestOpenH264ASM TestOpenH264ASM.c.o h264.c.o h264_ssse3.asm.o h264.asm.o -lwinpr
h264_ssse3.asm.o: ../h264_ssse3_x64.asm
nasm -f elf64 -o h264_ssse3.asm.o ../h264_ssse3_x64.asm
h264.asm.o: ../h264_x64.asm
nasm -f elf64 -o h264.asm.o ../h264_x64.asm
TestOpenH264ASM.c.o: TestOpenH264ASM.c
gcc -c -o TestOpenH264ASM.c.o TestOpenH264ASM.c
h264.c.o: ../h264.c
gcc -c -o h264.c.o ../h264.c
clean:
rm -f TestOpenH264ASM TestOpenH264ASM.c.o h264_ssse3.asm.o h264.c.o h264.asm.o

View File

@ -1,14 +0,0 @@
TestOpenH264: TestOpenH264ASM.c.o h264.c.o h264_ssse3.c.o
gcc -o TestOpenH264 TestOpenH264ASM.c.o h264.c.o h264_ssse3.c.o -lwinpr
h264_ssse3.c.o: ../h264_ssse3.c
gcc -c -O3 -o h264_ssse3.c.o ../h264_ssse3.c -mssse3
TestOpenH264ASM.c.o: TestOpenH264ASM.c
gcc -c -o TestOpenH264ASM.c.o TestOpenH264ASM.c
h264.c.o: ../h264.c
gcc -c -o h264.c.o ../h264.c
clean:
rm -f TestOpenH264 TestOpenH264ASM.c.o h264_ssse3.asm.o h264.c.o h264.asm.o

Binary file not shown.

View File

@ -1,92 +0,0 @@
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <winpr/crt.h>
#include "TestOpenH264ASM.h"
#define WIDTH 1920
#define HEIGHT 1080
#define SSSE3 1
int main(void){
int i,j,k;
int ret;
unsigned char *pDstData_c,*pDstData_asm,*pSrcData[3];
int nSrcStep[2];
#if SSSE3
if(freerdp_check_ssse3()){
fprintf(stderr,"ssse3 not supported!\n");
return EXIT_FAILURE;
}
#endif
struct timeval t1,t2,t3;
pSrcData[0]=malloc(1984*HEIGHT*sizeof(char));
pSrcData[1]=malloc(1984*HEIGHT/4*sizeof(char));
pSrcData[2]=malloc(1984*HEIGHT/4*sizeof(char));
pDstData_asm=_aligned_malloc(WIDTH*(HEIGHT+1)*4*sizeof(char),16);
pDstData_c=malloc(WIDTH*(HEIGHT+1)*4*sizeof(char));
memset(pDstData_asm,0xFF,WIDTH*(HEIGHT+1)*4*sizeof(char));
memset(pDstData_c,0xFF,WIDTH*(HEIGHT+1)*4*sizeof(char));
for(i=0;i<WIDTH*HEIGHT;i++){
pSrcData[0][i]=i%255;
pSrcData[1][i/4]=pSrcData[0][i];
pSrcData[2][i/4]=255-pSrcData[0][i];
}
nSrcStep[0]=1984;
nSrcStep[1]=992;
gettimeofday(&t1,NULL);
#if SSSE3
ret=freerdp_image_yuv420p_to_xrgb_ssse3(pDstData_asm,pSrcData,WIDTH,HEIGHT,nSrcStep,WIDTH*4);
#else
ret=freerdp_image_yuv_to_xrgb_asm(pDstData_asm,pSrcData,WIDTH,HEIGHT,nSrcStep,WIDTH*4);
#endif
gettimeofday(&t2,NULL);
freerdp_image_copy_yuv420p_to_xrgb(pDstData_c,WIDTH*4,0,0,WIDTH,HEIGHT,pSrcData,nSrcStep,0,0);
gettimeofday(&t3,NULL);
printf("in asm (0x%08X) it took %u sec %u usec,\nin c %u sec %u usec.\n",ret,(int)(t2.tv_sec-t1.tv_sec),(int)(t2.tv_usec-t1.tv_usec),
(int)(t3.tv_sec-t2.tv_sec),(int)(t3.tv_usec-t2.tv_usec));
printf("in asm the result was %X %X %X\n in c %X %X %X.\n",pDstData_asm[0],pDstData_asm[1],pDstData_asm[2],
pDstData_c[0],pDstData_c[1],pDstData_c[2]);
/*k=0;
for(i=0;i<HEIGHT+1;i++){
for(j=0;j<WIDTH;j++){
printf("%08X:%08X ",((unsigned int*)pDstData_asm)[k],((unsigned int*)pDstData_c)[k]);
k++;
}
puts("\n");
}*/
k=1;
for(i=0;i<(WIDTH*HEIGHT*4);i++){
if(pDstData_c[i]!=pDstData_asm[i]){
k=0;
printf("MISSMATCH at %d: %2X != %2X\n",i,(unsigned char)pDstData_asm[i],(unsigned char)pDstData_c[i]);
break;
}
}
if(k)
printf("everything OK\n");
free(pSrcData[0]);
free(pSrcData[1]);
free(pSrcData[2]);
free(pDstData_c);
_aligned_free(pDstData_asm);
return 0;
}

View File

@ -1,7 +0,0 @@
int freerdp_image_copy_yuv420p_to_xrgb(unsigned char* pDstData, int nDstStep, int nXDst, int nYDst,
int nWidth, int nHeight, unsigned char* pSrcData[3], int nSrcStep[2], int nXSrc, int nYSrc);
extern int freerdp_image_yuv_to_xrgb_asm(unsigned char *pDstData,unsigned char **pSrcData,int nWidth,int nHeight,int *istride,int scanline);
extern int freerdp_check_ssse3();
extern int freerdp_image_yuv420p_to_xrgb_ssse3(unsigned char *pDstData,unsigned char **pSrcData,int nWidth,int nHeight,int *istride,int scanline);

View File

@ -40,6 +40,7 @@ set(${MODULE_PREFIX}_OPT_SRCS
prim_set_opt.c
prim_shift_opt.c
prim_sign_opt.c
prim_YUV_opt.c
prim_YCoCg_opt.c)
add_definitions(-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE})
@ -55,11 +56,11 @@ endif()
if(WITH_SSE2)
if(CMAKE_COMPILER_IS_GNUCC)
set(OPTIMIZATION "${OPTIMIZATION} -msse2 -mssse3 -Wdeclaration-after-statement")
set(OPTIMIZATION "${OPTIMIZATION} -msse2 -mssse3 -O2 -Wdeclaration-after-statement")
endif()
if(MSVC)
set(OPTIMIZATION "${OPTIMIZATION} /arch:SSE2")
set(OPTIMIZATION "${OPTIMIZATION} /arch:SSE2 /O2")
endif()
elseif(WITH_NEON)
if(CMAKE_COMPILER_IS_GNUCC)
@ -70,6 +71,16 @@ endif()
set_property(SOURCE ${${MODULE_PREFIX}_OPT_SRCS} PROPERTY COMPILE_FLAGS ${OPTIMIZATION})
# always compile with optimization
if(CMAKE_COMPILER_IS_GNUCC)
set_property(SOURCE ${${MODULE_PREFIX}_SRCS} PROPERTY COMPILE_FLAGS "-O2")
endif()
if(MSVC)
set_property(SOURCE ${${MODULE_PREFIX}_SRCS} PROPERTY COMPILE_FLAGS "/O2")
endif()
set(${MODULE_PREFIX}_SRCS ${${MODULE_PREFIX}_SRCS} ${${MODULE_PREFIX}_OPT_SRCS})
add_complex_library(MODULE ${MODULE_NAME} TYPE "OBJECT"

View File

@ -44,24 +44,40 @@ pstatus_t general_YUV420ToRGB_8u_P3AC4R(const BYTE* pSrc[3], int srcStep[3],
int Up48, Up475;
int Vp403, Vp120;
BYTE* pRGB = pDst;
int nWidth, nHeight;
int last_line, last_column;
pY = pSrc[0];
pU = pSrc[1];
pV = pSrc[2];
last_column = roi->width & 0x01;
last_line = roi->height & 0x01;
nWidth = (roi->width + 1) & ~0x0001;
nHeight = (roi->height + 1) & ~0x0001;
halfWidth = roi->width / 2;
halfHeight = roi->height / 2;
halfWidth = nWidth / 2;
halfHeight = nHeight / 2;
srcPad[0] = (srcStep[0] - roi->width);
srcPad[0] = (srcStep[0] - nWidth);
srcPad[1] = (srcStep[1] - halfWidth);
srcPad[2] = (srcStep[2] - halfWidth);
dstPad = (dstStep - (roi->width * 4));
dstPad = (dstStep - (nWidth * 4));
for (y = 0; y < halfHeight; y++)
for (y = 0; y < halfHeight; )
{
for (x = 0; x < halfWidth; x++)
y++;
if (y == halfHeight)
last_line = last_line << 1;
for (x = 0; x < halfWidth; )
{
x++;
if (x == halfWidth)
last_column = last_column << 1;
U = *pU++;
V = *pV++;
@ -105,32 +121,41 @@ pstatus_t general_YUV420ToRGB_8u_P3AC4R(const BYTE* pSrc[3], int srcStep[3],
/* 2nd pixel */
Y = *pY++;
Yp = Y << 8;
if (!(last_column & 0x02))
{
Y = *pY++;
Yp = Y << 8;
R = (Yp + Vp403) >> 8;
G = (Yp - Up48 - Vp120) >> 8;
B = (Yp + Up475) >> 8;
R = (Yp + Vp403) >> 8;
G = (Yp - Up48 - Vp120) >> 8;
B = (Yp + Up475) >> 8;
if (R < 0)
R = 0;
else if (R > 255)
R = 255;
if (R < 0)
R = 0;
else if (R > 255)
R = 255;
if (G < 0)
G = 0;
else if (G > 255)
G = 255;
if (G < 0)
G = 0;
else if (G > 255)
G = 255;
if (B < 0)
B = 0;
else if (B > 255)
B = 255;
if (B < 0)
B = 0;
else if (B > 255)
B = 255;
*pRGB++ = (BYTE) B;
*pRGB++ = (BYTE) G;
*pRGB++ = (BYTE) R;
*pRGB++ = 0xFF;
*pRGB++ = (BYTE) B;
*pRGB++ = (BYTE) G;
*pRGB++ = (BYTE) R;
*pRGB++ = 0xFF;
}
else
{
pY++;
pRGB += 4;
last_column = last_column >> 1;
}
}
pY += srcPad[0];
@ -138,8 +163,12 @@ pstatus_t general_YUV420ToRGB_8u_P3AC4R(const BYTE* pSrc[3], int srcStep[3],
pV -= halfWidth;
pRGB += dstPad;
for (x = 0; x < halfWidth; x++)
for (x = 0; x < halfWidth; )
{
x++;
if (x == halfWidth)
last_column = last_column << 1;
U = *pU++;
V = *pV++;
@ -183,32 +212,41 @@ pstatus_t general_YUV420ToRGB_8u_P3AC4R(const BYTE* pSrc[3], int srcStep[3],
/* 4th pixel */
Y = *pY++;
Yp = Y << 8;
if(!(last_column & 0x02))
{
Y = *pY++;
Yp = Y << 8;
R = (Yp + Vp403) >> 8;
G = (Yp - Up48 - Vp120) >> 8;
B = (Yp + Up475) >> 8;
R = (Yp + Vp403) >> 8;
G = (Yp - Up48 - Vp120) >> 8;
B = (Yp + Up475) >> 8;
if (R < 0)
R = 0;
else if (R > 255)
R = 255;
if (R < 0)
R = 0;
else if (R > 255)
R = 255;
if (G < 0)
G = 0;
else if (G > 255)
G = 255;
if (G < 0)
G = 0;
else if (G > 255)
G = 255;
if (B < 0)
B = 0;
else if (B > 255)
B = 255;
if (B < 0)
B = 0;
else if (B > 255)
B = 255;
*pRGB++ = (BYTE) B;
*pRGB++ = (BYTE) G;
*pRGB++ = (BYTE) R;
*pRGB++ = 0xFF;
*pRGB++ = (BYTE) B;
*pRGB++ = (BYTE) G;
*pRGB++ = (BYTE) R;
*pRGB++ = 0xFF;
}
else
{
pY++;
pRGB += 4;
last_column = last_column >> 1;
}
}
pY += srcPad[0];
@ -223,6 +261,8 @@ pstatus_t general_YUV420ToRGB_8u_P3AC4R(const BYTE* pSrc[3], int srcStep[3],
void primitives_init_YUV(primitives_t* prims)
{
prims->YUV420ToRGB_8u_P3AC4R = general_YUV420ToRGB_8u_P3AC4R;
primitives_init_YUV_opt(prims);
}
void primitives_deinit_YUV(primitives_t* prims)

View File

@ -22,6 +22,7 @@
pstatus_t general_yCbCrToRGB_16s8u_P3AC4R(const INT16* pSrc[3], int srcStep, BYTE* pDst, int dstStep, const prim_size_t* roi);
void primitives_init_YUV(primitives_t* prims);
void primitives_init_YUV_opt(primitives_t* prims);
void primitives_deinit_YUV(primitives_t* prims);
#endif /* FREERDP_PRIMITIVES_YUV_H */

View File

@ -1,32 +1,32 @@
/** function for converting YUV420p data to the RGB format (but without any special upconverting)
* It's completely written in nasm-x86-assembly for intel processors supporting SSSE3 and higher.
* The target scanline (6th parameter) must be a multiple of 16.
* iStride[0] must be (target scanline) / 4 or bigger and iStride[1] the next multiple of four
* of the half of iStride[0] or bigger
* The target dstStep (6th parameter) must be a multiple of 16.
* srcStep[0] must be (target dstStep) / 4 or bigger and srcStep[1] the next multiple of four
* of the half of srcStep[0] or bigger
*/
#include <stdio.h>
#include <emmintrin.h>
//#include <immintrin.h>
#include <tmmintrin.h>
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include <winpr/sysinfo.h>
#include <winpr/crt.h>
int freerdp_check_ssse3()
{
if(IsProcessorFeaturePresentEx(PF_EX_SSSE3)&&IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
return 0;
return 1;
}
#include <freerdp/types.h>
#include <freerdp/primitives.h>
int freerdp_image_yuv420p_to_xrgb_ssse3(BYTE *pDstData,BYTE **pSrcData,int nWidth,int nHeight,int *iStride,int scanline)
#ifdef WITH_SSE2
#include <emmintrin.h>
#include <tmmintrin.h>
pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(const BYTE **pSrc, int *srcStep,
BYTE *pDst, int dstStep, const prim_size_t *roi)
{
char last_line,last_column;
int i,VaddDst,VaddY,VaddUV;
int i,nWidth,nHeight,VaddDst,VaddY,VaddU,VaddV;
BYTE *UData,*VData,*YData;
@ -37,9 +37,12 @@ int freerdp_image_yuv420p_to_xrgb_ssse3(BYTE *pDstData,BYTE **pSrcData,int nWidt
buffer=_aligned_malloc(4*16,16);
YData=pSrcData[0];
UData=pSrcData[1];
VData=pSrcData[2];
YData=(BYTE *)pSrc[0];
UData=(BYTE *)pSrc[1];
VData=(BYTE *)pSrc[2];
nWidth=roi->width;
nHeight=roi->height;
if((last_column=nWidth&3)){
@ -48,7 +51,7 @@ int freerdp_image_yuv420p_to_xrgb_ssse3(BYTE *pDstData,BYTE **pSrcData,int nWidt
case 2: r7=_mm_set_epi32(0,0,0xFFFFFFFF,0xFFFFFFFF); break;
case 3: r7=_mm_set_epi32(0,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF); break;
}
_mm_store_si128(buffer+48,r7);
_mm_store_si128(buffer+3,r7);
last_column=1;
}
@ -61,10 +64,10 @@ int freerdp_image_yuv420p_to_xrgb_ssse3(BYTE *pDstData,BYTE **pSrcData,int nWidt
nHeight=nHeight>>1;
VaddDst=(scanline<<1)-(nWidth<<4);
VaddY=(iStride[0]<<1)-(nWidth<<2);
VaddUV=iStride[1]-(((nWidth<<1)+2)&0xFFFC);
VaddDst=(dstStep<<1)-(nWidth<<4);
VaddY=(srcStep[0]<<1)-(nWidth<<2);
VaddU=srcStep[1]-(((nWidth<<1)+2)&0xFFFC);
VaddV=srcStep[2]-(((nWidth<<1)+2)&0xFFFC);
while(nHeight-- >0){
@ -129,7 +132,7 @@ int freerdp_image_yuv420p_to_xrgb_ssse3(BYTE *pDstData,BYTE **pSrcData,int nWidt
r1=_mm_add_epi32(r1,r6);
r7=_mm_add_epi32(r7,r6);
_mm_store_si128(buffer+16,r7);
_mm_store_si128(buffer+1,r7);
/* Now we've prepared U-data. Preparing V-data is actually the same, just with other coefficients */
r2=_mm_cvtsi32_si128(*(UINT32 *)VData);
@ -153,7 +156,7 @@ int freerdp_image_yuv420p_to_xrgb_ssse3(BYTE *pDstData,BYTE **pSrcData,int nWidt
r2=_mm_add_epi32(r2,r6);
r7=_mm_add_epi32(r7,r6);
_mm_store_si128(buffer+32,r7);
_mm_store_si128(buffer+2,r7);
@ -170,8 +173,8 @@ int freerdp_image_yuv420p_to_xrgb_ssse3(BYTE *pDstData,BYTE **pSrcData,int nWidt
_mm_store_si128(buffer,r4);
}else{
r1=_mm_load_si128(buffer+16);
r2=_mm_load_si128(buffer+32);
r1=_mm_load_si128(buffer+1);
r2=_mm_load_si128(buffer+2);
r0=_mm_load_si128(buffer);
}
@ -220,17 +223,17 @@ int freerdp_image_yuv420p_to_xrgb_ssse3(BYTE *pDstData,BYTE **pSrcData,int nWidt
if(last_column&0x02){
r6=_mm_load_si128(buffer+48);
r6=_mm_load_si128(buffer+3);
r4=_mm_and_si128(r4,r6);
r5=_mm_lddqu_si128((__m128i *)pDstData);
r5=_mm_lddqu_si128((__m128i *)pDst);
r6=_mm_andnot_si128(r6,r5);
r4=_mm_or_si128(r4,r6);
}
_mm_storeu_si128((__m128i *)pDstData,r4);
_mm_storeu_si128((__m128i *)pDst,r4);
//Y data processing in secound line
if(!(last_line&0x02)){
r4=_mm_cvtsi32_si128(*(UINT32 *)(YData+iStride[0]));
r4=_mm_cvtsi32_si128(*(UINT32 *)(YData+srcStep[0]));
r7=_mm_set_epi32(0x80800380,0x80800280,0x80800180,0x80800080);
r4=_mm_shuffle_epi8(r4,r7);
@ -271,28 +274,40 @@ int freerdp_image_yuv420p_to_xrgb_ssse3(BYTE *pDstData,BYTE **pSrcData,int nWidt
if(last_column&0x02){
r6=_mm_load_si128(buffer+48);
r6=_mm_load_si128(buffer+3);
r4=_mm_and_si128(r4,r6);
r5=_mm_lddqu_si128((__m128i *)(pDstData+scanline));
r5=_mm_lddqu_si128((__m128i *)(pDst+dstStep));
r6=_mm_andnot_si128(r6,r5);
r4=_mm_or_si128(r4,r6);
last_column=last_column>>1;
}
_mm_storeu_si128((__m128i *)(pDstData+scanline),r4);
_mm_storeu_si128((__m128i *)(pDst+dstStep),r4);
}
pDstData+=16;
pDst+=16;
YData+=4;
}while(i<nWidth);
pDstData+=VaddDst;
pDst+=VaddDst;
YData+=VaddY;
UData+=VaddUV;
VData+=VaddUV;
UData+=VaddU;
VData+=VaddV;
}
_aligned_free(buffer);
return 0;
}
return PRIMITIVES_SUCCESS;
}
#endif
void primitives_init_YUV_opt(primitives_t *prims)
{
#ifdef WITH_SSE2
if(IsProcessorFeaturePresentEx(PF_EX_SSSE3)&&IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
{
prims->YUV420ToRGB_8u_P3AC4R=ssse3_YUV420ToRGB_8u_P3AC4R;
}
#endif
}

View File

@ -155,8 +155,6 @@ wStream* StreamPool_Take(wStreamPool* pool, size_t size)
Stream_SetPosition(s, 0);
Stream_EnsureCapacity(s, size);
Stream_SetLength(s,size);
}
s->pool = pool;