libfreerdp-primitives: integrate H264 SSE3 color converter
This commit is contained in:
parent
666919d157
commit
bcf1266f51
@ -44,31 +44,12 @@ struct _H264_CONTEXT
|
|||||||
{
|
{
|
||||||
BOOL Compressor;
|
BOOL Compressor;
|
||||||
|
|
||||||
//BYTE* data;
|
|
||||||
//UINT32 size;
|
|
||||||
UINT32 width;
|
UINT32 width;
|
||||||
UINT32 height;
|
UINT32 height;
|
||||||
//int scanline;
|
|
||||||
|
|
||||||
BYTE* pYUVData[3];
|
|
||||||
int iStride[3];
|
int iStride[3];
|
||||||
|
|
||||||
/*
|
|
||||||
<<<<<<< HEAD
|
|
||||||
#ifdef WITH_OPENH264
|
|
||||||
ISVCDecoder* pDecoder;
|
|
||||||
BYTE* pYUVData[3];
|
BYTE* pYUVData[3];
|
||||||
int iStride[2];
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef WITH_LIBAVCODEC
|
|
||||||
AVCodec* codec;
|
|
||||||
AVCodecContext* codecContext;
|
|
||||||
AVCodecParserContext* codecParser;
|
|
||||||
AVFrame* videoFrame;
|
|
||||||
#endif
|
|
||||||
=======
|
|
||||||
*/
|
|
||||||
void* pSystemData;
|
void* pSystemData;
|
||||||
H264_CONTEXT_SUBSYSTEM* subsystem;
|
H264_CONTEXT_SUBSYSTEM* subsystem;
|
||||||
};
|
};
|
||||||
|
@ -28,9 +28,6 @@
|
|||||||
#include <freerdp/primitives.h>
|
#include <freerdp/primitives.h>
|
||||||
#include <freerdp/codec/h264.h>
|
#include <freerdp/codec/h264.h>
|
||||||
|
|
||||||
#include <sys/time.h>
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Dummy subsystem
|
* Dummy subsystem
|
||||||
*/
|
*/
|
||||||
@ -87,8 +84,6 @@ static int openh264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSiz
|
|||||||
SSysMEMBuffer* pSystemBuffer;
|
SSysMEMBuffer* pSystemBuffer;
|
||||||
H264_CONTEXT_OPENH264* sys = (H264_CONTEXT_OPENH264*) h264->pSystemData;
|
H264_CONTEXT_OPENH264* sys = (H264_CONTEXT_OPENH264*) h264->pSystemData;
|
||||||
|
|
||||||
struct timeval T1,T2;
|
|
||||||
|
|
||||||
if (!sys->pDecoder)
|
if (!sys->pDecoder)
|
||||||
return -1;
|
return -1;
|
||||||
|
|
||||||
@ -102,7 +97,6 @@ static int openh264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSiz
|
|||||||
|
|
||||||
ZeroMemory(&sBufferInfo, sizeof(sBufferInfo));
|
ZeroMemory(&sBufferInfo, sizeof(sBufferInfo));
|
||||||
|
|
||||||
gettimeofday(&T1,NULL);
|
|
||||||
state = (*sys->pDecoder)->DecodeFrame2(
|
state = (*sys->pDecoder)->DecodeFrame2(
|
||||||
sys->pDecoder,
|
sys->pDecoder,
|
||||||
pSrcData,
|
pSrcData,
|
||||||
@ -120,9 +114,6 @@ static int openh264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSiz
|
|||||||
if (sBufferInfo.iBufferStatus != 1)
|
if (sBufferInfo.iBufferStatus != 1)
|
||||||
state = (*sys->pDecoder)->DecodeFrame2(sys->pDecoder, NULL, 0, h264->pYUVData, &sBufferInfo);
|
state = (*sys->pDecoder)->DecodeFrame2(sys->pDecoder, NULL, 0, h264->pYUVData, &sBufferInfo);
|
||||||
|
|
||||||
gettimeofday(&T2,NULL);
|
|
||||||
printf("OpenH264: decoding took: %u sec %u usec\n",(unsigned int)(T2.tv_sec-T1.tv_sec),(unsigned int)(T2.tv_usec-T1.tv_usec));
|
|
||||||
|
|
||||||
pSystemBuffer = &sBufferInfo.UsrData.sSystemBuffer;
|
pSystemBuffer = &sBufferInfo.UsrData.sSystemBuffer;
|
||||||
|
|
||||||
#if 0
|
#if 0
|
||||||
@ -285,18 +276,12 @@ static int libavcodec_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcS
|
|||||||
AVPacket packet;
|
AVPacket packet;
|
||||||
H264_CONTEXT_LIBAVCODEC* sys = (H264_CONTEXT_LIBAVCODEC*) h264->pSystemData;
|
H264_CONTEXT_LIBAVCODEC* sys = (H264_CONTEXT_LIBAVCODEC*) h264->pSystemData;
|
||||||
|
|
||||||
struct timeval T1,T2;
|
|
||||||
|
|
||||||
av_init_packet(&packet);
|
av_init_packet(&packet);
|
||||||
|
|
||||||
packet.data = pSrcData;
|
packet.data = pSrcData;
|
||||||
packet.size = SrcSize;
|
packet.size = SrcSize;
|
||||||
|
|
||||||
gettimeofday(&T1,NULL);
|
|
||||||
status = avcodec_decode_video2(sys->codecContext, sys->videoFrame, &gotFrame, &packet);
|
status = avcodec_decode_video2(sys->codecContext, sys->videoFrame, &gotFrame, &packet);
|
||||||
gettimeofday(&T2,NULL);
|
|
||||||
|
|
||||||
printf("libavcodec: decoding took: %u sec %u usec\n",(unsigned int)(T2.tv_sec-T1.tv_sec),(unsigned int)(T2.tv_usec-T1.tv_usec));
|
|
||||||
|
|
||||||
if (status < 0)
|
if (status < 0)
|
||||||
{
|
{
|
||||||
@ -437,20 +422,18 @@ static H264_CONTEXT_SUBSYSTEM g_Subsystem_libavcodec =
|
|||||||
int h264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize,
|
int h264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize,
|
||||||
BYTE** ppDstData, DWORD DstFormat, int nDstStep, int nDstHeight, RDPGFX_RECT16* regionRects, int numRegionRects)
|
BYTE** ppDstData, DWORD DstFormat, int nDstStep, int nDstHeight, RDPGFX_RECT16* regionRects, int numRegionRects)
|
||||||
{
|
{
|
||||||
|
int index;
|
||||||
|
int status;
|
||||||
|
int* iStride;
|
||||||
BYTE* pDstData;
|
BYTE* pDstData;
|
||||||
BYTE* pDstPoint;
|
BYTE* pDstPoint;
|
||||||
|
prim_size_t roi;
|
||||||
BYTE** pYUVData;
|
BYTE** pYUVData;
|
||||||
|
int width, height;
|
||||||
BYTE* pYUVPoint[3];
|
BYTE* pYUVPoint[3];
|
||||||
|
|
||||||
RDPGFX_RECT16* rect;
|
RDPGFX_RECT16* rect;
|
||||||
int* iStride;
|
|
||||||
int ret, i, cx, cy;
|
|
||||||
int UncompressedSize;
|
int UncompressedSize;
|
||||||
primitives_t *prims = primitives_get();
|
primitives_t *prims = primitives_get();
|
||||||
prim_size_t roi;
|
|
||||||
|
|
||||||
struct timeval T1,T2;
|
|
||||||
|
|
||||||
if (!h264)
|
if (!h264)
|
||||||
return -1;
|
return -1;
|
||||||
@ -463,23 +446,23 @@ int h264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize,
|
|||||||
if (!(pDstData = *ppDstData))
|
if (!(pDstData = *ppDstData))
|
||||||
return -1;
|
return -1;
|
||||||
|
|
||||||
|
if ((status = h264->subsystem->Decompress(h264, pSrcData, SrcSize)) < 0)
|
||||||
if ((ret = h264->subsystem->Decompress(h264, pSrcData, SrcSize)) < 0)
|
return status;
|
||||||
return ret;
|
|
||||||
|
|
||||||
|
|
||||||
UncompressedSize = h264->width * h264->height * 4;
|
UncompressedSize = h264->width * h264->height * 4;
|
||||||
|
|
||||||
if (UncompressedSize > (nDstStep * nDstHeight))
|
if (UncompressedSize > (nDstStep * nDstHeight))
|
||||||
return -1;
|
return -1;
|
||||||
|
|
||||||
pYUVData = h264->pYUVData;
|
pYUVData = h264->pYUVData;
|
||||||
iStride = h264->iStride;
|
iStride = h264->iStride;
|
||||||
|
|
||||||
gettimeofday(&T1,NULL);
|
for (index = 0; index < numRegionRects; index++)
|
||||||
for (i = 0; i < numRegionRects; i++){
|
{
|
||||||
rect = &(regionRects[i]);
|
rect = &(regionRects[index]);
|
||||||
cx = rect->right - rect->left;
|
|
||||||
cy = rect->bottom - rect->top;
|
width = rect->right - rect->left;
|
||||||
|
height = rect->bottom - rect->top;
|
||||||
|
|
||||||
pDstPoint = pDstData + rect->top * nDstStep + rect->left * 4;
|
pDstPoint = pDstData + rect->top * nDstStep + rect->left * 4;
|
||||||
pYUVPoint[0] = pYUVData[0] + rect->top * iStride[0] + rect->left;
|
pYUVPoint[0] = pYUVData[0] + rect->top * iStride[0] + rect->left;
|
||||||
@ -488,17 +471,15 @@ int h264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize,
|
|||||||
pYUVPoint[2] = pYUVData[2] + rect->top/2 * iStride[2] + rect->left/2;
|
pYUVPoint[2] = pYUVData[2] + rect->top/2 * iStride[2] + rect->left/2;
|
||||||
|
|
||||||
#if 0
|
#if 0
|
||||||
printf("regionRect: x: %d, y: %d, cx: %d, cy: %d\n",
|
printf("regionRect: x: %d y: %d width: %d height: %d\n",
|
||||||
rect->left, rect->top, cx, cy);
|
rect->left, rect->top, width, height);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
roi.width = cx;
|
roi.width = width;
|
||||||
roi.height = cy;
|
roi.height = height;
|
||||||
|
|
||||||
prims->YUV420ToRGB_8u_P3AC4R((const BYTE**) pYUVPoint, iStride, pDstPoint, nDstStep, &roi);
|
prims->YUV420ToRGB_8u_P3AC4R((const BYTE**) pYUVPoint, iStride, pDstPoint, nDstStep, &roi);
|
||||||
}
|
}
|
||||||
gettimeofday(&T2,NULL);
|
|
||||||
printf("converting took %u sec %u usec\n",(unsigned int)(T2.tv_sec-T1.tv_sec),(unsigned int)(T2.tv_usec-T1.tv_usec));
|
|
||||||
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
@ -27,6 +27,16 @@
|
|||||||
#include "prim_internal.h"
|
#include "prim_internal.h"
|
||||||
#include "prim_YUV.h"
|
#include "prim_YUV.h"
|
||||||
|
|
||||||
|
/**
|
||||||
|
* | R | ( | 256 0 403 | | Y | )
|
||||||
|
* | G | = ( | 256 -48 -120 | | U - 128 | ) >> 8
|
||||||
|
* | B | ( | 256 475 0 | | V - 128 | )
|
||||||
|
*
|
||||||
|
* | Y | ( | 54 183 18 | | R | ) | 0 |
|
||||||
|
* | U | = ( | -29 -99 128 | | G | ) >> 8 + | 128 |
|
||||||
|
* | V | ( | 128 -116 -12 | | B | ) | 128 |
|
||||||
|
*/
|
||||||
|
|
||||||
pstatus_t general_YUV420ToRGB_8u_P3AC4R(const BYTE* pSrc[3], int srcStep[3],
|
pstatus_t general_YUV420ToRGB_8u_P3AC4R(const BYTE* pSrc[3], int srcStep[3],
|
||||||
BYTE* pDst, int dstStep, const prim_size_t* roi)
|
BYTE* pDst, int dstStep, const prim_size_t* roi)
|
||||||
{
|
{
|
||||||
@ -45,14 +55,14 @@ pstatus_t general_YUV420ToRGB_8u_P3AC4R(const BYTE* pSrc[3], int srcStep[3],
|
|||||||
int Vp403, Vp120;
|
int Vp403, Vp120;
|
||||||
BYTE* pRGB = pDst;
|
BYTE* pRGB = pDst;
|
||||||
int nWidth, nHeight;
|
int nWidth, nHeight;
|
||||||
int last_line, last_column;
|
int lastRow, lastCol;
|
||||||
|
|
||||||
pY = pSrc[0];
|
pY = pSrc[0];
|
||||||
pU = pSrc[1];
|
pU = pSrc[1];
|
||||||
pV = pSrc[2];
|
pV = pSrc[2];
|
||||||
|
|
||||||
last_column = roi->width & 0x01;
|
lastCol = roi->width & 0x01;
|
||||||
last_line = roi->height & 0x01;
|
lastRow = roi->height & 0x01;
|
||||||
|
|
||||||
nWidth = (roi->width + 1) & ~0x0001;
|
nWidth = (roi->width + 1) & ~0x0001;
|
||||||
nHeight = (roi->height + 1) & ~0x0001;
|
nHeight = (roi->height + 1) & ~0x0001;
|
||||||
@ -68,15 +78,13 @@ pstatus_t general_YUV420ToRGB_8u_P3AC4R(const BYTE* pSrc[3], int srcStep[3],
|
|||||||
|
|
||||||
for (y = 0; y < halfHeight; )
|
for (y = 0; y < halfHeight; )
|
||||||
{
|
{
|
||||||
y++;
|
if (++y == halfHeight)
|
||||||
if (y == halfHeight)
|
lastRow <<= 1;
|
||||||
last_line = last_line << 1;
|
|
||||||
|
|
||||||
for (x = 0; x < halfWidth; )
|
for (x = 0; x < halfWidth; )
|
||||||
{
|
{
|
||||||
x++;
|
if (++x == halfWidth)
|
||||||
if (x == halfWidth)
|
lastCol <<= 1;
|
||||||
last_column = last_column << 1;
|
|
||||||
|
|
||||||
U = *pU++;
|
U = *pU++;
|
||||||
V = *pV++;
|
V = *pV++;
|
||||||
@ -121,7 +129,7 @@ pstatus_t general_YUV420ToRGB_8u_P3AC4R(const BYTE* pSrc[3], int srcStep[3],
|
|||||||
|
|
||||||
/* 2nd pixel */
|
/* 2nd pixel */
|
||||||
|
|
||||||
if (!(last_column & 0x02))
|
if (!(lastCol & 0x02))
|
||||||
{
|
{
|
||||||
Y = *pY++;
|
Y = *pY++;
|
||||||
Yp = Y << 8;
|
Yp = Y << 8;
|
||||||
@ -154,7 +162,7 @@ pstatus_t general_YUV420ToRGB_8u_P3AC4R(const BYTE* pSrc[3], int srcStep[3],
|
|||||||
{
|
{
|
||||||
pY++;
|
pY++;
|
||||||
pRGB += 4;
|
pRGB += 4;
|
||||||
last_column = last_column >> 1;
|
lastCol >>= 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -165,9 +173,8 @@ pstatus_t general_YUV420ToRGB_8u_P3AC4R(const BYTE* pSrc[3], int srcStep[3],
|
|||||||
|
|
||||||
for (x = 0; x < halfWidth; )
|
for (x = 0; x < halfWidth; )
|
||||||
{
|
{
|
||||||
x++;
|
if (++x == halfWidth)
|
||||||
if (x == halfWidth)
|
lastCol <<= 1;
|
||||||
last_column = last_column << 1;
|
|
||||||
|
|
||||||
U = *pU++;
|
U = *pU++;
|
||||||
V = *pV++;
|
V = *pV++;
|
||||||
@ -212,7 +219,7 @@ pstatus_t general_YUV420ToRGB_8u_P3AC4R(const BYTE* pSrc[3], int srcStep[3],
|
|||||||
|
|
||||||
/* 4th pixel */
|
/* 4th pixel */
|
||||||
|
|
||||||
if(!(last_column & 0x02))
|
if (!(lastCol & 0x02))
|
||||||
{
|
{
|
||||||
Y = *pY++;
|
Y = *pY++;
|
||||||
Yp = Y << 8;
|
Yp = Y << 8;
|
||||||
@ -245,7 +252,7 @@ pstatus_t general_YUV420ToRGB_8u_P3AC4R(const BYTE* pSrc[3], int srcStep[3],
|
|||||||
{
|
{
|
||||||
pY++;
|
pY++;
|
||||||
pRGB += 4;
|
pRGB += 4;
|
||||||
last_column = last_column >> 1;
|
lastCol >>= 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -25,21 +25,17 @@
|
|||||||
pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(const BYTE **pSrc, int *srcStep,
|
pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(const BYTE **pSrc, int *srcStep,
|
||||||
BYTE *pDst, int dstStep, const prim_size_t *roi)
|
BYTE *pDst, int dstStep, const prim_size_t *roi)
|
||||||
{
|
{
|
||||||
char last_line,last_column;
|
int lastRow, lastCol;
|
||||||
/* last_line: if the last (U,V doubled) line should be skipped, set to 10B
|
|
||||||
* last_column: if it's the last column in a line, set to 10B (for handling line-endings not multiple by four) */
|
|
||||||
|
|
||||||
int i,nWidth,nHeight,VaddDst,VaddY,VaddU,VaddV;
|
|
||||||
|
|
||||||
BYTE *UData,*VData,*YData;
|
BYTE *UData,*VData,*YData;
|
||||||
|
int i,nWidth,nHeight,VaddDst,VaddY,VaddU,VaddV;
|
||||||
__m128i r0,r1,r2,r3,r4,r5,r6,r7;
|
__m128i r0,r1,r2,r3,r4,r5,r6,r7;
|
||||||
__m128i *buffer;
|
__m128i *buffer;
|
||||||
|
|
||||||
|
/* last_line: if the last (U,V doubled) line should be skipped, set to 10B
|
||||||
|
* last_column: if it's the last column in a line, set to 10B (for handling line-endings not multiple by four) */
|
||||||
|
|
||||||
buffer = _aligned_malloc(4 * 16, 16);
|
buffer = _aligned_malloc(4 * 16, 16);
|
||||||
|
|
||||||
|
|
||||||
YData = (BYTE*) pSrc[0];
|
YData = (BYTE*) pSrc[0];
|
||||||
UData = (BYTE*) pSrc[1];
|
UData = (BYTE*) pSrc[1];
|
||||||
VData = (BYTE*) pSrc[2];
|
VData = (BYTE*) pSrc[2];
|
||||||
@ -47,51 +43,50 @@ pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(const BYTE **pSrc, int *srcStep,
|
|||||||
nWidth = roi->width;
|
nWidth = roi->width;
|
||||||
nHeight = roi->height;
|
nHeight = roi->height;
|
||||||
|
|
||||||
|
if ((lastCol = (nWidth & 3)))
|
||||||
|
{
|
||||||
|
switch (lastCol)
|
||||||
|
{
|
||||||
|
case 1:
|
||||||
|
r7 = _mm_set_epi32(0,0,0,0xFFFFFFFF);
|
||||||
|
break;
|
||||||
|
|
||||||
if((last_column=nWidth&3)){
|
case 2:
|
||||||
switch(last_column){
|
r7 = _mm_set_epi32(0,0,0xFFFFFFFF,0xFFFFFFFF);
|
||||||
case 1: r7=_mm_set_epi32(0,0,0,0xFFFFFFFF); break;
|
break;
|
||||||
case 2: r7=_mm_set_epi32(0,0,0xFFFFFFFF,0xFFFFFFFF); break;
|
|
||||||
case 3: r7=_mm_set_epi32(0,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF); break;
|
case 3:
|
||||||
|
r7 = _mm_set_epi32(0,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF);
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
_mm_store_si128(buffer+3,r7);
|
_mm_store_si128(buffer+3,r7);
|
||||||
last_column=1;
|
lastCol = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
nWidth += 3;
|
nWidth += 3;
|
||||||
nWidth = nWidth >> 2;
|
nWidth = nWidth >> 2;
|
||||||
|
|
||||||
|
lastRow = nHeight & 1;
|
||||||
last_line=nHeight&1;
|
|
||||||
nHeight++;
|
nHeight++;
|
||||||
nHeight = nHeight >> 1;
|
nHeight = nHeight >> 1;
|
||||||
|
|
||||||
|
|
||||||
VaddDst = (dstStep << 1) - (nWidth << 4);
|
VaddDst = (dstStep << 1) - (nWidth << 4);
|
||||||
VaddY = (srcStep[0] << 1) - (nWidth << 2);
|
VaddY = (srcStep[0] << 1) - (nWidth << 2);
|
||||||
VaddU = srcStep[1] - (((nWidth << 1) + 2) & 0xFFFC);
|
VaddU = srcStep[1] - (((nWidth << 1) + 2) & 0xFFFC);
|
||||||
VaddV = srcStep[2] - (((nWidth << 1) + 2) & 0xFFFC);
|
VaddV = srcStep[2] - (((nWidth << 1) + 2) & 0xFFFC);
|
||||||
|
|
||||||
|
while (nHeight-- > 0)
|
||||||
while(nHeight-- >0){
|
{
|
||||||
if(nHeight==0){
|
if (nHeight == 0)
|
||||||
last_line=last_line<<1;
|
lastRow <<= 1;
|
||||||
}
|
|
||||||
|
|
||||||
i = 0;
|
i = 0;
|
||||||
do{
|
|
||||||
/*
|
|
||||||
* Well, in the end it should look like this:
|
|
||||||
* C = Y;
|
|
||||||
* D = U - 128;
|
|
||||||
* E = V - 128;
|
|
||||||
*
|
|
||||||
* R = clip(( 256 * C + 403 * E + 128) >> 8);
|
|
||||||
* G = clip(( 256 * C - 48 * D - 120 * E + 128) >> 8);
|
|
||||||
* B = clip(( 256 * C + 475 * D + 128) >> 8);
|
|
||||||
*/
|
|
||||||
if(!(i&0x01)){
|
|
||||||
|
|
||||||
|
do
|
||||||
|
{
|
||||||
|
if (!(i & 0x01))
|
||||||
|
{
|
||||||
/* Y-, U- and V-data is stored in different arrays.
|
/* Y-, U- and V-data is stored in different arrays.
|
||||||
* We start with processing U-data.
|
* We start with processing U-data.
|
||||||
*
|
*
|
||||||
@ -122,7 +117,6 @@ pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(const BYTE **pSrc, int *srcStep,
|
|||||||
r0 = _mm_unpacklo_epi16(r0,r4);
|
r0 = _mm_unpacklo_epi16(r0,r4);
|
||||||
r4 = _mm_unpackhi_epi16(r7,r4);
|
r4 = _mm_unpackhi_epi16(r7,r4);
|
||||||
|
|
||||||
|
|
||||||
/* to complete this step, add (?) 128 to each value (rounding ?!)
|
/* to complete this step, add (?) 128 to each value (rounding ?!)
|
||||||
* yeah, add. in the end this will be subtracted from something,
|
* yeah, add. in the end this will be subtracted from something,
|
||||||
* because it's part of G: 256*C - (48*D + 120*E - 128), 48*D-128 !
|
* because it's part of G: 256*C - (48*D + 120*E - 128), 48*D-128 !
|
||||||
@ -131,7 +125,6 @@ pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(const BYTE **pSrc, int *srcStep,
|
|||||||
r0 = _mm_sub_epi32(r0,r6);
|
r0 = _mm_sub_epi32(r0,r6);
|
||||||
r4 = _mm_sub_epi32(r4,r6);
|
r4 = _mm_sub_epi32(r4,r6);
|
||||||
|
|
||||||
|
|
||||||
/* to get B data, we need to prepare a secound value, D*475+128 */
|
/* to get B data, we need to prepare a secound value, D*475+128 */
|
||||||
r1 = r2;
|
r1 = r2;
|
||||||
r7 = _mm_set_epi16(475,475,475,475,475,475,475,475);
|
r7 = _mm_set_epi16(475,475,475,475,475,475,475,475);
|
||||||
@ -160,7 +153,6 @@ pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(const BYTE **pSrc, int *srcStep,
|
|||||||
|
|
||||||
r5 = r2;
|
r5 = r2;
|
||||||
|
|
||||||
|
|
||||||
/* this is also known as E*403+128, we need it to convert R data */
|
/* this is also known as E*403+128, we need it to convert R data */
|
||||||
r3 = r2;
|
r3 = r2;
|
||||||
r7 = _mm_set_epi16(403,403,403,403,403,403,403,403);
|
r7 = _mm_set_epi16(403,403,403,403,403,403,403,403);
|
||||||
@ -176,8 +168,6 @@ pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(const BYTE **pSrc, int *srcStep,
|
|||||||
/* and preserve upper four values for future ... */
|
/* and preserve upper four values for future ... */
|
||||||
_mm_store_si128(buffer+2,r7);
|
_mm_store_si128(buffer+2,r7);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/* doing this step: E*120 */
|
/* doing this step: E*120 */
|
||||||
r3 = r5;
|
r3 = r5;
|
||||||
r7 = _mm_set_epi16(120,120,120,120,120,120,120,120);
|
r7 = _mm_set_epi16(120,120,120,120,120,120,120,120);
|
||||||
@ -194,7 +184,9 @@ pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(const BYTE **pSrc, int *srcStep,
|
|||||||
|
|
||||||
/* and store to memory ! */
|
/* and store to memory ! */
|
||||||
_mm_store_si128(buffer,r4);
|
_mm_store_si128(buffer,r4);
|
||||||
}else{
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
/* maybe you've wondered about the conditional above ?
|
/* maybe you've wondered about the conditional above ?
|
||||||
* Well, we prepared UV data for eight pixel in each line, but can only process four
|
* Well, we prepared UV data for eight pixel in each line, but can only process four
|
||||||
* per loop. So we need to load the upper four pixel data from memory each secound loop! */
|
* per loop. So we need to load the upper four pixel data from memory each secound loop! */
|
||||||
@ -204,7 +196,7 @@ pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(const BYTE **pSrc, int *srcStep,
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (++i == nWidth)
|
if (++i == nWidth)
|
||||||
last_column=last_column<<1;
|
lastCol <<= 1;
|
||||||
|
|
||||||
/* We didn't produce any output yet, so let's do so!
|
/* We didn't produce any output yet, so let's do so!
|
||||||
* Ok, fetch four pixel from the Y-data array and shuffle them like this:
|
* Ok, fetch four pixel from the Y-data array and shuffle them like this:
|
||||||
@ -221,7 +213,6 @@ pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(const BYTE **pSrc, int *srcStep,
|
|||||||
r5 = _mm_sub_epi32(r5,r0);
|
r5 = _mm_sub_epi32(r5,r0);
|
||||||
r6 = _mm_add_epi32(r6,r1);
|
r6 = _mm_add_epi32(r6,r1);
|
||||||
|
|
||||||
|
|
||||||
/* in the end, we only need bytes for RGB values.
|
/* in the end, we only need bytes for RGB values.
|
||||||
* So, what do we do? right! shifting left makes values bigger and thats always good.
|
* So, what do we do? right! shifting left makes values bigger and thats always good.
|
||||||
* before we had dwords of data, and by shifting left and treating the result
|
* before we had dwords of data, and by shifting left and treating the result
|
||||||
@ -266,16 +257,15 @@ pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(const BYTE **pSrc, int *srcStep,
|
|||||||
r7 = _mm_set_epi32(0x8080800E,0x8080800A,0x80808006,0x80808002);
|
r7 = _mm_set_epi32(0x8080800E,0x8080800A,0x80808006,0x80808002);
|
||||||
r6 = _mm_shuffle_epi8(r6,r7);
|
r6 = _mm_shuffle_epi8(r6,r7);
|
||||||
|
|
||||||
|
|
||||||
/* and at last we or it together and get this one:
|
/* and at last we or it together and get this one:
|
||||||
* xrgb xrgb xrgb xrgb */
|
* xrgb xrgb xrgb xrgb */
|
||||||
r4 = _mm_or_si128(r4,r5);
|
r4 = _mm_or_si128(r4,r5);
|
||||||
r4 = _mm_or_si128(r4,r6);
|
r4 = _mm_or_si128(r4,r6);
|
||||||
|
|
||||||
|
|
||||||
/* Only thing to do know is writing data to memory, but this gets a bit more
|
/* Only thing to do know is writing data to memory, but this gets a bit more
|
||||||
* complicated if the width is not a multiple of four and it is the last column in line. */
|
* complicated if the width is not a multiple of four and it is the last column in line. */
|
||||||
if(last_column&0x02){
|
if (lastCol & 0x02)
|
||||||
|
{
|
||||||
/* let's say, we need to only convert six pixel in width
|
/* let's say, we need to only convert six pixel in width
|
||||||
* Ok, the first 4 pixel will be converted just like every 4 pixel else, but
|
* Ok, the first 4 pixel will be converted just like every 4 pixel else, but
|
||||||
* if it's the last loop in line, last_column is shifted left by one (curious? have a look above),
|
* if it's the last loop in line, last_column is shifted left by one (curious? have a look above),
|
||||||
@ -294,8 +284,8 @@ pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(const BYTE **pSrc, int *srcStep,
|
|||||||
}
|
}
|
||||||
_mm_storeu_si128((__m128i *)pDst,r4);
|
_mm_storeu_si128((__m128i *)pDst,r4);
|
||||||
|
|
||||||
|
if (!(lastRow & 0x02))
|
||||||
if(!(last_line&0x02)){
|
{
|
||||||
/* Because UV data is the same for two lines, we can process the secound line just here,
|
/* Because UV data is the same for two lines, we can process the secound line just here,
|
||||||
* in the same loop. Only thing we need to do is to add some offsets to the Y- and destination
|
* in the same loop. Only thing we need to do is to add some offsets to the Y- and destination
|
||||||
* pointer. These offsets are iStride[0] and the target scanline.
|
* pointer. These offsets are iStride[0] and the target scanline.
|
||||||
@ -312,7 +302,6 @@ pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(const BYTE **pSrc, int *srcStep,
|
|||||||
r5 = _mm_sub_epi32(r5,r0);
|
r5 = _mm_sub_epi32(r5,r0);
|
||||||
r6 = _mm_add_epi32(r6,r1);
|
r6 = _mm_add_epi32(r6,r1);
|
||||||
|
|
||||||
|
|
||||||
r4 = _mm_slli_epi32(r4,8);
|
r4 = _mm_slli_epi32(r4,8);
|
||||||
r5 = _mm_slli_epi32(r5,8);
|
r5 = _mm_slli_epi32(r5,8);
|
||||||
r6 = _mm_slli_epi32(r6,8);
|
r6 = _mm_slli_epi32(r6,8);
|
||||||
@ -336,12 +325,11 @@ pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(const BYTE **pSrc, int *srcStep,
|
|||||||
r7 = _mm_set_epi32(0x8080800E,0x8080800A,0x80808006,0x80808002);
|
r7 = _mm_set_epi32(0x8080800E,0x8080800A,0x80808006,0x80808002);
|
||||||
r6 = _mm_shuffle_epi8(r6,r7);
|
r6 = _mm_shuffle_epi8(r6,r7);
|
||||||
|
|
||||||
|
|
||||||
r4 = _mm_or_si128(r4,r5);
|
r4 = _mm_or_si128(r4,r5);
|
||||||
r4 = _mm_or_si128(r4,r6);
|
r4 = _mm_or_si128(r4,r6);
|
||||||
|
|
||||||
|
if (lastCol & 0x02)
|
||||||
if(last_column&0x02){
|
{
|
||||||
r6 = _mm_load_si128(buffer+3);
|
r6 = _mm_load_si128(buffer+3);
|
||||||
r4 = _mm_and_si128(r4,r6);
|
r4 = _mm_and_si128(r4,r6);
|
||||||
r5 = _mm_lddqu_si128((__m128i *)(pDst+dstStep));
|
r5 = _mm_lddqu_si128((__m128i *)(pDst+dstStep));
|
||||||
@ -350,7 +338,7 @@ pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(const BYTE **pSrc, int *srcStep,
|
|||||||
|
|
||||||
/* only thing is, we should shift [rbp-42] back here, because we have processed the last column,
|
/* only thing is, we should shift [rbp-42] back here, because we have processed the last column,
|
||||||
* and this "special condition" can be released */
|
* and this "special condition" can be released */
|
||||||
last_column=last_column>>1;
|
lastCol >>= 1;
|
||||||
}
|
}
|
||||||
_mm_storeu_si128((__m128i *)(pDst+dstStep),r4);
|
_mm_storeu_si128((__m128i *)(pDst+dstStep),r4);
|
||||||
}
|
}
|
||||||
@ -358,8 +346,8 @@ pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(const BYTE **pSrc, int *srcStep,
|
|||||||
/* after all we have to increase the destination- and Y-data pointer by four pixel */
|
/* after all we have to increase the destination- and Y-data pointer by four pixel */
|
||||||
pDst += 16;
|
pDst += 16;
|
||||||
YData += 4;
|
YData += 4;
|
||||||
|
}
|
||||||
}while(i<nWidth);
|
while (i < nWidth);
|
||||||
|
|
||||||
/* after each line we have to add the scanline to the destination pointer, because
|
/* after each line we have to add the scanline to the destination pointer, because
|
||||||
* we are processing two lines at once, but only increasing the destination pointer
|
* we are processing two lines at once, but only increasing the destination pointer
|
||||||
|
Loading…
Reference in New Issue
Block a user