libfreerdp-primitives: integrate H264 SSE3 color converter

This commit is contained in:
Marc-André Moreau 2014-09-09 19:15:07 -04:00
parent 666919d157
commit bcf1266f51
4 changed files with 225 additions and 268 deletions

View File

@ -44,31 +44,12 @@ struct _H264_CONTEXT
{
BOOL Compressor;
//BYTE* data;
//UINT32 size;
UINT32 width;
UINT32 height;
//int scanline;
BYTE* pYUVData[3];
int iStride[3];
/*
<<<<<<< HEAD
#ifdef WITH_OPENH264
ISVCDecoder* pDecoder;
BYTE* pYUVData[3];
int iStride[2];
#endif
#ifdef WITH_LIBAVCODEC
AVCodec* codec;
AVCodecContext* codecContext;
AVCodecParserContext* codecParser;
AVFrame* videoFrame;
#endif
=======
*/
void* pSystemData;
H264_CONTEXT_SUBSYSTEM* subsystem;
};

View File

@ -28,9 +28,6 @@
#include <freerdp/primitives.h>
#include <freerdp/codec/h264.h>
#include <sys/time.h>
/**
* Dummy subsystem
*/
@ -87,8 +84,6 @@ static int openh264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSiz
SSysMEMBuffer* pSystemBuffer;
H264_CONTEXT_OPENH264* sys = (H264_CONTEXT_OPENH264*) h264->pSystemData;
struct timeval T1,T2;
if (!sys->pDecoder)
return -1;
@ -102,7 +97,6 @@ static int openh264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSiz
ZeroMemory(&sBufferInfo, sizeof(sBufferInfo));
gettimeofday(&T1,NULL);
state = (*sys->pDecoder)->DecodeFrame2(
sys->pDecoder,
pSrcData,
@ -120,9 +114,6 @@ static int openh264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSiz
if (sBufferInfo.iBufferStatus != 1)
state = (*sys->pDecoder)->DecodeFrame2(sys->pDecoder, NULL, 0, h264->pYUVData, &sBufferInfo);
gettimeofday(&T2,NULL);
printf("OpenH264: decoding took: %u sec %u usec\n",(unsigned int)(T2.tv_sec-T1.tv_sec),(unsigned int)(T2.tv_usec-T1.tv_usec));
pSystemBuffer = &sBufferInfo.UsrData.sSystemBuffer;
#if 0
@ -285,18 +276,12 @@ static int libavcodec_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcS
AVPacket packet;
H264_CONTEXT_LIBAVCODEC* sys = (H264_CONTEXT_LIBAVCODEC*) h264->pSystemData;
struct timeval T1,T2;
av_init_packet(&packet);
packet.data = pSrcData;
packet.size = SrcSize;
gettimeofday(&T1,NULL);
status = avcodec_decode_video2(sys->codecContext, sys->videoFrame, &gotFrame, &packet);
gettimeofday(&T2,NULL);
printf("libavcodec: decoding took: %u sec %u usec\n",(unsigned int)(T2.tv_sec-T1.tv_sec),(unsigned int)(T2.tv_usec-T1.tv_usec));
if (status < 0)
{
@ -437,20 +422,18 @@ static H264_CONTEXT_SUBSYSTEM g_Subsystem_libavcodec =
int h264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize,
BYTE** ppDstData, DWORD DstFormat, int nDstStep, int nDstHeight, RDPGFX_RECT16* regionRects, int numRegionRects)
{
int index;
int status;
int* iStride;
BYTE* pDstData;
BYTE* pDstPoint;
prim_size_t roi;
BYTE** pYUVData;
int width, height;
BYTE* pYUVPoint[3];
RDPGFX_RECT16* rect;
int* iStride;
int ret, i, cx, cy;
int UncompressedSize;
primitives_t *prims = primitives_get();
prim_size_t roi;
struct timeval T1,T2;
if (!h264)
return -1;
@ -463,23 +446,23 @@ int h264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize,
if (!(pDstData = *ppDstData))
return -1;
if ((ret = h264->subsystem->Decompress(h264, pSrcData, SrcSize)) < 0)
return ret;
if ((status = h264->subsystem->Decompress(h264, pSrcData, SrcSize)) < 0)
return status;
UncompressedSize = h264->width * h264->height * 4;
if (UncompressedSize > (nDstStep * nDstHeight))
return -1;
pYUVData = h264->pYUVData;
iStride = h264->iStride;
gettimeofday(&T1,NULL);
for (i = 0; i < numRegionRects; i++){
rect = &(regionRects[i]);
cx = rect->right - rect->left;
cy = rect->bottom - rect->top;
for (index = 0; index < numRegionRects; index++)
{
rect = &(regionRects[index]);
width = rect->right - rect->left;
height = rect->bottom - rect->top;
pDstPoint = pDstData + rect->top * nDstStep + rect->left * 4;
pYUVPoint[0] = pYUVData[0] + rect->top * iStride[0] + rect->left;
@ -488,17 +471,15 @@ int h264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize,
pYUVPoint[2] = pYUVData[2] + rect->top/2 * iStride[2] + rect->left/2;
#if 0
printf("regionRect: x: %d, y: %d, cx: %d, cy: %d\n",
rect->left, rect->top, cx, cy);
printf("regionRect: x: %d y: %d width: %d height: %d\n",
rect->left, rect->top, width, height);
#endif
roi.width = cx;
roi.height = cy;
roi.width = width;
roi.height = height;
prims->YUV420ToRGB_8u_P3AC4R((const BYTE**) pYUVPoint, iStride, pDstPoint, nDstStep, &roi);
}
gettimeofday(&T2,NULL);
printf("converting took %u sec %u usec\n",(unsigned int)(T2.tv_sec-T1.tv_sec),(unsigned int)(T2.tv_usec-T1.tv_usec));
return 1;
}

View File

@ -27,6 +27,16 @@
#include "prim_internal.h"
#include "prim_YUV.h"
/**
* | R | ( | 256 0 403 | | Y | )
* | G | = ( | 256 -48 -120 | | U - 128 | ) >> 8
* | B | ( | 256 475 0 | | V - 128 | )
*
* | Y | ( | 54 183 18 | | R | ) | 0 |
* | U | = ( | -29 -99 128 | | G | ) >> 8 + | 128 |
* | V | ( | 128 -116 -12 | | B | ) | 128 |
*/
pstatus_t general_YUV420ToRGB_8u_P3AC4R(const BYTE* pSrc[3], int srcStep[3],
BYTE* pDst, int dstStep, const prim_size_t* roi)
{
@ -45,14 +55,14 @@ pstatus_t general_YUV420ToRGB_8u_P3AC4R(const BYTE* pSrc[3], int srcStep[3],
int Vp403, Vp120;
BYTE* pRGB = pDst;
int nWidth, nHeight;
int last_line, last_column;
int lastRow, lastCol;
pY = pSrc[0];
pU = pSrc[1];
pV = pSrc[2];
last_column = roi->width & 0x01;
last_line = roi->height & 0x01;
lastCol = roi->width & 0x01;
lastRow = roi->height & 0x01;
nWidth = (roi->width + 1) & ~0x0001;
nHeight = (roi->height + 1) & ~0x0001;
@ -68,15 +78,13 @@ pstatus_t general_YUV420ToRGB_8u_P3AC4R(const BYTE* pSrc[3], int srcStep[3],
for (y = 0; y < halfHeight; )
{
y++;
if (y == halfHeight)
last_line = last_line << 1;
if (++y == halfHeight)
lastRow <<= 1;
for (x = 0; x < halfWidth; )
{
x++;
if (x == halfWidth)
last_column = last_column << 1;
if (++x == halfWidth)
lastCol <<= 1;
U = *pU++;
V = *pV++;
@ -121,7 +129,7 @@ pstatus_t general_YUV420ToRGB_8u_P3AC4R(const BYTE* pSrc[3], int srcStep[3],
/* 2nd pixel */
if (!(last_column & 0x02))
if (!(lastCol & 0x02))
{
Y = *pY++;
Yp = Y << 8;
@ -154,7 +162,7 @@ pstatus_t general_YUV420ToRGB_8u_P3AC4R(const BYTE* pSrc[3], int srcStep[3],
{
pY++;
pRGB += 4;
last_column = last_column >> 1;
lastCol >>= 1;
}
}
@ -165,9 +173,8 @@ pstatus_t general_YUV420ToRGB_8u_P3AC4R(const BYTE* pSrc[3], int srcStep[3],
for (x = 0; x < halfWidth; )
{
x++;
if (x == halfWidth)
last_column = last_column << 1;
if (++x == halfWidth)
lastCol <<= 1;
U = *pU++;
V = *pV++;
@ -212,7 +219,7 @@ pstatus_t general_YUV420ToRGB_8u_P3AC4R(const BYTE* pSrc[3], int srcStep[3],
/* 4th pixel */
if(!(last_column & 0x02))
if (!(lastCol & 0x02))
{
Y = *pY++;
Yp = Y << 8;
@ -245,7 +252,7 @@ pstatus_t general_YUV420ToRGB_8u_P3AC4R(const BYTE* pSrc[3], int srcStep[3],
{
pY++;
pRGB += 4;
last_column = last_column >> 1;
lastCol >>= 1;
}
}

View File

@ -25,73 +25,68 @@
pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(const BYTE **pSrc, int *srcStep,
BYTE *pDst, int dstStep, const prim_size_t *roi)
{
char last_line,last_column;
/* last_line: if the last (U,V doubled) line should be skipped, set to 10B
* last_column: if it's the last column in a line, set to 10B (for handling line-endings not multiple by four) */
int i,nWidth,nHeight,VaddDst,VaddY,VaddU,VaddV;
int lastRow, lastCol;
BYTE *UData,*VData,*YData;
int i,nWidth,nHeight,VaddDst,VaddY,VaddU,VaddV;
__m128i r0,r1,r2,r3,r4,r5,r6,r7;
__m128i *buffer;
/* last_line: if the last (U,V doubled) line should be skipped, set to 10B
* last_column: if it's the last column in a line, set to 10B (for handling line-endings not multiple by four) */
buffer=_aligned_malloc(4*16,16);
buffer = _aligned_malloc(4 * 16, 16);
YData = (BYTE*) pSrc[0];
UData = (BYTE*) pSrc[1];
VData = (BYTE*) pSrc[2];
YData=(BYTE *)pSrc[0];
UData=(BYTE *)pSrc[1];
VData=(BYTE *)pSrc[2];
nWidth = roi->width;
nHeight = roi->height;
nWidth=roi->width;
nHeight=roi->height;
if ((lastCol = (nWidth & 3)))
{
switch (lastCol)
{
case 1:
r7 = _mm_set_epi32(0,0,0,0xFFFFFFFF);
break;
case 2:
r7 = _mm_set_epi32(0,0,0xFFFFFFFF,0xFFFFFFFF);
break;
if((last_column=nWidth&3)){
switch(last_column){
case 1: r7=_mm_set_epi32(0,0,0,0xFFFFFFFF); break;
case 2: r7=_mm_set_epi32(0,0,0xFFFFFFFF,0xFFFFFFFF); break;
case 3: r7=_mm_set_epi32(0,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF); break;
case 3:
r7 = _mm_set_epi32(0,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF);
break;
}
_mm_store_si128(buffer+3,r7);
last_column=1;
lastCol = 1;
}
nWidth+=3;
nWidth=nWidth>>2;
nWidth += 3;
nWidth = nWidth >> 2;
last_line=nHeight&1;
lastRow = nHeight & 1;
nHeight++;
nHeight=nHeight>>1;
nHeight = nHeight >> 1;
VaddDst = (dstStep << 1) - (nWidth << 4);
VaddY = (srcStep[0] << 1) - (nWidth << 2);
VaddU = srcStep[1] - (((nWidth << 1) + 2) & 0xFFFC);
VaddV = srcStep[2] - (((nWidth << 1) + 2) & 0xFFFC);
VaddDst=(dstStep<<1)-(nWidth<<4);
VaddY=(srcStep[0]<<1)-(nWidth<<2);
VaddU=srcStep[1]-(((nWidth<<1)+2)&0xFFFC);
VaddV=srcStep[2]-(((nWidth<<1)+2)&0xFFFC);
while (nHeight-- > 0)
{
if (nHeight == 0)
lastRow <<= 1;
i = 0;
while(nHeight-- >0){
if(nHeight==0){
last_line=last_line<<1;
}
i=0;
do{
/*
* Well, in the end it should look like this:
* C = Y;
* D = U - 128;
* E = V - 128;
*
* R = clip(( 256 * C + 403 * E + 128) >> 8);
* G = clip(( 256 * C - 48 * D - 120 * E + 128) >> 8);
* B = clip(( 256 * C + 475 * D + 128) >> 8);
*/
if(!(i&0x01)){
do
{
if (!(i & 0x01))
{
/* Y-, U- and V-data is stored in different arrays.
* We start with processing U-data.
*
@ -99,50 +94,48 @@ pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(const BYTE **pSrc, int *srcStep,
* 0d0d 0c0c 0b0b 0a0a
* we've done two things: converting the values to signed words and duplicating
* each value, because always two pixel "share" the same U- (and V-) data */
r0=_mm_cvtsi32_si128(*(UINT32 *)UData);
r5=_mm_set_epi32(0x80038003,0x80028002,0x80018001,0x80008000);
r0=_mm_shuffle_epi8(r0,r5);
r0 = _mm_cvtsi32_si128(*(UINT32 *)UData);
r5 = _mm_set_epi32(0x80038003,0x80028002,0x80018001,0x80008000);
r0 = _mm_shuffle_epi8(r0,r5);
UData+=4;
UData += 4;
/* then we subtract 128 from each value, so we get D */
r3=_mm_set_epi16(128,128,128,128,128,128,128,128);
r0=_mm_subs_epi16(r0,r3);
r3 = _mm_set_epi16(128,128,128,128,128,128,128,128);
r0 = _mm_subs_epi16(r0,r3);
/* we need to do two things with our D, so let's store it for later use */
r2=r0;
r2 = r0;
/* now we can multiply our D with 48 and unpack it to xmm4:xmm0
* this is what we need to get G data later on */
r4=r0;
r7=_mm_set_epi16(48,48,48,48,48,48,48,48);
r0=_mm_mullo_epi16(r0,r7);
r4=_mm_mulhi_epi16(r4,r7);
r7=r0;
r0=_mm_unpacklo_epi16(r0,r4);
r4=_mm_unpackhi_epi16(r7,r4);
r4 = r0;
r7 = _mm_set_epi16(48,48,48,48,48,48,48,48);
r0 = _mm_mullo_epi16(r0,r7);
r4 = _mm_mulhi_epi16(r4,r7);
r7 = r0;
r0 = _mm_unpacklo_epi16(r0,r4);
r4 = _mm_unpackhi_epi16(r7,r4);
/* to complete this step, add (?) 128 to each value (rounding ?!)
* yeah, add. in the end this will be subtracted from something,
* because it's part of G: 256*C - (48*D + 120*E - 128), 48*D-128 !
* by the way, our values have become signed dwords during multiplication! */
r6=_mm_set_epi32(128,128,128,128);
r0=_mm_sub_epi32(r0,r6);
r4=_mm_sub_epi32(r4,r6);
r6 = _mm_set_epi32(128,128,128,128);
r0 = _mm_sub_epi32(r0,r6);
r4 = _mm_sub_epi32(r4,r6);
/* to get B data, we need to prepare a secound value, D*475+128 */
r1=r2;
r7=_mm_set_epi16(475,475,475,475,475,475,475,475);
r1=_mm_mullo_epi16(r1,r7);
r2=_mm_mulhi_epi16(r2,r7);
r7=r1;
r1=_mm_unpacklo_epi16(r1,r2);
r7=_mm_unpackhi_epi16(r7,r2);
r1 = r2;
r7 = _mm_set_epi16(475,475,475,475,475,475,475,475);
r1 = _mm_mullo_epi16(r1,r7);
r2 = _mm_mulhi_epi16(r2,r7);
r7 = r1;
r1 = _mm_unpacklo_epi16(r1,r2);
r7 = _mm_unpackhi_epi16(r7,r2);
r1=_mm_add_epi32(r1,r6);
r7=_mm_add_epi32(r7,r6);
r1 = _mm_add_epi32(r1,r6);
r7 = _mm_add_epi32(r7,r6);
/* so we got something like this: xmm7:xmm1
* this pair contains values for 16 pixel:
@ -151,76 +144,74 @@ pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(const BYTE **pSrc, int *srcStep,
_mm_store_si128(buffer+1,r7);
/* Now we've prepared U-data. Preparing V-data is actually the same, just with other coefficients */
r2=_mm_cvtsi32_si128(*(UINT32 *)VData);
r2=_mm_shuffle_epi8(r2,r5);
r2 = _mm_cvtsi32_si128(*(UINT32 *)VData);
r2 = _mm_shuffle_epi8(r2,r5);
VData+=4;
VData += 4;
r2=_mm_subs_epi16(r2,r3);
r5=r2;
r2 = _mm_subs_epi16(r2,r3);
r5 = r2;
/* this is also known as E*403+128, we need it to convert R data */
r3=r2;
r7=_mm_set_epi16(403,403,403,403,403,403,403,403);
r2=_mm_mullo_epi16(r2,r7);
r3=_mm_mulhi_epi16(r3,r7);
r7=r2;
r2=_mm_unpacklo_epi16(r2,r3);
r7=_mm_unpackhi_epi16(r7,r3);
r3 = r2;
r7 = _mm_set_epi16(403,403,403,403,403,403,403,403);
r2 = _mm_mullo_epi16(r2,r7);
r3 = _mm_mulhi_epi16(r3,r7);
r7 = r2;
r2 = _mm_unpacklo_epi16(r2,r3);
r7 = _mm_unpackhi_epi16(r7,r3);
r2=_mm_add_epi32(r2,r6);
r7=_mm_add_epi32(r7,r6);
r2 = _mm_add_epi32(r2,r6);
r7 = _mm_add_epi32(r7,r6);
/* and preserve upper four values for future ... */
_mm_store_si128(buffer+2,r7);
/* doing this step: E*120 */
r3=r5;
r7=_mm_set_epi16(120,120,120,120,120,120,120,120);
r3=_mm_mullo_epi16(r3,r7);
r5=_mm_mulhi_epi16(r5,r7);
r7=r3;
r3=_mm_unpacklo_epi16(r3,r5);
r7=_mm_unpackhi_epi16(r7,r5);
r3 = r5;
r7 = _mm_set_epi16(120,120,120,120,120,120,120,120);
r3 = _mm_mullo_epi16(r3,r7);
r5 = _mm_mulhi_epi16(r5,r7);
r7 = r3;
r3 = _mm_unpacklo_epi16(r3,r5);
r7 = _mm_unpackhi_epi16(r7,r5);
/* now we complete what we've begun above:
* (48*D-128) + (120*E) = (48*D +120*E -128) */
r0=_mm_add_epi32(r0,r3);
r4=_mm_add_epi32(r4,r7);
r0 = _mm_add_epi32(r0,r3);
r4 = _mm_add_epi32(r4,r7);
/* and store to memory ! */
_mm_store_si128(buffer,r4);
}else{
}
else
{
/* maybe you've wondered about the conditional above ?
* Well, we prepared UV data for eight pixel in each line, but can only process four
* per loop. So we need to load the upper four pixel data from memory each secound loop! */
r1=_mm_load_si128(buffer+1);
r2=_mm_load_si128(buffer+2);
r0=_mm_load_si128(buffer);
r1 = _mm_load_si128(buffer+1);
r2 = _mm_load_si128(buffer+2);
r0 = _mm_load_si128(buffer);
}
if(++i==nWidth)
last_column=last_column<<1;
if (++i == nWidth)
lastCol <<= 1;
/* We didn't produce any output yet, so let's do so!
* Ok, fetch four pixel from the Y-data array and shuffle them like this:
* 00d0 00c0 00b0 00a0, to get signed dwords and multiply by 256 */
r4=_mm_cvtsi32_si128(*(UINT32 *)YData);
r7=_mm_set_epi32(0x80800380,0x80800280,0x80800180,0x80800080);
r4=_mm_shuffle_epi8(r4,r7);
r4 = _mm_cvtsi32_si128(*(UINT32 *)YData);
r7 = _mm_set_epi32(0x80800380,0x80800280,0x80800180,0x80800080);
r4 = _mm_shuffle_epi8(r4,r7);
r5=r4;
r6=r4;
r5 = r4;
r6 = r4;
/* no we can perform the "real" conversion itself and produce output! */
r4=_mm_add_epi32(r4,r2);
r5=_mm_sub_epi32(r5,r0);
r6=_mm_add_epi32(r6,r1);
r4 = _mm_add_epi32(r4,r2);
r5 = _mm_sub_epi32(r5,r0);
r6 = _mm_add_epi32(r6,r1);
/* in the end, we only need bytes for RGB values.
* So, what do we do? right! shifting left makes values bigger and thats always good.
@ -228,9 +219,9 @@ pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(const BYTE **pSrc, int *srcStep,
* as packed words, we get not only signed words, but do also divide by 256
* imagine, data is now ordered this way: ddx0 ccx0 bbx0 aax0, and x is the least
* significant byte, that we don't need anymore, because we've done some rounding */
r4=_mm_slli_epi32(r4,8);
r5=_mm_slli_epi32(r5,8);
r6=_mm_slli_epi32(r6,8);
r4 = _mm_slli_epi32(r4,8);
r5 = _mm_slli_epi32(r5,8);
r6 = _mm_slli_epi32(r6,8);
/* one thing we still have to face is the clip() function ...
* we have still signed words, and there are those min/max instructions in SSE2 ...
@ -238,128 +229,125 @@ pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(const BYTE **pSrc, int *srcStep,
* and it operates with signs !
* if we feed it with our values and zeros, it takes the zeros if our values are smaller than
* zero and otherwise our values */
r7=_mm_set_epi32(0,0,0,0);
r4=_mm_max_epi16(r4,r7);
r5=_mm_max_epi16(r5,r7);
r6=_mm_max_epi16(r6,r7);
r7 = _mm_set_epi32(0,0,0,0);
r4 = _mm_max_epi16(r4,r7);
r5 = _mm_max_epi16(r5,r7);
r6 = _mm_max_epi16(r6,r7);
/* the same thing just completely different can be used to limit our values to 255,
* but now using the min instruction and 255s */
r7=_mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000);
r4=_mm_min_epi16(r4,r7);
r5=_mm_min_epi16(r5,r7);
r6=_mm_min_epi16(r6,r7);
r7 = _mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000);
r4 = _mm_min_epi16(r4,r7);
r5 = _mm_min_epi16(r5,r7);
r6 = _mm_min_epi16(r6,r7);
/* Now we got our bytes.
* the moment has come to assemble the three channels R,G and B to the xrgb dwords
* on Red channel we just have to and each futural dword with 00FF0000H */
//r7=_mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000);
r4=_mm_and_si128(r4,r7);
r4 = _mm_and_si128(r4,r7);
/* on Green channel we have to shuffle somehow, so we get something like this:
* 00d0 00c0 00b0 00a0 */
r7=_mm_set_epi32(0x80800E80,0x80800A80,0x80800680,0x80800280);
r5=_mm_shuffle_epi8(r5,r7);
r7 = _mm_set_epi32(0x80800E80,0x80800A80,0x80800680,0x80800280);
r5 = _mm_shuffle_epi8(r5,r7);
/* and on Blue channel that one:
* 000d 000c 000b 000a */
r7=_mm_set_epi32(0x8080800E,0x8080800A,0x80808006,0x80808002);
r6=_mm_shuffle_epi8(r6,r7);
r7 = _mm_set_epi32(0x8080800E,0x8080800A,0x80808006,0x80808002);
r6 = _mm_shuffle_epi8(r6,r7);
/* and at last we or it together and get this one:
* xrgb xrgb xrgb xrgb */
r4=_mm_or_si128(r4,r5);
r4=_mm_or_si128(r4,r6);
r4 = _mm_or_si128(r4,r5);
r4 = _mm_or_si128(r4,r6);
/* Only thing to do know is writing data to memory, but this gets a bit more
* complicated if the width is not a multiple of four and it is the last column in line. */
if(last_column&0x02){
if (lastCol & 0x02)
{
/* let's say, we need to only convert six pixel in width
* Ok, the first 4 pixel will be converted just like every 4 pixel else, but
* if it's the last loop in line, last_column is shifted left by one (curious? have a look above),
* and we land here. Through initialisation a mask was prepared. In this case it looks like
* 0000FFFFH 0000FFFFH 0000FFFFH 0000FFFFH */
r6=_mm_load_si128(buffer+3);
r6 = _mm_load_si128(buffer+3);
/* we and our output data with this mask to get only the valid pixel */
r4=_mm_and_si128(r4,r6);
r4 = _mm_and_si128(r4,r6);
/* then we fetch memory from the destination array ... */
r5=_mm_lddqu_si128((__m128i *)pDst);
r5 = _mm_lddqu_si128((__m128i *)pDst);
/* ... and and it with the inverse mask. We get only those pixel, which should not be updated */
r6=_mm_andnot_si128(r6,r5);
r6 = _mm_andnot_si128(r6,r5);
/* we only have to or the two values together and write it back to the destination array,
* and only the pixel that should be updated really get changed. */
r4=_mm_or_si128(r4,r6);
r4 = _mm_or_si128(r4,r6);
}
_mm_storeu_si128((__m128i *)pDst,r4);
if(!(last_line&0x02)){
if (!(lastRow & 0x02))
{
/* Because UV data is the same for two lines, we can process the secound line just here,
* in the same loop. Only thing we need to do is to add some offsets to the Y- and destination
* pointer. These offsets are iStride[0] and the target scanline.
* But if we don't need to process the secound line, like if we are in the last line of processing nine lines,
* we just skip all this. */
r4=_mm_cvtsi32_si128(*(UINT32 *)(YData+srcStep[0]));
r7=_mm_set_epi32(0x80800380,0x80800280,0x80800180,0x80800080);
r4=_mm_shuffle_epi8(r4,r7);
r4 = _mm_cvtsi32_si128(*(UINT32 *)(YData+srcStep[0]));
r7 = _mm_set_epi32(0x80800380,0x80800280,0x80800180,0x80800080);
r4 = _mm_shuffle_epi8(r4,r7);
r5=r4;
r6=r4;
r5 = r4;
r6 = r4;
r4=_mm_add_epi32(r4,r2);
r5=_mm_sub_epi32(r5,r0);
r6=_mm_add_epi32(r6,r1);
r4 = _mm_add_epi32(r4,r2);
r5 = _mm_sub_epi32(r5,r0);
r6 = _mm_add_epi32(r6,r1);
r4 = _mm_slli_epi32(r4,8);
r5 = _mm_slli_epi32(r5,8);
r6 = _mm_slli_epi32(r6,8);
r4=_mm_slli_epi32(r4,8);
r5=_mm_slli_epi32(r5,8);
r6=_mm_slli_epi32(r6,8);
r7 = _mm_set_epi32(0,0,0,0);
r4 = _mm_max_epi16(r4,r7);
r5 = _mm_max_epi16(r5,r7);
r6 = _mm_max_epi16(r6,r7);
r7=_mm_set_epi32(0,0,0,0);
r4=_mm_max_epi16(r4,r7);
r5=_mm_max_epi16(r5,r7);
r6=_mm_max_epi16(r6,r7);
r7 = _mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000);
r4 = _mm_min_epi16(r4,r7);
r5 = _mm_min_epi16(r5,r7);
r6 = _mm_min_epi16(r6,r7);
r7=_mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000);
r4=_mm_min_epi16(r4,r7);
r5=_mm_min_epi16(r5,r7);
r6=_mm_min_epi16(r6,r7);
r7 = _mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000);
r4 = _mm_and_si128(r4,r7);
r7=_mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000);
r4=_mm_and_si128(r4,r7);
r7 = _mm_set_epi32(0x80800E80,0x80800A80,0x80800680,0x80800280);
r5 = _mm_shuffle_epi8(r5,r7);
r7=_mm_set_epi32(0x80800E80,0x80800A80,0x80800680,0x80800280);
r5=_mm_shuffle_epi8(r5,r7);
r7 = _mm_set_epi32(0x8080800E,0x8080800A,0x80808006,0x80808002);
r6 = _mm_shuffle_epi8(r6,r7);
r7=_mm_set_epi32(0x8080800E,0x8080800A,0x80808006,0x80808002);
r6=_mm_shuffle_epi8(r6,r7);
r4 = _mm_or_si128(r4,r5);
r4 = _mm_or_si128(r4,r6);
r4=_mm_or_si128(r4,r5);
r4=_mm_or_si128(r4,r6);
if(last_column&0x02){
r6=_mm_load_si128(buffer+3);
r4=_mm_and_si128(r4,r6);
r5=_mm_lddqu_si128((__m128i *)(pDst+dstStep));
r6=_mm_andnot_si128(r6,r5);
r4=_mm_or_si128(r4,r6);
if (lastCol & 0x02)
{
r6 = _mm_load_si128(buffer+3);
r4 = _mm_and_si128(r4,r6);
r5 = _mm_lddqu_si128((__m128i *)(pDst+dstStep));
r6 = _mm_andnot_si128(r6,r5);
r4 = _mm_or_si128(r4,r6);
/* only thing is, we should shift [rbp-42] back here, because we have processed the last column,
* and this "special condition" can be released */
last_column=last_column>>1;
lastCol >>= 1;
}
_mm_storeu_si128((__m128i *)(pDst+dstStep),r4);
}
/* after all we have to increase the destination- and Y-data pointer by four pixel */
pDst+=16;
YData+=4;
}while(i<nWidth);
pDst += 16;
YData += 4;
}
while (i < nWidth);
/* after each line we have to add the scanline to the destination pointer, because
* we are processing two lines at once, but only increasing the destination pointer
@ -368,15 +356,15 @@ pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(const BYTE **pSrc, int *srcStep,
* if we're not converting the full width of the scanline, like only 64 pixel, but the
* output buffer was "designed" for 1920p HD, we have to add the remaining length for each line,
* to get into the next line. */
pDst+=VaddDst;
pDst += VaddDst;
/* same thing has to be done for Y-data, but with iStride[0] instead of the target scanline */
YData+=VaddY;
YData += VaddY;
/* and again for UV data, but here it's enough to add the remaining length, because
* UV data is the same for two lines and there exists only one "UV line" on two "real lines" */
UData+=VaddU;
VData+=VaddV;
UData += VaddU;
VData += VaddV;
}
_aligned_free(buffer);
@ -388,9 +376,9 @@ pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(const BYTE **pSrc, int *srcStep,
void primitives_init_YUV_opt(primitives_t *prims)
{
#ifdef WITH_SSE2
if(IsProcessorFeaturePresentEx(PF_EX_SSSE3)&&IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
if (IsProcessorFeaturePresentEx(PF_EX_SSSE3) && IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
{
prims->YUV420ToRGB_8u_P3AC4R=ssse3_YUV420ToRGB_8u_P3AC4R;
prims->YUV420ToRGB_8u_P3AC4R = ssse3_YUV420ToRGB_8u_P3AC4R;
}
#endif
}