From bcf1266f517f07212e737fd24bba548a93157a37 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Andr=C3=A9=20Moreau?= Date: Tue, 9 Sep 2014 19:15:07 -0400 Subject: [PATCH] libfreerdp-primitives: integrate H264 SSE3 color converter --- include/freerdp/codec/h264.h | 19 -- libfreerdp/codec/h264.c | 55 ++-- libfreerdp/primitives/prim_YUV.c | 39 +-- libfreerdp/primitives/prim_YUV_opt.c | 380 +++++++++++++-------------- 4 files changed, 225 insertions(+), 268 deletions(-) diff --git a/include/freerdp/codec/h264.h b/include/freerdp/codec/h264.h index 969914709..e539cb0b3 100644 --- a/include/freerdp/codec/h264.h +++ b/include/freerdp/codec/h264.h @@ -44,31 +44,12 @@ struct _H264_CONTEXT { BOOL Compressor; - //BYTE* data; - //UINT32 size; UINT32 width; UINT32 height; - //int scanline; - BYTE* pYUVData[3]; int iStride[3]; - -/* -<<<<<<< HEAD -#ifdef WITH_OPENH264 - ISVCDecoder* pDecoder; BYTE* pYUVData[3]; - int iStride[2]; -#endif -#ifdef WITH_LIBAVCODEC - AVCodec* codec; - AVCodecContext* codecContext; - AVCodecParserContext* codecParser; - AVFrame* videoFrame; -#endif -======= -*/ void* pSystemData; H264_CONTEXT_SUBSYSTEM* subsystem; }; diff --git a/libfreerdp/codec/h264.c b/libfreerdp/codec/h264.c index 5f8f688ab..cf5d2be58 100644 --- a/libfreerdp/codec/h264.c +++ b/libfreerdp/codec/h264.c @@ -28,9 +28,6 @@ #include #include -#include - - /** * Dummy subsystem */ @@ -87,8 +84,6 @@ static int openh264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSiz SSysMEMBuffer* pSystemBuffer; H264_CONTEXT_OPENH264* sys = (H264_CONTEXT_OPENH264*) h264->pSystemData; - struct timeval T1,T2; - if (!sys->pDecoder) return -1; @@ -102,7 +97,6 @@ static int openh264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSiz ZeroMemory(&sBufferInfo, sizeof(sBufferInfo)); - gettimeofday(&T1,NULL); state = (*sys->pDecoder)->DecodeFrame2( sys->pDecoder, pSrcData, @@ -119,9 +113,6 @@ static int openh264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSiz if (sBufferInfo.iBufferStatus != 1) state = (*sys->pDecoder)->DecodeFrame2(sys->pDecoder, NULL, 0, h264->pYUVData, &sBufferInfo); - - gettimeofday(&T2,NULL); - printf("OpenH264: decoding took: %u sec %u usec\n",(unsigned int)(T2.tv_sec-T1.tv_sec),(unsigned int)(T2.tv_usec-T1.tv_usec)); pSystemBuffer = &sBufferInfo.UsrData.sSystemBuffer; @@ -285,18 +276,12 @@ static int libavcodec_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcS AVPacket packet; H264_CONTEXT_LIBAVCODEC* sys = (H264_CONTEXT_LIBAVCODEC*) h264->pSystemData; - struct timeval T1,T2; - av_init_packet(&packet); packet.data = pSrcData; packet.size = SrcSize; - gettimeofday(&T1,NULL); status = avcodec_decode_video2(sys->codecContext, sys->videoFrame, &gotFrame, &packet); - gettimeofday(&T2,NULL); - - printf("libavcodec: decoding took: %u sec %u usec\n",(unsigned int)(T2.tv_sec-T1.tv_sec),(unsigned int)(T2.tv_usec-T1.tv_usec)); if (status < 0) { @@ -437,20 +422,18 @@ static H264_CONTEXT_SUBSYSTEM g_Subsystem_libavcodec = int h264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize, BYTE** ppDstData, DWORD DstFormat, int nDstStep, int nDstHeight, RDPGFX_RECT16* regionRects, int numRegionRects) { + int index; + int status; + int* iStride; BYTE* pDstData; BYTE* pDstPoint; - + prim_size_t roi; BYTE** pYUVData; + int width, height; BYTE* pYUVPoint[3]; - RDPGFX_RECT16* rect; - int* iStride; - int ret, i, cx, cy; int UncompressedSize; primitives_t *prims = primitives_get(); - prim_size_t roi; - - struct timeval T1,T2; if (!h264) return -1; @@ -463,23 +446,23 @@ int h264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize, if (!(pDstData = *ppDstData)) return -1; - - if ((ret = h264->subsystem->Decompress(h264, pSrcData, SrcSize)) < 0) - return ret; - + if ((status = h264->subsystem->Decompress(h264, pSrcData, SrcSize)) < 0) + return status; UncompressedSize = h264->width * h264->height * 4; + if (UncompressedSize > (nDstStep * nDstHeight)) return -1; pYUVData = h264->pYUVData; iStride = h264->iStride; - gettimeofday(&T1,NULL); - for (i = 0; i < numRegionRects; i++){ - rect = &(regionRects[i]); - cx = rect->right - rect->left; - cy = rect->bottom - rect->top; + for (index = 0; index < numRegionRects; index++) + { + rect = &(regionRects[index]); + + width = rect->right - rect->left; + height = rect->bottom - rect->top; pDstPoint = pDstData + rect->top * nDstStep + rect->left * 4; pYUVPoint[0] = pYUVData[0] + rect->top * iStride[0] + rect->left; @@ -488,17 +471,15 @@ int h264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize, pYUVPoint[2] = pYUVData[2] + rect->top/2 * iStride[2] + rect->left/2; #if 0 - printf("regionRect: x: %d, y: %d, cx: %d, cy: %d\n", - rect->left, rect->top, cx, cy); + printf("regionRect: x: %d y: %d width: %d height: %d\n", + rect->left, rect->top, width, height); #endif - roi.width = cx; - roi.height = cy; + roi.width = width; + roi.height = height; prims->YUV420ToRGB_8u_P3AC4R((const BYTE**) pYUVPoint, iStride, pDstPoint, nDstStep, &roi); } - gettimeofday(&T2,NULL); - printf("converting took %u sec %u usec\n",(unsigned int)(T2.tv_sec-T1.tv_sec),(unsigned int)(T2.tv_usec-T1.tv_usec)); return 1; } diff --git a/libfreerdp/primitives/prim_YUV.c b/libfreerdp/primitives/prim_YUV.c index 0425c9e8f..24ff1a49a 100644 --- a/libfreerdp/primitives/prim_YUV.c +++ b/libfreerdp/primitives/prim_YUV.c @@ -27,6 +27,16 @@ #include "prim_internal.h" #include "prim_YUV.h" +/** + * | R | ( | 256 0 403 | | Y | ) + * | G | = ( | 256 -48 -120 | | U - 128 | ) >> 8 + * | B | ( | 256 475 0 | | V - 128 | ) + * + * | Y | ( | 54 183 18 | | R | ) | 0 | + * | U | = ( | -29 -99 128 | | G | ) >> 8 + | 128 | + * | V | ( | 128 -116 -12 | | B | ) | 128 | + */ + pstatus_t general_YUV420ToRGB_8u_P3AC4R(const BYTE* pSrc[3], int srcStep[3], BYTE* pDst, int dstStep, const prim_size_t* roi) { @@ -45,14 +55,14 @@ pstatus_t general_YUV420ToRGB_8u_P3AC4R(const BYTE* pSrc[3], int srcStep[3], int Vp403, Vp120; BYTE* pRGB = pDst; int nWidth, nHeight; - int last_line, last_column; + int lastRow, lastCol; pY = pSrc[0]; pU = pSrc[1]; pV = pSrc[2]; - last_column = roi->width & 0x01; - last_line = roi->height & 0x01; + lastCol = roi->width & 0x01; + lastRow = roi->height & 0x01; nWidth = (roi->width + 1) & ~0x0001; nHeight = (roi->height + 1) & ~0x0001; @@ -68,15 +78,13 @@ pstatus_t general_YUV420ToRGB_8u_P3AC4R(const BYTE* pSrc[3], int srcStep[3], for (y = 0; y < halfHeight; ) { - y++; - if (y == halfHeight) - last_line = last_line << 1; + if (++y == halfHeight) + lastRow <<= 1; for (x = 0; x < halfWidth; ) { - x++; - if (x == halfWidth) - last_column = last_column << 1; + if (++x == halfWidth) + lastCol <<= 1; U = *pU++; V = *pV++; @@ -121,7 +129,7 @@ pstatus_t general_YUV420ToRGB_8u_P3AC4R(const BYTE* pSrc[3], int srcStep[3], /* 2nd pixel */ - if (!(last_column & 0x02)) + if (!(lastCol & 0x02)) { Y = *pY++; Yp = Y << 8; @@ -154,7 +162,7 @@ pstatus_t general_YUV420ToRGB_8u_P3AC4R(const BYTE* pSrc[3], int srcStep[3], { pY++; pRGB += 4; - last_column = last_column >> 1; + lastCol >>= 1; } } @@ -165,9 +173,8 @@ pstatus_t general_YUV420ToRGB_8u_P3AC4R(const BYTE* pSrc[3], int srcStep[3], for (x = 0; x < halfWidth; ) { - x++; - if (x == halfWidth) - last_column = last_column << 1; + if (++x == halfWidth) + lastCol <<= 1; U = *pU++; V = *pV++; @@ -212,7 +219,7 @@ pstatus_t general_YUV420ToRGB_8u_P3AC4R(const BYTE* pSrc[3], int srcStep[3], /* 4th pixel */ - if(!(last_column & 0x02)) + if (!(lastCol & 0x02)) { Y = *pY++; Yp = Y << 8; @@ -245,7 +252,7 @@ pstatus_t general_YUV420ToRGB_8u_P3AC4R(const BYTE* pSrc[3], int srcStep[3], { pY++; pRGB += 4; - last_column = last_column >> 1; + lastCol >>= 1; } } diff --git a/libfreerdp/primitives/prim_YUV_opt.c b/libfreerdp/primitives/prim_YUV_opt.c index a8010b9d3..eaf7bf6d7 100644 --- a/libfreerdp/primitives/prim_YUV_opt.c +++ b/libfreerdp/primitives/prim_YUV_opt.c @@ -25,73 +25,68 @@ pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(const BYTE **pSrc, int *srcStep, BYTE *pDst, int dstStep, const prim_size_t *roi) { - char last_line,last_column; -/* last_line: if the last (U,V doubled) line should be skipped, set to 10B - * last_column: if it's the last column in a line, set to 10B (for handling line-endings not multiple by four) */ - - int i,nWidth,nHeight,VaddDst,VaddY,VaddU,VaddV; - + int lastRow, lastCol; BYTE *UData,*VData,*YData; - + int i,nWidth,nHeight,VaddDst,VaddY,VaddU,VaddV; __m128i r0,r1,r2,r3,r4,r5,r6,r7; __m128i *buffer; + /* last_line: if the last (U,V doubled) line should be skipped, set to 10B + * last_column: if it's the last column in a line, set to 10B (for handling line-endings not multiple by four) */ + + buffer = _aligned_malloc(4 * 16, 16); - buffer=_aligned_malloc(4*16,16); + YData = (BYTE*) pSrc[0]; + UData = (BYTE*) pSrc[1]; + VData = (BYTE*) pSrc[2]; + nWidth = roi->width; + nHeight = roi->height; - YData=(BYTE *)pSrc[0]; - UData=(BYTE *)pSrc[1]; - VData=(BYTE *)pSrc[2]; - - nWidth=roi->width; - nHeight=roi->height; - - - if((last_column=nWidth&3)){ - switch(last_column){ - case 1: r7=_mm_set_epi32(0,0,0,0xFFFFFFFF); break; - case 2: r7=_mm_set_epi32(0,0,0xFFFFFFFF,0xFFFFFFFF); break; - case 3: r7=_mm_set_epi32(0,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF); break; + if ((lastCol = (nWidth & 3))) + { + switch (lastCol) + { + case 1: + r7 = _mm_set_epi32(0,0,0,0xFFFFFFFF); + break; + + case 2: + r7 = _mm_set_epi32(0,0,0xFFFFFFFF,0xFFFFFFFF); + break; + + case 3: + r7 = _mm_set_epi32(0,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF); + break; } + _mm_store_si128(buffer+3,r7); - last_column=1; + lastCol = 1; } - nWidth+=3; - nWidth=nWidth>>2; + nWidth += 3; + nWidth = nWidth >> 2; - - last_line=nHeight&1; + lastRow = nHeight & 1; nHeight++; - nHeight=nHeight>>1; + nHeight = nHeight >> 1; + VaddDst = (dstStep << 1) - (nWidth << 4); + VaddY = (srcStep[0] << 1) - (nWidth << 2); + VaddU = srcStep[1] - (((nWidth << 1) + 2) & 0xFFFC); + VaddV = srcStep[2] - (((nWidth << 1) + 2) & 0xFFFC); - VaddDst=(dstStep<<1)-(nWidth<<4); - VaddY=(srcStep[0]<<1)-(nWidth<<2); - VaddU=srcStep[1]-(((nWidth<<1)+2)&0xFFFC); - VaddV=srcStep[2]-(((nWidth<<1)+2)&0xFFFC); - - - while(nHeight-- >0){ - if(nHeight==0){ - last_line=last_line<<1; - } + while (nHeight-- > 0) + { + if (nHeight == 0) + lastRow <<= 1; + + i = 0; - i=0; - do{ -/* - * Well, in the end it should look like this: - * C = Y; - * D = U - 128; - * E = V - 128; - * - * R = clip(( 256 * C + 403 * E + 128) >> 8); - * G = clip(( 256 * C - 48 * D - 120 * E + 128) >> 8); - * B = clip(( 256 * C + 475 * D + 128) >> 8); - */ - if(!(i&0x01)){ - + do + { + if (!(i & 0x01)) + { /* Y-, U- and V-data is stored in different arrays. * We start with processing U-data. * @@ -99,50 +94,48 @@ pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(const BYTE **pSrc, int *srcStep, * 0d0d 0c0c 0b0b 0a0a * we've done two things: converting the values to signed words and duplicating * each value, because always two pixel "share" the same U- (and V-) data */ - r0=_mm_cvtsi32_si128(*(UINT32 *)UData); - r5=_mm_set_epi32(0x80038003,0x80028002,0x80018001,0x80008000); - r0=_mm_shuffle_epi8(r0,r5); + r0 = _mm_cvtsi32_si128(*(UINT32 *)UData); + r5 = _mm_set_epi32(0x80038003,0x80028002,0x80018001,0x80008000); + r0 = _mm_shuffle_epi8(r0,r5); - UData+=4; + UData += 4; /* then we subtract 128 from each value, so we get D */ - r3=_mm_set_epi16(128,128,128,128,128,128,128,128); - r0=_mm_subs_epi16(r0,r3); + r3 = _mm_set_epi16(128,128,128,128,128,128,128,128); + r0 = _mm_subs_epi16(r0,r3); /* we need to do two things with our D, so let's store it for later use */ - r2=r0; + r2 = r0; /* now we can multiply our D with 48 and unpack it to xmm4:xmm0 * this is what we need to get G data later on */ - r4=r0; - r7=_mm_set_epi16(48,48,48,48,48,48,48,48); - r0=_mm_mullo_epi16(r0,r7); - r4=_mm_mulhi_epi16(r4,r7); - r7=r0; - r0=_mm_unpacklo_epi16(r0,r4); - r4=_mm_unpackhi_epi16(r7,r4); - + r4 = r0; + r7 = _mm_set_epi16(48,48,48,48,48,48,48,48); + r0 = _mm_mullo_epi16(r0,r7); + r4 = _mm_mulhi_epi16(r4,r7); + r7 = r0; + r0 = _mm_unpacklo_epi16(r0,r4); + r4 = _mm_unpackhi_epi16(r7,r4); /* to complete this step, add (?) 128 to each value (rounding ?!) * yeah, add. in the end this will be subtracted from something, * because it's part of G: 256*C - (48*D + 120*E - 128), 48*D-128 ! * by the way, our values have become signed dwords during multiplication! */ - r6=_mm_set_epi32(128,128,128,128); - r0=_mm_sub_epi32(r0,r6); - r4=_mm_sub_epi32(r4,r6); - + r6 = _mm_set_epi32(128,128,128,128); + r0 = _mm_sub_epi32(r0,r6); + r4 = _mm_sub_epi32(r4,r6); /* to get B data, we need to prepare a secound value, D*475+128 */ - r1=r2; - r7=_mm_set_epi16(475,475,475,475,475,475,475,475); - r1=_mm_mullo_epi16(r1,r7); - r2=_mm_mulhi_epi16(r2,r7); - r7=r1; - r1=_mm_unpacklo_epi16(r1,r2); - r7=_mm_unpackhi_epi16(r7,r2); + r1 = r2; + r7 = _mm_set_epi16(475,475,475,475,475,475,475,475); + r1 = _mm_mullo_epi16(r1,r7); + r2 = _mm_mulhi_epi16(r2,r7); + r7 = r1; + r1 = _mm_unpacklo_epi16(r1,r2); + r7 = _mm_unpackhi_epi16(r7,r2); - r1=_mm_add_epi32(r1,r6); - r7=_mm_add_epi32(r7,r6); + r1 = _mm_add_epi32(r1,r6); + r7 = _mm_add_epi32(r7,r6); /* so we got something like this: xmm7:xmm1 * this pair contains values for 16 pixel: @@ -151,76 +144,74 @@ pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(const BYTE **pSrc, int *srcStep, _mm_store_si128(buffer+1,r7); /* Now we've prepared U-data. Preparing V-data is actually the same, just with other coefficients */ - r2=_mm_cvtsi32_si128(*(UINT32 *)VData); - r2=_mm_shuffle_epi8(r2,r5); + r2 = _mm_cvtsi32_si128(*(UINT32 *)VData); + r2 = _mm_shuffle_epi8(r2,r5); - VData+=4; + VData += 4; - r2=_mm_subs_epi16(r2,r3); - - r5=r2; + r2 = _mm_subs_epi16(r2,r3); + r5 = r2; /* this is also known as E*403+128, we need it to convert R data */ - r3=r2; - r7=_mm_set_epi16(403,403,403,403,403,403,403,403); - r2=_mm_mullo_epi16(r2,r7); - r3=_mm_mulhi_epi16(r3,r7); - r7=r2; - r2=_mm_unpacklo_epi16(r2,r3); - r7=_mm_unpackhi_epi16(r7,r3); + r3 = r2; + r7 = _mm_set_epi16(403,403,403,403,403,403,403,403); + r2 = _mm_mullo_epi16(r2,r7); + r3 = _mm_mulhi_epi16(r3,r7); + r7 = r2; + r2 = _mm_unpacklo_epi16(r2,r3); + r7 = _mm_unpackhi_epi16(r7,r3); - r2=_mm_add_epi32(r2,r6); - r7=_mm_add_epi32(r7,r6); + r2 = _mm_add_epi32(r2,r6); + r7 = _mm_add_epi32(r7,r6); /* and preserve upper four values for future ... */ _mm_store_si128(buffer+2,r7); - - /* doing this step: E*120 */ - r3=r5; - r7=_mm_set_epi16(120,120,120,120,120,120,120,120); - r3=_mm_mullo_epi16(r3,r7); - r5=_mm_mulhi_epi16(r5,r7); - r7=r3; - r3=_mm_unpacklo_epi16(r3,r5); - r7=_mm_unpackhi_epi16(r7,r5); + r3 = r5; + r7 = _mm_set_epi16(120,120,120,120,120,120,120,120); + r3 = _mm_mullo_epi16(r3,r7); + r5 = _mm_mulhi_epi16(r5,r7); + r7 = r3; + r3 = _mm_unpacklo_epi16(r3,r5); + r7 = _mm_unpackhi_epi16(r7,r5); /* now we complete what we've begun above: * (48*D-128) + (120*E) = (48*D +120*E -128) */ - r0=_mm_add_epi32(r0,r3); - r4=_mm_add_epi32(r4,r7); + r0 = _mm_add_epi32(r0,r3); + r4 = _mm_add_epi32(r4,r7); /* and store to memory ! */ _mm_store_si128(buffer,r4); - }else{ + } + else + { /* maybe you've wondered about the conditional above ? * Well, we prepared UV data for eight pixel in each line, but can only process four * per loop. So we need to load the upper four pixel data from memory each secound loop! */ - r1=_mm_load_si128(buffer+1); - r2=_mm_load_si128(buffer+2); - r0=_mm_load_si128(buffer); + r1 = _mm_load_si128(buffer+1); + r2 = _mm_load_si128(buffer+2); + r0 = _mm_load_si128(buffer); } - if(++i==nWidth) - last_column=last_column<<1; + if (++i == nWidth) + lastCol <<= 1; /* We didn't produce any output yet, so let's do so! * Ok, fetch four pixel from the Y-data array and shuffle them like this: * 00d0 00c0 00b0 00a0, to get signed dwords and multiply by 256 */ - r4=_mm_cvtsi32_si128(*(UINT32 *)YData); - r7=_mm_set_epi32(0x80800380,0x80800280,0x80800180,0x80800080); - r4=_mm_shuffle_epi8(r4,r7); + r4 = _mm_cvtsi32_si128(*(UINT32 *)YData); + r7 = _mm_set_epi32(0x80800380,0x80800280,0x80800180,0x80800080); + r4 = _mm_shuffle_epi8(r4,r7); - r5=r4; - r6=r4; + r5 = r4; + r6 = r4; /* no we can perform the "real" conversion itself and produce output! */ - r4=_mm_add_epi32(r4,r2); - r5=_mm_sub_epi32(r5,r0); - r6=_mm_add_epi32(r6,r1); - + r4 = _mm_add_epi32(r4,r2); + r5 = _mm_sub_epi32(r5,r0); + r6 = _mm_add_epi32(r6,r1); /* in the end, we only need bytes for RGB values. * So, what do we do? right! shifting left makes values bigger and thats always good. @@ -228,9 +219,9 @@ pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(const BYTE **pSrc, int *srcStep, * as packed words, we get not only signed words, but do also divide by 256 * imagine, data is now ordered this way: ddx0 ccx0 bbx0 aax0, and x is the least * significant byte, that we don't need anymore, because we've done some rounding */ - r4=_mm_slli_epi32(r4,8); - r5=_mm_slli_epi32(r5,8); - r6=_mm_slli_epi32(r6,8); + r4 = _mm_slli_epi32(r4,8); + r5 = _mm_slli_epi32(r5,8); + r6 = _mm_slli_epi32(r6,8); /* one thing we still have to face is the clip() function ... * we have still signed words, and there are those min/max instructions in SSE2 ... @@ -238,128 +229,125 @@ pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(const BYTE **pSrc, int *srcStep, * and it operates with signs ! * if we feed it with our values and zeros, it takes the zeros if our values are smaller than * zero and otherwise our values */ - r7=_mm_set_epi32(0,0,0,0); - r4=_mm_max_epi16(r4,r7); - r5=_mm_max_epi16(r5,r7); - r6=_mm_max_epi16(r6,r7); + r7 = _mm_set_epi32(0,0,0,0); + r4 = _mm_max_epi16(r4,r7); + r5 = _mm_max_epi16(r5,r7); + r6 = _mm_max_epi16(r6,r7); /* the same thing just completely different can be used to limit our values to 255, * but now using the min instruction and 255s */ - r7=_mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000); - r4=_mm_min_epi16(r4,r7); - r5=_mm_min_epi16(r5,r7); - r6=_mm_min_epi16(r6,r7); + r7 = _mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000); + r4 = _mm_min_epi16(r4,r7); + r5 = _mm_min_epi16(r5,r7); + r6 = _mm_min_epi16(r6,r7); /* Now we got our bytes. * the moment has come to assemble the three channels R,G and B to the xrgb dwords * on Red channel we just have to and each futural dword with 00FF0000H */ //r7=_mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000); - r4=_mm_and_si128(r4,r7); + r4 = _mm_and_si128(r4,r7); /* on Green channel we have to shuffle somehow, so we get something like this: * 00d0 00c0 00b0 00a0 */ - r7=_mm_set_epi32(0x80800E80,0x80800A80,0x80800680,0x80800280); - r5=_mm_shuffle_epi8(r5,r7); + r7 = _mm_set_epi32(0x80800E80,0x80800A80,0x80800680,0x80800280); + r5 = _mm_shuffle_epi8(r5,r7); /* and on Blue channel that one: * 000d 000c 000b 000a */ - r7=_mm_set_epi32(0x8080800E,0x8080800A,0x80808006,0x80808002); - r6=_mm_shuffle_epi8(r6,r7); - + r7 = _mm_set_epi32(0x8080800E,0x8080800A,0x80808006,0x80808002); + r6 = _mm_shuffle_epi8(r6,r7); /* and at last we or it together and get this one: * xrgb xrgb xrgb xrgb */ - r4=_mm_or_si128(r4,r5); - r4=_mm_or_si128(r4,r6); - + r4 = _mm_or_si128(r4,r5); + r4 = _mm_or_si128(r4,r6); /* Only thing to do know is writing data to memory, but this gets a bit more * complicated if the width is not a multiple of four and it is the last column in line. */ - if(last_column&0x02){ + if (lastCol & 0x02) + { /* let's say, we need to only convert six pixel in width * Ok, the first 4 pixel will be converted just like every 4 pixel else, but * if it's the last loop in line, last_column is shifted left by one (curious? have a look above), * and we land here. Through initialisation a mask was prepared. In this case it looks like * 0000FFFFH 0000FFFFH 0000FFFFH 0000FFFFH */ - r6=_mm_load_si128(buffer+3); + r6 = _mm_load_si128(buffer+3); /* we and our output data with this mask to get only the valid pixel */ - r4=_mm_and_si128(r4,r6); + r4 = _mm_and_si128(r4,r6); /* then we fetch memory from the destination array ... */ - r5=_mm_lddqu_si128((__m128i *)pDst); + r5 = _mm_lddqu_si128((__m128i *)pDst); /* ... and and it with the inverse mask. We get only those pixel, which should not be updated */ - r6=_mm_andnot_si128(r6,r5); + r6 = _mm_andnot_si128(r6,r5); /* we only have to or the two values together and write it back to the destination array, * and only the pixel that should be updated really get changed. */ - r4=_mm_or_si128(r4,r6); + r4 = _mm_or_si128(r4,r6); } _mm_storeu_si128((__m128i *)pDst,r4); - - if(!(last_line&0x02)){ + if (!(lastRow & 0x02)) + { /* Because UV data is the same for two lines, we can process the secound line just here, * in the same loop. Only thing we need to do is to add some offsets to the Y- and destination * pointer. These offsets are iStride[0] and the target scanline. * But if we don't need to process the secound line, like if we are in the last line of processing nine lines, * we just skip all this. */ - r4=_mm_cvtsi32_si128(*(UINT32 *)(YData+srcStep[0])); - r7=_mm_set_epi32(0x80800380,0x80800280,0x80800180,0x80800080); - r4=_mm_shuffle_epi8(r4,r7); + r4 = _mm_cvtsi32_si128(*(UINT32 *)(YData+srcStep[0])); + r7 = _mm_set_epi32(0x80800380,0x80800280,0x80800180,0x80800080); + r4 = _mm_shuffle_epi8(r4,r7); - r5=r4; - r6=r4; + r5 = r4; + r6 = r4; - r4=_mm_add_epi32(r4,r2); - r5=_mm_sub_epi32(r5,r0); - r6=_mm_add_epi32(r6,r1); + r4 = _mm_add_epi32(r4,r2); + r5 = _mm_sub_epi32(r5,r0); + r6 = _mm_add_epi32(r6,r1); + r4 = _mm_slli_epi32(r4,8); + r5 = _mm_slli_epi32(r5,8); + r6 = _mm_slli_epi32(r6,8); - r4=_mm_slli_epi32(r4,8); - r5=_mm_slli_epi32(r5,8); - r6=_mm_slli_epi32(r6,8); + r7 = _mm_set_epi32(0,0,0,0); + r4 = _mm_max_epi16(r4,r7); + r5 = _mm_max_epi16(r5,r7); + r6 = _mm_max_epi16(r6,r7); - r7=_mm_set_epi32(0,0,0,0); - r4=_mm_max_epi16(r4,r7); - r5=_mm_max_epi16(r5,r7); - r6=_mm_max_epi16(r6,r7); + r7 = _mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000); + r4 = _mm_min_epi16(r4,r7); + r5 = _mm_min_epi16(r5,r7); + r6 = _mm_min_epi16(r6,r7); - r7=_mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000); - r4=_mm_min_epi16(r4,r7); - r5=_mm_min_epi16(r5,r7); - r6=_mm_min_epi16(r6,r7); + r7 = _mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000); + r4 = _mm_and_si128(r4,r7); - r7=_mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000); - r4=_mm_and_si128(r4,r7); + r7 = _mm_set_epi32(0x80800E80,0x80800A80,0x80800680,0x80800280); + r5 = _mm_shuffle_epi8(r5,r7); - r7=_mm_set_epi32(0x80800E80,0x80800A80,0x80800680,0x80800280); - r5=_mm_shuffle_epi8(r5,r7); + r7 = _mm_set_epi32(0x8080800E,0x8080800A,0x80808006,0x80808002); + r6 = _mm_shuffle_epi8(r6,r7); - r7=_mm_set_epi32(0x8080800E,0x8080800A,0x80808006,0x80808002); - r6=_mm_shuffle_epi8(r6,r7); + r4 = _mm_or_si128(r4,r5); + r4 = _mm_or_si128(r4,r6); - - r4=_mm_or_si128(r4,r5); - r4=_mm_or_si128(r4,r6); - - - if(last_column&0x02){ - r6=_mm_load_si128(buffer+3); - r4=_mm_and_si128(r4,r6); - r5=_mm_lddqu_si128((__m128i *)(pDst+dstStep)); - r6=_mm_andnot_si128(r6,r5); - r4=_mm_or_si128(r4,r6); + if (lastCol & 0x02) + { + r6 = _mm_load_si128(buffer+3); + r4 = _mm_and_si128(r4,r6); + r5 = _mm_lddqu_si128((__m128i *)(pDst+dstStep)); + r6 = _mm_andnot_si128(r6,r5); + r4 = _mm_or_si128(r4,r6); /* only thing is, we should shift [rbp-42] back here, because we have processed the last column, * and this "special condition" can be released */ - last_column=last_column>>1; + lastCol >>= 1; } _mm_storeu_si128((__m128i *)(pDst+dstStep),r4); } /* after all we have to increase the destination- and Y-data pointer by four pixel */ - pDst+=16; - YData+=4; - - }while(iYUV420ToRGB_8u_P3AC4R=ssse3_YUV420ToRGB_8u_P3AC4R; + prims->YUV420ToRGB_8u_P3AC4R = ssse3_YUV420ToRGB_8u_P3AC4R; } #endif }