diff --git a/libfreerdp/primitives/prim_YUV_ssse3.c b/libfreerdp/primitives/prim_YUV_ssse3.c index a1bae6c56..184f2be5e 100644 --- a/libfreerdp/primitives/prim_YUV_ssse3.c +++ b/libfreerdp/primitives/prim_YUV_ssse3.c @@ -515,83 +515,184 @@ static pstatus_t ssse3_RGBToYUV420( /* SSSE3 RGB -> AVC444-YUV conversion **/ /****************************************************************************/ -static INLINE void ssse3_RGBToAVC444YUV_BGRX_ROW( - const BYTE* src, BYTE* ydst, BYTE* udst1, BYTE* udst2, BYTE* vdst1, BYTE* vdst2, BOOL isEvenRow, - UINT32 width) +static INLINE void ssse3_RGBToAVC444YUV_BGRX_DOUBLE_ROW( + const BYTE* srcEven, const BYTE* srcOdd, BYTE* b1Even, BYTE* b1Odd, BYTE* b2, + BYTE* b3, BYTE* b4, BYTE* b5, BYTE* b6, BYTE* b7, UINT32 width) { UINT32 x; - __m128i vector128, y_factors, u_factors, v_factors, smask; - __m128i x1, x2, x3, x4, y, y1, y2, u, u1, u2, v, v1, v2; - const __m128i* argb = (const __m128i*) src; - __m128i* py = (__m128i*) ydst; - __m64* pu1 = (__m64*) udst1; - __m64* pu2 = (__m64*) udst2; - __m64* pv1 = (__m64*) vdst1; - __m64* pv2 = (__m64*) vdst2; - y_factors = _mm_load_si128((__m128i*)bgrx_y_factors); - u_factors = _mm_load_si128((__m128i*)bgrx_u_factors); - v_factors = _mm_load_si128((__m128i*)bgrx_v_factors); - vector128 = _mm_load_si128((__m128i*)const_buf_128b); - smask = _mm_set_epi8(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0); + const __m128i* argbEven = (const __m128i*) srcEven; + const __m128i* argbOdd = (const __m128i*) srcOdd; + const __m128i y_factors = _mm_load_si128((__m128i*)bgrx_y_factors); + const __m128i u_factors = _mm_load_si128((__m128i*)bgrx_u_factors); + const __m128i v_factors = _mm_load_si128((__m128i*)bgrx_v_factors); + const __m128i vector128 = _mm_load_si128((__m128i*)const_buf_128b); for (x = 0; x < width; x += 16) { /* store 16 rgba pixels in 4 128 bit registers */ - x1 = _mm_load_si128(argb++); // 1st 4 pixels - x2 = _mm_load_si128(argb++); // 2nd 4 pixels - x3 = _mm_load_si128(argb++); // 3rd 4 pixels - x4 = _mm_load_si128(argb++); // 4th 4 pixels - /* Y: multiplications with subtotals and horizontal sums */ - y1 = _mm_hadd_epi16(_mm_maddubs_epi16(x1, y_factors), _mm_maddubs_epi16(x2, y_factors)); - y2 = _mm_hadd_epi16(_mm_maddubs_epi16(x3, y_factors), _mm_maddubs_epi16(x4, y_factors)); - /* Y: shift the results (logical) */ - y1 = _mm_srli_epi16(y1, 7); - y2 = _mm_srli_epi16(y2, 7); - /* Y: pack (unsigned) 16 words into bytes */ - y = _mm_packus_epi16(y1, y2); - /* U: multiplications with subtotals and horizontal sums */ - u1 = _mm_hadd_epi16(_mm_maddubs_epi16(x1, u_factors), _mm_maddubs_epi16(x2, u_factors)); - u2 = _mm_hadd_epi16(_mm_maddubs_epi16(x3, u_factors), _mm_maddubs_epi16(x4, u_factors)); - /* U: shift the results (arithmetic) */ - u1 = _mm_srai_epi16(u1, 7); - u2 = _mm_srai_epi16(u2, 7); - /* U: pack (signed) 16 words into bytes */ - u = _mm_packs_epi16(u1, u2); - /* U: add 128 */ - u = _mm_add_epi8(u, vector128); - /* V: multiplications with subtotals and horizontal sums */ - v1 = _mm_hadd_epi16(_mm_maddubs_epi16(x1, v_factors), _mm_maddubs_epi16(x2, v_factors)); - v2 = _mm_hadd_epi16(_mm_maddubs_epi16(x3, v_factors), _mm_maddubs_epi16(x4, v_factors)); - /* V: shift the results (arithmetic) */ - v1 = _mm_srai_epi16(v1, 7); - v2 = _mm_srai_epi16(v2, 7); - /* V: pack (signed) 16 words into bytes */ - v = _mm_packs_epi16(v1, v2); - /* V: add 128 */ - v = _mm_add_epi8(v, vector128); - /* store y */ - _mm_storeu_si128(py++, y); + const __m128i xe1 = _mm_load_si128(argbEven++); // 1st 4 pixels + const __m128i xe2 = _mm_load_si128(argbEven++); // 2nd 4 pixels + const __m128i xe3 = _mm_load_si128(argbEven++); // 3rd 4 pixels + const __m128i xe4 = _mm_load_si128(argbEven++); // 4th 4 pixels + const __m128i xo1 = _mm_load_si128(argbOdd++); // 1st 4 pixels + const __m128i xo2 = _mm_load_si128(argbOdd++); // 2nd 4 pixels + const __m128i xo3 = _mm_load_si128(argbOdd++); // 3rd 4 pixels + const __m128i xo4 = _mm_load_si128(argbOdd++); // 4th 4 pixels + { + /* Y: multiplications with subtotals and horizontal sums */ + const __m128i ye1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, y_factors), + _mm_maddubs_epi16(xe2, y_factors)), 7); + const __m128i ye2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, y_factors), + _mm_maddubs_epi16(xe4, y_factors)), 7); + const __m128i ye = _mm_packus_epi16(ye1, ye2); + const __m128i yo1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, y_factors), + _mm_maddubs_epi16(xo2, y_factors)), 7); + const __m128i yo2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, y_factors), + _mm_maddubs_epi16(xo4, y_factors)), 7); + const __m128i yo = _mm_packus_epi16(yo1, yo2); + /* store y [b1] */ + _mm_storeu_si128((__m128i*)b1Even, ye); + b1Even += 16; - /* store u and v */ - if (isEvenRow) - { - u = _mm_shuffle_epi8(u, smask); - v = _mm_shuffle_epi8(v, smask); - _mm_storel_pi(pu1++, _mm_castsi128_ps(u)); - _mm_storeh_pi(pu2++, _mm_castsi128_ps(u)); - _mm_storel_pi(pv1++, _mm_castsi128_ps(v)); - _mm_storeh_pi(pv2++, _mm_castsi128_ps(v)); + if (b1Odd) + { + _mm_storeu_si128((__m128i*)b1Odd, yo); + b1Odd += 16; + } } - else { - _mm_storel_pi(pu1, _mm_castsi128_ps(u)); - _mm_storeh_pi(pu2, _mm_castsi128_ps(u)); - _mm_storel_pi(pv1, _mm_castsi128_ps(v)); - _mm_storeh_pi(pv2, _mm_castsi128_ps(v)); - pu1 += 2; - pu2 += 2; - pv1 += 2; - pv2 += 2; + /* We have now + * 16 even U values in ue + * 16 odd U values in uo + * + * We need to split these according to + * 3.3.8.3.2 YUV420p Stream Combination for YUV444 mode */ + __m128i ue, uo; + { + const __m128i ue1 = _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, u_factors), + _mm_maddubs_epi16(xe2, u_factors)), 7); + const __m128i ue2 = _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, u_factors), + _mm_maddubs_epi16(xe4, u_factors)), 7); + ue = _mm_add_epi8(_mm_packs_epi16(ue1, ue2), vector128); + } + + if (b1Odd) + { + const __m128i uo1 = _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, u_factors), + _mm_maddubs_epi16(xo2, u_factors)), 7); + const __m128i uo2 = _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, u_factors), + _mm_maddubs_epi16(xo4, u_factors)), 7); + uo = _mm_add_epi8(_mm_packs_epi16(uo1, uo2), vector128); + } + + /* Now we need the following storage distribution: + * 2x 2y -> b2 + * x 2y+1 -> b4 + * 2x+1 2y -> b6 */ + if (b1Odd) /* b2 */ + { + const __m128i ueh = _mm_unpackhi_epi8(ue, _mm_setzero_si128()); + const __m128i uoh = _mm_unpackhi_epi8(uo, _mm_setzero_si128()); + const __m128i hi = _mm_add_epi16(ueh, uoh); + const __m128i uel = _mm_unpacklo_epi8(ue, _mm_setzero_si128()); + const __m128i uol = _mm_unpacklo_epi8(uo, _mm_setzero_si128()); + const __m128i lo = _mm_add_epi16(uel, uol); + const __m128i added = _mm_hadd_epi16(lo, hi); + const __m128i avg16 = _mm_srai_epi16(added, 2); + const __m128i avg = _mm_packus_epi16(avg16, avg16); + _mm_storel_epi64((__m128i*)b2, avg); + } + else + { + const __m128i mask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 14, 12, 10, 8, 6, 4, 2, 0); + const __m128i ud = _mm_shuffle_epi8(ue, mask); + _mm_storel_epi64((__m128i*)b2, ud); + } + + b2 += 8; + + if (b1Odd) /* b4 */ + { + _mm_store_si128((__m128i*)b4, uo); + b4 += 16; + } + + { + /* b6 */ + const __m128i mask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 15, 13, 11, 9, 7, 5, 3, 1); + const __m128i ude = _mm_shuffle_epi8(ue, mask); + _mm_storel_epi64((__m128i*)b6, ude); + b6 += 8; + } + } + { + /* We have now + * 16 even V values in ue + * 16 odd V values in uo + * + * We need to split these according to + * 3.3.8.3.2 YUV420p Stream Combination for YUV444 mode */ + __m128i ve, vo; + { + const __m128i ve1 = _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, v_factors), + _mm_maddubs_epi16(xe2, v_factors)), 7); + const __m128i ve2 = _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, v_factors), + _mm_maddubs_epi16(xe4, v_factors)), 7); + ve = _mm_add_epi8(_mm_packs_epi16(ve1, ve2), vector128); + } + + if (b1Odd) + { + const __m128i vo1 = _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, v_factors), + _mm_maddubs_epi16(xo2, v_factors)), 7); + const __m128i vo2 = _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, v_factors), + _mm_maddubs_epi16(xo4, v_factors)), 7); + vo = _mm_add_epi8(_mm_packs_epi16(vo1, vo2), vector128); + } + + /* Now we need the following storage distribution: + * 2x 2y -> b3 + * x 2y+1 -> b5 + * 2x+1 2y -> b7 */ + if (b1Odd) /* b3 */ + { + const __m128i veh = _mm_unpackhi_epi8(ve, _mm_setzero_si128()); + const __m128i voh = _mm_unpackhi_epi8(vo, _mm_setzero_si128()); + const __m128i hi = _mm_add_epi16(veh, voh); + const __m128i vel = _mm_unpacklo_epi8(ve, _mm_setzero_si128()); + const __m128i vol = _mm_unpacklo_epi8(vo, _mm_setzero_si128()); + const __m128i lo = _mm_add_epi16(vel, vol); + const __m128i added = _mm_hadd_epi16(lo, hi); + const __m128i avg16 = _mm_srai_epi16(added, 2); + const __m128i avg = _mm_packus_epi16(avg16, avg16); + _mm_storel_epi64((__m128i*)b3, avg); + } + else + { + const __m128i mask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 14, 12, 10, 8, 6, 4, 2, 0); + const __m128i vd = _mm_shuffle_epi8(ve, mask); + _mm_storel_epi64((__m128i*)b3, vd); + } + + b3 += 8; + + if (b1Odd) /* b5 */ + { + _mm_store_si128((__m128i*)b5, vo); + b5 += 16; + } + + { + /* b7 */ + const __m128i mask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 15, 13, 11, 9, 7, 5, 3, 1); + const __m128i vde = _mm_shuffle_epi8(ve, mask); + _mm_storel_epi64((__m128i*)b7, vde); + b7 += 8; + } } } } @@ -603,9 +704,7 @@ static pstatus_t ssse3_RGBToAVC444YUV_BGRX( BYTE* pDst2[3], const UINT32 dst2Step[3], const prim_size_t* roi) { - UINT32 y, numRows; - BOOL evenRow = TRUE; - BYTE* b1, *b2, *b3, *b4, *b5, *b6, *b7; + UINT32 y; const BYTE* pMaxSrc = pSrc + (roi->height - 1) * srcStep; if (roi->height < 1 || roi->width < 1) @@ -614,28 +713,23 @@ static pstatus_t ssse3_RGBToAVC444YUV_BGRX( if (roi->width % 16 || (unsigned long)pSrc % 16 || srcStep % 16) return generic->RGBToAVC444YUV(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2, dst2Step, roi); - numRows = (roi->height + 1) & ~1; - - for (y = 0; y < numRows; y++, evenRow = !evenRow) + for (y = 0; y < roi->height; y += 2) { - const BYTE* src = y < roi->height ? pSrc + y * srcStep : pMaxSrc; - UINT32 i = y >> 1; - b1 = pDst1[0] + y * dst1Step[0]; - - if (evenRow) - { - b2 = pDst1[1] + i * dst1Step[1]; - b3 = pDst1[2] + i * dst1Step[2]; - b6 = pDst2[1] + i * dst2Step[1]; - b7 = pDst2[2] + i * dst2Step[2]; - ssse3_RGBToAVC444YUV_BGRX_ROW(src, b1, b2, b6, b3, b7, TRUE, roi->width); - } - else - { - b4 = pDst2[0] + dst2Step[0] * ((i & ~7) + i); - b5 = b4 + 8 * dst2Step[0]; - ssse3_RGBToAVC444YUV_BGRX_ROW(src, b1, b4, b4 + 8, b5, b5 + 8, FALSE, roi->width); - } + const BOOL last = (y >= (roi->height - 1)); + const BYTE* srcEven = y < roi->height ? pSrc + y * srcStep : pMaxSrc; + const BYTE* srcOdd = !last ? pSrc + (y + 1) * srcStep : pMaxSrc; + const UINT32 i = y >> 1; + const UINT32 n = (i & ~7) + i; + BYTE* b1Even = pDst1[0] + y * dst1Step[0]; + BYTE* b1Odd = !last ? (b1Even + dst1Step[0]) : NULL; + BYTE* b2 = pDst1[1] + (y / 2) * dst1Step[1]; + BYTE* b3 = pDst1[2] + (y / 2) * dst1Step[2]; + BYTE* b4 = pDst2[0] + dst2Step[0] * n; + BYTE* b5 = b4 + 8 * dst2Step[0]; + BYTE* b6 = pDst2[1] + (y / 2) * dst2Step[1]; + BYTE* b7 = pDst2[2] + (y / 2) * dst2Step[2]; + ssse3_RGBToAVC444YUV_BGRX_DOUBLE_ROW(srcEven, srcOdd, b1Even, b1Odd, b2, b3, b4, b5, b6, b7, + roi->width); } return PRIMITIVES_SUCCESS; @@ -775,8 +869,11 @@ static INLINE void ssse3_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW( const __m128i mask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 10, 6, 2, 12, 8, 4, 0); const __m128i ud = _mm_shuffle_epi8(uo, mask); - _mm_stream_si32((int*)uChromaDst1, ((int*)&ud)[0]); - _mm_stream_si32((int*)vChromaDst1, ((int*)&ud)[1]); + int* uDst1 = (int*)uChromaDst1; + int* vDst1 = (int*)vChromaDst1; + const int* src = (const int*)&ud; + _mm_stream_si32(uDst1, src[0]); + _mm_stream_si32(vDst1, src[1]); uChromaDst1 += 4; vChromaDst1 += 4; } @@ -839,9 +936,12 @@ static INLINE void ssse3_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW( { const __m128i mask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 10, 6, 2, 12, 8, 4, 0); - __m128i vd = _mm_shuffle_epi8(vo, mask); - _mm_stream_si32((int*)uChromaDst2, ((int*)&vd)[0]); - _mm_stream_si32((int*)vChromaDst2, ((int*)&vd)[1]); + const __m128i vd = _mm_shuffle_epi8(vo, mask); + int* uDst2 = (int*)uChromaDst2; + int* vDst2 = (int*)vChromaDst2; + const int* src = (const int*)&vd; + _mm_stream_si32(uDst2, src[0]); + _mm_stream_si32(vDst2, src[1]); uChromaDst2 += 4; vChromaDst2 += 4; }