Added SSSE3 UV average to AVC444v1

2018-01-31 11:33:55 +01:00 · 2018-01-31 11:33:55 +01:00 · eb8e9cb410
commit eb8e9cb410
parent 46159c4cab
1 changed files with 197 additions and 97 deletions
--- a/libfreerdp/primitives/prim_YUV_ssse3.c
+++ b/libfreerdp/primitives/prim_YUV_ssse3.c
@ -515,83 +515,184 @@ static pstatus_t ssse3_RGBToYUV420(
 /* SSSE3 RGB -> AVC444-YUV conversion                                      **/
 /****************************************************************************/

-static INLINE void ssse3_RGBToAVC444YUV_BGRX_ROW(
-    const BYTE* src, BYTE* ydst, BYTE* udst1, BYTE* udst2, BYTE* vdst1, BYTE* vdst2, BOOL isEvenRow,
-    UINT32 width)
+static INLINE void ssse3_RGBToAVC444YUV_BGRX_DOUBLE_ROW(
+    const BYTE* srcEven, const BYTE* srcOdd, BYTE* b1Even, BYTE* b1Odd, BYTE* b2,
+    BYTE* b3, BYTE* b4, BYTE* b5, BYTE* b6, BYTE* b7, UINT32 width)
 {
 	UINT32 x;
-	__m128i vector128, y_factors, u_factors, v_factors, smask;
-	__m128i x1, x2, x3, x4, y, y1, y2, u, u1, u2, v, v1, v2;
-	const __m128i* argb = (const __m128i*) src;
-	__m128i* py = (__m128i*) ydst;
-	__m64* pu1 = (__m64*) udst1;
-	__m64* pu2 = (__m64*) udst2;
-	__m64* pv1 = (__m64*) vdst1;
-	__m64* pv2 = (__m64*) vdst2;
-	y_factors = _mm_load_si128((__m128i*)bgrx_y_factors);
-	u_factors = _mm_load_si128((__m128i*)bgrx_u_factors);
-	v_factors = _mm_load_si128((__m128i*)bgrx_v_factors);
-	vector128 = _mm_load_si128((__m128i*)const_buf_128b);
-	smask = _mm_set_epi8(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
+	const __m128i* argbEven = (const __m128i*) srcEven;
+	const __m128i* argbOdd = (const __m128i*) srcOdd;
+	const __m128i y_factors = _mm_load_si128((__m128i*)bgrx_y_factors);
+	const __m128i u_factors = _mm_load_si128((__m128i*)bgrx_u_factors);
+	const __m128i v_factors = _mm_load_si128((__m128i*)bgrx_v_factors);
+	const __m128i vector128 = _mm_load_si128((__m128i*)const_buf_128b);

 	for (x = 0; x < width; x += 16)
 	{
 		/* store 16 rgba pixels in 4 128 bit registers */
-		x1 = _mm_load_si128(argb++); // 1st 4 pixels
-		x2 = _mm_load_si128(argb++); // 2nd 4 pixels
-		x3 = _mm_load_si128(argb++); // 3rd 4 pixels
-		x4 = _mm_load_si128(argb++); // 4th 4 pixels
-		/* Y: multiplications with subtotals and horizontal sums */
-		y1 = _mm_hadd_epi16(_mm_maddubs_epi16(x1, y_factors), _mm_maddubs_epi16(x2, y_factors));
-		y2 = _mm_hadd_epi16(_mm_maddubs_epi16(x3, y_factors), _mm_maddubs_epi16(x4, y_factors));
-		/* Y: shift the results (logical) */
-		y1 = _mm_srli_epi16(y1, 7);
-		y2 = _mm_srli_epi16(y2, 7);
-		/* Y: pack (unsigned) 16 words into bytes */
-		y = _mm_packus_epi16(y1, y2);
-		/* U: multiplications with subtotals and horizontal sums */
-		u1 = _mm_hadd_epi16(_mm_maddubs_epi16(x1, u_factors), _mm_maddubs_epi16(x2, u_factors));
-		u2 = _mm_hadd_epi16(_mm_maddubs_epi16(x3, u_factors), _mm_maddubs_epi16(x4, u_factors));
-		/* U: shift the results (arithmetic) */
-		u1 = _mm_srai_epi16(u1, 7);
-		u2 = _mm_srai_epi16(u2, 7);
-		/* U: pack (signed) 16 words into bytes */
-		u = _mm_packs_epi16(u1, u2);
-		/* U: add 128 */
-		u = _mm_add_epi8(u, vector128);
-		/* V: multiplications with subtotals and horizontal sums */
-		v1 = _mm_hadd_epi16(_mm_maddubs_epi16(x1, v_factors), _mm_maddubs_epi16(x2, v_factors));
-		v2 = _mm_hadd_epi16(_mm_maddubs_epi16(x3, v_factors), _mm_maddubs_epi16(x4, v_factors));
-		/* V: shift the results (arithmetic) */
-		v1 = _mm_srai_epi16(v1, 7);
-		v2 = _mm_srai_epi16(v2, 7);
-		/* V: pack (signed) 16 words into bytes */
-		v = _mm_packs_epi16(v1, v2);
-		/* V: add 128 */
-		v = _mm_add_epi8(v, vector128);
-		/* store y */
-		_mm_storeu_si128(py++, y);
+		const __m128i xe1 = _mm_load_si128(argbEven++); // 1st 4 pixels
+		const __m128i xe2 = _mm_load_si128(argbEven++); // 2nd 4 pixels
+		const __m128i xe3 = _mm_load_si128(argbEven++); // 3rd 4 pixels
+		const __m128i xe4 = _mm_load_si128(argbEven++); // 4th 4 pixels
+		const __m128i xo1 = _mm_load_si128(argbOdd++); // 1st 4 pixels
+		const __m128i xo2 = _mm_load_si128(argbOdd++); // 2nd 4 pixels
+		const __m128i xo3 = _mm_load_si128(argbOdd++); // 3rd 4 pixels
+		const __m128i xo4 = _mm_load_si128(argbOdd++); // 4th 4 pixels
+		{
+			/* Y: multiplications with subtotals and horizontal sums */
+			const __m128i ye1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, y_factors),
+			                                   _mm_maddubs_epi16(xe2, y_factors)), 7);
+			const __m128i ye2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, y_factors),
+			                                   _mm_maddubs_epi16(xe4, y_factors)), 7);
+			const __m128i ye = _mm_packus_epi16(ye1, ye2);
+			const __m128i yo1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, y_factors),
+			                                   _mm_maddubs_epi16(xo2, y_factors)), 7);
+			const __m128i yo2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, y_factors),
+			                                   _mm_maddubs_epi16(xo4, y_factors)), 7);
+			const __m128i yo = _mm_packus_epi16(yo1, yo2);
+			/* store y [b1] */
+			_mm_storeu_si128((__m128i*)b1Even, ye);
+			b1Even += 16;

-		/* store u and v */
-		if (isEvenRow)
-		{
-			u = _mm_shuffle_epi8(u, smask);
-			v = _mm_shuffle_epi8(v, smask);
-			_mm_storel_pi(pu1++, _mm_castsi128_ps(u));
-			_mm_storeh_pi(pu2++, _mm_castsi128_ps(u));
-			_mm_storel_pi(pv1++, _mm_castsi128_ps(v));
-			_mm_storeh_pi(pv2++, _mm_castsi128_ps(v));
+			if (b1Odd)
+			{
+				_mm_storeu_si128((__m128i*)b1Odd, yo);
+				b1Odd += 16;
+			}
 		}
-		else
 		{
-			_mm_storel_pi(pu1, _mm_castsi128_ps(u));
-			_mm_storeh_pi(pu2, _mm_castsi128_ps(u));
-			_mm_storel_pi(pv1, _mm_castsi128_ps(v));
-			_mm_storeh_pi(pv2, _mm_castsi128_ps(v));
-			pu1 += 2;
-			pu2 += 2;
-			pv1 += 2;
-			pv2 += 2;
+			/* We have now
+			   * 16 even U values in ue
+			   * 16 odd U values in uo
+			   *
+			   * We need to split these according to
+			   * 3.3.8.3.2 YUV420p Stream Combination for YUV444 mode */
+			__m128i ue, uo;
+			{
+				const __m128i ue1 = _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, u_factors),
+				                                   _mm_maddubs_epi16(xe2, u_factors)), 7);
+				const __m128i ue2 = _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, u_factors),
+				                                   _mm_maddubs_epi16(xe4, u_factors)), 7);
+				ue = _mm_add_epi8(_mm_packs_epi16(ue1, ue2), vector128);
+			}
+
+			if (b1Odd)
+			{
+				const __m128i uo1 = _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, u_factors),
+				                                   _mm_maddubs_epi16(xo2, u_factors)), 7);
+				const __m128i uo2 = _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, u_factors),
+				                                   _mm_maddubs_epi16(xo4, u_factors)), 7);
+				uo = _mm_add_epi8(_mm_packs_epi16(uo1, uo2), vector128);
+			}
+
+			/* Now we need the following storage distribution:
+			 * 2x   2y    -> b2
+			 * x    2y+1  -> b4
+			 * 2x+1 2y    -> b6 */
+			if (b1Odd) /* b2 */
+			{
+				const __m128i ueh = _mm_unpackhi_epi8(ue, _mm_setzero_si128());
+				const __m128i uoh = _mm_unpackhi_epi8(uo, _mm_setzero_si128());
+				const __m128i hi = _mm_add_epi16(ueh, uoh);
+				const __m128i uel = _mm_unpacklo_epi8(ue, _mm_setzero_si128());
+				const __m128i uol = _mm_unpacklo_epi8(uo, _mm_setzero_si128());
+				const __m128i lo = _mm_add_epi16(uel, uol);
+				const __m128i added = _mm_hadd_epi16(lo, hi);
+				const __m128i avg16 = _mm_srai_epi16(added, 2);
+				const __m128i avg = _mm_packus_epi16(avg16, avg16);
+				_mm_storel_epi64((__m128i*)b2, avg);
+			}
+			else
+			{
+				const __m128i mask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+				                                  14, 12, 10, 8, 6, 4, 2, 0);
+				const __m128i ud = _mm_shuffle_epi8(ue, mask);
+				_mm_storel_epi64((__m128i*)b2, ud);
+			}
+
+			b2 += 8;
+
+			if (b1Odd) /* b4 */
+			{
+				_mm_store_si128((__m128i*)b4, uo);
+				b4 += 16;
+			}
+
+			{
+				/* b6 */
+				const __m128i mask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+				                                  15, 13, 11, 9, 7, 5, 3, 1);
+				const __m128i ude = _mm_shuffle_epi8(ue, mask);
+				_mm_storel_epi64((__m128i*)b6, ude);
+				b6 += 8;
+			}
+		}
+		{
+			/* We have now
+			   * 16 even V values in ue
+			   * 16 odd V values in uo
+			   *
+			   * We need to split these according to
+			   * 3.3.8.3.2 YUV420p Stream Combination for YUV444 mode */
+			__m128i ve, vo;
+			{
+				const __m128i ve1 = _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, v_factors),
+				                                   _mm_maddubs_epi16(xe2, v_factors)), 7);
+				const __m128i ve2 = _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, v_factors),
+				                                   _mm_maddubs_epi16(xe4, v_factors)), 7);
+				ve = _mm_add_epi8(_mm_packs_epi16(ve1, ve2), vector128);
+			}
+
+			if (b1Odd)
+			{
+				const __m128i vo1 = _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, v_factors),
+				                                   _mm_maddubs_epi16(xo2, v_factors)), 7);
+				const __m128i vo2 = _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, v_factors),
+				                                   _mm_maddubs_epi16(xo4, v_factors)), 7);
+				vo = _mm_add_epi8(_mm_packs_epi16(vo1, vo2), vector128);
+			}
+
+			/* Now we need the following storage distribution:
+			 * 2x   2y    -> b3
+			 * x    2y+1  -> b5
+			 * 2x+1 2y    -> b7 */
+			if (b1Odd) /* b3 */
+			{
+				const __m128i veh = _mm_unpackhi_epi8(ve, _mm_setzero_si128());
+				const __m128i voh = _mm_unpackhi_epi8(vo, _mm_setzero_si128());
+				const __m128i hi = _mm_add_epi16(veh, voh);
+				const __m128i vel = _mm_unpacklo_epi8(ve, _mm_setzero_si128());
+				const __m128i vol = _mm_unpacklo_epi8(vo, _mm_setzero_si128());
+				const __m128i lo = _mm_add_epi16(vel, vol);
+				const __m128i added = _mm_hadd_epi16(lo, hi);
+				const __m128i avg16 = _mm_srai_epi16(added, 2);
+				const __m128i avg = _mm_packus_epi16(avg16, avg16);
+				_mm_storel_epi64((__m128i*)b3, avg);
+			}
+			else
+			{
+				const __m128i mask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+				                                  14, 12, 10, 8, 6, 4, 2, 0);
+				const __m128i vd = _mm_shuffle_epi8(ve, mask);
+				_mm_storel_epi64((__m128i*)b3, vd);
+			}
+
+			b3 += 8;
+
+			if (b1Odd) /* b5 */
+			{
+				_mm_store_si128((__m128i*)b5, vo);
+				b5 += 16;
+			}
+
+			{
+				/* b7 */
+				const __m128i mask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+				                                  15, 13, 11, 9, 7, 5, 3, 1);
+				const __m128i vde = _mm_shuffle_epi8(ve, mask);
+				_mm_storel_epi64((__m128i*)b7, vde);
+				b7 += 8;
+			}
 		}
 	}
 }
@ -603,9 +704,7 @@ static pstatus_t ssse3_RGBToAVC444YUV_BGRX(
    BYTE* pDst2[3], const UINT32 dst2Step[3],
    const prim_size_t* roi)
 {
-	UINT32 y, numRows;
-	BOOL evenRow = TRUE;
-	BYTE* b1, *b2, *b3, *b4, *b5, *b6, *b7;
+	UINT32 y;
 	const BYTE* pMaxSrc = pSrc + (roi->height - 1) * srcStep;

 	if (roi->height < 1 || roi->width < 1)
@ -614,28 +713,23 @@ static pstatus_t ssse3_RGBToAVC444YUV_BGRX(
 	if (roi->width % 16 || (unsigned long)pSrc % 16 || srcStep % 16)
 		return generic->RGBToAVC444YUV(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2, dst2Step, roi);

-	numRows = (roi->height + 1) & ~1;
-
-	for (y = 0; y < numRows; y++, evenRow = !evenRow)
+	for (y = 0; y < roi->height; y += 2)
 	{
-		const BYTE* src = y < roi->height ? pSrc + y * srcStep : pMaxSrc;
-		UINT32 i = y >> 1;
-		b1  = pDst1[0] + y * dst1Step[0];
-
-		if (evenRow)
-		{
-			b2 = pDst1[1] + i * dst1Step[1];
-			b3 = pDst1[2] + i * dst1Step[2];
-			b6 = pDst2[1] + i * dst2Step[1];
-			b7 = pDst2[2] + i * dst2Step[2];
-			ssse3_RGBToAVC444YUV_BGRX_ROW(src, b1, b2, b6, b3, b7, TRUE, roi->width);
-		}
-		else
-		{
-			b4 = pDst2[0] + dst2Step[0] * ((i & ~7) + i);
-			b5 = b4 + 8 * dst2Step[0];
-			ssse3_RGBToAVC444YUV_BGRX_ROW(src, b1, b4, b4 + 8, b5, b5 + 8, FALSE, roi->width);
-		}
+		const BOOL last = (y >= (roi->height - 1));
+		const BYTE* srcEven = y < roi->height ? pSrc + y * srcStep : pMaxSrc;
+		const BYTE* srcOdd = !last ? pSrc + (y + 1) * srcStep : pMaxSrc;
+		const UINT32 i = y >> 1;
+		const UINT32 n = (i & ~7) + i;
+		BYTE* b1Even = pDst1[0] + y * dst1Step[0];
+		BYTE* b1Odd = !last ? (b1Even + dst1Step[0]) : NULL;
+		BYTE* b2 = pDst1[1] + (y / 2) * dst1Step[1];
+		BYTE* b3 = pDst1[2] + (y / 2) * dst1Step[2];
+		BYTE* b4 = pDst2[0] + dst2Step[0] * n;
+		BYTE* b5 = b4 + 8 * dst2Step[0];
+		BYTE* b6 = pDst2[1] + (y / 2) * dst2Step[1];
+		BYTE* b7 = pDst2[2] + (y / 2) * dst2Step[2];
+		ssse3_RGBToAVC444YUV_BGRX_DOUBLE_ROW(srcEven, srcOdd, b1Even, b1Odd, b2, b3, b4, b5, b6, b7,
+		                                     roi->width);
 	}

 	return PRIMITIVES_SUCCESS;
@ -775,8 +869,11 @@ static INLINE void ssse3_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(
 				const __m128i mask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
 				                                  14, 10, 6, 2, 12, 8, 4, 0);
 				const __m128i ud = _mm_shuffle_epi8(uo, mask);
-				_mm_stream_si32((int*)uChromaDst1, ((int*)&ud)[0]);
-				_mm_stream_si32((int*)vChromaDst1, ((int*)&ud)[1]);
+				int* uDst1 = (int*)uChromaDst1;
+				int* vDst1 = (int*)vChromaDst1;
+				const int* src = (const int*)&ud;
+				_mm_stream_si32(uDst1, src[0]);
+				_mm_stream_si32(vDst1, src[1]);
 				uChromaDst1 += 4;
 				vChromaDst1 += 4;
 			}
@ -839,9 +936,12 @@ static INLINE void ssse3_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(
 			{
 				const __m128i mask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
 				                                  14, 10, 6, 2, 12, 8, 4, 0);
-				__m128i vd = _mm_shuffle_epi8(vo, mask);
-				_mm_stream_si32((int*)uChromaDst2, ((int*)&vd)[0]);
-				_mm_stream_si32((int*)vChromaDst2, ((int*)&vd)[1]);
+				const __m128i vd = _mm_shuffle_epi8(vo, mask);
+				int* uDst2 = (int*)uChromaDst2;
+				int* vDst2 = (int*)vChromaDst2;
+				const int* src = (const int*)&vd;
+				_mm_stream_si32(uDst2, src[0]);
+				_mm_stream_si32(vDst2, src[1]);
 				uChromaDst2 += 4;
 				vChromaDst2 += 4;
 			}