From 04518f0b42d5b8347a00e31d2bc622cdd2bf9ab0 Mon Sep 17 00:00:00 2001
From: Norbert Federa <nfedera@thinstuff.com>
Date: Tue, 27 Dec 2011 16:18:02 +0100
Subject: [PATCH] codec: fixed and accelerated RemoteFX ycbcr-to-rgb decoder

The current ycbcr decoder was loosing some bits because cr/cb was multiplied by
the shifted factors.
Instead one should multiply by the non-shifted factors and shift the result.
The effects of these lost bits are easily seen by comparing the colors of a
RemoteFX session with the colors of a plain RDP session - they are just wrong ;)

I've replaced the bit-magic from the non non-accelerated version (rfx_decode.c)
and replaced it with simple float multiplications using the compiler's implicit
integer conversions. On several test machines this was even a little bit faster.

The accelerated SSE2 ycbcr decoder (rfx_sse2.c) was completely changed in order
to make use of the SSE2 signed 16-bit integer multiplication.
Fortunately the factors in the conversion matrix are so small that we can
easily shift them to the maximum possible 16-bit signed integer value without
loosing any information and use _mm_mulhi_epi16 which takes the upper 16 bits
of the 32-bit result.

The SSE2 ycbcr decoder is now much simpler and about 40 percent faster.
---
 libfreerdp-codec/rfx_decode.c | 20 +++++-----
 libfreerdp-codec/rfx_sse2.c   | 69 ++++++++++++++++++++---------------
 2 files changed, 48 insertions(+), 41 deletions(-)

diff --git a/libfreerdp-codec/rfx_decode.c b/libfreerdp-codec/rfx_decode.c
index e07c7dd6b..aea5be079 100644
--- a/libfreerdp-codec/rfx_decode.c
+++ b/libfreerdp-codec/rfx_decode.c
@@ -98,19 +98,17 @@ void rfx_decode_ycbcr_to_rgb(sint16* y_r_buf, sint16* cb_g_buf, sint16* cr_b_buf
 	 */
 	for (i = 0; i < 4096; i++)
 	{
-		y = (y_r_buf[i] >> 5) + 128;
+		y = y_r_buf[i] + 4096; // 128<<5 = 4096 so that we can >> 5 over the sum
 		cb = cb_g_buf[i];
 		cr = cr_b_buf[i];
-		/* 1.403 >> 5 = 0.000010110011100(b) */
-		r = y + ((cr >> 5) + (cr >> 7) + (cr >> 8) + (cr >> 11) + (cr >> 12) + (cr >> 13));
-		y_r_buf[i] = MINMAX(r, 0, 255);
-		/* 0.344 >> 5 = 0.000000101100000(b), 0.714 >> 5 = 0.000001011011011(b) */
-		g = y - ((cb >> 7) + (cb >> 9) + (cb >> 10)) -
-			((cr >> 6) + (cr >> 8) + (cr >> 9) + (cr >> 11) + (cr >> 12) + (cr >> 13));
-		cb_g_buf[i] = MINMAX(g, 0, 255);
-		/* 1.77 >> 5 = 0.000011100010100(b) */
-		b = y + ((cb >> 5) + (cb >> 6) + (cb >> 7) + (cb >> 11) + (cb >> 13));
-		cr_b_buf[i] = MINMAX(b, 0, 255);
+
+		r = y + cr*1.403f;
+		g = y - cb*0.344f - cr*0.714f;
+		b = y + cb*1.770f;
+
+		y_r_buf[i]  = MINMAX(r>>5, 0, 255);
+		cb_g_buf[i] = MINMAX(g>>5, 0, 255);
+		cr_b_buf[i] = MINMAX(b>>5, 0, 255);
 	}
 }
 
diff --git a/libfreerdp-codec/rfx_sse2.c b/libfreerdp-codec/rfx_sse2.c
index f7e474687..202d83876 100644
--- a/libfreerdp-codec/rfx_sse2.c
+++ b/libfreerdp-codec/rfx_sse2.c
@@ -3,6 +3,7 @@
  * RemoteFX Codec Library - SSE2 Optimizations
  *
  * Copyright 2011 Stephen Erisman
+ * Copyright 2011 Norbert Federa <nfedera@thinstuff.com>
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -64,6 +65,12 @@ static void rfx_decode_ycbcr_to_rgb_sse2(sint16* y_r_buffer, sint16* cb_g_buffer
 
 	int i;
 
+	__m128i r_cr = _mm_set1_epi16(22986);	//  1.403 << 14
+	__m128i g_cb = _mm_set1_epi16(-5636);	// -0.344 << 14
+	__m128i g_cr = _mm_set1_epi16(-11698);	// -0.714 << 14
+	__m128i b_cb = _mm_set1_epi16(28999);	//  1.770 << 14
+	__m128i c4096 = _mm_set1_epi16(4096);
+
 	for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i += (CACHE_LINE_BYTES / sizeof(__m128i)))
 	{
 		_mm_prefetch((char*)(&y_r_buf[i]), _MM_HINT_NTA);
@@ -72,49 +79,51 @@ static void rfx_decode_ycbcr_to_rgb_sse2(sint16* y_r_buffer, sint16* cb_g_buffer
 	}
 	for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i++)
 	{
-		/* y = (y_r_buf[i] >> 5) + 128; */
-		y = _mm_load_si128(&y_r_buf[i]);
-		y = _mm_add_epi16(_mm_srai_epi16(y, 5), _mm_set1_epi16(128));
+		/*
+		In order to use SSE2 signed 16-bit integer multiplication we need to convert
+		the floating point factors to signed int without loosing information.
+		The result of this multiplication is 32 bit and we have two SSE instructions
+		that return either the hi or lo word.
+		Thus we will multiply the factors by the highest possible 2^n, take the 
+		upper 16 bits of the signed 32-bit result (_mm_mulhi_epi16) and correct	this
+		result by multiplying it by 2^(16-n).
+		For the given factors in the conversion matrix the best possible n is 14.
 
+		Example for calculating r:
+		r = (y>>5) + 128 + (cr*1.403)>>5                       // our base formula
+		r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5      // see above
+		r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5             // simplification
+		r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
+		*/
+
+		/* y = (y_r_buf[i] + 4096) >> 2 */
+		y = _mm_load_si128(&y_r_buf[i]);
+		y = _mm_add_epi16(y, c4096);
+		y = _mm_srai_epi16(y, 2);
+		/* cb = cb_g_buf[i]; */
+		cb = _mm_load_si128(&cb_g_buf[i]);
 		/* cr = cr_b_buf[i]; */
 		cr = _mm_load_si128(&cr_b_buf[i]);
 
-		/* r = y + ((cr >> 5) + (cr >> 7) + (cr >> 8) + (cr >> 11) + (cr >> 12) + (cr >> 13)); */
+		/* (y + HIWORD(cr*22986)) >> 3 */
+		r = _mm_add_epi16(y, _mm_mulhi_epi16(cr, r_cr));
+		r = _mm_srai_epi16(r, 3);
 		/* y_r_buf[i] = MINMAX(r, 0, 255); */
-		r = _mm_add_epi16(y, _mm_srai_epi16(cr, 5));
-		r = _mm_add_epi16(r, _mm_srai_epi16(cr, 7));
-		r = _mm_add_epi16(r, _mm_srai_epi16(cr, 8));
-		r = _mm_add_epi16(r, _mm_srai_epi16(cr, 11));
-		r = _mm_add_epi16(r, _mm_srai_epi16(cr, 12));
-		r = _mm_add_epi16(r, _mm_srai_epi16(cr, 13));
 		_mm_between_epi16(r, zero, max);
 		_mm_store_si128(&y_r_buf[i], r);
 
-		/* cb = cb_g_buf[i]; */
-		cb = _mm_load_si128(&cb_g_buf[i]);
-
-		/* g = y - ((cb >> 7) + (cb >> 9) + (cb >> 10)) -
-			((cr >> 6) + (cr >> 8) + (cr >> 9) + (cr >> 11) + (cr >> 12) + (cr >> 13)); */
+		/* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
+		g = _mm_add_epi16(y, _mm_mulhi_epi16(cb, g_cb));
+		g = _mm_add_epi16(g, _mm_mulhi_epi16(cr, g_cr));
+		g = _mm_srai_epi16(g, 3);
 		/* cb_g_buf[i] = MINMAX(g, 0, 255); */
-		g = _mm_sub_epi16(y, _mm_srai_epi16(cb, 7));
-		g = _mm_sub_epi16(g, _mm_srai_epi16(cb, 9));
-		g = _mm_sub_epi16(g, _mm_srai_epi16(cb, 10));
-		g = _mm_sub_epi16(g, _mm_srai_epi16(cr, 6));
-		g = _mm_sub_epi16(g, _mm_srai_epi16(cr, 8));
-		g = _mm_sub_epi16(g, _mm_srai_epi16(cr, 9));
-		g = _mm_sub_epi16(g, _mm_srai_epi16(cr, 11));
-		g = _mm_sub_epi16(g, _mm_srai_epi16(cr, 12));
-		g = _mm_sub_epi16(g, _mm_srai_epi16(cr, 13));
 		_mm_between_epi16(g, zero, max);
 		_mm_store_si128(&cb_g_buf[i], g);
 
-		/* b = y + ((cb >> 5) + (cb >> 6) + (cb >> 7) + (cb >> 11) + (cb >> 13)); */
+		/* (y + HIWORD(cb*28999)) >> 3 */
+		b = _mm_add_epi16(y, _mm_mulhi_epi16(cb, b_cb));
+		b = _mm_srai_epi16(b, 3);
 		/* cr_b_buf[i] = MINMAX(b, 0, 255); */
-		b = _mm_add_epi16(y, _mm_srai_epi16(cb, 5));
-		b = _mm_add_epi16(b, _mm_srai_epi16(cb, 6));
-		b = _mm_add_epi16(b, _mm_srai_epi16(cb, 7));
-		b = _mm_add_epi16(b, _mm_srai_epi16(cb, 11));
-		b = _mm_add_epi16(b, _mm_srai_epi16(cb, 13));
 		_mm_between_epi16(b, zero, max);
 		_mm_store_si128(&cr_b_buf[i], b);
 	}