From 04518f0b42d5b8347a00e31d2bc622cdd2bf9ab0 Mon Sep 17 00:00:00 2001 From: Norbert Federa Date: Tue, 27 Dec 2011 16:18:02 +0100 Subject: [PATCH] codec: fixed and accelerated RemoteFX ycbcr-to-rgb decoder The current ycbcr decoder was loosing some bits because cr/cb was multiplied by the shifted factors. Instead one should multiply by the non-shifted factors and shift the result. The effects of these lost bits are easily seen by comparing the colors of a RemoteFX session with the colors of a plain RDP session - they are just wrong ;) I've replaced the bit-magic from the non non-accelerated version (rfx_decode.c) and replaced it with simple float multiplications using the compiler's implicit integer conversions. On several test machines this was even a little bit faster. The accelerated SSE2 ycbcr decoder (rfx_sse2.c) was completely changed in order to make use of the SSE2 signed 16-bit integer multiplication. Fortunately the factors in the conversion matrix are so small that we can easily shift them to the maximum possible 16-bit signed integer value without loosing any information and use _mm_mulhi_epi16 which takes the upper 16 bits of the 32-bit result. The SSE2 ycbcr decoder is now much simpler and about 40 percent faster. --- libfreerdp-codec/rfx_decode.c | 20 +++++----- libfreerdp-codec/rfx_sse2.c | 69 ++++++++++++++++++++--------------- 2 files changed, 48 insertions(+), 41 deletions(-) diff --git a/libfreerdp-codec/rfx_decode.c b/libfreerdp-codec/rfx_decode.c index e07c7dd6b..aea5be079 100644 --- a/libfreerdp-codec/rfx_decode.c +++ b/libfreerdp-codec/rfx_decode.c @@ -98,19 +98,17 @@ void rfx_decode_ycbcr_to_rgb(sint16* y_r_buf, sint16* cb_g_buf, sint16* cr_b_buf */ for (i = 0; i < 4096; i++) { - y = (y_r_buf[i] >> 5) + 128; + y = y_r_buf[i] + 4096; // 128<<5 = 4096 so that we can >> 5 over the sum cb = cb_g_buf[i]; cr = cr_b_buf[i]; - /* 1.403 >> 5 = 0.000010110011100(b) */ - r = y + ((cr >> 5) + (cr >> 7) + (cr >> 8) + (cr >> 11) + (cr >> 12) + (cr >> 13)); - y_r_buf[i] = MINMAX(r, 0, 255); - /* 0.344 >> 5 = 0.000000101100000(b), 0.714 >> 5 = 0.000001011011011(b) */ - g = y - ((cb >> 7) + (cb >> 9) + (cb >> 10)) - - ((cr >> 6) + (cr >> 8) + (cr >> 9) + (cr >> 11) + (cr >> 12) + (cr >> 13)); - cb_g_buf[i] = MINMAX(g, 0, 255); - /* 1.77 >> 5 = 0.000011100010100(b) */ - b = y + ((cb >> 5) + (cb >> 6) + (cb >> 7) + (cb >> 11) + (cb >> 13)); - cr_b_buf[i] = MINMAX(b, 0, 255); + + r = y + cr*1.403f; + g = y - cb*0.344f - cr*0.714f; + b = y + cb*1.770f; + + y_r_buf[i] = MINMAX(r>>5, 0, 255); + cb_g_buf[i] = MINMAX(g>>5, 0, 255); + cr_b_buf[i] = MINMAX(b>>5, 0, 255); } } diff --git a/libfreerdp-codec/rfx_sse2.c b/libfreerdp-codec/rfx_sse2.c index f7e474687..202d83876 100644 --- a/libfreerdp-codec/rfx_sse2.c +++ b/libfreerdp-codec/rfx_sse2.c @@ -3,6 +3,7 @@ * RemoteFX Codec Library - SSE2 Optimizations * * Copyright 2011 Stephen Erisman + * Copyright 2011 Norbert Federa * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -64,6 +65,12 @@ static void rfx_decode_ycbcr_to_rgb_sse2(sint16* y_r_buffer, sint16* cb_g_buffer int i; + __m128i r_cr = _mm_set1_epi16(22986); // 1.403 << 14 + __m128i g_cb = _mm_set1_epi16(-5636); // -0.344 << 14 + __m128i g_cr = _mm_set1_epi16(-11698); // -0.714 << 14 + __m128i b_cb = _mm_set1_epi16(28999); // 1.770 << 14 + __m128i c4096 = _mm_set1_epi16(4096); + for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i += (CACHE_LINE_BYTES / sizeof(__m128i))) { _mm_prefetch((char*)(&y_r_buf[i]), _MM_HINT_NTA); @@ -72,49 +79,51 @@ static void rfx_decode_ycbcr_to_rgb_sse2(sint16* y_r_buffer, sint16* cb_g_buffer } for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i++) { - /* y = (y_r_buf[i] >> 5) + 128; */ - y = _mm_load_si128(&y_r_buf[i]); - y = _mm_add_epi16(_mm_srai_epi16(y, 5), _mm_set1_epi16(128)); + /* + In order to use SSE2 signed 16-bit integer multiplication we need to convert + the floating point factors to signed int without loosing information. + The result of this multiplication is 32 bit and we have two SSE instructions + that return either the hi or lo word. + Thus we will multiply the factors by the highest possible 2^n, take the + upper 16 bits of the signed 32-bit result (_mm_mulhi_epi16) and correct this + result by multiplying it by 2^(16-n). + For the given factors in the conversion matrix the best possible n is 14. + Example for calculating r: + r = (y>>5) + 128 + (cr*1.403)>>5 // our base formula + r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5 // see above + r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5 // simplification + r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3 + */ + + /* y = (y_r_buf[i] + 4096) >> 2 */ + y = _mm_load_si128(&y_r_buf[i]); + y = _mm_add_epi16(y, c4096); + y = _mm_srai_epi16(y, 2); + /* cb = cb_g_buf[i]; */ + cb = _mm_load_si128(&cb_g_buf[i]); /* cr = cr_b_buf[i]; */ cr = _mm_load_si128(&cr_b_buf[i]); - /* r = y + ((cr >> 5) + (cr >> 7) + (cr >> 8) + (cr >> 11) + (cr >> 12) + (cr >> 13)); */ + /* (y + HIWORD(cr*22986)) >> 3 */ + r = _mm_add_epi16(y, _mm_mulhi_epi16(cr, r_cr)); + r = _mm_srai_epi16(r, 3); /* y_r_buf[i] = MINMAX(r, 0, 255); */ - r = _mm_add_epi16(y, _mm_srai_epi16(cr, 5)); - r = _mm_add_epi16(r, _mm_srai_epi16(cr, 7)); - r = _mm_add_epi16(r, _mm_srai_epi16(cr, 8)); - r = _mm_add_epi16(r, _mm_srai_epi16(cr, 11)); - r = _mm_add_epi16(r, _mm_srai_epi16(cr, 12)); - r = _mm_add_epi16(r, _mm_srai_epi16(cr, 13)); _mm_between_epi16(r, zero, max); _mm_store_si128(&y_r_buf[i], r); - /* cb = cb_g_buf[i]; */ - cb = _mm_load_si128(&cb_g_buf[i]); - - /* g = y - ((cb >> 7) + (cb >> 9) + (cb >> 10)) - - ((cr >> 6) + (cr >> 8) + (cr >> 9) + (cr >> 11) + (cr >> 12) + (cr >> 13)); */ + /* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */ + g = _mm_add_epi16(y, _mm_mulhi_epi16(cb, g_cb)); + g = _mm_add_epi16(g, _mm_mulhi_epi16(cr, g_cr)); + g = _mm_srai_epi16(g, 3); /* cb_g_buf[i] = MINMAX(g, 0, 255); */ - g = _mm_sub_epi16(y, _mm_srai_epi16(cb, 7)); - g = _mm_sub_epi16(g, _mm_srai_epi16(cb, 9)); - g = _mm_sub_epi16(g, _mm_srai_epi16(cb, 10)); - g = _mm_sub_epi16(g, _mm_srai_epi16(cr, 6)); - g = _mm_sub_epi16(g, _mm_srai_epi16(cr, 8)); - g = _mm_sub_epi16(g, _mm_srai_epi16(cr, 9)); - g = _mm_sub_epi16(g, _mm_srai_epi16(cr, 11)); - g = _mm_sub_epi16(g, _mm_srai_epi16(cr, 12)); - g = _mm_sub_epi16(g, _mm_srai_epi16(cr, 13)); _mm_between_epi16(g, zero, max); _mm_store_si128(&cb_g_buf[i], g); - /* b = y + ((cb >> 5) + (cb >> 6) + (cb >> 7) + (cb >> 11) + (cb >> 13)); */ + /* (y + HIWORD(cb*28999)) >> 3 */ + b = _mm_add_epi16(y, _mm_mulhi_epi16(cb, b_cb)); + b = _mm_srai_epi16(b, 3); /* cr_b_buf[i] = MINMAX(b, 0, 255); */ - b = _mm_add_epi16(y, _mm_srai_epi16(cb, 5)); - b = _mm_add_epi16(b, _mm_srai_epi16(cb, 6)); - b = _mm_add_epi16(b, _mm_srai_epi16(cb, 7)); - b = _mm_add_epi16(b, _mm_srai_epi16(cb, 11)); - b = _mm_add_epi16(b, _mm_srai_epi16(cb, 13)); _mm_between_epi16(b, zero, max); _mm_store_si128(&cr_b_buf[i], b); }