codec: fixed and accelerated RemoteFX ycbcr-to-rgb decoder
The current ycbcr decoder was loosing some bits because cr/cb was multiplied by the shifted factors. Instead one should multiply by the non-shifted factors and shift the result. The effects of these lost bits are easily seen by comparing the colors of a RemoteFX session with the colors of a plain RDP session - they are just wrong ;) I've replaced the bit-magic from the non non-accelerated version (rfx_decode.c) and replaced it with simple float multiplications using the compiler's implicit integer conversions. On several test machines this was even a little bit faster. The accelerated SSE2 ycbcr decoder (rfx_sse2.c) was completely changed in order to make use of the SSE2 signed 16-bit integer multiplication. Fortunately the factors in the conversion matrix are so small that we can easily shift them to the maximum possible 16-bit signed integer value without loosing any information and use _mm_mulhi_epi16 which takes the upper 16 bits of the 32-bit result. The SSE2 ycbcr decoder is now much simpler and about 40 percent faster.
This commit is contained in:
parent
c5a2cf3035
commit
04518f0b42
@ -98,19 +98,17 @@ void rfx_decode_ycbcr_to_rgb(sint16* y_r_buf, sint16* cb_g_buf, sint16* cr_b_buf
|
||||
*/
|
||||
for (i = 0; i < 4096; i++)
|
||||
{
|
||||
y = (y_r_buf[i] >> 5) + 128;
|
||||
y = y_r_buf[i] + 4096; // 128<<5 = 4096 so that we can >> 5 over the sum
|
||||
cb = cb_g_buf[i];
|
||||
cr = cr_b_buf[i];
|
||||
/* 1.403 >> 5 = 0.000010110011100(b) */
|
||||
r = y + ((cr >> 5) + (cr >> 7) + (cr >> 8) + (cr >> 11) + (cr >> 12) + (cr >> 13));
|
||||
y_r_buf[i] = MINMAX(r, 0, 255);
|
||||
/* 0.344 >> 5 = 0.000000101100000(b), 0.714 >> 5 = 0.000001011011011(b) */
|
||||
g = y - ((cb >> 7) + (cb >> 9) + (cb >> 10)) -
|
||||
((cr >> 6) + (cr >> 8) + (cr >> 9) + (cr >> 11) + (cr >> 12) + (cr >> 13));
|
||||
cb_g_buf[i] = MINMAX(g, 0, 255);
|
||||
/* 1.77 >> 5 = 0.000011100010100(b) */
|
||||
b = y + ((cb >> 5) + (cb >> 6) + (cb >> 7) + (cb >> 11) + (cb >> 13));
|
||||
cr_b_buf[i] = MINMAX(b, 0, 255);
|
||||
|
||||
r = y + cr*1.403f;
|
||||
g = y - cb*0.344f - cr*0.714f;
|
||||
b = y + cb*1.770f;
|
||||
|
||||
y_r_buf[i] = MINMAX(r>>5, 0, 255);
|
||||
cb_g_buf[i] = MINMAX(g>>5, 0, 255);
|
||||
cr_b_buf[i] = MINMAX(b>>5, 0, 255);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -3,6 +3,7 @@
|
||||
* RemoteFX Codec Library - SSE2 Optimizations
|
||||
*
|
||||
* Copyright 2011 Stephen Erisman
|
||||
* Copyright 2011 Norbert Federa <nfedera@thinstuff.com>
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
@ -64,6 +65,12 @@ static void rfx_decode_ycbcr_to_rgb_sse2(sint16* y_r_buffer, sint16* cb_g_buffer
|
||||
|
||||
int i;
|
||||
|
||||
__m128i r_cr = _mm_set1_epi16(22986); // 1.403 << 14
|
||||
__m128i g_cb = _mm_set1_epi16(-5636); // -0.344 << 14
|
||||
__m128i g_cr = _mm_set1_epi16(-11698); // -0.714 << 14
|
||||
__m128i b_cb = _mm_set1_epi16(28999); // 1.770 << 14
|
||||
__m128i c4096 = _mm_set1_epi16(4096);
|
||||
|
||||
for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i += (CACHE_LINE_BYTES / sizeof(__m128i)))
|
||||
{
|
||||
_mm_prefetch((char*)(&y_r_buf[i]), _MM_HINT_NTA);
|
||||
@ -72,49 +79,51 @@ static void rfx_decode_ycbcr_to_rgb_sse2(sint16* y_r_buffer, sint16* cb_g_buffer
|
||||
}
|
||||
for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i++)
|
||||
{
|
||||
/* y = (y_r_buf[i] >> 5) + 128; */
|
||||
y = _mm_load_si128(&y_r_buf[i]);
|
||||
y = _mm_add_epi16(_mm_srai_epi16(y, 5), _mm_set1_epi16(128));
|
||||
/*
|
||||
In order to use SSE2 signed 16-bit integer multiplication we need to convert
|
||||
the floating point factors to signed int without loosing information.
|
||||
The result of this multiplication is 32 bit and we have two SSE instructions
|
||||
that return either the hi or lo word.
|
||||
Thus we will multiply the factors by the highest possible 2^n, take the
|
||||
upper 16 bits of the signed 32-bit result (_mm_mulhi_epi16) and correct this
|
||||
result by multiplying it by 2^(16-n).
|
||||
For the given factors in the conversion matrix the best possible n is 14.
|
||||
|
||||
Example for calculating r:
|
||||
r = (y>>5) + 128 + (cr*1.403)>>5 // our base formula
|
||||
r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5 // see above
|
||||
r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5 // simplification
|
||||
r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
|
||||
*/
|
||||
|
||||
/* y = (y_r_buf[i] + 4096) >> 2 */
|
||||
y = _mm_load_si128(&y_r_buf[i]);
|
||||
y = _mm_add_epi16(y, c4096);
|
||||
y = _mm_srai_epi16(y, 2);
|
||||
/* cb = cb_g_buf[i]; */
|
||||
cb = _mm_load_si128(&cb_g_buf[i]);
|
||||
/* cr = cr_b_buf[i]; */
|
||||
cr = _mm_load_si128(&cr_b_buf[i]);
|
||||
|
||||
/* r = y + ((cr >> 5) + (cr >> 7) + (cr >> 8) + (cr >> 11) + (cr >> 12) + (cr >> 13)); */
|
||||
/* (y + HIWORD(cr*22986)) >> 3 */
|
||||
r = _mm_add_epi16(y, _mm_mulhi_epi16(cr, r_cr));
|
||||
r = _mm_srai_epi16(r, 3);
|
||||
/* y_r_buf[i] = MINMAX(r, 0, 255); */
|
||||
r = _mm_add_epi16(y, _mm_srai_epi16(cr, 5));
|
||||
r = _mm_add_epi16(r, _mm_srai_epi16(cr, 7));
|
||||
r = _mm_add_epi16(r, _mm_srai_epi16(cr, 8));
|
||||
r = _mm_add_epi16(r, _mm_srai_epi16(cr, 11));
|
||||
r = _mm_add_epi16(r, _mm_srai_epi16(cr, 12));
|
||||
r = _mm_add_epi16(r, _mm_srai_epi16(cr, 13));
|
||||
_mm_between_epi16(r, zero, max);
|
||||
_mm_store_si128(&y_r_buf[i], r);
|
||||
|
||||
/* cb = cb_g_buf[i]; */
|
||||
cb = _mm_load_si128(&cb_g_buf[i]);
|
||||
|
||||
/* g = y - ((cb >> 7) + (cb >> 9) + (cb >> 10)) -
|
||||
((cr >> 6) + (cr >> 8) + (cr >> 9) + (cr >> 11) + (cr >> 12) + (cr >> 13)); */
|
||||
/* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
|
||||
g = _mm_add_epi16(y, _mm_mulhi_epi16(cb, g_cb));
|
||||
g = _mm_add_epi16(g, _mm_mulhi_epi16(cr, g_cr));
|
||||
g = _mm_srai_epi16(g, 3);
|
||||
/* cb_g_buf[i] = MINMAX(g, 0, 255); */
|
||||
g = _mm_sub_epi16(y, _mm_srai_epi16(cb, 7));
|
||||
g = _mm_sub_epi16(g, _mm_srai_epi16(cb, 9));
|
||||
g = _mm_sub_epi16(g, _mm_srai_epi16(cb, 10));
|
||||
g = _mm_sub_epi16(g, _mm_srai_epi16(cr, 6));
|
||||
g = _mm_sub_epi16(g, _mm_srai_epi16(cr, 8));
|
||||
g = _mm_sub_epi16(g, _mm_srai_epi16(cr, 9));
|
||||
g = _mm_sub_epi16(g, _mm_srai_epi16(cr, 11));
|
||||
g = _mm_sub_epi16(g, _mm_srai_epi16(cr, 12));
|
||||
g = _mm_sub_epi16(g, _mm_srai_epi16(cr, 13));
|
||||
_mm_between_epi16(g, zero, max);
|
||||
_mm_store_si128(&cb_g_buf[i], g);
|
||||
|
||||
/* b = y + ((cb >> 5) + (cb >> 6) + (cb >> 7) + (cb >> 11) + (cb >> 13)); */
|
||||
/* (y + HIWORD(cb*28999)) >> 3 */
|
||||
b = _mm_add_epi16(y, _mm_mulhi_epi16(cb, b_cb));
|
||||
b = _mm_srai_epi16(b, 3);
|
||||
/* cr_b_buf[i] = MINMAX(b, 0, 255); */
|
||||
b = _mm_add_epi16(y, _mm_srai_epi16(cb, 5));
|
||||
b = _mm_add_epi16(b, _mm_srai_epi16(cb, 6));
|
||||
b = _mm_add_epi16(b, _mm_srai_epi16(cb, 7));
|
||||
b = _mm_add_epi16(b, _mm_srai_epi16(cb, 11));
|
||||
b = _mm_add_epi16(b, _mm_srai_epi16(cb, 13));
|
||||
_mm_between_epi16(b, zero, max);
|
||||
_mm_store_si128(&cr_b_buf[i], b);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user