Merge pull request #318 from nfedera/rgb_to_ycbcr_speedup

codec: RemoteFX YCbCr/RGB conversion optimization
This commit is contained in:
Marc-André Moreau 2011-12-29 09:41:47 -08:00
commit f879bdc66d
3 changed files with 93 additions and 66 deletions

View File

@ -3,6 +3,7 @@
* RemoteFX Codec Library - Decode
*
* Copyright 2011 Vic Lee
* Copyright 2011 Norbert Federa <nfedera@thinstuff.com>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@ -83,8 +84,9 @@ static void rfx_decode_format_rgb(sint16* r_buf, sint16* g_buf, sint16* b_buf,
void rfx_decode_ycbcr_to_rgb(sint16* y_r_buf, sint16* cb_g_buf, sint16* cr_b_buf)
{
sint16 y, cb, cr;
sint16 r, g, b;
/* sint32 is used intentionally because we calculate with shifted factors! */
sint32 y, cb, cr;
sint32 r, g, b;
int i;
/**
@ -98,10 +100,17 @@ void rfx_decode_ycbcr_to_rgb(sint16* y_r_buf, sint16* cb_g_buf, sint16* cr_b_buf
*/
for (i = 0; i < 4096; i++)
{
y = y_r_buf[i] + 4096; // 128<<5 = 4096 so that we can >> 5 over the sum
y = y_r_buf[i];
cb = cb_g_buf[i];
cr = cr_b_buf[i];
#if 0
/**
* This is the slow floating point version kept here for reference
*/
y = y + 4096; /* 128<<5=4096 so that we can scale the sum by >> 5 */
r = y + cr*1.403f;
g = y - cb*0.344f - cr*0.714f;
b = y + cb*1.770f;
@ -109,6 +118,28 @@ void rfx_decode_ycbcr_to_rgb(sint16* y_r_buf, sint16* cb_g_buf, sint16* cr_b_buf
y_r_buf[i] = MINMAX(r>>5, 0, 255);
cb_g_buf[i] = MINMAX(g>>5, 0, 255);
cr_b_buf[i] = MINMAX(b>>5, 0, 255);
#else
/**
* We scale the factors by << 16 into 32-bit integers in order to avoid slower
* floating point multiplications. Since the final result needs to be scaled
* by >> 5 we will extract only the upper 11 bits (>> 21) from the final sum.
* Hence we also have to scale the other terms of the sum by << 16.
*
* R: 1.403 << 16 = 91947
* G: 0.344 << 16 = 22544, 0.714 << 16 = 46792
* B: 1.770 << 16 = 115998
*/
y = (y+4096)<<16;
r = y + cr*91947;
g = y - cb*22544 - cr*46792;
b = y + cb*115998;
y_r_buf[i] = MINMAX(r>>21, 0, 255);
cb_g_buf[i] = MINMAX(g>>21, 0, 255);
cr_b_buf[i] = MINMAX(b>>21, 0, 255);
#endif
}
}

View File

@ -3,6 +3,7 @@
* RemoteFX Codec Library - Encode
*
* Copyright 2011 Vic Lee
* Copyright 2011 Norbert Federa <nfedera@thinstuff.com>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@ -173,8 +174,9 @@ static void rfx_encode_format_rgb(const uint8* rgb_data, int width, int height,
void rfx_encode_rgb_to_ycbcr(sint16* y_r_buf, sint16* cb_g_buf, sint16* cr_b_buf)
{
sint16 y, cb, cr;
sint16 r, g, b;
// sint32 is used intentionally because we calculate with shifted factors!
sint32 y, cb, cr;
sint32 r, g, b;
int i;
/**
@ -191,20 +193,23 @@ void rfx_encode_rgb_to_ycbcr(sint16* y_r_buf, sint16* cb_g_buf, sint16* cr_b_buf
r = y_r_buf[i];
g = cb_g_buf[i];
b = cr_b_buf[i];
/* 0.299 << 5 = 1001.10010001(b), 0.587 << 5 = 10010.11001000(b), 0.114 << 5 = 11.10100101(b) */
y = ((r << 3) + (r) + (r >> 1) + (r >> 4) + (r >> 7)) +
((g << 4) + (g << 1) + (g >> 1) + (g >> 2) + (g >> 5)) +
((b << 1) + (b) + (b >> 1) + (b >> 3) + (b >> 6) + (b >> 7));
/*
* We scale the factors by << 15 into 32-bit integers in order to avoid slower
* floating point multiplications. Since the terms need to be scaled by << 5 we
* simply scale the final sum by >> 10
*
* Y: 0.299000 << 15 = 9798, 0.587000 << 15 = 19235, 0.114000 << 15 = 3735
* Cb: 0.168935 << 15 = 5535, 0.331665 << 15 = 10868, 0.500590 << 15 = 16403
* Cr: 0.499813 << 15 = 16377, 0.418531 << 15 = 13714, 0.081282 << 15 = 2663
*/
y = (r*9798 + g*19235 + b*3735)>>10;
cb = (r*-5535 + g*-10868 + b*16403)>>10;
cr = (r*16377 + g*-13714 + b*-2663)>>10;
y_r_buf[i] = MINMAX(y - 4096, -4096, 4095);
/* 0.168935 << 5 = 101.01100111(b), 0.331665 << 5 = 1010.10011100(b), 0.50059 << 5 = 10000.00000100(b) */
cb = 0 - ((r << 2) + (r) + (r >> 2) + (r >> 3) + (r >> 5)) -
((g << 3) + (g << 1) + (g >> 1) + (g >> 4) + (g >> 5) + (g >> 6)) +
((b << 4) + (b >> 6));
cb_g_buf[i] = MINMAX(cb, -4096, 4095);
/* 0.499813 << 5 = 1111.11111110(b), 0.418531 << 5 = 1101.01100100(b), 0.081282 << 5 = 10.10011001(b) */
cr = ((r << 4) - (r >> 7)) -
((g << 3) + (g << 2) + (g) + (g >> 2) + (g >> 3) + (g >> 6)) -
((b << 1) + (b >> 1) + (b >> 4) + (b >> 5) + (b >> 7));
cr_b_buf[i] = MINMAX(cr, -4096, 4095);
}
}

View File

@ -146,6 +146,16 @@ static void rfx_encode_rgb_to_ycbcr_sse2(sint16* y_r_buffer, sint16* cb_g_buffer
__m128i g;
__m128i b;
__m128i y_r = _mm_set1_epi16(9798); // 0.299000 << 15
__m128i y_g = _mm_set1_epi16(19235); // 0.587000 << 15
__m128i y_b = _mm_set1_epi16(3735); // 0.114000 << 15
__m128i cb_r = _mm_set1_epi16(-5535); // -0.168935 << 15
__m128i cb_g = _mm_set1_epi16(-10868); // -0.331665 << 15
__m128i cb_b = _mm_set1_epi16(16403); // 0.500590 << 15
__m128i cr_r = _mm_set1_epi16(16377); // 0.499813 << 15
__m128i cr_g = _mm_set1_epi16(-13714); // -0.418531 << 15
__m128i cr_b = _mm_set1_epi16(-2663); // -0.081282 << 15
int i;
for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i += (CACHE_LINE_BYTES / sizeof(__m128i)))
@ -156,6 +166,18 @@ static void rfx_encode_rgb_to_ycbcr_sse2(sint16* y_r_buffer, sint16* cb_g_buffer
}
for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i++)
{
/*
In order to use SSE2 signed 16-bit integer multiplication we need to convert
the floating point factors to signed int without loosing information.
The result of this multiplication is 32 bit and using SSE2 we get either the
product's hi or lo word.
Thus we will multiply the factors by the highest possible 2^n and take the
upper 16 bits of the signed 32-bit result (_mm_mulhi_epi16).
Since the final result needs to be scaled by << 5 and also in in order to keep
the precision within the upper 16 bits we will also have to scale the RGB
values used in the multiplication by << 5+(16-n).
*/
/* r = y_r_buf[i]; */
r = _mm_load_si128(&y_r_buf[i]);
@ -165,64 +187,33 @@ static void rfx_encode_rgb_to_ycbcr_sse2(sint16* y_r_buffer, sint16* cb_g_buffer
/* b = cr_b_buf[i]; */
b = _mm_load_si128(&cr_b_buf[i]);
/* y = ((r << 3) + (r) + (r >> 1) + (r >> 4) + (r >> 7)) +
((g << 4) + (g << 1) + (g >> 1) + (g >> 2) + (g >> 5)) +
((b << 1) + (b) + (b >> 1) + (b >> 3) + (b >> 6) + (b >> 7)); */
/* y_r_buf[i] = MINMAX(y, 0, (255 << 5)) - (128 << 5); */
y = _mm_add_epi16(_mm_slli_epi16(r, 3), r);
y = _mm_add_epi16(y, _mm_srai_epi16(r, 1));
y = _mm_add_epi16(y, _mm_srai_epi16(r, 4));
y = _mm_add_epi16(y, _mm_srai_epi16(r, 7));
y = _mm_add_epi16(y, _mm_slli_epi16(g, 4));
y = _mm_add_epi16(y, _mm_slli_epi16(g, 1));
y = _mm_add_epi16(y, _mm_srai_epi16(g, 1));
y = _mm_add_epi16(y, _mm_srai_epi16(g, 2));
y = _mm_add_epi16(y, _mm_srai_epi16(g, 5));
y = _mm_add_epi16(y, _mm_slli_epi16(b, 1));
y = _mm_add_epi16(y, b);
y = _mm_add_epi16(y, _mm_srai_epi16(b, 1));
y = _mm_add_epi16(y, _mm_srai_epi16(b, 3));
y = _mm_add_epi16(y, _mm_srai_epi16(b, 6));
y = _mm_add_epi16(y, _mm_srai_epi16(b, 7));
/* r<<6; g<<6; b<<6 */
r = _mm_slli_epi16(r, 6);
g = _mm_slli_epi16(g, 6);
b = _mm_slli_epi16(b, 6);
/* y = HIWORD(r*y_r) + HIWORD(g*y_g) + HIWORD(b*y_b) + min */
y = _mm_mulhi_epi16(r, y_r);
y = _mm_add_epi16(y, _mm_mulhi_epi16(g, y_g));
y = _mm_add_epi16(y, _mm_mulhi_epi16(b, y_b));
y = _mm_add_epi16(y, min);
/* y_r_buf[i] = MINMAX(y, 0, (255 << 5)) - (128 << 5); */
_mm_between_epi16(y, min, max);
_mm_store_si128(&y_r_buf[i], y);
/* cb = 0 - ((r << 2) + (r) + (r >> 2) + (r >> 3) + (r >> 5)) -
((g << 3) + (g << 1) + (g >> 1) + (g >> 4) + (g >> 5) + (g >> 6)) +
((b << 4) + (b >> 6)); */
/* cb = HIWORD(r*cb_r) + HIWORD(g*cb_g) + HIWORD(b*cb_b) */
cb = _mm_mulhi_epi16(r, cb_r);
cb = _mm_add_epi16(cb, _mm_mulhi_epi16(g, cb_g));
cb = _mm_add_epi16(cb, _mm_mulhi_epi16(b, cb_b));
/* cb_g_buf[i] = MINMAX(cb, (-128 << 5), (127 << 5)); */
cb = _mm_add_epi16(_mm_slli_epi16(b, 4), _mm_srai_epi16(b, 6));
cb = _mm_sub_epi16(cb, _mm_slli_epi16(r, 2));
cb = _mm_sub_epi16(cb, r);
cb = _mm_sub_epi16(cb, _mm_srai_epi16(r, 2));
cb = _mm_sub_epi16(cb, _mm_srai_epi16(r, 3));
cb = _mm_sub_epi16(cb, _mm_srai_epi16(r, 5));
cb = _mm_sub_epi16(cb, _mm_slli_epi16(g, 3));
cb = _mm_sub_epi16(cb, _mm_slli_epi16(g, 1));
cb = _mm_sub_epi16(cb, _mm_srai_epi16(g, 1));
cb = _mm_sub_epi16(cb, _mm_srai_epi16(g, 4));
cb = _mm_sub_epi16(cb, _mm_srai_epi16(g, 5));
cb = _mm_sub_epi16(cb, _mm_srai_epi16(g, 6));
_mm_between_epi16(cb, min, max);
_mm_store_si128(&cb_g_buf[i], cb);
/* cr = ((r << 4) - (r >> 7)) -
((g << 3) + (g << 2) + (g) + (g >> 2) + (g >> 3) + (g >> 6)) -
((b << 1) + (b >> 1) + (b >> 4) + (b >> 5) + (b >> 7)); */
/* cr = HIWORD(r*cr_r) + HIWORD(g*cr_g) + HIWORD(b*cr_b) */
cr = _mm_mulhi_epi16(r, cr_r);
cr = _mm_add_epi16(cr, _mm_mulhi_epi16(g, cr_g));
cr = _mm_add_epi16(cr, _mm_mulhi_epi16(b, cr_b));
/* cr_b_buf[i] = MINMAX(cr, (-128 << 5), (127 << 5)); */
cr = _mm_sub_epi16(_mm_slli_epi16(r, 4), _mm_srai_epi16(r, 7));
cr = _mm_sub_epi16(cr, _mm_slli_epi16(g, 3));
cr = _mm_sub_epi16(cr, _mm_slli_epi16(g, 2));
cr = _mm_sub_epi16(cr, g);
cr = _mm_sub_epi16(cr, _mm_srai_epi16(g, 2));
cr = _mm_sub_epi16(cr, _mm_srai_epi16(g, 3));
cr = _mm_sub_epi16(cr, _mm_srai_epi16(g, 6));
cr = _mm_sub_epi16(cr, _mm_slli_epi16(b, 1));
cr = _mm_sub_epi16(cr, _mm_srai_epi16(b, 1));
cr = _mm_sub_epi16(cr, _mm_srai_epi16(b, 4));
cr = _mm_sub_epi16(cr, _mm_srai_epi16(b, 5));
cr = _mm_sub_epi16(cr, _mm_srai_epi16(b, 7));
_mm_between_epi16(cr, min, max);
_mm_store_si128(&cr_b_buf[i], cr);
}