Merge pull request #318 from nfedera/rgb_to_ycbcr_speedup
codec: RemoteFX YCbCr/RGB conversion optimization
This commit is contained in:
commit
f879bdc66d
@ -3,6 +3,7 @@
|
||||
* RemoteFX Codec Library - Decode
|
||||
*
|
||||
* Copyright 2011 Vic Lee
|
||||
* Copyright 2011 Norbert Federa <nfedera@thinstuff.com>
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
@ -83,8 +84,9 @@ static void rfx_decode_format_rgb(sint16* r_buf, sint16* g_buf, sint16* b_buf,
|
||||
|
||||
void rfx_decode_ycbcr_to_rgb(sint16* y_r_buf, sint16* cb_g_buf, sint16* cr_b_buf)
|
||||
{
|
||||
sint16 y, cb, cr;
|
||||
sint16 r, g, b;
|
||||
/* sint32 is used intentionally because we calculate with shifted factors! */
|
||||
sint32 y, cb, cr;
|
||||
sint32 r, g, b;
|
||||
int i;
|
||||
|
||||
/**
|
||||
@ -98,10 +100,17 @@ void rfx_decode_ycbcr_to_rgb(sint16* y_r_buf, sint16* cb_g_buf, sint16* cr_b_buf
|
||||
*/
|
||||
for (i = 0; i < 4096; i++)
|
||||
{
|
||||
y = y_r_buf[i] + 4096; // 128<<5 = 4096 so that we can >> 5 over the sum
|
||||
y = y_r_buf[i];
|
||||
cb = cb_g_buf[i];
|
||||
cr = cr_b_buf[i];
|
||||
|
||||
#if 0
|
||||
/**
|
||||
* This is the slow floating point version kept here for reference
|
||||
*/
|
||||
|
||||
y = y + 4096; /* 128<<5=4096 so that we can scale the sum by >> 5 */
|
||||
|
||||
r = y + cr*1.403f;
|
||||
g = y - cb*0.344f - cr*0.714f;
|
||||
b = y + cb*1.770f;
|
||||
@ -109,6 +118,28 @@ void rfx_decode_ycbcr_to_rgb(sint16* y_r_buf, sint16* cb_g_buf, sint16* cr_b_buf
|
||||
y_r_buf[i] = MINMAX(r>>5, 0, 255);
|
||||
cb_g_buf[i] = MINMAX(g>>5, 0, 255);
|
||||
cr_b_buf[i] = MINMAX(b>>5, 0, 255);
|
||||
#else
|
||||
/**
|
||||
* We scale the factors by << 16 into 32-bit integers in order to avoid slower
|
||||
* floating point multiplications. Since the final result needs to be scaled
|
||||
* by >> 5 we will extract only the upper 11 bits (>> 21) from the final sum.
|
||||
* Hence we also have to scale the other terms of the sum by << 16.
|
||||
*
|
||||
* R: 1.403 << 16 = 91947
|
||||
* G: 0.344 << 16 = 22544, 0.714 << 16 = 46792
|
||||
* B: 1.770 << 16 = 115998
|
||||
*/
|
||||
|
||||
y = (y+4096)<<16;
|
||||
|
||||
r = y + cr*91947;
|
||||
g = y - cb*22544 - cr*46792;
|
||||
b = y + cb*115998;
|
||||
|
||||
y_r_buf[i] = MINMAX(r>>21, 0, 255);
|
||||
cb_g_buf[i] = MINMAX(g>>21, 0, 255);
|
||||
cr_b_buf[i] = MINMAX(b>>21, 0, 255);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -3,6 +3,7 @@
|
||||
* RemoteFX Codec Library - Encode
|
||||
*
|
||||
* Copyright 2011 Vic Lee
|
||||
* Copyright 2011 Norbert Federa <nfedera@thinstuff.com>
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
@ -173,8 +174,9 @@ static void rfx_encode_format_rgb(const uint8* rgb_data, int width, int height,
|
||||
|
||||
void rfx_encode_rgb_to_ycbcr(sint16* y_r_buf, sint16* cb_g_buf, sint16* cr_b_buf)
|
||||
{
|
||||
sint16 y, cb, cr;
|
||||
sint16 r, g, b;
|
||||
// sint32 is used intentionally because we calculate with shifted factors!
|
||||
sint32 y, cb, cr;
|
||||
sint32 r, g, b;
|
||||
int i;
|
||||
|
||||
/**
|
||||
@ -191,20 +193,23 @@ void rfx_encode_rgb_to_ycbcr(sint16* y_r_buf, sint16* cb_g_buf, sint16* cr_b_buf
|
||||
r = y_r_buf[i];
|
||||
g = cb_g_buf[i];
|
||||
b = cr_b_buf[i];
|
||||
/* 0.299 << 5 = 1001.10010001(b), 0.587 << 5 = 10010.11001000(b), 0.114 << 5 = 11.10100101(b) */
|
||||
y = ((r << 3) + (r) + (r >> 1) + (r >> 4) + (r >> 7)) +
|
||||
((g << 4) + (g << 1) + (g >> 1) + (g >> 2) + (g >> 5)) +
|
||||
((b << 1) + (b) + (b >> 1) + (b >> 3) + (b >> 6) + (b >> 7));
|
||||
|
||||
/*
|
||||
* We scale the factors by << 15 into 32-bit integers in order to avoid slower
|
||||
* floating point multiplications. Since the terms need to be scaled by << 5 we
|
||||
* simply scale the final sum by >> 10
|
||||
*
|
||||
* Y: 0.299000 << 15 = 9798, 0.587000 << 15 = 19235, 0.114000 << 15 = 3735
|
||||
* Cb: 0.168935 << 15 = 5535, 0.331665 << 15 = 10868, 0.500590 << 15 = 16403
|
||||
* Cr: 0.499813 << 15 = 16377, 0.418531 << 15 = 13714, 0.081282 << 15 = 2663
|
||||
*/
|
||||
|
||||
y = (r*9798 + g*19235 + b*3735)>>10;
|
||||
cb = (r*-5535 + g*-10868 + b*16403)>>10;
|
||||
cr = (r*16377 + g*-13714 + b*-2663)>>10;
|
||||
|
||||
y_r_buf[i] = MINMAX(y - 4096, -4096, 4095);
|
||||
/* 0.168935 << 5 = 101.01100111(b), 0.331665 << 5 = 1010.10011100(b), 0.50059 << 5 = 10000.00000100(b) */
|
||||
cb = 0 - ((r << 2) + (r) + (r >> 2) + (r >> 3) + (r >> 5)) -
|
||||
((g << 3) + (g << 1) + (g >> 1) + (g >> 4) + (g >> 5) + (g >> 6)) +
|
||||
((b << 4) + (b >> 6));
|
||||
cb_g_buf[i] = MINMAX(cb, -4096, 4095);
|
||||
/* 0.499813 << 5 = 1111.11111110(b), 0.418531 << 5 = 1101.01100100(b), 0.081282 << 5 = 10.10011001(b) */
|
||||
cr = ((r << 4) - (r >> 7)) -
|
||||
((g << 3) + (g << 2) + (g) + (g >> 2) + (g >> 3) + (g >> 6)) -
|
||||
((b << 1) + (b >> 1) + (b >> 4) + (b >> 5) + (b >> 7));
|
||||
cr_b_buf[i] = MINMAX(cr, -4096, 4095);
|
||||
}
|
||||
}
|
||||
|
@ -146,6 +146,16 @@ static void rfx_encode_rgb_to_ycbcr_sse2(sint16* y_r_buffer, sint16* cb_g_buffer
|
||||
__m128i g;
|
||||
__m128i b;
|
||||
|
||||
__m128i y_r = _mm_set1_epi16(9798); // 0.299000 << 15
|
||||
__m128i y_g = _mm_set1_epi16(19235); // 0.587000 << 15
|
||||
__m128i y_b = _mm_set1_epi16(3735); // 0.114000 << 15
|
||||
__m128i cb_r = _mm_set1_epi16(-5535); // -0.168935 << 15
|
||||
__m128i cb_g = _mm_set1_epi16(-10868); // -0.331665 << 15
|
||||
__m128i cb_b = _mm_set1_epi16(16403); // 0.500590 << 15
|
||||
__m128i cr_r = _mm_set1_epi16(16377); // 0.499813 << 15
|
||||
__m128i cr_g = _mm_set1_epi16(-13714); // -0.418531 << 15
|
||||
__m128i cr_b = _mm_set1_epi16(-2663); // -0.081282 << 15
|
||||
|
||||
int i;
|
||||
|
||||
for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i += (CACHE_LINE_BYTES / sizeof(__m128i)))
|
||||
@ -156,6 +166,18 @@ static void rfx_encode_rgb_to_ycbcr_sse2(sint16* y_r_buffer, sint16* cb_g_buffer
|
||||
}
|
||||
for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i++)
|
||||
{
|
||||
/*
|
||||
In order to use SSE2 signed 16-bit integer multiplication we need to convert
|
||||
the floating point factors to signed int without loosing information.
|
||||
The result of this multiplication is 32 bit and using SSE2 we get either the
|
||||
product's hi or lo word.
|
||||
Thus we will multiply the factors by the highest possible 2^n and take the
|
||||
upper 16 bits of the signed 32-bit result (_mm_mulhi_epi16).
|
||||
Since the final result needs to be scaled by << 5 and also in in order to keep
|
||||
the precision within the upper 16 bits we will also have to scale the RGB
|
||||
values used in the multiplication by << 5+(16-n).
|
||||
*/
|
||||
|
||||
/* r = y_r_buf[i]; */
|
||||
r = _mm_load_si128(&y_r_buf[i]);
|
||||
|
||||
@ -165,64 +187,33 @@ static void rfx_encode_rgb_to_ycbcr_sse2(sint16* y_r_buffer, sint16* cb_g_buffer
|
||||
/* b = cr_b_buf[i]; */
|
||||
b = _mm_load_si128(&cr_b_buf[i]);
|
||||
|
||||
/* y = ((r << 3) + (r) + (r >> 1) + (r >> 4) + (r >> 7)) +
|
||||
((g << 4) + (g << 1) + (g >> 1) + (g >> 2) + (g >> 5)) +
|
||||
((b << 1) + (b) + (b >> 1) + (b >> 3) + (b >> 6) + (b >> 7)); */
|
||||
/* y_r_buf[i] = MINMAX(y, 0, (255 << 5)) - (128 << 5); */
|
||||
y = _mm_add_epi16(_mm_slli_epi16(r, 3), r);
|
||||
y = _mm_add_epi16(y, _mm_srai_epi16(r, 1));
|
||||
y = _mm_add_epi16(y, _mm_srai_epi16(r, 4));
|
||||
y = _mm_add_epi16(y, _mm_srai_epi16(r, 7));
|
||||
y = _mm_add_epi16(y, _mm_slli_epi16(g, 4));
|
||||
y = _mm_add_epi16(y, _mm_slli_epi16(g, 1));
|
||||
y = _mm_add_epi16(y, _mm_srai_epi16(g, 1));
|
||||
y = _mm_add_epi16(y, _mm_srai_epi16(g, 2));
|
||||
y = _mm_add_epi16(y, _mm_srai_epi16(g, 5));
|
||||
y = _mm_add_epi16(y, _mm_slli_epi16(b, 1));
|
||||
y = _mm_add_epi16(y, b);
|
||||
y = _mm_add_epi16(y, _mm_srai_epi16(b, 1));
|
||||
y = _mm_add_epi16(y, _mm_srai_epi16(b, 3));
|
||||
y = _mm_add_epi16(y, _mm_srai_epi16(b, 6));
|
||||
y = _mm_add_epi16(y, _mm_srai_epi16(b, 7));
|
||||
/* r<<6; g<<6; b<<6 */
|
||||
r = _mm_slli_epi16(r, 6);
|
||||
g = _mm_slli_epi16(g, 6);
|
||||
b = _mm_slli_epi16(b, 6);
|
||||
|
||||
/* y = HIWORD(r*y_r) + HIWORD(g*y_g) + HIWORD(b*y_b) + min */
|
||||
y = _mm_mulhi_epi16(r, y_r);
|
||||
y = _mm_add_epi16(y, _mm_mulhi_epi16(g, y_g));
|
||||
y = _mm_add_epi16(y, _mm_mulhi_epi16(b, y_b));
|
||||
y = _mm_add_epi16(y, min);
|
||||
/* y_r_buf[i] = MINMAX(y, 0, (255 << 5)) - (128 << 5); */
|
||||
_mm_between_epi16(y, min, max);
|
||||
_mm_store_si128(&y_r_buf[i], y);
|
||||
|
||||
/* cb = 0 - ((r << 2) + (r) + (r >> 2) + (r >> 3) + (r >> 5)) -
|
||||
((g << 3) + (g << 1) + (g >> 1) + (g >> 4) + (g >> 5) + (g >> 6)) +
|
||||
((b << 4) + (b >> 6)); */
|
||||
/* cb = HIWORD(r*cb_r) + HIWORD(g*cb_g) + HIWORD(b*cb_b) */
|
||||
cb = _mm_mulhi_epi16(r, cb_r);
|
||||
cb = _mm_add_epi16(cb, _mm_mulhi_epi16(g, cb_g));
|
||||
cb = _mm_add_epi16(cb, _mm_mulhi_epi16(b, cb_b));
|
||||
/* cb_g_buf[i] = MINMAX(cb, (-128 << 5), (127 << 5)); */
|
||||
cb = _mm_add_epi16(_mm_slli_epi16(b, 4), _mm_srai_epi16(b, 6));
|
||||
cb = _mm_sub_epi16(cb, _mm_slli_epi16(r, 2));
|
||||
cb = _mm_sub_epi16(cb, r);
|
||||
cb = _mm_sub_epi16(cb, _mm_srai_epi16(r, 2));
|
||||
cb = _mm_sub_epi16(cb, _mm_srai_epi16(r, 3));
|
||||
cb = _mm_sub_epi16(cb, _mm_srai_epi16(r, 5));
|
||||
cb = _mm_sub_epi16(cb, _mm_slli_epi16(g, 3));
|
||||
cb = _mm_sub_epi16(cb, _mm_slli_epi16(g, 1));
|
||||
cb = _mm_sub_epi16(cb, _mm_srai_epi16(g, 1));
|
||||
cb = _mm_sub_epi16(cb, _mm_srai_epi16(g, 4));
|
||||
cb = _mm_sub_epi16(cb, _mm_srai_epi16(g, 5));
|
||||
cb = _mm_sub_epi16(cb, _mm_srai_epi16(g, 6));
|
||||
_mm_between_epi16(cb, min, max);
|
||||
_mm_store_si128(&cb_g_buf[i], cb);
|
||||
|
||||
/* cr = ((r << 4) - (r >> 7)) -
|
||||
((g << 3) + (g << 2) + (g) + (g >> 2) + (g >> 3) + (g >> 6)) -
|
||||
((b << 1) + (b >> 1) + (b >> 4) + (b >> 5) + (b >> 7)); */
|
||||
/* cr = HIWORD(r*cr_r) + HIWORD(g*cr_g) + HIWORD(b*cr_b) */
|
||||
cr = _mm_mulhi_epi16(r, cr_r);
|
||||
cr = _mm_add_epi16(cr, _mm_mulhi_epi16(g, cr_g));
|
||||
cr = _mm_add_epi16(cr, _mm_mulhi_epi16(b, cr_b));
|
||||
/* cr_b_buf[i] = MINMAX(cr, (-128 << 5), (127 << 5)); */
|
||||
cr = _mm_sub_epi16(_mm_slli_epi16(r, 4), _mm_srai_epi16(r, 7));
|
||||
cr = _mm_sub_epi16(cr, _mm_slli_epi16(g, 3));
|
||||
cr = _mm_sub_epi16(cr, _mm_slli_epi16(g, 2));
|
||||
cr = _mm_sub_epi16(cr, g);
|
||||
cr = _mm_sub_epi16(cr, _mm_srai_epi16(g, 2));
|
||||
cr = _mm_sub_epi16(cr, _mm_srai_epi16(g, 3));
|
||||
cr = _mm_sub_epi16(cr, _mm_srai_epi16(g, 6));
|
||||
cr = _mm_sub_epi16(cr, _mm_slli_epi16(b, 1));
|
||||
cr = _mm_sub_epi16(cr, _mm_srai_epi16(b, 1));
|
||||
cr = _mm_sub_epi16(cr, _mm_srai_epi16(b, 4));
|
||||
cr = _mm_sub_epi16(cr, _mm_srai_epi16(b, 5));
|
||||
cr = _mm_sub_epi16(cr, _mm_srai_epi16(b, 7));
|
||||
_mm_between_epi16(cr, min, max);
|
||||
_mm_store_si128(&cr_b_buf[i], cr);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user