Merge branch 'master' of github.com:FreeRDP/FreeRDP-1.0
This commit is contained in:
commit
d7917f9db5
@ -86,18 +86,31 @@ void rfx_decode_ycbcr_to_rgb(sint16* y_r_buf, sint16* cb_g_buf, sint16* cr_b_buf
|
||||
{
|
||||
sint16 y, cb, cr;
|
||||
sint16 r, g, b;
|
||||
|
||||
int i;
|
||||
|
||||
/**
|
||||
* The decoded YCbCr coeffectients are represented as 11.5 fixed-point numbers:
|
||||
*
|
||||
* 1 sign bit + 10 integer bits + 5 fractional bits
|
||||
*
|
||||
* However only 7 integer bits will be actually used since the value range is [-128.0, 127.0].
|
||||
* In other words, the decoded coeffectients is scaled by << 5 when intepreted as sint16.
|
||||
* It was scaled in the quantization phase, so we must scale it back here.
|
||||
*/
|
||||
for (i = 0; i < 4096; i++)
|
||||
{
|
||||
y = y_r_buf[i] + 128;
|
||||
y = (y_r_buf[i] >> 5) + 128;
|
||||
cb = cb_g_buf[i];
|
||||
cr = cr_b_buf[i];
|
||||
r = (y + cr + (cr >> 2) + (cr >> 3) + (cr >> 5));
|
||||
/* 1.403 >> 5 = 0.000010110011100(b) */
|
||||
r = y + ((cr >> 5) + (cr >> 7) + (cr >> 8) + (cr >> 11) + (cr >> 12) + (cr >> 13));
|
||||
y_r_buf[i] = MINMAX(r, 0, 255);
|
||||
g = (y - ((cb >> 2) + (cb >> 4) + (cb >> 5)) - ((cr >> 1) + (cr >> 3) + (cr >> 4) + (cr >> 5)));
|
||||
/* 0.344 >> 5 = 0.000000101100000(b), 0.714 >> 5 = 0.000001011011011(b) */
|
||||
g = y - ((cb >> 7) + (cb >> 9) + (cb >> 10)) -
|
||||
((cr >> 6) + (cr >> 8) + (cr >> 9) + (cr >> 11) + (cr >> 12) + (cr >> 13));
|
||||
cb_g_buf[i] = MINMAX(g, 0, 255);
|
||||
b = (y + cb + (cb >> 1) + (cb >> 2) + (cb >> 6));
|
||||
/* 1.77 >> 5 = 0.000011100010100(b) */
|
||||
b = y + ((cb >> 5) + (cb >> 6) + (cb >> 7) + (cb >> 11) + (cb >> 13));
|
||||
cr_b_buf[i] = MINMAX(b, 0, 255);
|
||||
}
|
||||
}
|
||||
|
@ -133,7 +133,7 @@ static void rfx_dwt_2d_encode_block(sint16* buffer, sint16* dwt, int subband_wid
|
||||
src = buffer + y * total_width + x;
|
||||
|
||||
/* H */
|
||||
*h = (src[total_width] - ((src[0] + src[n < subband_width - 1 ? 2 * total_width : total_width]) >> 1)) >> 1;
|
||||
*h = (src[total_width] - ((src[0] + src[n < subband_width - 1 ? 2 * total_width : 0]) >> 1)) >> 1;
|
||||
|
||||
/* L */
|
||||
*l = src[0] + (n == 0 ? *h : (*(h - total_width) + *h) >> 1);
|
||||
|
@ -123,20 +123,37 @@ void rfx_encode_rgb_to_ycbcr(sint16* y_r_buf, sint16* cb_g_buf, sint16* cr_b_buf
|
||||
{
|
||||
sint16 y, cb, cr;
|
||||
sint16 r, g, b;
|
||||
|
||||
int i;
|
||||
|
||||
/**
|
||||
* The encoded YCbCr coeffectients are represented as 11.5 fixed-point numbers:
|
||||
*
|
||||
* 1 sign bit + 10 integer bits + 5 fractional bits
|
||||
*
|
||||
* However only 7 integer bits will be actually used since the value range is [-128.0, 127.0].
|
||||
* In other words, the encoded coeffectients is scaled by << 5 when intepreted as sint16.
|
||||
* It will be scaled down to original during the quantization phase.
|
||||
*/
|
||||
for (i = 0; i < 4096; i++)
|
||||
{
|
||||
r = y_r_buf[i];
|
||||
g = cb_g_buf[i];
|
||||
b = cr_b_buf[i];
|
||||
y = ((r >> 2) + (r >> 5) + (r >> 6)) + ((g >> 1) + (g >> 4) + (g >> 6) + (g >> 7)) +
|
||||
((b >> 4) + (b >> 5) + (b >> 6) + (b >> 7));
|
||||
y_r_buf[i] = MINMAX(y, 0, 255) - 128;
|
||||
cb = 0 - ((r >> 3) + (r >> 5) + (r >> 6)) - ((g >> 2) + (g >> 4) + (g >> 6)) + (b >> 1);
|
||||
cb_g_buf[i] = MINMAX(cb, -128, 127);
|
||||
cr = (r >> 1) - ((g >> 2) + (g >> 3) + (g >> 5) + (g >> 6)) - ((b >> 4) + (b >> 6));
|
||||
cr_b_buf[i] = MINMAX(cr, -128, 127);
|
||||
/* 0.299 << 5 = 1001.10010001(b), 0.587 << 5 = 10010.11001000(b), 0.114 << 5 = 11.10100101(b) */
|
||||
y = ((r << 3) + (r) + (r >> 1) + (r >> 4) + (r >> 7)) +
|
||||
((g << 4) + (g << 1) + (g >> 1) + (g >> 2) + (g >> 5)) +
|
||||
((b << 1) + (b) + (b >> 1) + (b >> 3) + (b >> 6) + (b >> 7));
|
||||
y_r_buf[i] = MINMAX(y - 4096, -4096, 4095);
|
||||
/* 0.168935 << 5 = 101.01100111(b), 0.331665 << 5 = 1010.10011100(b), 0.50059 << 5 = 10000.00000100(b) */
|
||||
cb = 0 - ((r << 2) + (r) + (r >> 2) + (r >> 3) + (r >> 5)) -
|
||||
((g << 3) + (g << 1) + (g >> 1) + (g >> 4) + (g >> 5) + (g >> 6)) +
|
||||
((b << 4) + (b >> 6));
|
||||
cb_g_buf[i] = MINMAX(cb, -4096, 4095);
|
||||
/* 0.499813 << 5 = 1111.11111110(b), 0.418531 << 5 = 1101.01100100(b), 0.081282 << 5 = 10.10011001(b) */
|
||||
cr = ((r << 4) - (r >> 7)) -
|
||||
((g << 3) + (g << 2) + (g) + (g >> 2) + (g >> 3) + (g >> 6)) -
|
||||
((b << 1) + (b >> 1) + (b >> 4) + (b >> 5) + (b >> 7));
|
||||
cr_b_buf[i] = MINMAX(cr, -4096, 4095);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -23,10 +23,9 @@ static void rfx_quantization_decode_block(sint16* buffer, int buffer_size, uint3
|
||||
{
|
||||
sint16* dst;
|
||||
|
||||
if (factor <= 6)
|
||||
if (factor == 0)
|
||||
return;
|
||||
|
||||
factor -= 6;
|
||||
for (dst = buffer; buffer_size > 0; dst++, buffer_size--)
|
||||
{
|
||||
*dst <<= factor;
|
||||
@ -35,26 +34,28 @@ static void rfx_quantization_decode_block(sint16* buffer, int buffer_size, uint3
|
||||
|
||||
void rfx_quantization_decode(sint16* buffer, const uint32* quantization_values)
|
||||
{
|
||||
rfx_quantization_decode_block(buffer, 1024, quantization_values[8]); /* HL1 */
|
||||
rfx_quantization_decode_block(buffer + 1024, 1024, quantization_values[7]); /* LH1 */
|
||||
rfx_quantization_decode_block(buffer + 2048, 1024, quantization_values[9]); /* HH1 */
|
||||
rfx_quantization_decode_block(buffer + 3072, 256, quantization_values[5]); /* HL2 */
|
||||
rfx_quantization_decode_block(buffer + 3328, 256, quantization_values[4]); /* LH2 */
|
||||
rfx_quantization_decode_block(buffer + 3584, 256, quantization_values[6]); /* HH2 */
|
||||
rfx_quantization_decode_block(buffer + 3840, 64, quantization_values[2]); /* HL3 */
|
||||
rfx_quantization_decode_block(buffer + 3904, 64, quantization_values[1]); /* LH3 */
|
||||
rfx_quantization_decode_block(buffer + 3868, 64, quantization_values[3]); /* HH3 */
|
||||
rfx_quantization_decode_block(buffer + 4032, 64, quantization_values[0]); /* LL3 */
|
||||
/* Scale the values so that they are represented as 11.5 fixed-point number */
|
||||
rfx_quantization_decode_block(buffer, 4096, 5);
|
||||
|
||||
rfx_quantization_decode_block(buffer, 1024, quantization_values[8] - 6); /* HL1 */
|
||||
rfx_quantization_decode_block(buffer + 1024, 1024, quantization_values[7] - 6); /* LH1 */
|
||||
rfx_quantization_decode_block(buffer + 2048, 1024, quantization_values[9] - 6); /* HH1 */
|
||||
rfx_quantization_decode_block(buffer + 3072, 256, quantization_values[5] - 6); /* HL2 */
|
||||
rfx_quantization_decode_block(buffer + 3328, 256, quantization_values[4] - 6); /* LH2 */
|
||||
rfx_quantization_decode_block(buffer + 3584, 256, quantization_values[6] - 6); /* HH2 */
|
||||
rfx_quantization_decode_block(buffer + 3840, 64, quantization_values[2] - 6); /* HL3 */
|
||||
rfx_quantization_decode_block(buffer + 3904, 64, quantization_values[1] - 6); /* LH3 */
|
||||
rfx_quantization_decode_block(buffer + 3868, 64, quantization_values[3] - 6); /* HH3 */
|
||||
rfx_quantization_decode_block(buffer + 4032, 64, quantization_values[0] - 6); /* LL3 */
|
||||
}
|
||||
|
||||
static void rfx_quantization_encode_block(sint16* buffer, int buffer_size, uint32 factor)
|
||||
{
|
||||
sint16* dst;
|
||||
|
||||
if (factor <= 6)
|
||||
if (factor == 0)
|
||||
return;
|
||||
|
||||
factor -= 6;
|
||||
for (dst = buffer; buffer_size > 0; dst++, buffer_size--)
|
||||
{
|
||||
*dst >>= factor;
|
||||
@ -63,14 +64,17 @@ static void rfx_quantization_encode_block(sint16* buffer, int buffer_size, uint3
|
||||
|
||||
void rfx_quantization_encode(sint16* buffer, const uint32* quantization_values)
|
||||
{
|
||||
rfx_quantization_encode_block(buffer, 1024, quantization_values[8]); /* HL1 */
|
||||
rfx_quantization_encode_block(buffer + 1024, 1024, quantization_values[7]); /* LH1 */
|
||||
rfx_quantization_encode_block(buffer + 2048, 1024, quantization_values[9]); /* HH1 */
|
||||
rfx_quantization_encode_block(buffer + 3072, 256, quantization_values[5]); /* HL2 */
|
||||
rfx_quantization_encode_block(buffer + 3328, 256, quantization_values[4]); /* LH2 */
|
||||
rfx_quantization_encode_block(buffer + 3584, 256, quantization_values[6]); /* HH2 */
|
||||
rfx_quantization_encode_block(buffer + 3840, 64, quantization_values[2]); /* HL3 */
|
||||
rfx_quantization_encode_block(buffer + 3904, 64, quantization_values[1]); /* LH3 */
|
||||
rfx_quantization_encode_block(buffer + 3868, 64, quantization_values[3]); /* HH3 */
|
||||
rfx_quantization_encode_block(buffer + 4032, 64, quantization_values[0]); /* LL3 */
|
||||
rfx_quantization_encode_block(buffer, 1024, quantization_values[8] - 6); /* HL1 */
|
||||
rfx_quantization_encode_block(buffer + 1024, 1024, quantization_values[7] - 6); /* LH1 */
|
||||
rfx_quantization_encode_block(buffer + 2048, 1024, quantization_values[9] - 6); /* HH1 */
|
||||
rfx_quantization_encode_block(buffer + 3072, 256, quantization_values[5] - 6); /* HL2 */
|
||||
rfx_quantization_encode_block(buffer + 3328, 256, quantization_values[4] - 6); /* LH2 */
|
||||
rfx_quantization_encode_block(buffer + 3584, 256, quantization_values[6] - 6); /* HH2 */
|
||||
rfx_quantization_encode_block(buffer + 3840, 64, quantization_values[2] - 6); /* HL3 */
|
||||
rfx_quantization_encode_block(buffer + 3904, 64, quantization_values[1] - 6); /* LH3 */
|
||||
rfx_quantization_encode_block(buffer + 3868, 64, quantization_values[3] - 6); /* HH3 */
|
||||
rfx_quantization_encode_block(buffer + 4032, 64, quantization_values[0] - 6); /* LL3 */
|
||||
|
||||
/* The coefficients are scaled by << 5 at RGB->YCbCr phase, so we round it back here */
|
||||
rfx_quantization_encode_block(buffer, 4096, 5);
|
||||
}
|
||||
|
@ -68,49 +68,59 @@ static void rfx_decode_ycbcr_to_rgb_sse2(sint16* y_r_buffer, sint16* cb_g_buffer
|
||||
}
|
||||
for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i++)
|
||||
{
|
||||
/* y = y_r_buf[i] + 128; */
|
||||
/* y = (y_r_buf[i] >> 5) + 128; */
|
||||
y = _mm_load_si128(&y_r_buf[i]);
|
||||
y = _mm_add_epi16(y, _mm_set1_epi16(128));
|
||||
y = _mm_add_epi16(_mm_srai_epi16(y, 5), _mm_set1_epi16(128));
|
||||
|
||||
/* cr = cr_b_buf[i]; */
|
||||
cr = _mm_load_si128(&cr_b_buf[i]);
|
||||
|
||||
/* r = between(y + cr + (cr >> 2) + (cr >> 3) + (cr >> 5), 0, 255); */
|
||||
r = _mm_add_epi16(y, cr);
|
||||
r = _mm_add_epi16(r, _mm_srai_epi16(cr, 2));
|
||||
r = _mm_add_epi16(r, _mm_srai_epi16(cr, 3));
|
||||
r = _mm_add_epi16(r, _mm_srai_epi16(cr, 5));
|
||||
/* r = y + ((cr >> 5) + (cr >> 7) + (cr >> 8) + (cr >> 11) + (cr >> 12) + (cr >> 13)); */
|
||||
/* y_r_buf[i] = MINMAX(r, 0, 255); */
|
||||
r = _mm_add_epi16(y, _mm_srai_epi16(cr, 5));
|
||||
r = _mm_add_epi16(r, _mm_srai_epi16(cr, 7));
|
||||
r = _mm_add_epi16(r, _mm_srai_epi16(cr, 8));
|
||||
r = _mm_add_epi16(r, _mm_srai_epi16(cr, 11));
|
||||
r = _mm_add_epi16(r, _mm_srai_epi16(cr, 12));
|
||||
r = _mm_add_epi16(r, _mm_srai_epi16(cr, 13));
|
||||
_mm_between_epi16(r, zero, max);
|
||||
_mm_store_si128(&y_r_buf[i], r);
|
||||
|
||||
/* cb = cb_g_buf[i]; */
|
||||
cb = _mm_load_si128(&cb_g_buf[i]);
|
||||
|
||||
/* g = between(y - (cb >> 2) - (cb >> 4) - (cb >> 5) - (cr >> 1) - (cr >> 3) - (cr >> 4) - (cr >> 5), 0, 255); */
|
||||
g = _mm_sub_epi16(y, _mm_srai_epi16(cb, 2));
|
||||
g = _mm_sub_epi16(g, _mm_srai_epi16(cb, 4));
|
||||
g = _mm_sub_epi16(g, _mm_srai_epi16(cb, 5));
|
||||
g = _mm_sub_epi16(g, _mm_srai_epi16(cr, 1));
|
||||
g = _mm_sub_epi16(g, _mm_srai_epi16(cr, 3));
|
||||
g = _mm_sub_epi16(g, _mm_srai_epi16(cr, 4));
|
||||
g = _mm_sub_epi16(g, _mm_srai_epi16(cr, 5));
|
||||
/* g = y - ((cb >> 7) + (cb >> 9) + (cb >> 10)) -
|
||||
((cr >> 6) + (cr >> 8) + (cr >> 9) + (cr >> 11) + (cr >> 12) + (cr >> 13)); */
|
||||
/* cb_g_buf[i] = MINMAX(g, 0, 255); */
|
||||
g = _mm_sub_epi16(y, _mm_srai_epi16(cb, 7));
|
||||
g = _mm_sub_epi16(g, _mm_srai_epi16(cb, 9));
|
||||
g = _mm_sub_epi16(g, _mm_srai_epi16(cb, 10));
|
||||
g = _mm_sub_epi16(g, _mm_srai_epi16(cr, 6));
|
||||
g = _mm_sub_epi16(g, _mm_srai_epi16(cr, 8));
|
||||
g = _mm_sub_epi16(g, _mm_srai_epi16(cr, 9));
|
||||
g = _mm_sub_epi16(g, _mm_srai_epi16(cr, 11));
|
||||
g = _mm_sub_epi16(g, _mm_srai_epi16(cr, 12));
|
||||
g = _mm_sub_epi16(g, _mm_srai_epi16(cr, 13));
|
||||
_mm_between_epi16(g, zero, max);
|
||||
_mm_store_si128(&cb_g_buf[i], g);
|
||||
_mm_store_si128(&cb_g_buf[i], g);
|
||||
|
||||
/* b = between(y + cb + (cb >> 1) + (cb >> 2) + (cb >> 6), 0, 255); */
|
||||
b = _mm_add_epi16(y, cb);
|
||||
b = _mm_add_epi16(b, _mm_srai_epi16(cb, 1));
|
||||
b = _mm_add_epi16(b, _mm_srai_epi16(cb, 2));
|
||||
/* b = y + ((cb >> 5) + (cb >> 6) + (cb >> 7) + (cb >> 11) + (cb >> 13)); */
|
||||
/* cr_b_buf[i] = MINMAX(b, 0, 255); */
|
||||
b = _mm_add_epi16(y, _mm_srai_epi16(cb, 5));
|
||||
b = _mm_add_epi16(b, _mm_srai_epi16(cb, 6));
|
||||
b = _mm_add_epi16(b, _mm_srai_epi16(cb, 7));
|
||||
b = _mm_add_epi16(b, _mm_srai_epi16(cb, 11));
|
||||
b = _mm_add_epi16(b, _mm_srai_epi16(cb, 13));
|
||||
_mm_between_epi16(b, zero, max);
|
||||
_mm_store_si128(&cr_b_buf[i], b);
|
||||
}
|
||||
}
|
||||
|
||||
/* The encodec YCbCr coeffectients are represented as 11.5 fixed-point numbers. See rfx_encode.c */
|
||||
static void rfx_encode_rgb_to_ycbcr_sse2(sint16* y_r_buffer, sint16* cb_g_buffer, sint16* cr_b_buffer)
|
||||
{
|
||||
__m128i min = _mm_set1_epi16(-128);
|
||||
__m128i max = _mm_set1_epi16(127);
|
||||
__m128i min = _mm_set1_epi16(-128 << 5);
|
||||
__m128i max = _mm_set1_epi16(127 << 5);
|
||||
|
||||
__m128i* y_r_buf = (__m128i*) y_r_buffer;
|
||||
__m128i* cb_g_buf = (__m128i*) cb_g_buffer;
|
||||
@ -142,42 +152,64 @@ static void rfx_encode_rgb_to_ycbcr_sse2(sint16* y_r_buffer, sint16* cb_g_buffer
|
||||
/* b = cr_b_buf[i]; */
|
||||
b = _mm_load_si128(&cr_b_buf[i]);
|
||||
|
||||
/* y = ((r >> 2) + (r >> 5) + (r >> 6)) + ((g >> 1) + (g >> 4) + (g >> 6) + (g >> 7)) +
|
||||
((b >> 4) + (b >> 5) + (b >> 6) + (b >> 7)); */
|
||||
/* y_r_buf[i] = MINMAX(y, 0, 255) - 128; */
|
||||
y = _mm_add_epi16(_mm_srai_epi16(r, 2), _mm_srai_epi16(r, 5));
|
||||
y = _mm_add_epi16(y, _mm_srai_epi16(r, 6));
|
||||
/* y = ((r << 3) + (r) + (r >> 1) + (r >> 4) + (r >> 7)) +
|
||||
((g << 4) + (g << 1) + (g >> 1) + (g >> 2) + (g >> 5)) +
|
||||
((b << 1) + (b) + (b >> 1) + (b >> 3) + (b >> 6) + (b >> 7)); */
|
||||
/* y_r_buf[i] = MINMAX(y, 0, (255 << 5)) - (128 << 5); */
|
||||
y = _mm_add_epi16(_mm_slli_epi16(r, 3), r);
|
||||
y = _mm_add_epi16(y, _mm_srai_epi16(r, 1));
|
||||
y = _mm_add_epi16(y, _mm_srai_epi16(r, 4));
|
||||
y = _mm_add_epi16(y, _mm_srai_epi16(r, 7));
|
||||
y = _mm_add_epi16(y, _mm_slli_epi16(g, 4));
|
||||
y = _mm_add_epi16(y, _mm_slli_epi16(g, 1));
|
||||
y = _mm_add_epi16(y, _mm_srai_epi16(g, 1));
|
||||
y = _mm_add_epi16(y, _mm_srai_epi16(g, 4));
|
||||
y = _mm_add_epi16(y, _mm_srai_epi16(g, 6));
|
||||
y = _mm_add_epi16(y, _mm_srai_epi16(g, 7));
|
||||
y = _mm_add_epi16(y, _mm_srai_epi16(b, 4));
|
||||
y = _mm_add_epi16(y, _mm_srai_epi16(b, 5));
|
||||
y = _mm_add_epi16(y, _mm_srai_epi16(g, 2));
|
||||
y = _mm_add_epi16(y, _mm_srai_epi16(g, 5));
|
||||
y = _mm_add_epi16(y, _mm_slli_epi16(b, 1));
|
||||
y = _mm_add_epi16(y, b);
|
||||
y = _mm_add_epi16(y, _mm_srai_epi16(b, 1));
|
||||
y = _mm_add_epi16(y, _mm_srai_epi16(b, 3));
|
||||
y = _mm_add_epi16(y, _mm_srai_epi16(b, 6));
|
||||
y = _mm_add_epi16(y, _mm_srai_epi16(b, 7));
|
||||
y = _mm_add_epi16(y, min);
|
||||
_mm_between_epi16(y, min, max);
|
||||
_mm_store_si128(&y_r_buf[i], y);
|
||||
|
||||
/* cb = 0 - ((r >> 3) + (r >> 5) + (r >> 6)) - ((g >> 2) + (g >> 4) + (g >> 6)) + (b >> 1); */
|
||||
/* cb_g_buf[i] = MINMAX(cb, -128, 127); */
|
||||
cb = _mm_sub_epi16(_mm_srai_epi16(b, 1), _mm_srai_epi16(r, 3));
|
||||
/* cb = 0 - ((r << 2) + (r) + (r >> 2) + (r >> 3) + (r >> 5)) -
|
||||
((g << 3) + (g << 1) + (g >> 1) + (g >> 4) + (g >> 5) + (g >> 6)) +
|
||||
((b << 4) + (b >> 6)); */
|
||||
/* cb_g_buf[i] = MINMAX(cb, (-128 << 5), (127 << 5)); */
|
||||
cb = _mm_add_epi16(_mm_slli_epi16(b, 4), _mm_srai_epi16(b, 6));
|
||||
cb = _mm_sub_epi16(cb, _mm_slli_epi16(r, 2));
|
||||
cb = _mm_sub_epi16(cb, r);
|
||||
cb = _mm_sub_epi16(cb, _mm_srai_epi16(r, 2));
|
||||
cb = _mm_sub_epi16(cb, _mm_srai_epi16(r, 3));
|
||||
cb = _mm_sub_epi16(cb, _mm_srai_epi16(r, 5));
|
||||
cb = _mm_sub_epi16(cb, _mm_srai_epi16(r, 6));
|
||||
cb = _mm_sub_epi16(cb, _mm_srai_epi16(g, 2));
|
||||
cb = _mm_sub_epi16(cb, _mm_slli_epi16(g, 3));
|
||||
cb = _mm_sub_epi16(cb, _mm_slli_epi16(g, 1));
|
||||
cb = _mm_sub_epi16(cb, _mm_srai_epi16(g, 1));
|
||||
cb = _mm_sub_epi16(cb, _mm_srai_epi16(g, 4));
|
||||
cb = _mm_sub_epi16(cb, _mm_srai_epi16(g, 5));
|
||||
cb = _mm_sub_epi16(cb, _mm_srai_epi16(g, 6));
|
||||
_mm_between_epi16(cb, min, max);
|
||||
_mm_store_si128(&cb_g_buf[i], cb);
|
||||
|
||||
/* cr = (r >> 1) - ((g >> 2) + (g >> 3) + (g >> 5) + (g >> 6)) - ((b >> 4) + (b >> 6)); */
|
||||
/* cr_b_buf[i] = MINMAX(cr, -128, 127); */
|
||||
cr = _mm_sub_epi16(_mm_srai_epi16(r, 1), _mm_srai_epi16(g, 2));
|
||||
/* cr = ((r << 4) - (r >> 7)) -
|
||||
((g << 3) + (g << 2) + (g) + (g >> 2) + (g >> 3) + (g >> 6)) -
|
||||
((b << 1) + (b >> 1) + (b >> 4) + (b >> 5) + (b >> 7)); */
|
||||
/* cr_b_buf[i] = MINMAX(cr, (-128 << 5), (127 << 5)); */
|
||||
cr = _mm_sub_epi16(_mm_slli_epi16(r, 4), _mm_srai_epi16(r, 7));
|
||||
cr = _mm_sub_epi16(cr, _mm_slli_epi16(g, 3));
|
||||
cr = _mm_sub_epi16(cr, _mm_slli_epi16(g, 2));
|
||||
cr = _mm_sub_epi16(cr, g);
|
||||
cr = _mm_sub_epi16(cr, _mm_srai_epi16(g, 2));
|
||||
cr = _mm_sub_epi16(cr, _mm_srai_epi16(g, 3));
|
||||
cr = _mm_sub_epi16(cr, _mm_srai_epi16(g, 5));
|
||||
cr = _mm_sub_epi16(cr, _mm_srai_epi16(g, 6));
|
||||
cr = _mm_sub_epi16(cr, _mm_slli_epi16(b, 1));
|
||||
cr = _mm_sub_epi16(cr, _mm_srai_epi16(b, 1));
|
||||
cr = _mm_sub_epi16(cr, _mm_srai_epi16(b, 4));
|
||||
cr = _mm_sub_epi16(cr, _mm_srai_epi16(b, 6));
|
||||
cr = _mm_sub_epi16(cr, _mm_srai_epi16(b, 5));
|
||||
cr = _mm_sub_epi16(cr, _mm_srai_epi16(b, 7));
|
||||
_mm_between_epi16(cr, min, max);
|
||||
_mm_store_si128(&cr_b_buf[i], cr);
|
||||
}
|
||||
@ -186,17 +218,16 @@ static void rfx_encode_rgb_to_ycbcr_sse2(sint16* y_r_buffer, sint16* cb_g_buffer
|
||||
static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
rfx_quantization_decode_block_sse2(sint16* buffer, const int buffer_size, const uint32 factor)
|
||||
{
|
||||
int shift = factor-6;
|
||||
if (shift <= 0)
|
||||
if (factor == 0)
|
||||
return;
|
||||
|
||||
|
||||
__m128i a;
|
||||
__m128i * ptr = (__m128i*) buffer;
|
||||
__m128i * buf_end = (__m128i*) (buffer + buffer_size);
|
||||
do
|
||||
{
|
||||
a = _mm_load_si128(ptr);
|
||||
a = _mm_slli_epi16(a, shift);
|
||||
a = _mm_slli_epi16(a, factor);
|
||||
_mm_store_si128(ptr, a);
|
||||
|
||||
ptr++;
|
||||
@ -207,23 +238,24 @@ static void rfx_quantization_decode_sse2(sint16* buffer, const uint32* quantizat
|
||||
{
|
||||
_mm_prefetch_buffer((char*) buffer, 4096 * sizeof(sint16));
|
||||
|
||||
rfx_quantization_decode_block_sse2(buffer, 1024, quantization_values[8]); /* HL1 */
|
||||
rfx_quantization_decode_block_sse2(buffer + 1024, 1024, quantization_values[7]); /* LH1 */
|
||||
rfx_quantization_decode_block_sse2(buffer + 2048, 1024, quantization_values[9]); /* HH1 */
|
||||
rfx_quantization_decode_block_sse2(buffer + 3072, 256, quantization_values[5]); /* HL2 */
|
||||
rfx_quantization_decode_block_sse2(buffer + 3328, 256, quantization_values[4]); /* LH2 */
|
||||
rfx_quantization_decode_block_sse2(buffer + 3584, 256, quantization_values[6]); /* HH2 */
|
||||
rfx_quantization_decode_block_sse2(buffer + 3840, 64, quantization_values[2]); /* HL3 */
|
||||
rfx_quantization_decode_block_sse2(buffer + 3904, 64, quantization_values[1]); /* LH3 */
|
||||
rfx_quantization_decode_block_sse2(buffer + 3868, 64, quantization_values[3]); /* HH3 */
|
||||
rfx_quantization_decode_block_sse2(buffer + 4032, 64, quantization_values[0]); /* LL3 */
|
||||
rfx_quantization_decode_block_sse2(buffer, 4096, 5);
|
||||
|
||||
rfx_quantization_decode_block_sse2(buffer, 1024, quantization_values[8] - 6); /* HL1 */
|
||||
rfx_quantization_decode_block_sse2(buffer + 1024, 1024, quantization_values[7] - 6); /* LH1 */
|
||||
rfx_quantization_decode_block_sse2(buffer + 2048, 1024, quantization_values[9] - 6); /* HH1 */
|
||||
rfx_quantization_decode_block_sse2(buffer + 3072, 256, quantization_values[5] - 6); /* HL2 */
|
||||
rfx_quantization_decode_block_sse2(buffer + 3328, 256, quantization_values[4] - 6); /* LH2 */
|
||||
rfx_quantization_decode_block_sse2(buffer + 3584, 256, quantization_values[6] - 6); /* HH2 */
|
||||
rfx_quantization_decode_block_sse2(buffer + 3840, 64, quantization_values[2] - 6); /* HL3 */
|
||||
rfx_quantization_decode_block_sse2(buffer + 3904, 64, quantization_values[1] - 6); /* LH3 */
|
||||
rfx_quantization_decode_block_sse2(buffer + 3868, 64, quantization_values[3] - 6); /* HH3 */
|
||||
rfx_quantization_decode_block_sse2(buffer + 4032, 64, quantization_values[0] - 6); /* LL3 */
|
||||
}
|
||||
|
||||
static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
rfx_quantization_encode_block_sse2(sint16* buffer, const int buffer_size, const uint32 factor)
|
||||
{
|
||||
int shift = factor-6;
|
||||
if (shift <= 0)
|
||||
if (factor == 0)
|
||||
return;
|
||||
|
||||
__m128i a;
|
||||
@ -232,7 +264,7 @@ rfx_quantization_encode_block_sse2(sint16* buffer, const int buffer_size, const
|
||||
do
|
||||
{
|
||||
a = _mm_load_si128(ptr);
|
||||
a = _mm_srai_epi16(a, shift);
|
||||
a = _mm_srai_epi16(a, factor);
|
||||
_mm_store_si128(ptr, a);
|
||||
|
||||
ptr++;
|
||||
@ -243,16 +275,18 @@ static void rfx_quantization_encode_sse2(sint16* buffer, const uint32* quantizat
|
||||
{
|
||||
_mm_prefetch_buffer((char*) buffer, 4096 * sizeof(sint16));
|
||||
|
||||
rfx_quantization_encode_block_sse2(buffer, 1024, quantization_values[8]); /* HL1 */
|
||||
rfx_quantization_encode_block_sse2(buffer + 1024, 1024, quantization_values[7]); /* LH1 */
|
||||
rfx_quantization_encode_block_sse2(buffer + 2048, 1024, quantization_values[9]); /* HH1 */
|
||||
rfx_quantization_encode_block_sse2(buffer + 3072, 256, quantization_values[5]); /* HL2 */
|
||||
rfx_quantization_encode_block_sse2(buffer + 3328, 256, quantization_values[4]); /* LH2 */
|
||||
rfx_quantization_encode_block_sse2(buffer + 3584, 256, quantization_values[6]); /* HH2 */
|
||||
rfx_quantization_encode_block_sse2(buffer + 3840, 64, quantization_values[2]); /* HL3 */
|
||||
rfx_quantization_encode_block_sse2(buffer + 3904, 64, quantization_values[1]); /* LH3 */
|
||||
rfx_quantization_encode_block_sse2(buffer + 3868, 64, quantization_values[3]); /* HH3 */
|
||||
rfx_quantization_encode_block_sse2(buffer + 4032, 64, quantization_values[0]); /* LL3 */
|
||||
rfx_quantization_encode_block_sse2(buffer, 1024, quantization_values[8] - 6); /* HL1 */
|
||||
rfx_quantization_encode_block_sse2(buffer + 1024, 1024, quantization_values[7] - 6); /* LH1 */
|
||||
rfx_quantization_encode_block_sse2(buffer + 2048, 1024, quantization_values[9] - 6); /* HH1 */
|
||||
rfx_quantization_encode_block_sse2(buffer + 3072, 256, quantization_values[5] - 6); /* HL2 */
|
||||
rfx_quantization_encode_block_sse2(buffer + 3328, 256, quantization_values[4] - 6); /* LH2 */
|
||||
rfx_quantization_encode_block_sse2(buffer + 3584, 256, quantization_values[6] - 6); /* HH2 */
|
||||
rfx_quantization_encode_block_sse2(buffer + 3840, 64, quantization_values[2] - 6); /* HL3 */
|
||||
rfx_quantization_encode_block_sse2(buffer + 3904, 64, quantization_values[1] - 6); /* LH3 */
|
||||
rfx_quantization_encode_block_sse2(buffer + 3868, 64, quantization_values[3] - 6); /* HH3 */
|
||||
rfx_quantization_encode_block_sse2(buffer + 4032, 64, quantization_values[0] - 6); /* LL3 */
|
||||
|
||||
rfx_quantization_encode_block_sse2(buffer, 4096, 5);
|
||||
}
|
||||
|
||||
static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
@ -482,7 +516,7 @@ rfx_dwt_2d_encode_block_vert_sse2(sint16* src, sint16* l, sint16* h, int subband
|
||||
if (n < subband_width - 1)
|
||||
src_2n_2 = _mm_load_si128((__m128i*) (src + 2 * total_width));
|
||||
else
|
||||
src_2n_2 = src_2n_1;
|
||||
src_2n_2 = src_2n;
|
||||
|
||||
/* h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 */
|
||||
|
||||
@ -534,7 +568,7 @@ rfx_dwt_2d_encode_block_horiz_sse2(sint16* src, sint16* l, sint16* h, int subban
|
||||
/* The following 3 Set operations consumes more than half of the total DWT processing time! */
|
||||
src_2n = _mm_set_epi16(src[14], src[12], src[10], src[8], src[6], src[4], src[2], src[0]);
|
||||
src_2n_1 = _mm_set_epi16(src[15], src[13], src[11], src[9], src[7], src[5], src[3], src[1]);
|
||||
src_2n_2 = _mm_set_epi16(n == subband_width - 8 ? src[15] : src[16],
|
||||
src_2n_2 = _mm_set_epi16(n == subband_width - 8 ? src[14] : src[16],
|
||||
src[14], src[12], src[10], src[8], src[6], src[4], src[2]);
|
||||
|
||||
/* h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 */
|
||||
|
Loading…
Reference in New Issue
Block a user