Merge branch 'master' of github.com:FreeRDP/FreeRDP-1.0

This commit is contained in:
Marc-André Moreau 2011-09-03 16:36:37 -04:00
commit d7917f9db5
5 changed files with 177 additions and 109 deletions

View File

@ -86,18 +86,31 @@ void rfx_decode_ycbcr_to_rgb(sint16* y_r_buf, sint16* cb_g_buf, sint16* cr_b_buf
{
sint16 y, cb, cr;
sint16 r, g, b;
int i;
/**
* The decoded YCbCr coeffectients are represented as 11.5 fixed-point numbers:
*
* 1 sign bit + 10 integer bits + 5 fractional bits
*
* However only 7 integer bits will be actually used since the value range is [-128.0, 127.0].
* In other words, the decoded coeffectients is scaled by << 5 when intepreted as sint16.
* It was scaled in the quantization phase, so we must scale it back here.
*/
for (i = 0; i < 4096; i++)
{
y = y_r_buf[i] + 128;
y = (y_r_buf[i] >> 5) + 128;
cb = cb_g_buf[i];
cr = cr_b_buf[i];
r = (y + cr + (cr >> 2) + (cr >> 3) + (cr >> 5));
/* 1.403 >> 5 = 0.000010110011100(b) */
r = y + ((cr >> 5) + (cr >> 7) + (cr >> 8) + (cr >> 11) + (cr >> 12) + (cr >> 13));
y_r_buf[i] = MINMAX(r, 0, 255);
g = (y - ((cb >> 2) + (cb >> 4) + (cb >> 5)) - ((cr >> 1) + (cr >> 3) + (cr >> 4) + (cr >> 5)));
/* 0.344 >> 5 = 0.000000101100000(b), 0.714 >> 5 = 0.000001011011011(b) */
g = y - ((cb >> 7) + (cb >> 9) + (cb >> 10)) -
((cr >> 6) + (cr >> 8) + (cr >> 9) + (cr >> 11) + (cr >> 12) + (cr >> 13));
cb_g_buf[i] = MINMAX(g, 0, 255);
b = (y + cb + (cb >> 1) + (cb >> 2) + (cb >> 6));
/* 1.77 >> 5 = 0.000011100010100(b) */
b = y + ((cb >> 5) + (cb >> 6) + (cb >> 7) + (cb >> 11) + (cb >> 13));
cr_b_buf[i] = MINMAX(b, 0, 255);
}
}

View File

@ -133,7 +133,7 @@ static void rfx_dwt_2d_encode_block(sint16* buffer, sint16* dwt, int subband_wid
src = buffer + y * total_width + x;
/* H */
*h = (src[total_width] - ((src[0] + src[n < subband_width - 1 ? 2 * total_width : total_width]) >> 1)) >> 1;
*h = (src[total_width] - ((src[0] + src[n < subband_width - 1 ? 2 * total_width : 0]) >> 1)) >> 1;
/* L */
*l = src[0] + (n == 0 ? *h : (*(h - total_width) + *h) >> 1);

View File

@ -123,20 +123,37 @@ void rfx_encode_rgb_to_ycbcr(sint16* y_r_buf, sint16* cb_g_buf, sint16* cr_b_buf
{
sint16 y, cb, cr;
sint16 r, g, b;
int i;
/**
* The encoded YCbCr coeffectients are represented as 11.5 fixed-point numbers:
*
* 1 sign bit + 10 integer bits + 5 fractional bits
*
* However only 7 integer bits will be actually used since the value range is [-128.0, 127.0].
* In other words, the encoded coeffectients is scaled by << 5 when intepreted as sint16.
* It will be scaled down to original during the quantization phase.
*/
for (i = 0; i < 4096; i++)
{
r = y_r_buf[i];
g = cb_g_buf[i];
b = cr_b_buf[i];
y = ((r >> 2) + (r >> 5) + (r >> 6)) + ((g >> 1) + (g >> 4) + (g >> 6) + (g >> 7)) +
((b >> 4) + (b >> 5) + (b >> 6) + (b >> 7));
y_r_buf[i] = MINMAX(y, 0, 255) - 128;
cb = 0 - ((r >> 3) + (r >> 5) + (r >> 6)) - ((g >> 2) + (g >> 4) + (g >> 6)) + (b >> 1);
cb_g_buf[i] = MINMAX(cb, -128, 127);
cr = (r >> 1) - ((g >> 2) + (g >> 3) + (g >> 5) + (g >> 6)) - ((b >> 4) + (b >> 6));
cr_b_buf[i] = MINMAX(cr, -128, 127);
/* 0.299 << 5 = 1001.10010001(b), 0.587 << 5 = 10010.11001000(b), 0.114 << 5 = 11.10100101(b) */
y = ((r << 3) + (r) + (r >> 1) + (r >> 4) + (r >> 7)) +
((g << 4) + (g << 1) + (g >> 1) + (g >> 2) + (g >> 5)) +
((b << 1) + (b) + (b >> 1) + (b >> 3) + (b >> 6) + (b >> 7));
y_r_buf[i] = MINMAX(y - 4096, -4096, 4095);
/* 0.168935 << 5 = 101.01100111(b), 0.331665 << 5 = 1010.10011100(b), 0.50059 << 5 = 10000.00000100(b) */
cb = 0 - ((r << 2) + (r) + (r >> 2) + (r >> 3) + (r >> 5)) -
((g << 3) + (g << 1) + (g >> 1) + (g >> 4) + (g >> 5) + (g >> 6)) +
((b << 4) + (b >> 6));
cb_g_buf[i] = MINMAX(cb, -4096, 4095);
/* 0.499813 << 5 = 1111.11111110(b), 0.418531 << 5 = 1101.01100100(b), 0.081282 << 5 = 10.10011001(b) */
cr = ((r << 4) - (r >> 7)) -
((g << 3) + (g << 2) + (g) + (g >> 2) + (g >> 3) + (g >> 6)) -
((b << 1) + (b >> 1) + (b >> 4) + (b >> 5) + (b >> 7));
cr_b_buf[i] = MINMAX(cr, -4096, 4095);
}
}

View File

@ -23,10 +23,9 @@ static void rfx_quantization_decode_block(sint16* buffer, int buffer_size, uint3
{
sint16* dst;
if (factor <= 6)
if (factor == 0)
return;
factor -= 6;
for (dst = buffer; buffer_size > 0; dst++, buffer_size--)
{
*dst <<= factor;
@ -35,26 +34,28 @@ static void rfx_quantization_decode_block(sint16* buffer, int buffer_size, uint3
void rfx_quantization_decode(sint16* buffer, const uint32* quantization_values)
{
rfx_quantization_decode_block(buffer, 1024, quantization_values[8]); /* HL1 */
rfx_quantization_decode_block(buffer + 1024, 1024, quantization_values[7]); /* LH1 */
rfx_quantization_decode_block(buffer + 2048, 1024, quantization_values[9]); /* HH1 */
rfx_quantization_decode_block(buffer + 3072, 256, quantization_values[5]); /* HL2 */
rfx_quantization_decode_block(buffer + 3328, 256, quantization_values[4]); /* LH2 */
rfx_quantization_decode_block(buffer + 3584, 256, quantization_values[6]); /* HH2 */
rfx_quantization_decode_block(buffer + 3840, 64, quantization_values[2]); /* HL3 */
rfx_quantization_decode_block(buffer + 3904, 64, quantization_values[1]); /* LH3 */
rfx_quantization_decode_block(buffer + 3868, 64, quantization_values[3]); /* HH3 */
rfx_quantization_decode_block(buffer + 4032, 64, quantization_values[0]); /* LL3 */
/* Scale the values so that they are represented as 11.5 fixed-point number */
rfx_quantization_decode_block(buffer, 4096, 5);
rfx_quantization_decode_block(buffer, 1024, quantization_values[8] - 6); /* HL1 */
rfx_quantization_decode_block(buffer + 1024, 1024, quantization_values[7] - 6); /* LH1 */
rfx_quantization_decode_block(buffer + 2048, 1024, quantization_values[9] - 6); /* HH1 */
rfx_quantization_decode_block(buffer + 3072, 256, quantization_values[5] - 6); /* HL2 */
rfx_quantization_decode_block(buffer + 3328, 256, quantization_values[4] - 6); /* LH2 */
rfx_quantization_decode_block(buffer + 3584, 256, quantization_values[6] - 6); /* HH2 */
rfx_quantization_decode_block(buffer + 3840, 64, quantization_values[2] - 6); /* HL3 */
rfx_quantization_decode_block(buffer + 3904, 64, quantization_values[1] - 6); /* LH3 */
rfx_quantization_decode_block(buffer + 3868, 64, quantization_values[3] - 6); /* HH3 */
rfx_quantization_decode_block(buffer + 4032, 64, quantization_values[0] - 6); /* LL3 */
}
static void rfx_quantization_encode_block(sint16* buffer, int buffer_size, uint32 factor)
{
sint16* dst;
if (factor <= 6)
if (factor == 0)
return;
factor -= 6;
for (dst = buffer; buffer_size > 0; dst++, buffer_size--)
{
*dst >>= factor;
@ -63,14 +64,17 @@ static void rfx_quantization_encode_block(sint16* buffer, int buffer_size, uint3
void rfx_quantization_encode(sint16* buffer, const uint32* quantization_values)
{
rfx_quantization_encode_block(buffer, 1024, quantization_values[8]); /* HL1 */
rfx_quantization_encode_block(buffer + 1024, 1024, quantization_values[7]); /* LH1 */
rfx_quantization_encode_block(buffer + 2048, 1024, quantization_values[9]); /* HH1 */
rfx_quantization_encode_block(buffer + 3072, 256, quantization_values[5]); /* HL2 */
rfx_quantization_encode_block(buffer + 3328, 256, quantization_values[4]); /* LH2 */
rfx_quantization_encode_block(buffer + 3584, 256, quantization_values[6]); /* HH2 */
rfx_quantization_encode_block(buffer + 3840, 64, quantization_values[2]); /* HL3 */
rfx_quantization_encode_block(buffer + 3904, 64, quantization_values[1]); /* LH3 */
rfx_quantization_encode_block(buffer + 3868, 64, quantization_values[3]); /* HH3 */
rfx_quantization_encode_block(buffer + 4032, 64, quantization_values[0]); /* LL3 */
rfx_quantization_encode_block(buffer, 1024, quantization_values[8] - 6); /* HL1 */
rfx_quantization_encode_block(buffer + 1024, 1024, quantization_values[7] - 6); /* LH1 */
rfx_quantization_encode_block(buffer + 2048, 1024, quantization_values[9] - 6); /* HH1 */
rfx_quantization_encode_block(buffer + 3072, 256, quantization_values[5] - 6); /* HL2 */
rfx_quantization_encode_block(buffer + 3328, 256, quantization_values[4] - 6); /* LH2 */
rfx_quantization_encode_block(buffer + 3584, 256, quantization_values[6] - 6); /* HH2 */
rfx_quantization_encode_block(buffer + 3840, 64, quantization_values[2] - 6); /* HL3 */
rfx_quantization_encode_block(buffer + 3904, 64, quantization_values[1] - 6); /* LH3 */
rfx_quantization_encode_block(buffer + 3868, 64, quantization_values[3] - 6); /* HH3 */
rfx_quantization_encode_block(buffer + 4032, 64, quantization_values[0] - 6); /* LL3 */
/* The coefficients are scaled by << 5 at RGB->YCbCr phase, so we round it back here */
rfx_quantization_encode_block(buffer, 4096, 5);
}

View File

@ -68,49 +68,59 @@ static void rfx_decode_ycbcr_to_rgb_sse2(sint16* y_r_buffer, sint16* cb_g_buffer
}
for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i++)
{
/* y = y_r_buf[i] + 128; */
/* y = (y_r_buf[i] >> 5) + 128; */
y = _mm_load_si128(&y_r_buf[i]);
y = _mm_add_epi16(y, _mm_set1_epi16(128));
y = _mm_add_epi16(_mm_srai_epi16(y, 5), _mm_set1_epi16(128));
/* cr = cr_b_buf[i]; */
cr = _mm_load_si128(&cr_b_buf[i]);
/* r = between(y + cr + (cr >> 2) + (cr >> 3) + (cr >> 5), 0, 255); */
r = _mm_add_epi16(y, cr);
r = _mm_add_epi16(r, _mm_srai_epi16(cr, 2));
r = _mm_add_epi16(r, _mm_srai_epi16(cr, 3));
r = _mm_add_epi16(r, _mm_srai_epi16(cr, 5));
/* r = y + ((cr >> 5) + (cr >> 7) + (cr >> 8) + (cr >> 11) + (cr >> 12) + (cr >> 13)); */
/* y_r_buf[i] = MINMAX(r, 0, 255); */
r = _mm_add_epi16(y, _mm_srai_epi16(cr, 5));
r = _mm_add_epi16(r, _mm_srai_epi16(cr, 7));
r = _mm_add_epi16(r, _mm_srai_epi16(cr, 8));
r = _mm_add_epi16(r, _mm_srai_epi16(cr, 11));
r = _mm_add_epi16(r, _mm_srai_epi16(cr, 12));
r = _mm_add_epi16(r, _mm_srai_epi16(cr, 13));
_mm_between_epi16(r, zero, max);
_mm_store_si128(&y_r_buf[i], r);
/* cb = cb_g_buf[i]; */
cb = _mm_load_si128(&cb_g_buf[i]);
/* g = between(y - (cb >> 2) - (cb >> 4) - (cb >> 5) - (cr >> 1) - (cr >> 3) - (cr >> 4) - (cr >> 5), 0, 255); */
g = _mm_sub_epi16(y, _mm_srai_epi16(cb, 2));
g = _mm_sub_epi16(g, _mm_srai_epi16(cb, 4));
g = _mm_sub_epi16(g, _mm_srai_epi16(cb, 5));
g = _mm_sub_epi16(g, _mm_srai_epi16(cr, 1));
g = _mm_sub_epi16(g, _mm_srai_epi16(cr, 3));
g = _mm_sub_epi16(g, _mm_srai_epi16(cr, 4));
g = _mm_sub_epi16(g, _mm_srai_epi16(cr, 5));
/* g = y - ((cb >> 7) + (cb >> 9) + (cb >> 10)) -
((cr >> 6) + (cr >> 8) + (cr >> 9) + (cr >> 11) + (cr >> 12) + (cr >> 13)); */
/* cb_g_buf[i] = MINMAX(g, 0, 255); */
g = _mm_sub_epi16(y, _mm_srai_epi16(cb, 7));
g = _mm_sub_epi16(g, _mm_srai_epi16(cb, 9));
g = _mm_sub_epi16(g, _mm_srai_epi16(cb, 10));
g = _mm_sub_epi16(g, _mm_srai_epi16(cr, 6));
g = _mm_sub_epi16(g, _mm_srai_epi16(cr, 8));
g = _mm_sub_epi16(g, _mm_srai_epi16(cr, 9));
g = _mm_sub_epi16(g, _mm_srai_epi16(cr, 11));
g = _mm_sub_epi16(g, _mm_srai_epi16(cr, 12));
g = _mm_sub_epi16(g, _mm_srai_epi16(cr, 13));
_mm_between_epi16(g, zero, max);
_mm_store_si128(&cb_g_buf[i], g);
_mm_store_si128(&cb_g_buf[i], g);
/* b = between(y + cb + (cb >> 1) + (cb >> 2) + (cb >> 6), 0, 255); */
b = _mm_add_epi16(y, cb);
b = _mm_add_epi16(b, _mm_srai_epi16(cb, 1));
b = _mm_add_epi16(b, _mm_srai_epi16(cb, 2));
/* b = y + ((cb >> 5) + (cb >> 6) + (cb >> 7) + (cb >> 11) + (cb >> 13)); */
/* cr_b_buf[i] = MINMAX(b, 0, 255); */
b = _mm_add_epi16(y, _mm_srai_epi16(cb, 5));
b = _mm_add_epi16(b, _mm_srai_epi16(cb, 6));
b = _mm_add_epi16(b, _mm_srai_epi16(cb, 7));
b = _mm_add_epi16(b, _mm_srai_epi16(cb, 11));
b = _mm_add_epi16(b, _mm_srai_epi16(cb, 13));
_mm_between_epi16(b, zero, max);
_mm_store_si128(&cr_b_buf[i], b);
}
}
/* The encodec YCbCr coeffectients are represented as 11.5 fixed-point numbers. See rfx_encode.c */
static void rfx_encode_rgb_to_ycbcr_sse2(sint16* y_r_buffer, sint16* cb_g_buffer, sint16* cr_b_buffer)
{
__m128i min = _mm_set1_epi16(-128);
__m128i max = _mm_set1_epi16(127);
__m128i min = _mm_set1_epi16(-128 << 5);
__m128i max = _mm_set1_epi16(127 << 5);
__m128i* y_r_buf = (__m128i*) y_r_buffer;
__m128i* cb_g_buf = (__m128i*) cb_g_buffer;
@ -142,42 +152,64 @@ static void rfx_encode_rgb_to_ycbcr_sse2(sint16* y_r_buffer, sint16* cb_g_buffer
/* b = cr_b_buf[i]; */
b = _mm_load_si128(&cr_b_buf[i]);
/* y = ((r >> 2) + (r >> 5) + (r >> 6)) + ((g >> 1) + (g >> 4) + (g >> 6) + (g >> 7)) +
((b >> 4) + (b >> 5) + (b >> 6) + (b >> 7)); */
/* y_r_buf[i] = MINMAX(y, 0, 255) - 128; */
y = _mm_add_epi16(_mm_srai_epi16(r, 2), _mm_srai_epi16(r, 5));
y = _mm_add_epi16(y, _mm_srai_epi16(r, 6));
/* y = ((r << 3) + (r) + (r >> 1) + (r >> 4) + (r >> 7)) +
((g << 4) + (g << 1) + (g >> 1) + (g >> 2) + (g >> 5)) +
((b << 1) + (b) + (b >> 1) + (b >> 3) + (b >> 6) + (b >> 7)); */
/* y_r_buf[i] = MINMAX(y, 0, (255 << 5)) - (128 << 5); */
y = _mm_add_epi16(_mm_slli_epi16(r, 3), r);
y = _mm_add_epi16(y, _mm_srai_epi16(r, 1));
y = _mm_add_epi16(y, _mm_srai_epi16(r, 4));
y = _mm_add_epi16(y, _mm_srai_epi16(r, 7));
y = _mm_add_epi16(y, _mm_slli_epi16(g, 4));
y = _mm_add_epi16(y, _mm_slli_epi16(g, 1));
y = _mm_add_epi16(y, _mm_srai_epi16(g, 1));
y = _mm_add_epi16(y, _mm_srai_epi16(g, 4));
y = _mm_add_epi16(y, _mm_srai_epi16(g, 6));
y = _mm_add_epi16(y, _mm_srai_epi16(g, 7));
y = _mm_add_epi16(y, _mm_srai_epi16(b, 4));
y = _mm_add_epi16(y, _mm_srai_epi16(b, 5));
y = _mm_add_epi16(y, _mm_srai_epi16(g, 2));
y = _mm_add_epi16(y, _mm_srai_epi16(g, 5));
y = _mm_add_epi16(y, _mm_slli_epi16(b, 1));
y = _mm_add_epi16(y, b);
y = _mm_add_epi16(y, _mm_srai_epi16(b, 1));
y = _mm_add_epi16(y, _mm_srai_epi16(b, 3));
y = _mm_add_epi16(y, _mm_srai_epi16(b, 6));
y = _mm_add_epi16(y, _mm_srai_epi16(b, 7));
y = _mm_add_epi16(y, min);
_mm_between_epi16(y, min, max);
_mm_store_si128(&y_r_buf[i], y);
/* cb = 0 - ((r >> 3) + (r >> 5) + (r >> 6)) - ((g >> 2) + (g >> 4) + (g >> 6)) + (b >> 1); */
/* cb_g_buf[i] = MINMAX(cb, -128, 127); */
cb = _mm_sub_epi16(_mm_srai_epi16(b, 1), _mm_srai_epi16(r, 3));
/* cb = 0 - ((r << 2) + (r) + (r >> 2) + (r >> 3) + (r >> 5)) -
((g << 3) + (g << 1) + (g >> 1) + (g >> 4) + (g >> 5) + (g >> 6)) +
((b << 4) + (b >> 6)); */
/* cb_g_buf[i] = MINMAX(cb, (-128 << 5), (127 << 5)); */
cb = _mm_add_epi16(_mm_slli_epi16(b, 4), _mm_srai_epi16(b, 6));
cb = _mm_sub_epi16(cb, _mm_slli_epi16(r, 2));
cb = _mm_sub_epi16(cb, r);
cb = _mm_sub_epi16(cb, _mm_srai_epi16(r, 2));
cb = _mm_sub_epi16(cb, _mm_srai_epi16(r, 3));
cb = _mm_sub_epi16(cb, _mm_srai_epi16(r, 5));
cb = _mm_sub_epi16(cb, _mm_srai_epi16(r, 6));
cb = _mm_sub_epi16(cb, _mm_srai_epi16(g, 2));
cb = _mm_sub_epi16(cb, _mm_slli_epi16(g, 3));
cb = _mm_sub_epi16(cb, _mm_slli_epi16(g, 1));
cb = _mm_sub_epi16(cb, _mm_srai_epi16(g, 1));
cb = _mm_sub_epi16(cb, _mm_srai_epi16(g, 4));
cb = _mm_sub_epi16(cb, _mm_srai_epi16(g, 5));
cb = _mm_sub_epi16(cb, _mm_srai_epi16(g, 6));
_mm_between_epi16(cb, min, max);
_mm_store_si128(&cb_g_buf[i], cb);
/* cr = (r >> 1) - ((g >> 2) + (g >> 3) + (g >> 5) + (g >> 6)) - ((b >> 4) + (b >> 6)); */
/* cr_b_buf[i] = MINMAX(cr, -128, 127); */
cr = _mm_sub_epi16(_mm_srai_epi16(r, 1), _mm_srai_epi16(g, 2));
/* cr = ((r << 4) - (r >> 7)) -
((g << 3) + (g << 2) + (g) + (g >> 2) + (g >> 3) + (g >> 6)) -
((b << 1) + (b >> 1) + (b >> 4) + (b >> 5) + (b >> 7)); */
/* cr_b_buf[i] = MINMAX(cr, (-128 << 5), (127 << 5)); */
cr = _mm_sub_epi16(_mm_slli_epi16(r, 4), _mm_srai_epi16(r, 7));
cr = _mm_sub_epi16(cr, _mm_slli_epi16(g, 3));
cr = _mm_sub_epi16(cr, _mm_slli_epi16(g, 2));
cr = _mm_sub_epi16(cr, g);
cr = _mm_sub_epi16(cr, _mm_srai_epi16(g, 2));
cr = _mm_sub_epi16(cr, _mm_srai_epi16(g, 3));
cr = _mm_sub_epi16(cr, _mm_srai_epi16(g, 5));
cr = _mm_sub_epi16(cr, _mm_srai_epi16(g, 6));
cr = _mm_sub_epi16(cr, _mm_slli_epi16(b, 1));
cr = _mm_sub_epi16(cr, _mm_srai_epi16(b, 1));
cr = _mm_sub_epi16(cr, _mm_srai_epi16(b, 4));
cr = _mm_sub_epi16(cr, _mm_srai_epi16(b, 6));
cr = _mm_sub_epi16(cr, _mm_srai_epi16(b, 5));
cr = _mm_sub_epi16(cr, _mm_srai_epi16(b, 7));
_mm_between_epi16(cr, min, max);
_mm_store_si128(&cr_b_buf[i], cr);
}
@ -186,17 +218,16 @@ static void rfx_encode_rgb_to_ycbcr_sse2(sint16* y_r_buffer, sint16* cb_g_buffer
static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
rfx_quantization_decode_block_sse2(sint16* buffer, const int buffer_size, const uint32 factor)
{
int shift = factor-6;
if (shift <= 0)
if (factor == 0)
return;
__m128i a;
__m128i * ptr = (__m128i*) buffer;
__m128i * buf_end = (__m128i*) (buffer + buffer_size);
do
{
a = _mm_load_si128(ptr);
a = _mm_slli_epi16(a, shift);
a = _mm_slli_epi16(a, factor);
_mm_store_si128(ptr, a);
ptr++;
@ -207,23 +238,24 @@ static void rfx_quantization_decode_sse2(sint16* buffer, const uint32* quantizat
{
_mm_prefetch_buffer((char*) buffer, 4096 * sizeof(sint16));
rfx_quantization_decode_block_sse2(buffer, 1024, quantization_values[8]); /* HL1 */
rfx_quantization_decode_block_sse2(buffer + 1024, 1024, quantization_values[7]); /* LH1 */
rfx_quantization_decode_block_sse2(buffer + 2048, 1024, quantization_values[9]); /* HH1 */
rfx_quantization_decode_block_sse2(buffer + 3072, 256, quantization_values[5]); /* HL2 */
rfx_quantization_decode_block_sse2(buffer + 3328, 256, quantization_values[4]); /* LH2 */
rfx_quantization_decode_block_sse2(buffer + 3584, 256, quantization_values[6]); /* HH2 */
rfx_quantization_decode_block_sse2(buffer + 3840, 64, quantization_values[2]); /* HL3 */
rfx_quantization_decode_block_sse2(buffer + 3904, 64, quantization_values[1]); /* LH3 */
rfx_quantization_decode_block_sse2(buffer + 3868, 64, quantization_values[3]); /* HH3 */
rfx_quantization_decode_block_sse2(buffer + 4032, 64, quantization_values[0]); /* LL3 */
rfx_quantization_decode_block_sse2(buffer, 4096, 5);
rfx_quantization_decode_block_sse2(buffer, 1024, quantization_values[8] - 6); /* HL1 */
rfx_quantization_decode_block_sse2(buffer + 1024, 1024, quantization_values[7] - 6); /* LH1 */
rfx_quantization_decode_block_sse2(buffer + 2048, 1024, quantization_values[9] - 6); /* HH1 */
rfx_quantization_decode_block_sse2(buffer + 3072, 256, quantization_values[5] - 6); /* HL2 */
rfx_quantization_decode_block_sse2(buffer + 3328, 256, quantization_values[4] - 6); /* LH2 */
rfx_quantization_decode_block_sse2(buffer + 3584, 256, quantization_values[6] - 6); /* HH2 */
rfx_quantization_decode_block_sse2(buffer + 3840, 64, quantization_values[2] - 6); /* HL3 */
rfx_quantization_decode_block_sse2(buffer + 3904, 64, quantization_values[1] - 6); /* LH3 */
rfx_quantization_decode_block_sse2(buffer + 3868, 64, quantization_values[3] - 6); /* HH3 */
rfx_quantization_decode_block_sse2(buffer + 4032, 64, quantization_values[0] - 6); /* LL3 */
}
static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
rfx_quantization_encode_block_sse2(sint16* buffer, const int buffer_size, const uint32 factor)
{
int shift = factor-6;
if (shift <= 0)
if (factor == 0)
return;
__m128i a;
@ -232,7 +264,7 @@ rfx_quantization_encode_block_sse2(sint16* buffer, const int buffer_size, const
do
{
a = _mm_load_si128(ptr);
a = _mm_srai_epi16(a, shift);
a = _mm_srai_epi16(a, factor);
_mm_store_si128(ptr, a);
ptr++;
@ -243,16 +275,18 @@ static void rfx_quantization_encode_sse2(sint16* buffer, const uint32* quantizat
{
_mm_prefetch_buffer((char*) buffer, 4096 * sizeof(sint16));
rfx_quantization_encode_block_sse2(buffer, 1024, quantization_values[8]); /* HL1 */
rfx_quantization_encode_block_sse2(buffer + 1024, 1024, quantization_values[7]); /* LH1 */
rfx_quantization_encode_block_sse2(buffer + 2048, 1024, quantization_values[9]); /* HH1 */
rfx_quantization_encode_block_sse2(buffer + 3072, 256, quantization_values[5]); /* HL2 */
rfx_quantization_encode_block_sse2(buffer + 3328, 256, quantization_values[4]); /* LH2 */
rfx_quantization_encode_block_sse2(buffer + 3584, 256, quantization_values[6]); /* HH2 */
rfx_quantization_encode_block_sse2(buffer + 3840, 64, quantization_values[2]); /* HL3 */
rfx_quantization_encode_block_sse2(buffer + 3904, 64, quantization_values[1]); /* LH3 */
rfx_quantization_encode_block_sse2(buffer + 3868, 64, quantization_values[3]); /* HH3 */
rfx_quantization_encode_block_sse2(buffer + 4032, 64, quantization_values[0]); /* LL3 */
rfx_quantization_encode_block_sse2(buffer, 1024, quantization_values[8] - 6); /* HL1 */
rfx_quantization_encode_block_sse2(buffer + 1024, 1024, quantization_values[7] - 6); /* LH1 */
rfx_quantization_encode_block_sse2(buffer + 2048, 1024, quantization_values[9] - 6); /* HH1 */
rfx_quantization_encode_block_sse2(buffer + 3072, 256, quantization_values[5] - 6); /* HL2 */
rfx_quantization_encode_block_sse2(buffer + 3328, 256, quantization_values[4] - 6); /* LH2 */
rfx_quantization_encode_block_sse2(buffer + 3584, 256, quantization_values[6] - 6); /* HH2 */
rfx_quantization_encode_block_sse2(buffer + 3840, 64, quantization_values[2] - 6); /* HL3 */
rfx_quantization_encode_block_sse2(buffer + 3904, 64, quantization_values[1] - 6); /* LH3 */
rfx_quantization_encode_block_sse2(buffer + 3868, 64, quantization_values[3] - 6); /* HH3 */
rfx_quantization_encode_block_sse2(buffer + 4032, 64, quantization_values[0] - 6); /* LL3 */
rfx_quantization_encode_block_sse2(buffer, 4096, 5);
}
static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@ -482,7 +516,7 @@ rfx_dwt_2d_encode_block_vert_sse2(sint16* src, sint16* l, sint16* h, int subband
if (n < subband_width - 1)
src_2n_2 = _mm_load_si128((__m128i*) (src + 2 * total_width));
else
src_2n_2 = src_2n_1;
src_2n_2 = src_2n;
/* h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 */
@ -534,7 +568,7 @@ rfx_dwt_2d_encode_block_horiz_sse2(sint16* src, sint16* l, sint16* h, int subban
/* The following 3 Set operations consumes more than half of the total DWT processing time! */
src_2n = _mm_set_epi16(src[14], src[12], src[10], src[8], src[6], src[4], src[2], src[0]);
src_2n_1 = _mm_set_epi16(src[15], src[13], src[11], src[9], src[7], src[5], src[3], src[1]);
src_2n_2 = _mm_set_epi16(n == subband_width - 8 ? src[15] : src[16],
src_2n_2 = _mm_set_epi16(n == subband_width - 8 ? src[14] : src[16],
src[14], src[12], src[10], src[8], src[6], src[4], src[2]);
/* h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 */