From febff86af03d2ccbdd4826c9555336916eb6ecb3 Mon Sep 17 00:00:00 2001 From: Martijn van Beurden Date: Thu, 12 May 2022 14:28:05 +0200 Subject: [PATCH] Remove all assembler and intrinsics from decoder This commit drops all use of assembler and intrinsics from the libFLAC decoder. This is because they are only for 32-bit x86, hard to debug, maintain and fuzz properly, and because the decoder has much greater security risks than the encoder. --- src/libFLAC/ia32/lpc_asm.nasm | 655 ------------------------------ src/libFLAC/include/private/lpc.h | 16 - src/libFLAC/lpc_intrin_sse41.c | 544 ------------------------- src/libFLAC/stream_decoder.c | 57 +-- 4 files changed, 2 insertions(+), 1270 deletions(-) diff --git a/src/libFLAC/ia32/lpc_asm.nasm b/src/libFLAC/ia32/lpc_asm.nasm index b6117605..af5fc309 100644 --- a/src/libFLAC/ia32/lpc_asm.nasm +++ b/src/libFLAC/ia32/lpc_asm.nasm @@ -38,9 +38,6 @@ cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32 cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_wide_asm_ia32 -cglobal FLAC__lpc_restore_signal_asm_ia32 -cglobal FLAC__lpc_restore_signal_asm_ia32_mmx -cglobal FLAC__lpc_restore_signal_wide_asm_ia32 code_section @@ -446,377 +443,6 @@ cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx pop ebp ret -; ********************************************************************** -; -; void FLAC__lpc_restore_signal(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]) -; { -; unsigned i, j; -; FLAC__int32 sum; -; -; FLAC__ASSERT(order > 0); -; -; for(i = 0; i < data_len; i++) { -; sum = 0; -; for(j = 0; j < order; j++) -; sum += qlp_coeff[j] * data[i-j-1]; -; data[i] = residual[i] + (sum >> lp_quantization); -; } -; } - ALIGN 16 -cident FLAC__lpc_restore_signal_asm_ia32 - ;[esp + 40] data[] - ;[esp + 36] lp_quantization - ;[esp + 32] order - ;[esp + 28] qlp_coeff[] - ;[esp + 24] data_len - ;[esp + 20] residual[] - - ;ASSERT(order > 0) - - push ebp - push ebx - push esi - push edi - - mov esi, [esp + 20] ; esi = residual[] - mov edi, [esp + 40] ; edi = data[] - mov eax, [esp + 32] ; eax = order - mov ebx, [esp + 24] ; ebx = data_len - - test ebx, ebx - jz near .end ; do nothing if data_len == 0 - -.begin: - cmp eax, byte 1 - jg short .x87_1more - - mov ecx, [esp + 28] - mov edx, [ecx] - mov eax, [edi - 4] - mov ecx, [esp + 36] - ALIGN 16 -.x87_1_loop_i: - imul eax, edx - sar eax, cl - add eax, [esi] - mov [edi], eax - add esi, byte 4 - add edi, byte 4 - dec ebx - jnz .x87_1_loop_i - - jmp .end - -.x87_1more: - cmp eax, byte 32 ; for order <= 32 there is a faster routine - jbe short .x87_32 - - ; This version is here just for completeness, since FLAC__MAX_LPC_ORDER == 32 - ALIGN 16 -.x87_32more_loop_i: - xor ebp, ebp - mov ecx, [esp + 32] - mov edx, ecx - shl edx, 2 - add edx, [esp + 28] - neg ecx - ALIGN 16 -.x87_32more_loop_j: - sub edx, byte 4 - mov eax, [edx] - imul eax, [edi + 4 * ecx] - add ebp, eax - inc ecx - jnz short .x87_32more_loop_j - - mov ecx, [esp + 36] - sar ebp, cl - add ebp, [esi] - mov [edi], ebp - add edi, byte 4 - add esi, byte 4 - - dec ebx - jnz .x87_32more_loop_i - - jmp .end - -.mov_eip_to_eax: - mov eax, [esp] - ret - -.x87_32: - sub esi, edi - neg eax - lea edx, [eax + eax * 8 + .jumper_0 - .get_eip0] - call .mov_eip_to_eax -.get_eip0: - add edx, eax - inc edx ; compensate for the shorter opcode on the last iteration - mov eax, [esp + 28] ; eax = qlp_coeff[] - xor ebp, ebp - jmp edx - - mov ecx, [eax + 124] ; ecx = qlp_coeff[31] - imul ecx, [edi - 128] ; ecx = qlp_coeff[31] * data[i-32] - add ebp, ecx ; sum += qlp_coeff[31] * data[i-32] - mov ecx, [eax + 120] ; ecx = qlp_coeff[30] - imul ecx, [edi - 124] ; ecx = qlp_coeff[30] * data[i-31] - add ebp, ecx ; sum += qlp_coeff[30] * data[i-31] - mov ecx, [eax + 116] ; ecx = qlp_coeff[29] - imul ecx, [edi - 120] ; ecx = qlp_coeff[29] * data[i-30] - add ebp, ecx ; sum += qlp_coeff[29] * data[i-30] - mov ecx, [eax + 112] ; ecx = qlp_coeff[28] - imul ecx, [edi - 116] ; ecx = qlp_coeff[28] * data[i-29] - add ebp, ecx ; sum += qlp_coeff[28] * data[i-29] - mov ecx, [eax + 108] ; ecx = qlp_coeff[27] - imul ecx, [edi - 112] ; ecx = qlp_coeff[27] * data[i-28] - add ebp, ecx ; sum += qlp_coeff[27] * data[i-28] - mov ecx, [eax + 104] ; ecx = qlp_coeff[26] - imul ecx, [edi - 108] ; ecx = qlp_coeff[26] * data[i-27] - add ebp, ecx ; sum += qlp_coeff[26] * data[i-27] - mov ecx, [eax + 100] ; ecx = qlp_coeff[25] - imul ecx, [edi - 104] ; ecx = qlp_coeff[25] * data[i-26] - add ebp, ecx ; sum += qlp_coeff[25] * data[i-26] - mov ecx, [eax + 96] ; ecx = qlp_coeff[24] - imul ecx, [edi - 100] ; ecx = qlp_coeff[24] * data[i-25] - add ebp, ecx ; sum += qlp_coeff[24] * data[i-25] - mov ecx, [eax + 92] ; ecx = qlp_coeff[23] - imul ecx, [edi - 96] ; ecx = qlp_coeff[23] * data[i-24] - add ebp, ecx ; sum += qlp_coeff[23] * data[i-24] - mov ecx, [eax + 88] ; ecx = qlp_coeff[22] - imul ecx, [edi - 92] ; ecx = qlp_coeff[22] * data[i-23] - add ebp, ecx ; sum += qlp_coeff[22] * data[i-23] - mov ecx, [eax + 84] ; ecx = qlp_coeff[21] - imul ecx, [edi - 88] ; ecx = qlp_coeff[21] * data[i-22] - add ebp, ecx ; sum += qlp_coeff[21] * data[i-22] - mov ecx, [eax + 80] ; ecx = qlp_coeff[20] - imul ecx, [edi - 84] ; ecx = qlp_coeff[20] * data[i-21] - add ebp, ecx ; sum += qlp_coeff[20] * data[i-21] - mov ecx, [eax + 76] ; ecx = qlp_coeff[19] - imul ecx, [edi - 80] ; ecx = qlp_coeff[19] * data[i-20] - add ebp, ecx ; sum += qlp_coeff[19] * data[i-20] - mov ecx, [eax + 72] ; ecx = qlp_coeff[18] - imul ecx, [edi - 76] ; ecx = qlp_coeff[18] * data[i-19] - add ebp, ecx ; sum += qlp_coeff[18] * data[i-19] - mov ecx, [eax + 68] ; ecx = qlp_coeff[17] - imul ecx, [edi - 72] ; ecx = qlp_coeff[17] * data[i-18] - add ebp, ecx ; sum += qlp_coeff[17] * data[i-18] - mov ecx, [eax + 64] ; ecx = qlp_coeff[16] - imul ecx, [edi - 68] ; ecx = qlp_coeff[16] * data[i-17] - add ebp, ecx ; sum += qlp_coeff[16] * data[i-17] - mov ecx, [eax + 60] ; ecx = qlp_coeff[15] - imul ecx, [edi - 64] ; ecx = qlp_coeff[15] * data[i-16] - add ebp, ecx ; sum += qlp_coeff[15] * data[i-16] - mov ecx, [eax + 56] ; ecx = qlp_coeff[14] - imul ecx, [edi - 60] ; ecx = qlp_coeff[14] * data[i-15] - add ebp, ecx ; sum += qlp_coeff[14] * data[i-15] - mov ecx, [eax + 52] ; ecx = qlp_coeff[13] - imul ecx, [edi - 56] ; ecx = qlp_coeff[13] * data[i-14] - add ebp, ecx ; sum += qlp_coeff[13] * data[i-14] - mov ecx, [eax + 48] ; ecx = qlp_coeff[12] - imul ecx, [edi - 52] ; ecx = qlp_coeff[12] * data[i-13] - add ebp, ecx ; sum += qlp_coeff[12] * data[i-13] - mov ecx, [eax + 44] ; ecx = qlp_coeff[11] - imul ecx, [edi - 48] ; ecx = qlp_coeff[11] * data[i-12] - add ebp, ecx ; sum += qlp_coeff[11] * data[i-12] - mov ecx, [eax + 40] ; ecx = qlp_coeff[10] - imul ecx, [edi - 44] ; ecx = qlp_coeff[10] * data[i-11] - add ebp, ecx ; sum += qlp_coeff[10] * data[i-11] - mov ecx, [eax + 36] ; ecx = qlp_coeff[ 9] - imul ecx, [edi - 40] ; ecx = qlp_coeff[ 9] * data[i-10] - add ebp, ecx ; sum += qlp_coeff[ 9] * data[i-10] - mov ecx, [eax + 32] ; ecx = qlp_coeff[ 8] - imul ecx, [edi - 36] ; ecx = qlp_coeff[ 8] * data[i- 9] - add ebp, ecx ; sum += qlp_coeff[ 8] * data[i- 9] - mov ecx, [eax + 28] ; ecx = qlp_coeff[ 7] - imul ecx, [edi - 32] ; ecx = qlp_coeff[ 7] * data[i- 8] - add ebp, ecx ; sum += qlp_coeff[ 7] * data[i- 8] - mov ecx, [eax + 24] ; ecx = qlp_coeff[ 6] - imul ecx, [edi - 28] ; ecx = qlp_coeff[ 6] * data[i- 7] - add ebp, ecx ; sum += qlp_coeff[ 6] * data[i- 7] - mov ecx, [eax + 20] ; ecx = qlp_coeff[ 5] - imul ecx, [edi - 24] ; ecx = qlp_coeff[ 5] * data[i- 6] - add ebp, ecx ; sum += qlp_coeff[ 5] * data[i- 6] - mov ecx, [eax + 16] ; ecx = qlp_coeff[ 4] - imul ecx, [edi - 20] ; ecx = qlp_coeff[ 4] * data[i- 5] - add ebp, ecx ; sum += qlp_coeff[ 4] * data[i- 5] - mov ecx, [eax + 12] ; ecx = qlp_coeff[ 3] - imul ecx, [edi - 16] ; ecx = qlp_coeff[ 3] * data[i- 4] - add ebp, ecx ; sum += qlp_coeff[ 3] * data[i- 4] - mov ecx, [eax + 8] ; ecx = qlp_coeff[ 2] - imul ecx, [edi - 12] ; ecx = qlp_coeff[ 2] * data[i- 3] - add ebp, ecx ; sum += qlp_coeff[ 2] * data[i- 3] - mov ecx, [eax + 4] ; ecx = qlp_coeff[ 1] - imul ecx, [edi - 8] ; ecx = qlp_coeff[ 1] * data[i- 2] - add ebp, ecx ; sum += qlp_coeff[ 1] * data[i- 2] - mov ecx, [eax] ; ecx = qlp_coeff[ 0] (NOTE: one byte missing from instruction) - imul ecx, [edi - 4] ; ecx = qlp_coeff[ 0] * data[i- 1] - add ebp, ecx ; sum += qlp_coeff[ 0] * data[i- 1] -.jumper_0: - - mov ecx, [esp + 36] - sar ebp, cl ; ebp = (sum >> lp_quantization) - add ebp, [esi + edi] ; ebp = residual[i] + (sum >> lp_quantization) - mov [edi], ebp ; data[i] = residual[i] + (sum >> lp_quantization) - add edi, byte 4 - - dec ebx - jz short .end - xor ebp, ebp - jmp edx - -.end: - pop edi - pop esi - pop ebx - pop ebp - ret - -; WATCHOUT: this routine works on 16 bit data which means bits-per-sample for -; the channel and qlp_coeffs must be <= 16. Especially note that this routine -; cannot be used for side-channel coded 16bps channels since the effective bps -; is 17. -; WATCHOUT: this routine requires that each data array have a buffer of up to -; 3 zeroes in front (at negative indices) for alignment purposes, i.e. for each -; channel n, data[n][-1] through data[n][-3] should be accessible and zero. - ALIGN 16 -cident FLAC__lpc_restore_signal_asm_ia32_mmx - ;[esp + 40] data[] - ;[esp + 36] lp_quantization - ;[esp + 32] order - ;[esp + 28] qlp_coeff[] - ;[esp + 24] data_len - ;[esp + 20] residual[] - - ;ASSERT(order > 0) - - push ebp - push ebx - push esi - push edi - - mov esi, [esp + 20] - mov edi, [esp + 40] - mov eax, [esp + 32] - mov ebx, [esp + 24] - - test ebx, ebx - jz near .end ; do nothing if data_len == 0 - cmp eax, byte 4 - jb near FLAC__lpc_restore_signal_asm_ia32.begin - - mov edx, [esp + 28] - movd mm6, [esp + 36] - mov ebp, esp - - and esp, 0xfffffff8 - - xor ecx, ecx -.copy_qlp_loop: - push word [edx + 4 * ecx] - inc ecx - cmp ecx, eax - jnz short .copy_qlp_loop - - and ecx, 0x3 - test ecx, ecx - je short .za_end - sub ecx, byte 4 -.za_loop: - push word 0 - inc eax - inc ecx - jnz short .za_loop -.za_end: - - movq mm5, [esp + 2 * eax - 8] - movd mm4, [edi - 16] - punpckldq mm4, [edi - 12] - movd mm0, [edi - 8] - punpckldq mm0, [edi - 4] - packssdw mm4, mm0 - - cmp eax, byte 4 - jnbe short .mmx_4more - - ALIGN 16 -.mmx_4_loop_i: - movq mm7, mm4 - pmaddwd mm7, mm5 - movq mm0, mm7 - punpckhdq mm7, mm7 - paddd mm7, mm0 - psrad mm7, mm6 - movd mm1, [esi] - paddd mm7, mm1 - movd [edi], mm7 - psllq mm7, 48 - psrlq mm4, 16 - por mm4, mm7 - - add esi, byte 4 - add edi, byte 4 - - dec ebx - jnz .mmx_4_loop_i - jmp .mmx_end -.mmx_4more: - shl eax, 2 - neg eax - add eax, byte 16 - ALIGN 16 -.mmx_4more_loop_i: - mov ecx, edi - add ecx, eax - mov edx, esp - - movq mm7, mm4 - pmaddwd mm7, mm5 - - ALIGN 16 -.mmx_4more_loop_j: - movd mm0, [ecx - 16] - punpckldq mm0, [ecx - 12] - movd mm1, [ecx - 8] - punpckldq mm1, [ecx - 4] - packssdw mm0, mm1 - pmaddwd mm0, [edx] - paddd mm7, mm0 - - add edx, byte 8 - add ecx, byte 16 - cmp ecx, edi - jnz .mmx_4more_loop_j - - movq mm0, mm7 - punpckhdq mm7, mm7 - paddd mm7, mm0 - psrad mm7, mm6 - movd mm1, [esi] - paddd mm7, mm1 - movd [edi], mm7 - psllq mm7, 48 - psrlq mm4, 16 - por mm4, mm7 - - add esi, byte 4 - add edi, byte 4 - - dec ebx - jnz short .mmx_4more_loop_i -.mmx_end: - emms - mov esp, ebp - -.end: - pop edi - pop esi - pop ebx - pop ebp - ret - - ; ********************************************************************** ; ;void FLAC__lpc_compute_residual_from_qlp_coefficients_wide(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]) @@ -1098,285 +724,4 @@ cident FLAC__lpc_compute_residual_from_qlp_coefficients_wide_asm_ia32 pop ebp ret -; ********************************************************************** -; -; void FLAC__lpc_restore_signal_wide(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]) -; { -; unsigned i, j; -; FLAC__int64 sum; -; -; FLAC__ASSERT(order > 0); -; -; for(i = 0; i < data_len; i++) { -; sum = 0; -; for(j = 0; j < order; j++) -; sum += qlp_coeff[j] * (FLAC__int64)data[i-j-1]; -; data[i] = residual[i] + (FLAC__int32)(sum >> lp_quantization); -; } -; } - ALIGN 16 -cident FLAC__lpc_restore_signal_wide_asm_ia32 - ;[esp + 40] data[] - ;[esp + 36] lp_quantization - ;[esp + 32] order - ;[esp + 28] qlp_coeff[] - ;[esp + 24] data_len - ;[esp + 20] residual[] - - ;ASSERT(order > 0) - ;ASSERT(order <= 32) - ;ASSERT(lp_quantization <= 31) - - push ebp - push ebx - push esi - push edi - - mov ebx, [esp + 24] ; ebx = data_len - test ebx, ebx - jz near .end ; do nothing if data_len == 0 - -.begin: - mov eax, [esp + 32] ; eax = order - cmp eax, 1 - jg short .x87_32 - - mov esi, [esp + 20] ; esi = residual[] - mov edi, [esp + 40] ; edi = data[] - mov ecx, [esp + 28] ; ecx = qlp_coeff[] - mov ebp, [ecx] ; ebp = qlp_coeff[0] - mov eax, [edi - 4] ; eax = data[-1] - mov ecx, [esp + 36] ; cl = lp_quantization - ALIGN 16 -.x87_1_loop_i: - imul ebp ; edx:eax = qlp_coeff[0] * (FLAC__int64)data[i-1] - shrd eax, edx, cl ; 0 <= lp_quantization <= 15 -; - add eax, [esi] - mov [edi], eax -; - add esi, 4 - add edi, 4 - dec ebx - jnz .x87_1_loop_i - jmp .end - -.mov_eip_to_eax: - mov eax, [esp] - ret - -.x87_32: ; eax = order - neg eax - add eax, eax - lea ebp, [eax + eax * 4 + .jumper_0 - .get_eip0] - call .mov_eip_to_eax -.get_eip0: - add ebp, eax - inc ebp ; compensate for the shorter opcode on the last iteration - - mov ebx, [esp + 28] ; ebx = qlp_coeff[] - mov edi, [esp + 40] ; esi = data[] - sub [esp + 20], edi ; residual[] -= data[] - - xor ecx, ecx - xor esi, esi - jmp ebp - -;eax = -- -;edx = -- -;ecx = 0 -;esi = 0 -; -;ebx = qlp_coeff[] -;edi = data[] -;ebp = @address - - mov eax, [ebx + 124] ; eax = qlp_coeff[31] - imul dword [edi - 128] ; edx:eax = qlp_coeff[31] * data[i-32] - add ecx, eax - adc esi, edx ; sum += qlp_coeff[31] * data[i-32] - - mov eax, [ebx + 120] ; eax = qlp_coeff[30] - imul dword [edi - 124] ; edx:eax = qlp_coeff[30] * data[i-31] - add ecx, eax - adc esi, edx ; sum += qlp_coeff[30] * data[i-31] - - mov eax, [ebx + 116] - imul dword [edi - 120] - add ecx, eax - adc esi, edx - - mov eax, [ebx + 112] - imul dword [edi - 116] - add ecx, eax - adc esi, edx - - mov eax, [ebx + 108] - imul dword [edi - 112] - add ecx, eax - adc esi, edx - - mov eax, [ebx + 104] - imul dword [edi - 108] - add ecx, eax - adc esi, edx - - mov eax, [ebx + 100] - imul dword [edi - 104] - add ecx, eax - adc esi, edx - - mov eax, [ebx + 96] - imul dword [edi - 100] - add ecx, eax - adc esi, edx - - mov eax, [ebx + 92] - imul dword [edi - 96] - add ecx, eax - adc esi, edx - - mov eax, [ebx + 88] - imul dword [edi - 92] - add ecx, eax - adc esi, edx - - mov eax, [ebx + 84] - imul dword [edi - 88] - add ecx, eax - adc esi, edx - - mov eax, [ebx + 80] - imul dword [edi - 84] - add ecx, eax - adc esi, edx - - mov eax, [ebx + 76] - imul dword [edi - 80] - add ecx, eax - adc esi, edx - - mov eax, [ebx + 72] - imul dword [edi - 76] - add ecx, eax - adc esi, edx - - mov eax, [ebx + 68] - imul dword [edi - 72] - add ecx, eax - adc esi, edx - - mov eax, [ebx + 64] - imul dword [edi - 68] - add ecx, eax - adc esi, edx - - mov eax, [ebx + 60] - imul dword [edi - 64] - add ecx, eax - adc esi, edx - - mov eax, [ebx + 56] - imul dword [edi - 60] - add ecx, eax - adc esi, edx - - mov eax, [ebx + 52] - imul dword [edi - 56] - add ecx, eax - adc esi, edx - - mov eax, [ebx + 48] - imul dword [edi - 52] - add ecx, eax - adc esi, edx - - mov eax, [ebx + 44] - imul dword [edi - 48] - add ecx, eax - adc esi, edx - - mov eax, [ebx + 40] - imul dword [edi - 44] - add ecx, eax - adc esi, edx - - mov eax, [ebx + 36] - imul dword [edi - 40] - add ecx, eax - adc esi, edx - - mov eax, [ebx + 32] - imul dword [edi - 36] - add ecx, eax - adc esi, edx - - mov eax, [ebx + 28] - imul dword [edi - 32] - add ecx, eax - adc esi, edx - - mov eax, [ebx + 24] - imul dword [edi - 28] - add ecx, eax - adc esi, edx - - mov eax, [ebx + 20] - imul dword [edi - 24] - add ecx, eax - adc esi, edx - - mov eax, [ebx + 16] - imul dword [edi - 20] - add ecx, eax - adc esi, edx - - mov eax, [ebx + 12] - imul dword [edi - 16] - add ecx, eax - adc esi, edx - - mov eax, [ebx + 8] - imul dword [edi - 12] - add ecx, eax - adc esi, edx - - mov eax, [ebx + 4] - imul dword [edi - 8] - add ecx, eax - adc esi, edx - - mov eax, [ebx] ; eax = qlp_coeff[ 0] (NOTE: one byte missing from instruction) - imul dword [edi - 4] ; edx:eax = qlp_coeff[ 0] * data[i- 1] - add ecx, eax - adc esi, edx ; sum += qlp_coeff[ 0] * data[i- 1] - -.jumper_0: - mov edx, ecx -;esi:edx = sum - mov ecx, [esp + 36] ; cl = lp_quantization - shrd edx, esi, cl ; edx = (sum >> lp_quantization) -;eax = -- -;ecx = -- -;edx = sum >> lp_q -;esi = -- -; - mov eax, [esp + 20] ; residual[] - data[] - add edx, [edi + eax] ; edx = residual[i] + (sum >> lp_quantization) - mov [edi], edx ; data[i] = residual[i] + (sum >> lp_quantization) - add edi, 4 - - dec dword [esp + 24] - jz short .end - xor ecx, ecx - xor esi, esi - jmp ebp - -.end: - pop edi - pop esi - pop ebx - pop ebp - ret - ; end diff --git a/src/libFLAC/include/private/lpc.h b/src/libFLAC/include/private/lpc.h index 0e619c1d..f538e645 100644 --- a/src/libFLAC/include/private/lpc.h +++ b/src/libFLAC/include/private/lpc.h @@ -206,22 +206,6 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_avx2(const FLA */ void FLAC__lpc_restore_signal(const FLAC__int32 residual[], uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 data[]); void FLAC__lpc_restore_signal_wide(const FLAC__int32 residual[], uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 data[]); -#ifndef FLAC__NO_ASM -# ifdef FLAC__CPU_IA32 -# ifdef FLAC__HAS_NASM -void FLAC__lpc_restore_signal_asm_ia32(const FLAC__int32 residual[], uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 data[]); -void FLAC__lpc_restore_signal_asm_ia32_mmx(const FLAC__int32 residual[], uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 data[]); -void FLAC__lpc_restore_signal_wide_asm_ia32(const FLAC__int32 residual[], uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 data[]); -# endif /* FLAC__HAS_NASM */ -# endif /* FLAC__CPU_IA32 */ -# if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN -# ifdef FLAC__SSE4_1_SUPPORTED -void FLAC__lpc_restore_signal_intrin_sse41(const FLAC__int32 residual[], uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 data[]); -void FLAC__lpc_restore_signal_16_intrin_sse41(const FLAC__int32 residual[], uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 data[]); -void FLAC__lpc_restore_signal_wide_intrin_sse41(const FLAC__int32 residual[], uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 data[]); -# endif -# endif -#endif /* FLAC__NO_ASM */ #ifndef FLAC__INTEGER_ONLY_LIBRARY diff --git a/src/libFLAC/lpc_intrin_sse41.c b/src/libFLAC/lpc_intrin_sse41.c index c37399be..74e7e956 100644 --- a/src/libFLAC/lpc_intrin_sse41.c +++ b/src/libFLAC/lpc_intrin_sse41.c @@ -588,550 +588,6 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FL } } -FLAC__SSE_TARGET("sse4.1") -void FLAC__lpc_restore_signal_wide_intrin_sse41(const FLAC__int32 residual[], uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 data[]) -{ - int i; - const __m128i cnt = _mm_cvtsi32_si128(lp_quantization); - - if (!data_len) - return; - - FLAC__ASSERT(order > 0); - FLAC__ASSERT(order <= 32); - FLAC__ASSERT(lp_quantization <= 32); /* there's no _mm_sra_epi64() so we have to use _mm_srl_epi64() */ - - if(order <= 12) { - if(order > 8) { /* order == 9, 10, 11, 12 */ - if(order > 10) { /* order == 11, 12 */ - __m128i qlp[6], dat[6]; - __m128i summ, temp; - qlp[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0))); // 0 q[1] 0 q[0] - qlp[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2))); // 0 q[3] 0 q[2] - qlp[2] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4))); // 0 q[5] 0 q[4] - qlp[3] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+6))); // 0 q[7] 0 q[6] - qlp[4] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+8))); // 0 q[9] 0 q[8] - if (order == 12) - qlp[5] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+10))); // 0 q[11] 0 q[10] - else - qlp[5] = _mm_cvtepu32_epi64(_mm_cvtsi32_si128(qlp_coeff[10])); // 0 0 0 q[10] - - dat[5] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-12)), _MM_SHUFFLE(2,0,3,1)); // 0 d[i-12] 0 d[i-11] - dat[4] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-10)), _MM_SHUFFLE(2,0,3,1)); // 0 d[i-10] 0 d[i-9] - dat[3] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-8 )), _MM_SHUFFLE(2,0,3,1)); // 0 d[i-8] 0 d[i-7] - dat[2] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-6 )), _MM_SHUFFLE(2,0,3,1)); // 0 d[i-6] 0 d[i-5] - dat[1] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-4 )), _MM_SHUFFLE(2,0,3,1)); // 0 d[i-4] 0 d[i-3] - dat[0] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-2 )), _MM_SHUFFLE(2,0,3,1)); // 0 d[i-2] 0 d[i-1] - - summ = _mm_mul_epi32(dat[5], qlp[5]) ; - summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[4], qlp[4])); - summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[3], qlp[3])); - summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[2], qlp[2])); - summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1])); - summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0])); - - summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8)); // ?_64 sum_64 - summ = _mm_srl_epi64(summ, cnt); // ?_64 (sum >> lp_quantization)_64 == ?_32 ?_32 ?_32 (sum >> lp_quantization)_32 - temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[0]), summ); // ? ? ? d[i] - data[0] = _mm_cvtsi128_si32(temp); - - for(i = 1; i < (int)data_len; i++) { - temp = _mm_slli_si128(temp, 8); - dat[5] = _mm_alignr_epi8(dat[5], dat[4], 8); // ? d[i-11] ? d[i-10] - dat[4] = _mm_alignr_epi8(dat[4], dat[3], 8); // ? d[i-9] ? d[i-8] - dat[3] = _mm_alignr_epi8(dat[3], dat[2], 8); // ? d[i-7] ? d[i-6] - dat[2] = _mm_alignr_epi8(dat[2], dat[1], 8); // ? d[i-5] ? d[i-4] - dat[1] = _mm_alignr_epi8(dat[1], dat[0], 8); // ? d[i-3] ? d[i-2] - dat[0] = _mm_alignr_epi8(dat[0], temp, 8); // ? d[i-1] ? d[i ] - - summ = _mm_mul_epi32(dat[5], qlp[5]) ; - summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[4], qlp[4])); - summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[3], qlp[3])); - summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[2], qlp[2])); - summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1])); - summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0])); - - summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8)); // ?_64 sum_64 - summ = _mm_srl_epi64(summ, cnt); // ?_64 (sum >> lp_quantization)_64 == ?_32 ?_32 ?_32 (sum >> lp_quantization)_32 - temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ); // ? ? ? d[i] - data[i] = _mm_cvtsi128_si32(temp); - } - } - else { /* order == 9, 10 */ - __m128i qlp[5], dat[5]; - __m128i summ, temp; - qlp[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0))); - qlp[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2))); - qlp[2] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4))); - qlp[3] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+6))); - if (order == 10) - qlp[4] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+8))); - else - qlp[4] = _mm_cvtepu32_epi64(_mm_cvtsi32_si128(qlp_coeff[8])); - - dat[4] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-10)), _MM_SHUFFLE(2,0,3,1)); - dat[3] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-8 )), _MM_SHUFFLE(2,0,3,1)); - dat[2] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-6 )), _MM_SHUFFLE(2,0,3,1)); - dat[1] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-4 )), _MM_SHUFFLE(2,0,3,1)); - dat[0] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-2 )), _MM_SHUFFLE(2,0,3,1)); - - summ = _mm_mul_epi32(dat[4], qlp[4]) ; - summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[3], qlp[3])); - summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[2], qlp[2])); - summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1])); - summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0])); - - summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8)); - summ = _mm_srl_epi64(summ, cnt); - temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[0]), summ); - data[0] = _mm_cvtsi128_si32(temp); - - for(i = 1; i < (int)data_len; i++) { - temp = _mm_slli_si128(temp, 8); - dat[4] = _mm_alignr_epi8(dat[4], dat[3], 8); - dat[3] = _mm_alignr_epi8(dat[3], dat[2], 8); - dat[2] = _mm_alignr_epi8(dat[2], dat[1], 8); - dat[1] = _mm_alignr_epi8(dat[1], dat[0], 8); - dat[0] = _mm_alignr_epi8(dat[0], temp, 8); - - summ = _mm_mul_epi32(dat[4], qlp[4]) ; - summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[3], qlp[3])); - summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[2], qlp[2])); - summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1])); - summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0])); - - summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8)); - summ = _mm_srl_epi64(summ, cnt); - temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ); - data[i] = _mm_cvtsi128_si32(temp); - } - } - } - else if(order > 4) { /* order == 5, 6, 7, 8 */ - if(order > 6) { /* order == 7, 8 */ - __m128i qlp[4], dat[4]; - __m128i summ, temp; - qlp[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0))); - qlp[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2))); - qlp[2] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4))); - if (order == 8) - qlp[3] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+6))); - else - qlp[3] = _mm_cvtepu32_epi64(_mm_cvtsi32_si128(qlp_coeff[6])); - - dat[3] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-8 )), _MM_SHUFFLE(2,0,3,1)); - dat[2] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-6 )), _MM_SHUFFLE(2,0,3,1)); - dat[1] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-4 )), _MM_SHUFFLE(2,0,3,1)); - dat[0] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-2 )), _MM_SHUFFLE(2,0,3,1)); - - summ = _mm_mul_epi32(dat[3], qlp[3]) ; - summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[2], qlp[2])); - summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1])); - summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0])); - - summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8)); - summ = _mm_srl_epi64(summ, cnt); - temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[0]), summ); - data[0] = _mm_cvtsi128_si32(temp); - - for(i = 1; i < (int)data_len; i++) { - temp = _mm_slli_si128(temp, 8); - dat[3] = _mm_alignr_epi8(dat[3], dat[2], 8); - dat[2] = _mm_alignr_epi8(dat[2], dat[1], 8); - dat[1] = _mm_alignr_epi8(dat[1], dat[0], 8); - dat[0] = _mm_alignr_epi8(dat[0], temp, 8); - - summ = _mm_mul_epi32(dat[3], qlp[3]) ; - summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[2], qlp[2])); - summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1])); - summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0])); - - summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8)); - summ = _mm_srl_epi64(summ, cnt); - temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ); - data[i] = _mm_cvtsi128_si32(temp); - } - } - else { /* order == 5, 6 */ - __m128i qlp[3], dat[3]; - __m128i summ, temp; - qlp[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0))); - qlp[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2))); - if (order == 6) - qlp[2] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4))); - else - qlp[2] = _mm_cvtepu32_epi64(_mm_cvtsi32_si128(qlp_coeff[4])); - - dat[2] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-6 )), _MM_SHUFFLE(2,0,3,1)); - dat[1] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-4 )), _MM_SHUFFLE(2,0,3,1)); - dat[0] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-2 )), _MM_SHUFFLE(2,0,3,1)); - - summ = _mm_mul_epi32(dat[2], qlp[2]) ; - summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1])); - summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0])); - - summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8)); - summ = _mm_srl_epi64(summ, cnt); - temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[0]), summ); - data[0] = _mm_cvtsi128_si32(temp); - - for(i = 1; i < (int)data_len; i++) { - temp = _mm_slli_si128(temp, 8); - dat[2] = _mm_alignr_epi8(dat[2], dat[1], 8); - dat[1] = _mm_alignr_epi8(dat[1], dat[0], 8); - dat[0] = _mm_alignr_epi8(dat[0], temp, 8); - - summ = _mm_mul_epi32(dat[2], qlp[2]) ; - summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1])); - summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0])); - - summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8)); - summ = _mm_srl_epi64(summ, cnt); - temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ); - data[i] = _mm_cvtsi128_si32(temp); - } - } - } - else { /* order == 1, 2, 3, 4 */ - if(order > 2) { /* order == 3, 4 */ - __m128i qlp[2], dat[2]; - __m128i summ, temp; - qlp[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0))); - if (order == 4) - qlp[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2))); - else - qlp[1] = _mm_cvtepu32_epi64(_mm_cvtsi32_si128(qlp_coeff[2])); - - dat[1] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-4 )), _MM_SHUFFLE(2,0,3,1)); - dat[0] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-2 )), _MM_SHUFFLE(2,0,3,1)); - - summ = _mm_mul_epi32(dat[1], qlp[1]) ; - summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0])); - - summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8)); - summ = _mm_srl_epi64(summ, cnt); - temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[0]), summ); - data[0] = _mm_cvtsi128_si32(temp); - - for(i = 1; i < (int)data_len; i++) { - temp = _mm_slli_si128(temp, 8); - dat[1] = _mm_alignr_epi8(dat[1], dat[0], 8); - dat[0] = _mm_alignr_epi8(dat[0], temp, 8); - - summ = _mm_mul_epi32(dat[1], qlp[1]) ; - summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0])); - - summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8)); - summ = _mm_srl_epi64(summ, cnt); - temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ); - data[i] = _mm_cvtsi128_si32(temp); - } - } - else { /* order == 1, 2 */ - if(order == 2) { - __m128i qlp0, dat0; - __m128i summ, temp; - qlp0 = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff))); - - dat0 = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-2 )), _MM_SHUFFLE(2,0,3,1)); - - summ = _mm_mul_epi32(dat0, qlp0); - - summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8)); - summ = _mm_srl_epi64(summ, cnt); - temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[0]), summ); - data[0] = _mm_cvtsi128_si32(temp); - - for(i = 1; i < (int)data_len; i++) { - dat0 = _mm_alignr_epi8(dat0, _mm_slli_si128(temp, 8), 8); - - summ = _mm_mul_epi32(dat0, qlp0); - - summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8)); - summ = _mm_srl_epi64(summ, cnt); - temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ); - data[i] = _mm_cvtsi128_si32(temp); - } - } - else { /* order == 1 */ - __m128i qlp0; - __m128i summ, temp; - qlp0 = _mm_cvtsi32_si128(qlp_coeff[0]); - temp = _mm_cvtsi32_si128(data[-1]); - - summ = _mm_mul_epi32(temp, qlp0); - summ = _mm_srl_epi64(summ, cnt); - temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[0]), summ); - data[0] = _mm_cvtsi128_si32(temp); - - for(i = 1; i < (int)data_len; i++) { - summ = _mm_mul_epi32(temp, qlp0); - summ = _mm_srl_epi64(summ, cnt); - temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ); - data[i] = _mm_cvtsi128_si32(temp); - } - } - } - } - } - else { /* order > 12 */ - __m128i qlp[16]; - - for(i = 0; i < (int)order/2; i++) - qlp[i] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+i*2)), _MM_SHUFFLE(2,0,3,1)); // 0 q[2*i] 0 q[2*i+1] - if(order & 1) - qlp[i] = _mm_shuffle_epi32(_mm_cvtsi32_si128(qlp_coeff[i*2]), _MM_SHUFFLE(2,0,3,1)); - - for(i = 0; i < (int)data_len; i++) { - __m128i summ = _mm_setzero_si128(), dat; - FLAC__int32 * const datai = &data[i]; - - switch((order+1) / 2) { - case 16: /* order == 31, 32 */ - dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-32))); - summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[15])); /* Falls through. */ - case 15: - dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-30))); - summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[14])); /* Falls through. */ - case 14: - dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-28))); - summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[13])); /* Falls through. */ - case 13: - dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-26))); - summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[12])); /* Falls through. */ - case 12: - dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-24))); - summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[11])); /* Falls through. */ - case 11: - dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-22))); - summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[10])); /* Falls through. */ - case 10: - dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-20))); - summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[9])); /* Falls through. */ - case 9: - dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-18))); - summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[8])); /* Falls through. */ - case 8: - dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-16))); - summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[7])); /* Falls through. */ - case 7: /* order == 13, 14 */ - dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-14))); - summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[6])); - dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-12))); - summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[5])); - dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-10))); - summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[4])); - dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-8))); - summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[3])); - dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-6))); - summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[2])); - dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-4))); - summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[1])); - dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-2))); - summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[0])); - } - summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8)); - summ = _mm_srl_epi64(summ, cnt); - summ = _mm_add_epi32(summ, _mm_cvtsi32_si128(residual[i])); - data[i] = _mm_cvtsi128_si32(summ); - } - } -} - -FLAC__SSE_TARGET("sse4.1") -void FLAC__lpc_restore_signal_intrin_sse41(const FLAC__int32 residual[], uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 data[]) -{ - if(order < 8) { - FLAC__lpc_restore_signal(residual, data_len, qlp_coeff, order, lp_quantization, data); - return; - } - - FLAC__ASSERT(order >= 8); - FLAC__ASSERT(order <= 32); - - if(order <= 12) { - int i; - const __m128i cnt = _mm_cvtsi32_si128(lp_quantization); - - if(order > 8) /* order == 9, 10, 11, 12 */ - { - __m128i qlp[3], dat[3]; - __m128i summ, temp; - - qlp[0] = _mm_loadu_si128((const __m128i*)(const void*)(qlp_coeff + 0)); // q[3] q[2] q[1] q[0] - qlp[1] = _mm_loadu_si128((const __m128i*)(const void*)(qlp_coeff + 4)); // q[7] q[6] q[5] q[4] - qlp[2] = _mm_loadu_si128((const __m128i*)(const void*)(qlp_coeff + 8)); // q[11] q[10] q[9] q[8] - switch (order) - { - case 9: - qlp[2] = _mm_slli_si128(qlp[2], 12); qlp[2] = _mm_srli_si128(qlp[2], 12); break; // 0 0 0 q[8] - case 10: - qlp[2] = _mm_slli_si128(qlp[2], 8); qlp[2] = _mm_srli_si128(qlp[2], 8); break; // 0 0 q[9] q[8] - case 11: - qlp[2] = _mm_slli_si128(qlp[2], 4); qlp[2] = _mm_srli_si128(qlp[2], 4); break; // 0 q[10] q[9] q[8] - } - - dat[2] = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data - 12)), _MM_SHUFFLE(0, 1, 2, 3)); // d[i-12] d[i-11] d[i-10] d[i-9] - dat[1] = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data - 8)), _MM_SHUFFLE(0, 1, 2, 3)); // d[i-8] d[i-7] d[i-6] d[i-5] - dat[0] = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data - 4)), _MM_SHUFFLE(0, 1, 2, 3)); // d[i-4] d[i-3] d[i-2] d[i-1] - - for (i = 0;;) { - summ = _mm_mullo_epi32(dat[2], qlp[2]); - summ = _mm_add_epi32(summ, _mm_mullo_epi32(dat[1], qlp[1])); - summ = _mm_add_epi32(summ, _mm_mullo_epi32(dat[0], qlp[0])); - - summ = _mm_add_epi32(summ, _mm_shuffle_epi32(summ, _MM_SHUFFLE(1,0,3,2))); - summ = _mm_add_epi32(summ, _mm_shufflelo_epi16(summ, _MM_SHUFFLE(1,0,3,2))); - - summ = _mm_sra_epi32(summ, cnt); - temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ); - data[i] = _mm_cvtsi128_si32(temp); - - if(++i >= (int)data_len) break; - - temp = _mm_slli_si128(temp, 12); - dat[2] = _mm_alignr_epi8(dat[2], dat[1], 12); - dat[1] = _mm_alignr_epi8(dat[1], dat[0], 12); - dat[0] = _mm_alignr_epi8(dat[0], temp, 12); - } - } - else /* order == 8 */ - { - __m128i qlp[2], dat[2]; - __m128i summ, temp; - - qlp[0] = _mm_loadu_si128((const __m128i*)(const void*)(qlp_coeff + 0)); - qlp[1] = _mm_loadu_si128((const __m128i*)(const void*)(qlp_coeff + 4)); - - dat[1] = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data - 8)), _MM_SHUFFLE(0, 1, 2, 3)); - dat[0] = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data - 4)), _MM_SHUFFLE(0, 1, 2, 3)); - - for (i = 0;;) { - summ = _mm_add_epi32(_mm_mullo_epi32(dat[1], qlp[1]), _mm_mullo_epi32(dat[0], qlp[0])); - - summ = _mm_add_epi32(summ, _mm_shuffle_epi32(summ, _MM_SHUFFLE(1,0,3,2))); - summ = _mm_add_epi32(summ, _mm_shufflelo_epi16(summ, _MM_SHUFFLE(1,0,3,2))); - - summ = _mm_sra_epi32(summ, cnt); - temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ); - data[i] = _mm_cvtsi128_si32(temp); - - if(++i >= (int)data_len) break; - - temp = _mm_slli_si128(temp, 12); - dat[1] = _mm_alignr_epi8(dat[1], dat[0], 12); - dat[0] = _mm_alignr_epi8(dat[0], temp, 12); - } - } - } - else { /* order > 12 */ -#ifdef FLAC__HAS_NASM - FLAC__lpc_restore_signal_asm_ia32(residual, data_len, qlp_coeff, order, lp_quantization, data); -#else - FLAC__lpc_restore_signal(residual, data_len, qlp_coeff, order, lp_quantization, data); -#endif - } -} - -FLAC__SSE_TARGET("ssse3") -void FLAC__lpc_restore_signal_16_intrin_sse41(const FLAC__int32 residual[], uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 data[]) -{ - if(order < 8) { - FLAC__lpc_restore_signal(residual, data_len, qlp_coeff, order, lp_quantization, data); - return; - } - - FLAC__ASSERT(order >= 8); - FLAC__ASSERT(order <= 32); - - if(order <= 12) { - int i; - const __m128i cnt = _mm_cvtsi32_si128(lp_quantization); - - if(order > 8) /* order == 9, 10, 11, 12 */ - { - __m128i qlp[2], dat[2]; - __m128i summ, temp; - - qlp[0] = _mm_loadu_si128((const __m128i*)(const void*)(qlp_coeff+0)); // q[3] q[2] q[1] q[0] - temp = _mm_loadu_si128((const __m128i*)(const void*)(qlp_coeff+4)); // q[7] q[6] q[5] q[4] - qlp[1] = _mm_loadu_si128((const __m128i*)(const void*)(qlp_coeff+8)); // q[11] q[10] q[9] q[8] - switch(order) - { - case 9: - qlp[1] = _mm_slli_si128(qlp[1], 12); qlp[1] = _mm_srli_si128(qlp[1], 12); break; // 0 0 0 q[8] - case 10: - qlp[1] = _mm_slli_si128(qlp[1], 8); qlp[1] = _mm_srli_si128(qlp[1], 8); break; // 0 0 q[9] q[8] - case 11: - qlp[1] = _mm_slli_si128(qlp[1], 4); qlp[1] = _mm_srli_si128(qlp[1], 4); break; // 0 q[10] q[9] q[8] - } - qlp[0] = _mm_packs_epi32(qlp[0], temp); // q[7] q[6] q[5] q[4] q[3] q[2] q[1] q[0] - qlp[1] = _mm_packs_epi32(qlp[1], _mm_setzero_si128()); // 0 0 0 0 q[11] q[10] q[9] q[8] - - dat[1] = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data-12)), _MM_SHUFFLE(0,1,2,3)); // d[i-12] d[i-11] d[i-10] d[i-9] - temp = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data-8)), _MM_SHUFFLE(0,1,2,3)); // d[i-8] d[i-7] d[i-6] d[i-5] - dat[0] = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data-4)), _MM_SHUFFLE(0,1,2,3)); // d[i-4] d[i-3] d[i-2] d[i-1] - - dat[1] = _mm_packs_epi32(dat[1], _mm_setzero_si128()); // 0 0 0 0 d[i-12] d[i-11] d[i-10] d[i-9] - dat[0] = _mm_packs_epi32(dat[0], temp); // d[i-8] d[i-7] d[i-6] d[i-5] d[i-4] d[i-3] d[i-2] d[i-1] - - for(i = 0;;) { - summ = _mm_madd_epi16(dat[1], qlp[1]); - summ = _mm_add_epi32(summ, _mm_madd_epi16(dat[0], qlp[0])); - - summ = _mm_add_epi32(summ, _mm_shuffle_epi32(summ, _MM_SHUFFLE(1,0,3,2))); - summ = _mm_add_epi32(summ, _mm_shufflelo_epi16(summ, _MM_SHUFFLE(1,0,3,2))); - - summ = _mm_sra_epi32(summ, cnt); - temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ); - data[i] = _mm_cvtsi128_si32(temp); - - if(++i >= (int)data_len) break; - - temp = _mm_slli_si128(temp, 14); - dat[1] = _mm_alignr_epi8(dat[1], dat[0], 14); // 0 0 0 d[i-12] d[i-11] d[i-10] d[i-9] d[i-8] - dat[0] = _mm_alignr_epi8(dat[0], temp, 14); // d[i-7] d[i-6] d[i-5] d[i-4] d[i-3] d[i-2] d[i-1] d[i] - } - } - else /* order == 8 */ - { - __m128i qlp0, dat0; - __m128i summ, temp; - - qlp0 = _mm_loadu_si128((const __m128i*)(const void*)(qlp_coeff+0)); // q[3] q[2] q[1] q[0] - temp = _mm_loadu_si128((const __m128i*)(const void*)(qlp_coeff+4)); // q[7] q[6] q[5] q[4] - qlp0 = _mm_packs_epi32(qlp0, temp); // q[7] q[6] q[5] q[4] q[3] q[2] q[1] q[0] - - temp = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data-8)), _MM_SHUFFLE(0,1,2,3)); - dat0 = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data-4)), _MM_SHUFFLE(0,1,2,3)); - dat0 = _mm_packs_epi32(dat0, temp); // d[i-8] d[i-7] d[i-6] d[i-5] d[i-4] d[i-3] d[i-2] d[i-1] - - for(i = 0;;) { - summ = _mm_madd_epi16(dat0, qlp0); - - summ = _mm_add_epi32(summ, _mm_shuffle_epi32(summ, _MM_SHUFFLE(1,0,3,2))); - summ = _mm_add_epi32(summ, _mm_shufflelo_epi16(summ, _MM_SHUFFLE(1,0,3,2))); - - summ = _mm_sra_epi32(summ, cnt); - temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ); - data[i] = _mm_cvtsi128_si32(temp); - - if(++i >= (int)data_len) break; - - temp = _mm_slli_si128(temp, 14); - dat0 = _mm_alignr_epi8(dat0, temp, 14); // d[i-7] d[i-6] d[i-5] d[i-4] d[i-3] d[i-2] d[i-1] d[i] - } - } - } - else { /* order > 12 */ -#ifdef FLAC__HAS_NASM - FLAC__lpc_restore_signal_asm_ia32_mmx(residual, data_len, qlp_coeff, order, lp_quantization, data); -#else - FLAC__lpc_restore_signal(residual, data_len, qlp_coeff, order, lp_quantization, data); -#endif - } -} - #endif /* defined FLAC__CPU_IA32 */ FLAC__SSE_TARGET("sse4.1") diff --git a/src/libFLAC/stream_decoder.c b/src/libFLAC/stream_decoder.c index eab9e8b4..cc583d7d 100644 --- a/src/libFLAC/stream_decoder.c +++ b/src/libFLAC/stream_decoder.c @@ -45,7 +45,6 @@ #include "protected/stream_decoder.h" #include "private/bitreader.h" #include "private/bitmath.h" -#include "private/cpu.h" #include "private/crc.h" #include "private/fixed.h" #include "private/format.h" @@ -129,12 +128,6 @@ typedef struct FLAC__StreamDecoderPrivate { FLAC__StreamDecoderWriteCallback write_callback; FLAC__StreamDecoderMetadataCallback metadata_callback; FLAC__StreamDecoderErrorCallback error_callback; - /* generic 32-bit datapath: */ - void (*local_lpc_restore_signal)(const FLAC__int32 residual[], uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 data[]); - /* generic 64-bit datapath: */ - void (*local_lpc_restore_signal_64bit)(const FLAC__int32 residual[], uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 data[]); - /* for use when the signal is <= 16 bits-per-sample, or <= 15 bits-per-sample on a side channel (which requires 1 extra bit): */ - void (*local_lpc_restore_signal_16bit)(const FLAC__int32 residual[], uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 data[]); void *client_data; FILE *file; /* only used if FLAC__stream_decoder_init_file()/FLAC__stream_decoder_init_file() called, else NULL */ FLAC__BitReader *input; @@ -152,7 +145,6 @@ typedef struct FLAC__StreamDecoderPrivate { size_t metadata_filter_ids_count, metadata_filter_ids_capacity; /* units for both are IDs, not bytes */ FLAC__Frame frame; FLAC__bool cached; /* true if there is a byte in lookahead */ - FLAC__CPUInfo cpuinfo; FLAC__byte header_warmup[2]; /* contains the sync code and reserved bits */ FLAC__byte lookahead; /* temp storage when we need to look ahead one byte in the stream */ /* unaligned (original) pointers to allocated data */ @@ -373,48 +365,6 @@ static FLAC__StreamDecoderInitStatus init_stream_internal_( return decoder->protected_->initstate = FLAC__STREAM_DECODER_INIT_STATUS_ERROR_OPENING_FILE; #endif - /* - * get the CPU info and set the function pointers - */ - FLAC__cpu_info(&decoder->private_->cpuinfo); - /* first default to the non-asm routines */ - decoder->private_->local_lpc_restore_signal = FLAC__lpc_restore_signal; - decoder->private_->local_lpc_restore_signal_64bit = FLAC__lpc_restore_signal_wide; - decoder->private_->local_lpc_restore_signal_16bit = FLAC__lpc_restore_signal; - /* now override with asm where appropriate */ -#ifndef FLAC__NO_ASM - if(decoder->private_->cpuinfo.use_asm) { -#ifdef FLAC__CPU_IA32 - FLAC__ASSERT(decoder->private_->cpuinfo.type == FLAC__CPUINFO_TYPE_IA32); -#ifdef FLAC__HAS_NASM - decoder->private_->local_lpc_restore_signal_64bit = FLAC__lpc_restore_signal_wide_asm_ia32; /* OPT_IA32: was really necessary for GCC < 4.9 */ - if (decoder->private_->cpuinfo.x86.mmx) { - decoder->private_->local_lpc_restore_signal = FLAC__lpc_restore_signal_asm_ia32; - decoder->private_->local_lpc_restore_signal_16bit = FLAC__lpc_restore_signal_asm_ia32_mmx; - } - else { - decoder->private_->local_lpc_restore_signal = FLAC__lpc_restore_signal_asm_ia32; - decoder->private_->local_lpc_restore_signal_16bit = FLAC__lpc_restore_signal_asm_ia32; - } -#endif -#if FLAC__HAS_X86INTRIN && ! defined FLAC__INTEGER_ONLY_LIBRARY -# if defined FLAC__SSE4_1_SUPPORTED - if (decoder->private_->cpuinfo.x86.sse41) { -# if !defined FLAC__HAS_NASM /* these are not undoubtedly faster than their MMX ASM counterparts */ - decoder->private_->local_lpc_restore_signal = FLAC__lpc_restore_signal_intrin_sse41; - decoder->private_->local_lpc_restore_signal_16bit = FLAC__lpc_restore_signal_16_intrin_sse41; -# endif - decoder->private_->local_lpc_restore_signal_64bit = FLAC__lpc_restore_signal_wide_intrin_sse41; - } -# endif -#endif -#elif defined FLAC__CPU_X86_64 - FLAC__ASSERT(decoder->private_->cpuinfo.type == FLAC__CPUINFO_TYPE_X86_64); - /* No useful SSE optimizations yet */ -#endif - } -#endif - /* from here on, errors are fatal */ if(!FLAC__bitreader_init(decoder->private_->input, read_callback_, decoder)) { @@ -2848,12 +2798,9 @@ FLAC__bool read_subframe_lpc_(FLAC__StreamDecoder *decoder, uint32_t channel, ui if(do_full_decode) { memcpy(decoder->private_->output[channel], subframe->warmup, sizeof(FLAC__int32) * order); if(bps + subframe->qlp_coeff_precision + FLAC__bitmath_ilog2(order) <= 32) - if(bps <= 16 && subframe->qlp_coeff_precision <= 16) - decoder->private_->local_lpc_restore_signal_16bit(decoder->private_->residual[channel], decoder->private_->frame.header.blocksize-order, subframe->qlp_coeff, order, subframe->quantization_level, decoder->private_->output[channel]+order); - else - decoder->private_->local_lpc_restore_signal(decoder->private_->residual[channel], decoder->private_->frame.header.blocksize-order, subframe->qlp_coeff, order, subframe->quantization_level, decoder->private_->output[channel]+order); + FLAC__lpc_restore_signal(decoder->private_->residual[channel], decoder->private_->frame.header.blocksize-order, subframe->qlp_coeff, order, subframe->quantization_level, decoder->private_->output[channel]+order); else - decoder->private_->local_lpc_restore_signal_64bit(decoder->private_->residual[channel], decoder->private_->frame.header.blocksize-order, subframe->qlp_coeff, order, subframe->quantization_level, decoder->private_->output[channel]+order); + FLAC__lpc_restore_signal_wide(decoder->private_->residual[channel], decoder->private_->frame.header.blocksize-order, subframe->qlp_coeff, order, subframe->quantization_level, decoder->private_->output[channel]+order); } return true;