Remove all assembler and intrinsics from decoder

This commit drops all use of assembler and intrinsics from the libFLAC decoder. This is because they are only for 32-bit x86, hard to debug, maintain and fuzz properly, and because the decoder has much greater security risks than the encoder.
2022-05-12 14:28:05 +02:00 · 2022-05-12 14:28:05 +02:00 · febff86af0
commit febff86af0
parent a67102694d
4 changed files with 2 additions and 1270 deletions
--- a/src/libFLAC/ia32/lpc_asm.nasm
+++ b/src/libFLAC/ia32/lpc_asm.nasm
@ -38,9 +38,6 @@
 cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32
 cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx
 cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_wide_asm_ia32
-cglobal FLAC__lpc_restore_signal_asm_ia32
-cglobal FLAC__lpc_restore_signal_asm_ia32_mmx
-cglobal FLAC__lpc_restore_signal_wide_asm_ia32

 	code_section

@ -446,377 +443,6 @@ cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx
 	pop	ebp
 	ret

-; **********************************************************************
-;
-; void FLAC__lpc_restore_signal(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
-; {
-; 	unsigned i, j;
-; 	FLAC__int32 sum;
-;
-; 	FLAC__ASSERT(order > 0);
-;
-; 	for(i = 0; i < data_len; i++) {
-; 		sum = 0;
-; 		for(j = 0; j < order; j++)
-; 			sum += qlp_coeff[j] * data[i-j-1];
-; 		data[i] = residual[i] + (sum >> lp_quantization);
-; 	}
-; }
-	ALIGN	16
-cident FLAC__lpc_restore_signal_asm_ia32
-	;[esp + 40]	data[]
-	;[esp + 36]	lp_quantization
-	;[esp + 32]	order
-	;[esp + 28]	qlp_coeff[]
-	;[esp + 24]	data_len
-	;[esp + 20]	residual[]
-
-	;ASSERT(order > 0)
-
-	push	ebp
-	push	ebx
-	push	esi
-	push	edi
-
-	mov	esi, [esp + 20]			; esi = residual[]
-	mov	edi, [esp + 40]			; edi = data[]
-	mov	eax, [esp + 32]			; eax = order
-	mov	ebx, [esp + 24]			; ebx = data_len
-
-	test	ebx, ebx
-	jz	near .end			; do nothing if data_len == 0
-
-.begin:
-	cmp	eax, byte 1
-	jg	short .x87_1more
-
-	mov	ecx, [esp + 28]
-	mov	edx, [ecx]
-	mov	eax, [edi - 4]
-	mov	ecx, [esp + 36]
-	ALIGN	16
-.x87_1_loop_i:
-	imul	eax, edx
-	sar	eax, cl
-	add	eax, [esi]
-	mov	[edi], eax
-	add	esi, byte 4
-	add	edi, byte 4
-	dec	ebx
-	jnz	.x87_1_loop_i
-
-	jmp	.end
-
-.x87_1more:
-	cmp	eax, byte 32			; for order <= 32 there is a faster routine
-	jbe	short .x87_32
-
-	; This version is here just for completeness, since FLAC__MAX_LPC_ORDER == 32
-	ALIGN 16
-.x87_32more_loop_i:
-	xor	ebp, ebp
-	mov	ecx, [esp + 32]
-	mov	edx, ecx
-	shl	edx, 2
-	add	edx, [esp + 28]
-	neg	ecx
-	ALIGN	16
-.x87_32more_loop_j:
-	sub	edx, byte 4
-	mov	eax, [edx]
-	imul	eax, [edi + 4 * ecx]
-	add	ebp, eax
-	inc	ecx
-	jnz	short .x87_32more_loop_j
-
-	mov	ecx, [esp + 36]
-	sar	ebp, cl
-	add	ebp, [esi]
-	mov	[edi], ebp
-	add	edi, byte 4
-	add	esi, byte 4
-
-	dec	ebx
-	jnz	.x87_32more_loop_i
-
-	jmp	.end
-
-.mov_eip_to_eax:
-	mov	eax, [esp]
-	ret
-
-.x87_32:
-	sub	esi, edi
-	neg	eax
-	lea	edx, [eax + eax * 8 + .jumper_0 - .get_eip0]
-	call	.mov_eip_to_eax
-.get_eip0:
-	add	edx, eax
-	inc	edx				; compensate for the shorter opcode on the last iteration
-	mov	eax, [esp + 28]			; eax = qlp_coeff[]
-	xor	ebp, ebp
-	jmp	edx
-
-	mov	ecx, [eax + 124]		; ecx =  qlp_coeff[31]
-	imul	ecx, [edi - 128]		; ecx =  qlp_coeff[31] * data[i-32]
-	add	ebp, ecx			; sum += qlp_coeff[31] * data[i-32]
-	mov	ecx, [eax + 120]		; ecx =  qlp_coeff[30]
-	imul	ecx, [edi - 124]		; ecx =  qlp_coeff[30] * data[i-31]
-	add	ebp, ecx			; sum += qlp_coeff[30] * data[i-31]
-	mov	ecx, [eax + 116]		; ecx =  qlp_coeff[29]
-	imul	ecx, [edi - 120]		; ecx =  qlp_coeff[29] * data[i-30]
-	add	ebp, ecx			; sum += qlp_coeff[29] * data[i-30]
-	mov	ecx, [eax + 112]		; ecx =  qlp_coeff[28]
-	imul	ecx, [edi - 116]		; ecx =  qlp_coeff[28] * data[i-29]
-	add	ebp, ecx			; sum += qlp_coeff[28] * data[i-29]
-	mov	ecx, [eax + 108]		; ecx =  qlp_coeff[27]
-	imul	ecx, [edi - 112]		; ecx =  qlp_coeff[27] * data[i-28]
-	add	ebp, ecx			; sum += qlp_coeff[27] * data[i-28]
-	mov	ecx, [eax + 104]		; ecx =  qlp_coeff[26]
-	imul	ecx, [edi - 108]		; ecx =  qlp_coeff[26] * data[i-27]
-	add	ebp, ecx			; sum += qlp_coeff[26] * data[i-27]
-	mov	ecx, [eax + 100]		; ecx =  qlp_coeff[25]
-	imul	ecx, [edi - 104]		; ecx =  qlp_coeff[25] * data[i-26]
-	add	ebp, ecx			; sum += qlp_coeff[25] * data[i-26]
-	mov	ecx, [eax + 96]			; ecx =  qlp_coeff[24]
-	imul	ecx, [edi - 100]		; ecx =  qlp_coeff[24] * data[i-25]
-	add	ebp, ecx			; sum += qlp_coeff[24] * data[i-25]
-	mov	ecx, [eax + 92]			; ecx =  qlp_coeff[23]
-	imul	ecx, [edi - 96]			; ecx =  qlp_coeff[23] * data[i-24]
-	add	ebp, ecx			; sum += qlp_coeff[23] * data[i-24]
-	mov	ecx, [eax + 88]			; ecx =  qlp_coeff[22]
-	imul	ecx, [edi - 92]			; ecx =  qlp_coeff[22] * data[i-23]
-	add	ebp, ecx			; sum += qlp_coeff[22] * data[i-23]
-	mov	ecx, [eax + 84]			; ecx =  qlp_coeff[21]
-	imul	ecx, [edi - 88]			; ecx =  qlp_coeff[21] * data[i-22]
-	add	ebp, ecx			; sum += qlp_coeff[21] * data[i-22]
-	mov	ecx, [eax + 80]			; ecx =  qlp_coeff[20]
-	imul	ecx, [edi - 84]			; ecx =  qlp_coeff[20] * data[i-21]
-	add	ebp, ecx			; sum += qlp_coeff[20] * data[i-21]
-	mov	ecx, [eax + 76]			; ecx =  qlp_coeff[19]
-	imul	ecx, [edi - 80]			; ecx =  qlp_coeff[19] * data[i-20]
-	add	ebp, ecx			; sum += qlp_coeff[19] * data[i-20]
-	mov	ecx, [eax + 72]			; ecx =  qlp_coeff[18]
-	imul	ecx, [edi - 76]			; ecx =  qlp_coeff[18] * data[i-19]
-	add	ebp, ecx			; sum += qlp_coeff[18] * data[i-19]
-	mov	ecx, [eax + 68]			; ecx =  qlp_coeff[17]
-	imul	ecx, [edi - 72]			; ecx =  qlp_coeff[17] * data[i-18]
-	add	ebp, ecx			; sum += qlp_coeff[17] * data[i-18]
-	mov	ecx, [eax + 64]			; ecx =  qlp_coeff[16]
-	imul	ecx, [edi - 68]			; ecx =  qlp_coeff[16] * data[i-17]
-	add	ebp, ecx			; sum += qlp_coeff[16] * data[i-17]
-	mov	ecx, [eax + 60]			; ecx =  qlp_coeff[15]
-	imul	ecx, [edi - 64]			; ecx =  qlp_coeff[15] * data[i-16]
-	add	ebp, ecx			; sum += qlp_coeff[15] * data[i-16]
-	mov	ecx, [eax + 56]			; ecx =  qlp_coeff[14]
-	imul	ecx, [edi - 60]			; ecx =  qlp_coeff[14] * data[i-15]
-	add	ebp, ecx			; sum += qlp_coeff[14] * data[i-15]
-	mov	ecx, [eax + 52]			; ecx =  qlp_coeff[13]
-	imul	ecx, [edi - 56]			; ecx =  qlp_coeff[13] * data[i-14]
-	add	ebp, ecx			; sum += qlp_coeff[13] * data[i-14]
-	mov	ecx, [eax + 48]			; ecx =  qlp_coeff[12]
-	imul	ecx, [edi - 52]			; ecx =  qlp_coeff[12] * data[i-13]
-	add	ebp, ecx			; sum += qlp_coeff[12] * data[i-13]
-	mov	ecx, [eax + 44]			; ecx =  qlp_coeff[11]
-	imul	ecx, [edi - 48]			; ecx =  qlp_coeff[11] * data[i-12]
-	add	ebp, ecx			; sum += qlp_coeff[11] * data[i-12]
-	mov	ecx, [eax + 40]			; ecx =  qlp_coeff[10]
-	imul	ecx, [edi - 44]			; ecx =  qlp_coeff[10] * data[i-11]
-	add	ebp, ecx			; sum += qlp_coeff[10] * data[i-11]
-	mov	ecx, [eax + 36]			; ecx =  qlp_coeff[ 9]
-	imul	ecx, [edi - 40]			; ecx =  qlp_coeff[ 9] * data[i-10]
-	add	ebp, ecx			; sum += qlp_coeff[ 9] * data[i-10]
-	mov	ecx, [eax + 32]			; ecx =  qlp_coeff[ 8]
-	imul	ecx, [edi - 36]			; ecx =  qlp_coeff[ 8] * data[i- 9]
-	add	ebp, ecx			; sum += qlp_coeff[ 8] * data[i- 9]
-	mov	ecx, [eax + 28]			; ecx =  qlp_coeff[ 7]
-	imul	ecx, [edi - 32]			; ecx =  qlp_coeff[ 7] * data[i- 8]
-	add	ebp, ecx			; sum += qlp_coeff[ 7] * data[i- 8]
-	mov	ecx, [eax + 24]			; ecx =  qlp_coeff[ 6]
-	imul	ecx, [edi - 28]			; ecx =  qlp_coeff[ 6] * data[i- 7]
-	add	ebp, ecx			; sum += qlp_coeff[ 6] * data[i- 7]
-	mov	ecx, [eax + 20]			; ecx =  qlp_coeff[ 5]
-	imul	ecx, [edi - 24]			; ecx =  qlp_coeff[ 5] * data[i- 6]
-	add	ebp, ecx			; sum += qlp_coeff[ 5] * data[i- 6]
-	mov	ecx, [eax + 16]			; ecx =  qlp_coeff[ 4]
-	imul	ecx, [edi - 20]			; ecx =  qlp_coeff[ 4] * data[i- 5]
-	add	ebp, ecx			; sum += qlp_coeff[ 4] * data[i- 5]
-	mov	ecx, [eax + 12]			; ecx =  qlp_coeff[ 3]
-	imul	ecx, [edi - 16]			; ecx =  qlp_coeff[ 3] * data[i- 4]
-	add	ebp, ecx			; sum += qlp_coeff[ 3] * data[i- 4]
-	mov	ecx, [eax + 8]			; ecx =  qlp_coeff[ 2]
-	imul	ecx, [edi - 12]			; ecx =  qlp_coeff[ 2] * data[i- 3]
-	add	ebp, ecx			; sum += qlp_coeff[ 2] * data[i- 3]
-	mov	ecx, [eax + 4]			; ecx =  qlp_coeff[ 1]
-	imul	ecx, [edi - 8]			; ecx =  qlp_coeff[ 1] * data[i- 2]
-	add	ebp, ecx			; sum += qlp_coeff[ 1] * data[i- 2]
-	mov	ecx, [eax]			; ecx =  qlp_coeff[ 0] (NOTE: one byte missing from instruction)
-	imul	ecx, [edi - 4]			; ecx =  qlp_coeff[ 0] * data[i- 1]
-	add	ebp, ecx			; sum += qlp_coeff[ 0] * data[i- 1]
-.jumper_0:
-
-	mov	ecx, [esp + 36]
-	sar	ebp, cl				; ebp = (sum >> lp_quantization)
-	add	ebp, [esi + edi]		; ebp = residual[i] + (sum >> lp_quantization)
-	mov	[edi], ebp			; data[i] = residual[i] + (sum >> lp_quantization)
-	add	edi, byte 4
-
-	dec	ebx
-	jz	short .end
-	xor	ebp, ebp
-	jmp	edx
-
-.end:
-	pop	edi
-	pop	esi
-	pop	ebx
-	pop	ebp
-	ret
-
-; WATCHOUT: this routine works on 16 bit data which means bits-per-sample for
-; the channel and qlp_coeffs must be <= 16.  Especially note that this routine
-; cannot be used for side-channel coded 16bps channels since the effective bps
-; is 17.
-; WATCHOUT: this routine requires that each data array have a buffer of up to
-; 3 zeroes in front (at negative indices) for alignment purposes, i.e. for each
-; channel n, data[n][-1] through data[n][-3] should be accessible and zero.
-	ALIGN	16
-cident FLAC__lpc_restore_signal_asm_ia32_mmx
-	;[esp + 40]	data[]
-	;[esp + 36]	lp_quantization
-	;[esp + 32]	order
-	;[esp + 28]	qlp_coeff[]
-	;[esp + 24]	data_len
-	;[esp + 20]	residual[]
-
-	;ASSERT(order > 0)
-
-	push	ebp
-	push	ebx
-	push	esi
-	push	edi
-
-	mov	esi, [esp + 20]
-	mov	edi, [esp + 40]
-	mov	eax, [esp + 32]
-	mov	ebx, [esp + 24]
-
-	test	ebx, ebx
-	jz	near .end			; do nothing if data_len == 0
-	cmp	eax, byte 4
-	jb	near FLAC__lpc_restore_signal_asm_ia32.begin
-
-	mov	edx, [esp + 28]
-	movd	mm6, [esp + 36]
-	mov	ebp, esp
-
-	and	esp, 0xfffffff8
-
-	xor	ecx, ecx
-.copy_qlp_loop:
-	push	word [edx + 4 * ecx]
-	inc	ecx
-	cmp	ecx, eax
-	jnz	short .copy_qlp_loop
-
-	and	ecx, 0x3
-	test	ecx, ecx
-	je	short .za_end
-	sub	ecx, byte 4
-.za_loop:
-	push	word 0
-	inc	eax
-	inc	ecx
-	jnz	short .za_loop
-.za_end:
-
-	movq	mm5, [esp + 2 * eax - 8]
-	movd	mm4, [edi - 16]
-	punpckldq	mm4, [edi - 12]
-	movd	mm0, [edi - 8]
-	punpckldq	mm0, [edi - 4]
-	packssdw	mm4, mm0
-
-	cmp	eax, byte 4
-	jnbe	short .mmx_4more
-
-	ALIGN	16
-.mmx_4_loop_i:
-	movq	mm7, mm4
-	pmaddwd	mm7, mm5
-	movq	mm0, mm7
-	punpckhdq	mm7, mm7
-	paddd	mm7, mm0
-	psrad	mm7, mm6
-	movd	mm1, [esi]
-	paddd	mm7, mm1
-	movd	[edi], mm7
-	psllq	mm7, 48
-	psrlq	mm4, 16
-	por	mm4, mm7
-
-	add	esi, byte 4
-	add	edi, byte 4
-
-	dec	ebx
-	jnz	.mmx_4_loop_i
-	jmp	.mmx_end
-.mmx_4more:
-	shl	eax, 2
-	neg	eax
-	add	eax, byte 16
-	ALIGN	16
-.mmx_4more_loop_i:
-	mov	ecx, edi
-	add	ecx, eax
-	mov	edx, esp
-
-	movq	mm7, mm4
-	pmaddwd	mm7, mm5
-
-	ALIGN	16
-.mmx_4more_loop_j:
-	movd	mm0, [ecx - 16]
-	punpckldq	mm0, [ecx - 12]
-	movd	mm1, [ecx - 8]
-	punpckldq	mm1, [ecx - 4]
-	packssdw	mm0, mm1
-	pmaddwd	mm0, [edx]
-	paddd	mm7, mm0
-
-	add	edx, byte 8
-	add	ecx, byte 16
-	cmp	ecx, edi
-	jnz	.mmx_4more_loop_j
-
-	movq	mm0, mm7
-	punpckhdq	mm7, mm7
-	paddd	mm7, mm0
-	psrad	mm7, mm6
-	movd	mm1, [esi]
-	paddd	mm7, mm1
-	movd	[edi], mm7
-	psllq	mm7, 48
-	psrlq	mm4, 16
-	por	mm4, mm7
-
-	add	esi, byte 4
-	add	edi, byte 4
-
-	dec	ebx
-	jnz	short .mmx_4more_loop_i
-.mmx_end:
-	emms
-	mov	esp, ebp
-
-.end:
-	pop	edi
-	pop	esi
-	pop	ebx
-	pop	ebp
-	ret
-
-
 ; **********************************************************************
 ;
 ;void FLAC__lpc_compute_residual_from_qlp_coefficients_wide(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
@ -1098,285 +724,4 @@ cident FLAC__lpc_compute_residual_from_qlp_coefficients_wide_asm_ia32
 	pop	ebp
 	ret

-; **********************************************************************
-;
-; void FLAC__lpc_restore_signal_wide(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
-; {
-; 	unsigned i, j;
-; 	FLAC__int64 sum;
-;
-; 	FLAC__ASSERT(order > 0);
-;
-; 	for(i = 0; i < data_len; i++) {
-; 		sum = 0;
-; 		for(j = 0; j < order; j++)
-; 			sum += qlp_coeff[j] * (FLAC__int64)data[i-j-1];
-; 		data[i] = residual[i] + (FLAC__int32)(sum >> lp_quantization);
-; 	}
-; }
-	ALIGN	16
-cident FLAC__lpc_restore_signal_wide_asm_ia32
-	;[esp + 40]	data[]
-	;[esp + 36]	lp_quantization
-	;[esp + 32]	order
-	;[esp + 28]	qlp_coeff[]
-	;[esp + 24]	data_len
-	;[esp + 20]	residual[]
-
-	;ASSERT(order > 0)
-	;ASSERT(order <= 32)
-	;ASSERT(lp_quantization <= 31)
-
-	push	ebp
-	push	ebx
-	push	esi
-	push	edi
-
-	mov	ebx, [esp + 24]			; ebx = data_len
-	test	ebx, ebx
-	jz	near .end			; do nothing if data_len == 0
-
-.begin:
-	mov	eax, [esp + 32]			; eax = order
-	cmp	eax, 1
-	jg	short .x87_32
-
-	mov	esi, [esp + 20]			; esi = residual[]
-	mov	edi, [esp + 40]			; edi = data[]
-	mov	ecx, [esp + 28]			; ecx = qlp_coeff[]
-	mov	ebp, [ecx]			; ebp = qlp_coeff[0]
-	mov	eax, [edi - 4]			; eax = data[-1]
-	mov	ecx, [esp + 36]			; cl = lp_quantization
-	ALIGN	16
-.x87_1_loop_i:
-	imul	ebp				; edx:eax = qlp_coeff[0] * (FLAC__int64)data[i-1]
-	shrd	eax, edx, cl			; 0 <= lp_quantization <= 15
-;
-	add	eax, [esi]
-	mov	[edi], eax
-;
-	add	esi, 4
-	add	edi, 4
-	dec	ebx
-	jnz	.x87_1_loop_i
-	jmp	.end
-
-.mov_eip_to_eax:
-	mov	eax, [esp]
-	ret
-
-.x87_32:	; eax = order
-	neg	eax
-	add	eax, eax
-	lea	ebp, [eax + eax * 4 + .jumper_0 - .get_eip0]
-	call	.mov_eip_to_eax
-.get_eip0:
-	add	ebp, eax
-	inc	ebp				; compensate for the shorter opcode on the last iteration
-
-	mov	ebx, [esp + 28]			; ebx = qlp_coeff[]
-	mov	edi, [esp + 40]			; esi = data[]
-	sub	[esp + 20], edi			; residual[] -= data[]
-
-	xor	ecx, ecx
-	xor	esi, esi
-	jmp	ebp
-
-;eax = --
-;edx = --
-;ecx = 0
-;esi = 0
-;
-;ebx = qlp_coeff[]
-;edi = data[]
-;ebp = @address
-
-	mov	eax, [ebx + 124]		; eax =  qlp_coeff[31]
-	imul	dword [edi - 128]		; edx:eax =  qlp_coeff[31] * data[i-32]
-	add	ecx, eax
-	adc	esi, edx			; sum += qlp_coeff[31] * data[i-32]
-
-	mov	eax, [ebx + 120]		; eax =  qlp_coeff[30]
-	imul	dword [edi - 124]		; edx:eax =  qlp_coeff[30] * data[i-31]
-	add	ecx, eax
-	adc	esi, edx			; sum += qlp_coeff[30] * data[i-31]
-
-	mov	eax, [ebx + 116]
-	imul	dword [edi - 120]
-	add	ecx, eax
-	adc	esi, edx
-
-	mov	eax, [ebx + 112]
-	imul	dword [edi - 116]
-	add	ecx, eax
-	adc	esi, edx
-
-	mov	eax, [ebx + 108]
-	imul	dword [edi - 112]
-	add	ecx, eax
-	adc	esi, edx
-
-	mov	eax, [ebx + 104]
-	imul	dword [edi - 108]
-	add	ecx, eax
-	adc	esi, edx
-
-	mov	eax, [ebx + 100]
-	imul	dword [edi - 104]
-	add	ecx, eax
-	adc	esi, edx
-
-	mov	eax, [ebx + 96]
-	imul	dword [edi - 100]
-	add	ecx, eax
-	adc	esi, edx
-
-	mov	eax, [ebx + 92]
-	imul	dword [edi - 96]
-	add	ecx, eax
-	adc	esi, edx
-
-	mov	eax, [ebx + 88]
-	imul	dword [edi - 92]
-	add	ecx, eax
-	adc	esi, edx
-
-	mov	eax, [ebx + 84]
-	imul	dword [edi - 88]
-	add	ecx, eax
-	adc	esi, edx
-
-	mov	eax, [ebx + 80]
-	imul	dword [edi - 84]
-	add	ecx, eax
-	adc	esi, edx
-
-	mov	eax, [ebx + 76]
-	imul	dword [edi - 80]
-	add	ecx, eax
-	adc	esi, edx
-
-	mov	eax, [ebx + 72]
-	imul	dword [edi - 76]
-	add	ecx, eax
-	adc	esi, edx
-
-	mov	eax, [ebx + 68]
-	imul	dword [edi - 72]
-	add	ecx, eax
-	adc	esi, edx
-
-	mov	eax, [ebx + 64]
-	imul	dword [edi - 68]
-	add	ecx, eax
-	adc	esi, edx
-
-	mov	eax, [ebx + 60]
-	imul	dword [edi - 64]
-	add	ecx, eax
-	adc	esi, edx
-
-	mov	eax, [ebx + 56]
-	imul	dword [edi - 60]
-	add	ecx, eax
-	adc	esi, edx
-
-	mov	eax, [ebx + 52]
-	imul	dword [edi - 56]
-	add	ecx, eax
-	adc	esi, edx
-
-	mov	eax, [ebx + 48]
-	imul	dword [edi - 52]
-	add	ecx, eax
-	adc	esi, edx
-
-	mov	eax, [ebx + 44]
-	imul	dword [edi - 48]
-	add	ecx, eax
-	adc	esi, edx
-
-	mov	eax, [ebx + 40]
-	imul	dword [edi - 44]
-	add	ecx, eax
-	adc	esi, edx
-
-	mov	eax, [ebx + 36]
-	imul	dword [edi - 40]
-	add	ecx, eax
-	adc	esi, edx
-
-	mov	eax, [ebx + 32]
-	imul	dword [edi - 36]
-	add	ecx, eax
-	adc	esi, edx
-
-	mov	eax, [ebx + 28]
-	imul	dword [edi - 32]
-	add	ecx, eax
-	adc	esi, edx
-
-	mov	eax, [ebx + 24]
-	imul	dword [edi - 28]
-	add	ecx, eax
-	adc	esi, edx
-
-	mov	eax, [ebx + 20]
-	imul	dword [edi - 24]
-	add	ecx, eax
-	adc	esi, edx
-
-	mov	eax, [ebx + 16]
-	imul	dword [edi - 20]
-	add	ecx, eax
-	adc	esi, edx
-
-	mov	eax, [ebx + 12]
-	imul	dword [edi - 16]
-	add	ecx, eax
-	adc	esi, edx
-
-	mov	eax, [ebx + 8]
-	imul	dword [edi - 12]
-	add	ecx, eax
-	adc	esi, edx
-
-	mov	eax, [ebx + 4]
-	imul	dword [edi - 8]
-	add	ecx, eax
-	adc	esi, edx
-
-	mov	eax, [ebx]			; eax =  qlp_coeff[ 0] (NOTE: one byte missing from instruction)
-	imul	dword [edi - 4]			; edx:eax =  qlp_coeff[ 0] * data[i- 1]
-	add	ecx, eax
-	adc	esi, edx			; sum += qlp_coeff[ 0] * data[i- 1]
-
-.jumper_0:
-	mov	edx, ecx
-;esi:edx = sum
-	mov	ecx, [esp + 36]			; cl = lp_quantization
-	shrd	edx, esi, cl			; edx = (sum >> lp_quantization)
-;eax = --
-;ecx = --
-;edx = sum >> lp_q
-;esi = --
-;
-	mov	eax, [esp + 20]			; residual[] - data[]
-	add	edx, [edi + eax]		; edx = residual[i] + (sum >> lp_quantization)
-	mov	[edi], edx			; data[i] = residual[i] + (sum >> lp_quantization)
-	add	edi, 4
-
-	dec	dword [esp + 24]
-	jz	short .end
-	xor	ecx, ecx
-	xor	esi, esi
-	jmp	ebp
-
-.end:
-	pop	edi
-	pop	esi
-	pop	ebx
-	pop	ebp
-	ret
-
 ; end
--- a/src/libFLAC/include/private/lpc.h
+++ b/src/libFLAC/include/private/lpc.h
@ -206,22 +206,6 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_avx2(const FLA
 */
 void FLAC__lpc_restore_signal(const FLAC__int32 residual[], uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 data[]);
 void FLAC__lpc_restore_signal_wide(const FLAC__int32 residual[], uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 data[]);
-#ifndef FLAC__NO_ASM
-#  ifdef FLAC__CPU_IA32
-#    ifdef FLAC__HAS_NASM
-void FLAC__lpc_restore_signal_asm_ia32(const FLAC__int32 residual[], uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 data[]);
-void FLAC__lpc_restore_signal_asm_ia32_mmx(const FLAC__int32 residual[], uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 data[]);
-void FLAC__lpc_restore_signal_wide_asm_ia32(const FLAC__int32 residual[], uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 data[]);
-#    endif /* FLAC__HAS_NASM */
-#  endif /* FLAC__CPU_IA32 */
-#  if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN
-#    ifdef FLAC__SSE4_1_SUPPORTED
-void FLAC__lpc_restore_signal_intrin_sse41(const FLAC__int32 residual[], uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 data[]);
-void FLAC__lpc_restore_signal_16_intrin_sse41(const FLAC__int32 residual[], uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 data[]);
-void FLAC__lpc_restore_signal_wide_intrin_sse41(const FLAC__int32 residual[], uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 data[]);
-#    endif
-#  endif
-#endif /* FLAC__NO_ASM */

 #ifndef FLAC__INTEGER_ONLY_LIBRARY

--- a/src/libFLAC/lpc_intrin_sse41.c
+++ b/src/libFLAC/lpc_intrin_sse41.c
@ -588,550 +588,6 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FL
 	}
 }

-FLAC__SSE_TARGET("sse4.1")
-void FLAC__lpc_restore_signal_wide_intrin_sse41(const FLAC__int32 residual[], uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 data[])
-{
-	int i;
-	const __m128i cnt = _mm_cvtsi32_si128(lp_quantization);
-
-	if (!data_len)
-		return;
-
-	FLAC__ASSERT(order > 0);
-	FLAC__ASSERT(order <= 32);
-	FLAC__ASSERT(lp_quantization <= 32); /* there's no _mm_sra_epi64() so we have to use _mm_srl_epi64() */
-
-	if(order <= 12) {
-		if(order > 8) { /* order == 9, 10, 11, 12 */
-			if(order > 10) { /* order == 11, 12 */
-				__m128i qlp[6], dat[6];
-				__m128i summ, temp;
-				qlp[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0)));		// 0  q[1]  0  q[0]
-				qlp[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2)));		// 0  q[3]  0  q[2]
-				qlp[2] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4)));		// 0  q[5]  0  q[4]
-				qlp[3] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+6)));		// 0  q[7]  0  q[6]
-				qlp[4] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+8)));		// 0  q[9]  0  q[8]
-				if (order == 12)
-					qlp[5] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+10)));	// 0  q[11] 0  q[10]
-				else
-					qlp[5] = _mm_cvtepu32_epi64(_mm_cvtsi32_si128(qlp_coeff[10]));					// 0    0   0  q[10]
-
-				dat[5] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-12)), _MM_SHUFFLE(2,0,3,1));	// 0  d[i-12] 0  d[i-11]
-				dat[4] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-10)), _MM_SHUFFLE(2,0,3,1));	// 0  d[i-10] 0  d[i-9]
-				dat[3] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-8 )), _MM_SHUFFLE(2,0,3,1));	// 0  d[i-8]  0  d[i-7]
-				dat[2] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-6 )), _MM_SHUFFLE(2,0,3,1));	// 0  d[i-6]  0  d[i-5]
-				dat[1] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-4 )), _MM_SHUFFLE(2,0,3,1));	// 0  d[i-4]  0  d[i-3]
-				dat[0] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-2 )), _MM_SHUFFLE(2,0,3,1));	// 0  d[i-2]  0  d[i-1]
-
-				summ =                     _mm_mul_epi32(dat[5], qlp[5]) ;
-				summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[4], qlp[4]));
-				summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[3], qlp[3]));
-				summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[2], qlp[2]));
-				summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1]));
-				summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
-
-				summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));	// ?_64  sum_64
-				summ = _mm_srl_epi64(summ, cnt);						// ?_64  (sum >> lp_quantization)_64  ==  ?_32  ?_32  ?_32  (sum >> lp_quantization)_32
-				temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[0]), summ);	// ?  ?  ?  d[i]
-				data[0] = _mm_cvtsi128_si32(temp);
-
-				for(i = 1; i < (int)data_len; i++) {
-					temp = _mm_slli_si128(temp, 8);
-					dat[5] = _mm_alignr_epi8(dat[5], dat[4], 8);	//  ?  d[i-11] ?  d[i-10]
-					dat[4] = _mm_alignr_epi8(dat[4], dat[3], 8);	//  ?  d[i-9]  ?  d[i-8]
-					dat[3] = _mm_alignr_epi8(dat[3], dat[2], 8);	//  ?  d[i-7]  ?  d[i-6]
-					dat[2] = _mm_alignr_epi8(dat[2], dat[1], 8);	//  ?  d[i-5]  ?  d[i-4]
-					dat[1] = _mm_alignr_epi8(dat[1], dat[0], 8);	//  ?  d[i-3]  ?  d[i-2]
-					dat[0] = _mm_alignr_epi8(dat[0],   temp, 8);	//  ?  d[i-1]  ?  d[i  ]
-
-					summ =                     _mm_mul_epi32(dat[5], qlp[5]) ;
-					summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[4], qlp[4]));
-					summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[3], qlp[3]));
-					summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[2], qlp[2]));
-					summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1]));
-					summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
-
-					summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));	// ?_64  sum_64
-					summ = _mm_srl_epi64(summ, cnt);						// ?_64  (sum >> lp_quantization)_64  ==  ?_32  ?_32  ?_32  (sum >> lp_quantization)_32
-					temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ);	// ?  ?  ?  d[i]
-					data[i] = _mm_cvtsi128_si32(temp);
-				}
-			}
-			else { /* order == 9, 10 */
-				__m128i qlp[5], dat[5];
-				__m128i summ, temp;
-				qlp[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0)));
-				qlp[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2)));
-				qlp[2] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4)));
-				qlp[3] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+6)));
-				if (order == 10)
-					qlp[4] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+8)));
-				else
-					qlp[4] = _mm_cvtepu32_epi64(_mm_cvtsi32_si128(qlp_coeff[8]));
-
-				dat[4] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-10)), _MM_SHUFFLE(2,0,3,1));
-				dat[3] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-8 )), _MM_SHUFFLE(2,0,3,1));
-				dat[2] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-6 )), _MM_SHUFFLE(2,0,3,1));
-				dat[1] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-4 )), _MM_SHUFFLE(2,0,3,1));
-				dat[0] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-2 )), _MM_SHUFFLE(2,0,3,1));
-
-				summ =                     _mm_mul_epi32(dat[4], qlp[4]) ;
-				summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[3], qlp[3]));
-				summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[2], qlp[2]));
-				summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1]));
-				summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
-
-				summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
-				summ = _mm_srl_epi64(summ, cnt);
-				temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[0]), summ);
-				data[0] = _mm_cvtsi128_si32(temp);
-
-				for(i = 1; i < (int)data_len; i++) {
-					temp = _mm_slli_si128(temp, 8);
-					dat[4] = _mm_alignr_epi8(dat[4], dat[3], 8);
-					dat[3] = _mm_alignr_epi8(dat[3], dat[2], 8);
-					dat[2] = _mm_alignr_epi8(dat[2], dat[1], 8);
-					dat[1] = _mm_alignr_epi8(dat[1], dat[0], 8);
-					dat[0] = _mm_alignr_epi8(dat[0],   temp, 8);
-
-					summ =                     _mm_mul_epi32(dat[4], qlp[4]) ;
-					summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[3], qlp[3]));
-					summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[2], qlp[2]));
-					summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1]));
-					summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
-
-					summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
-					summ = _mm_srl_epi64(summ, cnt);
-					temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ);
-					data[i] = _mm_cvtsi128_si32(temp);
-				}
-			}
-		}
-		else if(order > 4) { /* order == 5, 6, 7, 8 */
-			if(order > 6) { /* order == 7, 8 */
-				__m128i qlp[4], dat[4];
-				__m128i summ, temp;
-				qlp[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0)));
-				qlp[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2)));
-				qlp[2] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4)));
-				if (order == 8)
-					qlp[3] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+6)));
-				else
-					qlp[3] = _mm_cvtepu32_epi64(_mm_cvtsi32_si128(qlp_coeff[6]));
-
-				dat[3] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-8 )), _MM_SHUFFLE(2,0,3,1));
-				dat[2] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-6 )), _MM_SHUFFLE(2,0,3,1));
-				dat[1] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-4 )), _MM_SHUFFLE(2,0,3,1));
-				dat[0] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-2 )), _MM_SHUFFLE(2,0,3,1));
-
-				summ =                     _mm_mul_epi32(dat[3], qlp[3]) ;
-				summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[2], qlp[2]));
-				summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1]));
-				summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
-
-				summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
-				summ = _mm_srl_epi64(summ, cnt);
-				temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[0]), summ);
-				data[0] = _mm_cvtsi128_si32(temp);
-
-				for(i = 1; i < (int)data_len; i++) {
-					temp = _mm_slli_si128(temp, 8);
-					dat[3] = _mm_alignr_epi8(dat[3], dat[2], 8);
-					dat[2] = _mm_alignr_epi8(dat[2], dat[1], 8);
-					dat[1] = _mm_alignr_epi8(dat[1], dat[0], 8);
-					dat[0] = _mm_alignr_epi8(dat[0],   temp, 8);
-
-					summ =                     _mm_mul_epi32(dat[3], qlp[3]) ;
-					summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[2], qlp[2]));
-					summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1]));
-					summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
-
-					summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
-					summ = _mm_srl_epi64(summ, cnt);
-					temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ);
-					data[i] = _mm_cvtsi128_si32(temp);
-				}
-			}
-			else { /* order == 5, 6 */
-				__m128i qlp[3], dat[3];
-				__m128i summ, temp;
-				qlp[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0)));
-				qlp[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2)));
-				if (order == 6)
-					qlp[2] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4)));
-				else
-					qlp[2] = _mm_cvtepu32_epi64(_mm_cvtsi32_si128(qlp_coeff[4]));
-
-				dat[2] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-6 )), _MM_SHUFFLE(2,0,3,1));
-				dat[1] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-4 )), _MM_SHUFFLE(2,0,3,1));
-				dat[0] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-2 )), _MM_SHUFFLE(2,0,3,1));
-
-				summ =                     _mm_mul_epi32(dat[2], qlp[2]) ;
-				summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1]));
-				summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
-
-				summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
-				summ = _mm_srl_epi64(summ, cnt);
-				temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[0]), summ);
-				data[0] = _mm_cvtsi128_si32(temp);
-
-				for(i = 1; i < (int)data_len; i++) {
-					temp = _mm_slli_si128(temp, 8);
-					dat[2] = _mm_alignr_epi8(dat[2], dat[1], 8);
-					dat[1] = _mm_alignr_epi8(dat[1], dat[0], 8);
-					dat[0] = _mm_alignr_epi8(dat[0],   temp, 8);
-
-					summ =                     _mm_mul_epi32(dat[2], qlp[2]) ;
-					summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1]));
-					summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
-
-					summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
-					summ = _mm_srl_epi64(summ, cnt);
-					temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ);
-					data[i] = _mm_cvtsi128_si32(temp);
-				}
-			}
-		}
-		else { /* order == 1, 2, 3, 4 */
-			if(order > 2) { /* order == 3, 4 */
-				__m128i qlp[2], dat[2];
-				__m128i summ, temp;
-				qlp[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0)));
-				if (order == 4)
-					qlp[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2)));
-				else
-					qlp[1] = _mm_cvtepu32_epi64(_mm_cvtsi32_si128(qlp_coeff[2]));
-
-				dat[1] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-4 )), _MM_SHUFFLE(2,0,3,1));
-				dat[0] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-2 )), _MM_SHUFFLE(2,0,3,1));
-
-				summ =                     _mm_mul_epi32(dat[1], qlp[1]) ;
-				summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
-
-				summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
-				summ = _mm_srl_epi64(summ, cnt);
-				temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[0]), summ);
-				data[0] = _mm_cvtsi128_si32(temp);
-
-				for(i = 1; i < (int)data_len; i++) {
-					temp = _mm_slli_si128(temp, 8);
-					dat[1] = _mm_alignr_epi8(dat[1], dat[0], 8);
-					dat[0] = _mm_alignr_epi8(dat[0],   temp, 8);
-
-					summ =                     _mm_mul_epi32(dat[1], qlp[1]) ;
-					summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
-
-					summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
-					summ = _mm_srl_epi64(summ, cnt);
-					temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ);
-					data[i] = _mm_cvtsi128_si32(temp);
-				}
-			}
-			else { /* order == 1, 2 */
-				if(order == 2) {
-					__m128i qlp0, dat0;
-					__m128i summ, temp;
-					qlp0 = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff)));
-
-					dat0 = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-2 )), _MM_SHUFFLE(2,0,3,1));
-
-					summ = _mm_mul_epi32(dat0, qlp0);
-
-					summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
-					summ = _mm_srl_epi64(summ, cnt);
-					temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[0]), summ);
-					data[0] = _mm_cvtsi128_si32(temp);
-
-					for(i = 1; i < (int)data_len; i++) {
-						dat0 = _mm_alignr_epi8(dat0, _mm_slli_si128(temp, 8), 8);
-
-						summ = _mm_mul_epi32(dat0, qlp0);
-
-						summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
-						summ = _mm_srl_epi64(summ, cnt);
-						temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ);
-						data[i] = _mm_cvtsi128_si32(temp);
-					}
-				}
-				else { /* order == 1 */
-					__m128i qlp0;
-					__m128i summ, temp;
-					qlp0 = _mm_cvtsi32_si128(qlp_coeff[0]);
-					temp = _mm_cvtsi32_si128(data[-1]);
-
-					summ = _mm_mul_epi32(temp, qlp0);
-					summ = _mm_srl_epi64(summ, cnt);
-					temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[0]), summ);
-					data[0] = _mm_cvtsi128_si32(temp);
-
-					for(i = 1; i < (int)data_len; i++) {
-						summ = _mm_mul_epi32(temp, qlp0);
-						summ = _mm_srl_epi64(summ, cnt);
-						temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ);
-						data[i] = _mm_cvtsi128_si32(temp);
-					}
-				}
-			}
-		}
-	}
-	else { /* order > 12 */
-		__m128i qlp[16];
-
-		for(i = 0; i < (int)order/2; i++)
-			qlp[i] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+i*2)), _MM_SHUFFLE(2,0,3,1));	// 0  q[2*i]  0  q[2*i+1]
-		if(order & 1)
-			qlp[i] = _mm_shuffle_epi32(_mm_cvtsi32_si128(qlp_coeff[i*2]), _MM_SHUFFLE(2,0,3,1));
-
-		for(i = 0; i < (int)data_len; i++) {
-			__m128i summ = _mm_setzero_si128(), dat;
-			FLAC__int32 * const datai = &data[i];
-
-			switch((order+1) / 2) {
-				case 16: /* order == 31, 32 */
-					dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-32)));
-					summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[15]));                /* Falls through. */
-				case 15:
-					dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-30)));
-					summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[14]));                /* Falls through. */
-				case 14:
-					dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-28)));
-					summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[13]));                /* Falls through. */
-				case 13:
-					dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-26)));
-					summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[12]));                /* Falls through. */
-				case 12:
-					dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-24)));
-					summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[11]));                /* Falls through. */
-				case 11:
-					dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-22)));
-					summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[10]));                /* Falls through. */
-				case 10:
-					dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-20)));
-					summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[9]));                 /* Falls through. */
-				case  9:
-					dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-18)));
-					summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[8]));                 /* Falls through. */
-				case  8:
-					dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-16)));
-					summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[7]));                 /* Falls through. */
-				case  7: /* order == 13, 14 */
-					dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-14)));
-					summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[6]));
-					dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-12)));
-					summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[5]));
-					dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-10)));
-					summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[4]));
-					dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-8)));
-					summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[3]));
-					dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-6)));
-					summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[2]));
-					dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-4)));
-					summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[1]));
-					dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-2)));
-					summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[0]));
-			}
-			summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
-			summ = _mm_srl_epi64(summ, cnt);
-			summ = _mm_add_epi32(summ, _mm_cvtsi32_si128(residual[i]));
-			data[i] = _mm_cvtsi128_si32(summ);
-		}
-	}
-}
-
-FLAC__SSE_TARGET("sse4.1")
-void FLAC__lpc_restore_signal_intrin_sse41(const FLAC__int32 residual[], uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 data[])
-{
-	if(order < 8) {
-		FLAC__lpc_restore_signal(residual, data_len, qlp_coeff, order, lp_quantization, data);
-		return;
-	}
-
-	FLAC__ASSERT(order >= 8);
-	FLAC__ASSERT(order <= 32);
-
-	if(order <= 12) {
-		int i;
-		const __m128i cnt = _mm_cvtsi32_si128(lp_quantization);
-
-		if(order > 8) /* order == 9, 10, 11, 12 */
-		{
-			__m128i qlp[3], dat[3];
-			__m128i summ, temp;
-
-			qlp[0] = _mm_loadu_si128((const __m128i*)(const void*)(qlp_coeff + 0));	// q[3]  q[2]  q[1]  q[0]
-			qlp[1] = _mm_loadu_si128((const __m128i*)(const void*)(qlp_coeff + 4));	// q[7]  q[6]  q[5]  q[4]
-			qlp[2] = _mm_loadu_si128((const __m128i*)(const void*)(qlp_coeff + 8));	// q[11] q[10] q[9]  q[8]
-			switch (order)
-			{
-			case 9:
-				qlp[2] = _mm_slli_si128(qlp[2], 12); qlp[2] = _mm_srli_si128(qlp[2], 12); break;	//   0     0     0   q[8]
-			case 10:
-				qlp[2] = _mm_slli_si128(qlp[2], 8); qlp[2] = _mm_srli_si128(qlp[2], 8); break;	//   0     0   q[9]  q[8]
-			case 11:
-				qlp[2] = _mm_slli_si128(qlp[2], 4); qlp[2] = _mm_srli_si128(qlp[2], 4); break;	//   0   q[10] q[9]  q[8]
-			}
-
-			dat[2] = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data - 12)), _MM_SHUFFLE(0, 1, 2, 3));	// d[i-12] d[i-11] d[i-10] d[i-9]
-			dat[1] = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data - 8)), _MM_SHUFFLE(0, 1, 2, 3));	// d[i-8]  d[i-7]  d[i-6]  d[i-5]
-			dat[0] = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data - 4)), _MM_SHUFFLE(0, 1, 2, 3));	// d[i-4]  d[i-3]  d[i-2]  d[i-1]
-
-			for (i = 0;;) {
-				summ = _mm_mullo_epi32(dat[2], qlp[2]);
-				summ = _mm_add_epi32(summ, _mm_mullo_epi32(dat[1], qlp[1]));
-				summ = _mm_add_epi32(summ, _mm_mullo_epi32(dat[0], qlp[0]));
-
-				summ = _mm_add_epi32(summ, _mm_shuffle_epi32(summ, _MM_SHUFFLE(1,0,3,2)));
-				summ = _mm_add_epi32(summ, _mm_shufflelo_epi16(summ, _MM_SHUFFLE(1,0,3,2)));
-
-				summ = _mm_sra_epi32(summ, cnt);
-				temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ);
-				data[i] = _mm_cvtsi128_si32(temp);
-
-				if(++i >= (int)data_len) break;
-
-				temp = _mm_slli_si128(temp, 12);
-				dat[2] = _mm_alignr_epi8(dat[2], dat[1], 12);
-				dat[1] = _mm_alignr_epi8(dat[1], dat[0], 12);
-				dat[0] = _mm_alignr_epi8(dat[0], temp, 12);
-			}
-		}
-		else /* order == 8 */
-		{
-			__m128i qlp[2], dat[2];
-			__m128i summ, temp;
-
-			qlp[0] = _mm_loadu_si128((const __m128i*)(const void*)(qlp_coeff + 0));
-			qlp[1] = _mm_loadu_si128((const __m128i*)(const void*)(qlp_coeff + 4));
-
-			dat[1] = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data - 8)), _MM_SHUFFLE(0, 1, 2, 3));
-			dat[0] = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data - 4)), _MM_SHUFFLE(0, 1, 2, 3));
-
-			for (i = 0;;) {
-				summ = _mm_add_epi32(_mm_mullo_epi32(dat[1], qlp[1]), _mm_mullo_epi32(dat[0], qlp[0]));
-
-				summ = _mm_add_epi32(summ, _mm_shuffle_epi32(summ, _MM_SHUFFLE(1,0,3,2)));
-				summ = _mm_add_epi32(summ, _mm_shufflelo_epi16(summ, _MM_SHUFFLE(1,0,3,2)));
-
-				summ = _mm_sra_epi32(summ, cnt);
-				temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ);
-				data[i] = _mm_cvtsi128_si32(temp);
-
-				if(++i >= (int)data_len) break;
-
-				temp = _mm_slli_si128(temp, 12);
-				dat[1] = _mm_alignr_epi8(dat[1], dat[0], 12);
-				dat[0] = _mm_alignr_epi8(dat[0], temp, 12);
-			}
-		}
-	}
-	else { /* order > 12 */
-#ifdef FLAC__HAS_NASM
-		FLAC__lpc_restore_signal_asm_ia32(residual, data_len, qlp_coeff, order, lp_quantization, data);
-#else
-		FLAC__lpc_restore_signal(residual, data_len, qlp_coeff, order, lp_quantization, data);
-#endif
-	}
-}
-
-FLAC__SSE_TARGET("ssse3")
-void FLAC__lpc_restore_signal_16_intrin_sse41(const FLAC__int32 residual[], uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 data[])
-{
-	if(order < 8) {
-		FLAC__lpc_restore_signal(residual, data_len, qlp_coeff, order, lp_quantization, data);
-		return;
-	}
-
-	FLAC__ASSERT(order >= 8);
-	FLAC__ASSERT(order <= 32);
-
-	if(order <= 12) {
-		int i;
-		const __m128i cnt = _mm_cvtsi32_si128(lp_quantization);
-
-		if(order > 8) /* order == 9, 10, 11, 12 */
-		{
-			__m128i qlp[2], dat[2];
-			__m128i summ, temp;
-
-			qlp[0] = _mm_loadu_si128((const __m128i*)(const void*)(qlp_coeff+0));	// q[3]  q[2]  q[1]  q[0]
-			temp   = _mm_loadu_si128((const __m128i*)(const void*)(qlp_coeff+4));	// q[7]  q[6]  q[5]  q[4]
-			qlp[1] = _mm_loadu_si128((const __m128i*)(const void*)(qlp_coeff+8));	// q[11] q[10] q[9]  q[8]
-			switch(order)
-			{
-			case 9:
-				qlp[1] = _mm_slli_si128(qlp[1], 12); qlp[1] = _mm_srli_si128(qlp[1], 12); break;	//   0     0     0   q[8]
-			case 10:
-				qlp[1] = _mm_slli_si128(qlp[1],  8); qlp[1] = _mm_srli_si128(qlp[1],  8); break;	//   0     0   q[9]  q[8]
-			case 11:
-				qlp[1] = _mm_slli_si128(qlp[1],  4); qlp[1] = _mm_srli_si128(qlp[1],  4); break;	//   0   q[10] q[9]  q[8]
-			}
-			qlp[0] = _mm_packs_epi32(qlp[0], temp);					// q[7]  q[6]  q[5]  q[4]  q[3]  q[2]  q[1]  q[0]
-			qlp[1] = _mm_packs_epi32(qlp[1], _mm_setzero_si128());	//   0     0     0     0   q[11] q[10] q[9]  q[8]
-
-			dat[1] = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data-12)), _MM_SHUFFLE(0,1,2,3));	// d[i-12] d[i-11] d[i-10] d[i-9]
-			temp   = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data-8)),  _MM_SHUFFLE(0,1,2,3));	// d[i-8]  d[i-7]  d[i-6]  d[i-5]
-			dat[0] = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data-4)),  _MM_SHUFFLE(0,1,2,3));	// d[i-4]  d[i-3]  d[i-2]  d[i-1]
-
-			dat[1] = _mm_packs_epi32(dat[1], _mm_setzero_si128());		//   0       0       0       0     d[i-12] d[i-11] d[i-10] d[i-9]
-			dat[0] = _mm_packs_epi32(dat[0], temp);						// d[i-8]  d[i-7]  d[i-6]  d[i-5]  d[i-4]  d[i-3]  d[i-2]  d[i-1]
-
-			for(i = 0;;) {
-				summ = _mm_madd_epi16(dat[1], qlp[1]);
-				summ = _mm_add_epi32(summ, _mm_madd_epi16(dat[0], qlp[0]));
-
-				summ = _mm_add_epi32(summ, _mm_shuffle_epi32(summ, _MM_SHUFFLE(1,0,3,2)));
-				summ = _mm_add_epi32(summ, _mm_shufflelo_epi16(summ, _MM_SHUFFLE(1,0,3,2)));
-
-				summ = _mm_sra_epi32(summ, cnt);
-				temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ);
-				data[i] = _mm_cvtsi128_si32(temp);
-
-				if(++i >= (int)data_len) break;
-
-				temp = _mm_slli_si128(temp, 14);
-				dat[1] = _mm_alignr_epi8(dat[1], dat[0], 14);	//   0       0       0     d[i-12] d[i-11] d[i-10] d[i-9]  d[i-8]
-				dat[0] = _mm_alignr_epi8(dat[0],   temp, 14);	// d[i-7]  d[i-6]  d[i-5]  d[i-4]  d[i-3]  d[i-2]  d[i-1]  d[i]
-			}
-		}
-		else /* order == 8 */
-		{
-			__m128i qlp0, dat0;
-			__m128i summ, temp;
-
-			qlp0 = _mm_loadu_si128((const __m128i*)(const void*)(qlp_coeff+0));	// q[3]  q[2]  q[1]  q[0]
-			temp = _mm_loadu_si128((const __m128i*)(const void*)(qlp_coeff+4));	// q[7]  q[6]  q[5]  q[4]
-			qlp0 = _mm_packs_epi32(qlp0, temp);						// q[7]  q[6]  q[5]  q[4]  q[3]  q[2]  q[1]  q[0]
-
-			temp = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data-8)), _MM_SHUFFLE(0,1,2,3));
-			dat0 = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data-4)), _MM_SHUFFLE(0,1,2,3));
-			dat0 = _mm_packs_epi32(dat0, temp);						// d[i-8]  d[i-7]  d[i-6]  d[i-5]  d[i-4]  d[i-3]  d[i-2]  d[i-1]
-
-			for(i = 0;;) {
-				summ = _mm_madd_epi16(dat0, qlp0);
-
-				summ = _mm_add_epi32(summ, _mm_shuffle_epi32(summ, _MM_SHUFFLE(1,0,3,2)));
-				summ = _mm_add_epi32(summ, _mm_shufflelo_epi16(summ, _MM_SHUFFLE(1,0,3,2)));
-
-				summ = _mm_sra_epi32(summ, cnt);
-				temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ);
-				data[i] = _mm_cvtsi128_si32(temp);
-
-				if(++i >= (int)data_len) break;
-
-				temp = _mm_slli_si128(temp, 14);
-				dat0 = _mm_alignr_epi8(dat0, temp, 14);	// d[i-7]  d[i-6]  d[i-5]  d[i-4]  d[i-3]  d[i-2]  d[i-1]  d[i]
-			}
-		}
-	}
-	else { /* order > 12 */
-#ifdef FLAC__HAS_NASM
-		FLAC__lpc_restore_signal_asm_ia32_mmx(residual, data_len, qlp_coeff, order, lp_quantization, data);
-#else
-		FLAC__lpc_restore_signal(residual, data_len, qlp_coeff, order, lp_quantization, data);
-#endif
-	}
-}
-
 #endif /* defined FLAC__CPU_IA32 */

 FLAC__SSE_TARGET("sse4.1")
--- a/src/libFLAC/stream_decoder.c
+++ b/src/libFLAC/stream_decoder.c
@ -45,7 +45,6 @@
 #include "protected/stream_decoder.h"
 #include "private/bitreader.h"
 #include "private/bitmath.h"
-#include "private/cpu.h"
 #include "private/crc.h"
 #include "private/fixed.h"
 #include "private/format.h"
@ -129,12 +128,6 @@ typedef struct FLAC__StreamDecoderPrivate {
 	FLAC__StreamDecoderWriteCallback write_callback;
 	FLAC__StreamDecoderMetadataCallback metadata_callback;
 	FLAC__StreamDecoderErrorCallback error_callback;
-	/* generic 32-bit datapath: */
-	void (*local_lpc_restore_signal)(const FLAC__int32 residual[], uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 data[]);
-	/* generic 64-bit datapath: */
-	void (*local_lpc_restore_signal_64bit)(const FLAC__int32 residual[], uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 data[]);
-	/* for use when the signal is <= 16 bits-per-sample, or <= 15 bits-per-sample on a side channel (which requires 1 extra bit): */
-	void (*local_lpc_restore_signal_16bit)(const FLAC__int32 residual[], uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 data[]);
 	void *client_data;
 	FILE *file; /* only used if FLAC__stream_decoder_init_file()/FLAC__stream_decoder_init_file() called, else NULL */
 	FLAC__BitReader *input;
@ -152,7 +145,6 @@ typedef struct FLAC__StreamDecoderPrivate {
 	size_t metadata_filter_ids_count, metadata_filter_ids_capacity; /* units for both are IDs, not bytes */
 	FLAC__Frame frame;
 	FLAC__bool cached; /* true if there is a byte in lookahead */
-	FLAC__CPUInfo cpuinfo;
 	FLAC__byte header_warmup[2]; /* contains the sync code and reserved bits */
 	FLAC__byte lookahead; /* temp storage when we need to look ahead one byte in the stream */
 	/* unaligned (original) pointers to allocated data */
@ -373,48 +365,6 @@ static FLAC__StreamDecoderInitStatus init_stream_internal_(
 		return decoder->protected_->initstate = FLAC__STREAM_DECODER_INIT_STATUS_ERROR_OPENING_FILE;
 #endif

-	/*
-	 * get the CPU info and set the function pointers
-	 */
-	FLAC__cpu_info(&decoder->private_->cpuinfo);
-	/* first default to the non-asm routines */
-	decoder->private_->local_lpc_restore_signal = FLAC__lpc_restore_signal;
-	decoder->private_->local_lpc_restore_signal_64bit = FLAC__lpc_restore_signal_wide;
-	decoder->private_->local_lpc_restore_signal_16bit = FLAC__lpc_restore_signal;
-	/* now override with asm where appropriate */
-#ifndef FLAC__NO_ASM
-	if(decoder->private_->cpuinfo.use_asm) {
-#ifdef FLAC__CPU_IA32
-		FLAC__ASSERT(decoder->private_->cpuinfo.type == FLAC__CPUINFO_TYPE_IA32);
-#ifdef FLAC__HAS_NASM
-		decoder->private_->local_lpc_restore_signal_64bit = FLAC__lpc_restore_signal_wide_asm_ia32; /* OPT_IA32: was really necessary for GCC < 4.9 */
-		if (decoder->private_->cpuinfo.x86.mmx) {
-			decoder->private_->local_lpc_restore_signal = FLAC__lpc_restore_signal_asm_ia32;
-			decoder->private_->local_lpc_restore_signal_16bit = FLAC__lpc_restore_signal_asm_ia32_mmx;
-		}
-		else {
-			decoder->private_->local_lpc_restore_signal = FLAC__lpc_restore_signal_asm_ia32;
-			decoder->private_->local_lpc_restore_signal_16bit = FLAC__lpc_restore_signal_asm_ia32;
-		}
-#endif
-#if FLAC__HAS_X86INTRIN && ! defined FLAC__INTEGER_ONLY_LIBRARY
-# if defined FLAC__SSE4_1_SUPPORTED
-		if (decoder->private_->cpuinfo.x86.sse41) {
-#  if !defined FLAC__HAS_NASM  /* these are not undoubtedly faster than their MMX ASM counterparts */
-			decoder->private_->local_lpc_restore_signal = FLAC__lpc_restore_signal_intrin_sse41;
-			decoder->private_->local_lpc_restore_signal_16bit = FLAC__lpc_restore_signal_16_intrin_sse41;
-#  endif
-			decoder->private_->local_lpc_restore_signal_64bit = FLAC__lpc_restore_signal_wide_intrin_sse41;
-		}
-# endif
-#endif
-#elif defined FLAC__CPU_X86_64
-		FLAC__ASSERT(decoder->private_->cpuinfo.type == FLAC__CPUINFO_TYPE_X86_64);
-		/* No useful SSE optimizations yet */
-#endif
-	}
-#endif
-
 	/* from here on, errors are fatal */

 	if(!FLAC__bitreader_init(decoder->private_->input, read_callback_, decoder)) {
@ -2848,12 +2798,9 @@ FLAC__bool read_subframe_lpc_(FLAC__StreamDecoder *decoder, uint32_t channel, ui
 	if(do_full_decode) {
 		memcpy(decoder->private_->output[channel], subframe->warmup, sizeof(FLAC__int32) * order);
 		if(bps + subframe->qlp_coeff_precision + FLAC__bitmath_ilog2(order) <= 32)
-			if(bps <= 16 && subframe->qlp_coeff_precision <= 16)
-				decoder->private_->local_lpc_restore_signal_16bit(decoder->private_->residual[channel], decoder->private_->frame.header.blocksize-order, subframe->qlp_coeff, order, subframe->quantization_level, decoder->private_->output[channel]+order);
-			else
-				decoder->private_->local_lpc_restore_signal(decoder->private_->residual[channel], decoder->private_->frame.header.blocksize-order, subframe->qlp_coeff, order, subframe->quantization_level, decoder->private_->output[channel]+order);
+			FLAC__lpc_restore_signal(decoder->private_->residual[channel], decoder->private_->frame.header.blocksize-order, subframe->qlp_coeff, order, subframe->quantization_level, decoder->private_->output[channel]+order);
 		else
-			decoder->private_->local_lpc_restore_signal_64bit(decoder->private_->residual[channel], decoder->private_->frame.header.blocksize-order, subframe->qlp_coeff, order, subframe->quantization_level, decoder->private_->output[channel]+order);
+			FLAC__lpc_restore_signal_wide(decoder->private_->residual[channel], decoder->private_->frame.header.blocksize-order, subframe->qlp_coeff, order, subframe->quantization_level, decoder->private_->output[channel]+order);
 	}

 	return true;