Remove all assembler and intrinsics from decoder

This commit drops all use of assembler and intrinsics from the libFLAC
decoder. This is because they are only for 32-bit x86, hard to debug,
maintain and fuzz properly, and because the decoder has much greater
security risks than the encoder.
This commit is contained in:
Martijn van Beurden 2022-05-12 14:28:05 +02:00
parent a67102694d
commit febff86af0
4 changed files with 2 additions and 1270 deletions

View File

@ -38,9 +38,6 @@
cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32
cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx
cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_wide_asm_ia32
cglobal FLAC__lpc_restore_signal_asm_ia32
cglobal FLAC__lpc_restore_signal_asm_ia32_mmx
cglobal FLAC__lpc_restore_signal_wide_asm_ia32
code_section
@ -446,377 +443,6 @@ cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx
pop ebp
ret
; **********************************************************************
;
; void FLAC__lpc_restore_signal(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
; {
; unsigned i, j;
; FLAC__int32 sum;
;
; FLAC__ASSERT(order > 0);
;
; for(i = 0; i < data_len; i++) {
; sum = 0;
; for(j = 0; j < order; j++)
; sum += qlp_coeff[j] * data[i-j-1];
; data[i] = residual[i] + (sum >> lp_quantization);
; }
; }
ALIGN 16
cident FLAC__lpc_restore_signal_asm_ia32
;[esp + 40] data[]
;[esp + 36] lp_quantization
;[esp + 32] order
;[esp + 28] qlp_coeff[]
;[esp + 24] data_len
;[esp + 20] residual[]
;ASSERT(order > 0)
push ebp
push ebx
push esi
push edi
mov esi, [esp + 20] ; esi = residual[]
mov edi, [esp + 40] ; edi = data[]
mov eax, [esp + 32] ; eax = order
mov ebx, [esp + 24] ; ebx = data_len
test ebx, ebx
jz near .end ; do nothing if data_len == 0
.begin:
cmp eax, byte 1
jg short .x87_1more
mov ecx, [esp + 28]
mov edx, [ecx]
mov eax, [edi - 4]
mov ecx, [esp + 36]
ALIGN 16
.x87_1_loop_i:
imul eax, edx
sar eax, cl
add eax, [esi]
mov [edi], eax
add esi, byte 4
add edi, byte 4
dec ebx
jnz .x87_1_loop_i
jmp .end
.x87_1more:
cmp eax, byte 32 ; for order <= 32 there is a faster routine
jbe short .x87_32
; This version is here just for completeness, since FLAC__MAX_LPC_ORDER == 32
ALIGN 16
.x87_32more_loop_i:
xor ebp, ebp
mov ecx, [esp + 32]
mov edx, ecx
shl edx, 2
add edx, [esp + 28]
neg ecx
ALIGN 16
.x87_32more_loop_j:
sub edx, byte 4
mov eax, [edx]
imul eax, [edi + 4 * ecx]
add ebp, eax
inc ecx
jnz short .x87_32more_loop_j
mov ecx, [esp + 36]
sar ebp, cl
add ebp, [esi]
mov [edi], ebp
add edi, byte 4
add esi, byte 4
dec ebx
jnz .x87_32more_loop_i
jmp .end
.mov_eip_to_eax:
mov eax, [esp]
ret
.x87_32:
sub esi, edi
neg eax
lea edx, [eax + eax * 8 + .jumper_0 - .get_eip0]
call .mov_eip_to_eax
.get_eip0:
add edx, eax
inc edx ; compensate for the shorter opcode on the last iteration
mov eax, [esp + 28] ; eax = qlp_coeff[]
xor ebp, ebp
jmp edx
mov ecx, [eax + 124] ; ecx = qlp_coeff[31]
imul ecx, [edi - 128] ; ecx = qlp_coeff[31] * data[i-32]
add ebp, ecx ; sum += qlp_coeff[31] * data[i-32]
mov ecx, [eax + 120] ; ecx = qlp_coeff[30]
imul ecx, [edi - 124] ; ecx = qlp_coeff[30] * data[i-31]
add ebp, ecx ; sum += qlp_coeff[30] * data[i-31]
mov ecx, [eax + 116] ; ecx = qlp_coeff[29]
imul ecx, [edi - 120] ; ecx = qlp_coeff[29] * data[i-30]
add ebp, ecx ; sum += qlp_coeff[29] * data[i-30]
mov ecx, [eax + 112] ; ecx = qlp_coeff[28]
imul ecx, [edi - 116] ; ecx = qlp_coeff[28] * data[i-29]
add ebp, ecx ; sum += qlp_coeff[28] * data[i-29]
mov ecx, [eax + 108] ; ecx = qlp_coeff[27]
imul ecx, [edi - 112] ; ecx = qlp_coeff[27] * data[i-28]
add ebp, ecx ; sum += qlp_coeff[27] * data[i-28]
mov ecx, [eax + 104] ; ecx = qlp_coeff[26]
imul ecx, [edi - 108] ; ecx = qlp_coeff[26] * data[i-27]
add ebp, ecx ; sum += qlp_coeff[26] * data[i-27]
mov ecx, [eax + 100] ; ecx = qlp_coeff[25]
imul ecx, [edi - 104] ; ecx = qlp_coeff[25] * data[i-26]
add ebp, ecx ; sum += qlp_coeff[25] * data[i-26]
mov ecx, [eax + 96] ; ecx = qlp_coeff[24]
imul ecx, [edi - 100] ; ecx = qlp_coeff[24] * data[i-25]
add ebp, ecx ; sum += qlp_coeff[24] * data[i-25]
mov ecx, [eax + 92] ; ecx = qlp_coeff[23]
imul ecx, [edi - 96] ; ecx = qlp_coeff[23] * data[i-24]
add ebp, ecx ; sum += qlp_coeff[23] * data[i-24]
mov ecx, [eax + 88] ; ecx = qlp_coeff[22]
imul ecx, [edi - 92] ; ecx = qlp_coeff[22] * data[i-23]
add ebp, ecx ; sum += qlp_coeff[22] * data[i-23]
mov ecx, [eax + 84] ; ecx = qlp_coeff[21]
imul ecx, [edi - 88] ; ecx = qlp_coeff[21] * data[i-22]
add ebp, ecx ; sum += qlp_coeff[21] * data[i-22]
mov ecx, [eax + 80] ; ecx = qlp_coeff[20]
imul ecx, [edi - 84] ; ecx = qlp_coeff[20] * data[i-21]
add ebp, ecx ; sum += qlp_coeff[20] * data[i-21]
mov ecx, [eax + 76] ; ecx = qlp_coeff[19]
imul ecx, [edi - 80] ; ecx = qlp_coeff[19] * data[i-20]
add ebp, ecx ; sum += qlp_coeff[19] * data[i-20]
mov ecx, [eax + 72] ; ecx = qlp_coeff[18]
imul ecx, [edi - 76] ; ecx = qlp_coeff[18] * data[i-19]
add ebp, ecx ; sum += qlp_coeff[18] * data[i-19]
mov ecx, [eax + 68] ; ecx = qlp_coeff[17]
imul ecx, [edi - 72] ; ecx = qlp_coeff[17] * data[i-18]
add ebp, ecx ; sum += qlp_coeff[17] * data[i-18]
mov ecx, [eax + 64] ; ecx = qlp_coeff[16]
imul ecx, [edi - 68] ; ecx = qlp_coeff[16] * data[i-17]
add ebp, ecx ; sum += qlp_coeff[16] * data[i-17]
mov ecx, [eax + 60] ; ecx = qlp_coeff[15]
imul ecx, [edi - 64] ; ecx = qlp_coeff[15] * data[i-16]
add ebp, ecx ; sum += qlp_coeff[15] * data[i-16]
mov ecx, [eax + 56] ; ecx = qlp_coeff[14]
imul ecx, [edi - 60] ; ecx = qlp_coeff[14] * data[i-15]
add ebp, ecx ; sum += qlp_coeff[14] * data[i-15]
mov ecx, [eax + 52] ; ecx = qlp_coeff[13]
imul ecx, [edi - 56] ; ecx = qlp_coeff[13] * data[i-14]
add ebp, ecx ; sum += qlp_coeff[13] * data[i-14]
mov ecx, [eax + 48] ; ecx = qlp_coeff[12]
imul ecx, [edi - 52] ; ecx = qlp_coeff[12] * data[i-13]
add ebp, ecx ; sum += qlp_coeff[12] * data[i-13]
mov ecx, [eax + 44] ; ecx = qlp_coeff[11]
imul ecx, [edi - 48] ; ecx = qlp_coeff[11] * data[i-12]
add ebp, ecx ; sum += qlp_coeff[11] * data[i-12]
mov ecx, [eax + 40] ; ecx = qlp_coeff[10]
imul ecx, [edi - 44] ; ecx = qlp_coeff[10] * data[i-11]
add ebp, ecx ; sum += qlp_coeff[10] * data[i-11]
mov ecx, [eax + 36] ; ecx = qlp_coeff[ 9]
imul ecx, [edi - 40] ; ecx = qlp_coeff[ 9] * data[i-10]
add ebp, ecx ; sum += qlp_coeff[ 9] * data[i-10]
mov ecx, [eax + 32] ; ecx = qlp_coeff[ 8]
imul ecx, [edi - 36] ; ecx = qlp_coeff[ 8] * data[i- 9]
add ebp, ecx ; sum += qlp_coeff[ 8] * data[i- 9]
mov ecx, [eax + 28] ; ecx = qlp_coeff[ 7]
imul ecx, [edi - 32] ; ecx = qlp_coeff[ 7] * data[i- 8]
add ebp, ecx ; sum += qlp_coeff[ 7] * data[i- 8]
mov ecx, [eax + 24] ; ecx = qlp_coeff[ 6]
imul ecx, [edi - 28] ; ecx = qlp_coeff[ 6] * data[i- 7]
add ebp, ecx ; sum += qlp_coeff[ 6] * data[i- 7]
mov ecx, [eax + 20] ; ecx = qlp_coeff[ 5]
imul ecx, [edi - 24] ; ecx = qlp_coeff[ 5] * data[i- 6]
add ebp, ecx ; sum += qlp_coeff[ 5] * data[i- 6]
mov ecx, [eax + 16] ; ecx = qlp_coeff[ 4]
imul ecx, [edi - 20] ; ecx = qlp_coeff[ 4] * data[i- 5]
add ebp, ecx ; sum += qlp_coeff[ 4] * data[i- 5]
mov ecx, [eax + 12] ; ecx = qlp_coeff[ 3]
imul ecx, [edi - 16] ; ecx = qlp_coeff[ 3] * data[i- 4]
add ebp, ecx ; sum += qlp_coeff[ 3] * data[i- 4]
mov ecx, [eax + 8] ; ecx = qlp_coeff[ 2]
imul ecx, [edi - 12] ; ecx = qlp_coeff[ 2] * data[i- 3]
add ebp, ecx ; sum += qlp_coeff[ 2] * data[i- 3]
mov ecx, [eax + 4] ; ecx = qlp_coeff[ 1]
imul ecx, [edi - 8] ; ecx = qlp_coeff[ 1] * data[i- 2]
add ebp, ecx ; sum += qlp_coeff[ 1] * data[i- 2]
mov ecx, [eax] ; ecx = qlp_coeff[ 0] (NOTE: one byte missing from instruction)
imul ecx, [edi - 4] ; ecx = qlp_coeff[ 0] * data[i- 1]
add ebp, ecx ; sum += qlp_coeff[ 0] * data[i- 1]
.jumper_0:
mov ecx, [esp + 36]
sar ebp, cl ; ebp = (sum >> lp_quantization)
add ebp, [esi + edi] ; ebp = residual[i] + (sum >> lp_quantization)
mov [edi], ebp ; data[i] = residual[i] + (sum >> lp_quantization)
add edi, byte 4
dec ebx
jz short .end
xor ebp, ebp
jmp edx
.end:
pop edi
pop esi
pop ebx
pop ebp
ret
; WATCHOUT: this routine works on 16 bit data which means bits-per-sample for
; the channel and qlp_coeffs must be <= 16. Especially note that this routine
; cannot be used for side-channel coded 16bps channels since the effective bps
; is 17.
; WATCHOUT: this routine requires that each data array have a buffer of up to
; 3 zeroes in front (at negative indices) for alignment purposes, i.e. for each
; channel n, data[n][-1] through data[n][-3] should be accessible and zero.
ALIGN 16
cident FLAC__lpc_restore_signal_asm_ia32_mmx
;[esp + 40] data[]
;[esp + 36] lp_quantization
;[esp + 32] order
;[esp + 28] qlp_coeff[]
;[esp + 24] data_len
;[esp + 20] residual[]
;ASSERT(order > 0)
push ebp
push ebx
push esi
push edi
mov esi, [esp + 20]
mov edi, [esp + 40]
mov eax, [esp + 32]
mov ebx, [esp + 24]
test ebx, ebx
jz near .end ; do nothing if data_len == 0
cmp eax, byte 4
jb near FLAC__lpc_restore_signal_asm_ia32.begin
mov edx, [esp + 28]
movd mm6, [esp + 36]
mov ebp, esp
and esp, 0xfffffff8
xor ecx, ecx
.copy_qlp_loop:
push word [edx + 4 * ecx]
inc ecx
cmp ecx, eax
jnz short .copy_qlp_loop
and ecx, 0x3
test ecx, ecx
je short .za_end
sub ecx, byte 4
.za_loop:
push word 0
inc eax
inc ecx
jnz short .za_loop
.za_end:
movq mm5, [esp + 2 * eax - 8]
movd mm4, [edi - 16]
punpckldq mm4, [edi - 12]
movd mm0, [edi - 8]
punpckldq mm0, [edi - 4]
packssdw mm4, mm0
cmp eax, byte 4
jnbe short .mmx_4more
ALIGN 16
.mmx_4_loop_i:
movq mm7, mm4
pmaddwd mm7, mm5
movq mm0, mm7
punpckhdq mm7, mm7
paddd mm7, mm0
psrad mm7, mm6
movd mm1, [esi]
paddd mm7, mm1
movd [edi], mm7
psllq mm7, 48
psrlq mm4, 16
por mm4, mm7
add esi, byte 4
add edi, byte 4
dec ebx
jnz .mmx_4_loop_i
jmp .mmx_end
.mmx_4more:
shl eax, 2
neg eax
add eax, byte 16
ALIGN 16
.mmx_4more_loop_i:
mov ecx, edi
add ecx, eax
mov edx, esp
movq mm7, mm4
pmaddwd mm7, mm5
ALIGN 16
.mmx_4more_loop_j:
movd mm0, [ecx - 16]
punpckldq mm0, [ecx - 12]
movd mm1, [ecx - 8]
punpckldq mm1, [ecx - 4]
packssdw mm0, mm1
pmaddwd mm0, [edx]
paddd mm7, mm0
add edx, byte 8
add ecx, byte 16
cmp ecx, edi
jnz .mmx_4more_loop_j
movq mm0, mm7
punpckhdq mm7, mm7
paddd mm7, mm0
psrad mm7, mm6
movd mm1, [esi]
paddd mm7, mm1
movd [edi], mm7
psllq mm7, 48
psrlq mm4, 16
por mm4, mm7
add esi, byte 4
add edi, byte 4
dec ebx
jnz short .mmx_4more_loop_i
.mmx_end:
emms
mov esp, ebp
.end:
pop edi
pop esi
pop ebx
pop ebp
ret
; **********************************************************************
;
;void FLAC__lpc_compute_residual_from_qlp_coefficients_wide(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
@ -1098,285 +724,4 @@ cident FLAC__lpc_compute_residual_from_qlp_coefficients_wide_asm_ia32
pop ebp
ret
; **********************************************************************
;
; void FLAC__lpc_restore_signal_wide(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
; {
; unsigned i, j;
; FLAC__int64 sum;
;
; FLAC__ASSERT(order > 0);
;
; for(i = 0; i < data_len; i++) {
; sum = 0;
; for(j = 0; j < order; j++)
; sum += qlp_coeff[j] * (FLAC__int64)data[i-j-1];
; data[i] = residual[i] + (FLAC__int32)(sum >> lp_quantization);
; }
; }
ALIGN 16
cident FLAC__lpc_restore_signal_wide_asm_ia32
;[esp + 40] data[]
;[esp + 36] lp_quantization
;[esp + 32] order
;[esp + 28] qlp_coeff[]
;[esp + 24] data_len
;[esp + 20] residual[]
;ASSERT(order > 0)
;ASSERT(order <= 32)
;ASSERT(lp_quantization <= 31)
push ebp
push ebx
push esi
push edi
mov ebx, [esp + 24] ; ebx = data_len
test ebx, ebx
jz near .end ; do nothing if data_len == 0
.begin:
mov eax, [esp + 32] ; eax = order
cmp eax, 1
jg short .x87_32
mov esi, [esp + 20] ; esi = residual[]
mov edi, [esp + 40] ; edi = data[]
mov ecx, [esp + 28] ; ecx = qlp_coeff[]
mov ebp, [ecx] ; ebp = qlp_coeff[0]
mov eax, [edi - 4] ; eax = data[-1]
mov ecx, [esp + 36] ; cl = lp_quantization
ALIGN 16
.x87_1_loop_i:
imul ebp ; edx:eax = qlp_coeff[0] * (FLAC__int64)data[i-1]
shrd eax, edx, cl ; 0 <= lp_quantization <= 15
;
add eax, [esi]
mov [edi], eax
;
add esi, 4
add edi, 4
dec ebx
jnz .x87_1_loop_i
jmp .end
.mov_eip_to_eax:
mov eax, [esp]
ret
.x87_32: ; eax = order
neg eax
add eax, eax
lea ebp, [eax + eax * 4 + .jumper_0 - .get_eip0]
call .mov_eip_to_eax
.get_eip0:
add ebp, eax
inc ebp ; compensate for the shorter opcode on the last iteration
mov ebx, [esp + 28] ; ebx = qlp_coeff[]
mov edi, [esp + 40] ; esi = data[]
sub [esp + 20], edi ; residual[] -= data[]
xor ecx, ecx
xor esi, esi
jmp ebp
;eax = --
;edx = --
;ecx = 0
;esi = 0
;
;ebx = qlp_coeff[]
;edi = data[]
;ebp = @address
mov eax, [ebx + 124] ; eax = qlp_coeff[31]
imul dword [edi - 128] ; edx:eax = qlp_coeff[31] * data[i-32]
add ecx, eax
adc esi, edx ; sum += qlp_coeff[31] * data[i-32]
mov eax, [ebx + 120] ; eax = qlp_coeff[30]
imul dword [edi - 124] ; edx:eax = qlp_coeff[30] * data[i-31]
add ecx, eax
adc esi, edx ; sum += qlp_coeff[30] * data[i-31]
mov eax, [ebx + 116]
imul dword [edi - 120]
add ecx, eax
adc esi, edx
mov eax, [ebx + 112]
imul dword [edi - 116]
add ecx, eax
adc esi, edx
mov eax, [ebx + 108]
imul dword [edi - 112]
add ecx, eax
adc esi, edx
mov eax, [ebx + 104]
imul dword [edi - 108]
add ecx, eax
adc esi, edx
mov eax, [ebx + 100]
imul dword [edi - 104]
add ecx, eax
adc esi, edx
mov eax, [ebx + 96]
imul dword [edi - 100]
add ecx, eax
adc esi, edx
mov eax, [ebx + 92]
imul dword [edi - 96]
add ecx, eax
adc esi, edx
mov eax, [ebx + 88]
imul dword [edi - 92]
add ecx, eax
adc esi, edx
mov eax, [ebx + 84]
imul dword [edi - 88]
add ecx, eax
adc esi, edx
mov eax, [ebx + 80]
imul dword [edi - 84]
add ecx, eax
adc esi, edx
mov eax, [ebx + 76]
imul dword [edi - 80]
add ecx, eax
adc esi, edx
mov eax, [ebx + 72]
imul dword [edi - 76]
add ecx, eax
adc esi, edx
mov eax, [ebx + 68]
imul dword [edi - 72]
add ecx, eax
adc esi, edx
mov eax, [ebx + 64]
imul dword [edi - 68]
add ecx, eax
adc esi, edx
mov eax, [ebx + 60]
imul dword [edi - 64]
add ecx, eax
adc esi, edx
mov eax, [ebx + 56]
imul dword [edi - 60]
add ecx, eax
adc esi, edx
mov eax, [ebx + 52]
imul dword [edi - 56]
add ecx, eax
adc esi, edx
mov eax, [ebx + 48]
imul dword [edi - 52]
add ecx, eax
adc esi, edx
mov eax, [ebx + 44]
imul dword [edi - 48]
add ecx, eax
adc esi, edx
mov eax, [ebx + 40]
imul dword [edi - 44]
add ecx, eax
adc esi, edx
mov eax, [ebx + 36]
imul dword [edi - 40]
add ecx, eax
adc esi, edx
mov eax, [ebx + 32]
imul dword [edi - 36]
add ecx, eax
adc esi, edx
mov eax, [ebx + 28]
imul dword [edi - 32]
add ecx, eax
adc esi, edx
mov eax, [ebx + 24]
imul dword [edi - 28]
add ecx, eax
adc esi, edx
mov eax, [ebx + 20]
imul dword [edi - 24]
add ecx, eax
adc esi, edx
mov eax, [ebx + 16]
imul dword [edi - 20]
add ecx, eax
adc esi, edx
mov eax, [ebx + 12]
imul dword [edi - 16]
add ecx, eax
adc esi, edx
mov eax, [ebx + 8]
imul dword [edi - 12]
add ecx, eax
adc esi, edx
mov eax, [ebx + 4]
imul dword [edi - 8]
add ecx, eax
adc esi, edx
mov eax, [ebx] ; eax = qlp_coeff[ 0] (NOTE: one byte missing from instruction)
imul dword [edi - 4] ; edx:eax = qlp_coeff[ 0] * data[i- 1]
add ecx, eax
adc esi, edx ; sum += qlp_coeff[ 0] * data[i- 1]
.jumper_0:
mov edx, ecx
;esi:edx = sum
mov ecx, [esp + 36] ; cl = lp_quantization
shrd edx, esi, cl ; edx = (sum >> lp_quantization)
;eax = --
;ecx = --
;edx = sum >> lp_q
;esi = --
;
mov eax, [esp + 20] ; residual[] - data[]
add edx, [edi + eax] ; edx = residual[i] + (sum >> lp_quantization)
mov [edi], edx ; data[i] = residual[i] + (sum >> lp_quantization)
add edi, 4
dec dword [esp + 24]
jz short .end
xor ecx, ecx
xor esi, esi
jmp ebp
.end:
pop edi
pop esi
pop ebx
pop ebp
ret
; end

View File

@ -206,22 +206,6 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_avx2(const FLA
*/
void FLAC__lpc_restore_signal(const FLAC__int32 residual[], uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 data[]);
void FLAC__lpc_restore_signal_wide(const FLAC__int32 residual[], uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 data[]);
#ifndef FLAC__NO_ASM
# ifdef FLAC__CPU_IA32
# ifdef FLAC__HAS_NASM
void FLAC__lpc_restore_signal_asm_ia32(const FLAC__int32 residual[], uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 data[]);
void FLAC__lpc_restore_signal_asm_ia32_mmx(const FLAC__int32 residual[], uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 data[]);
void FLAC__lpc_restore_signal_wide_asm_ia32(const FLAC__int32 residual[], uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 data[]);
# endif /* FLAC__HAS_NASM */
# endif /* FLAC__CPU_IA32 */
# if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN
# ifdef FLAC__SSE4_1_SUPPORTED
void FLAC__lpc_restore_signal_intrin_sse41(const FLAC__int32 residual[], uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 data[]);
void FLAC__lpc_restore_signal_16_intrin_sse41(const FLAC__int32 residual[], uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 data[]);
void FLAC__lpc_restore_signal_wide_intrin_sse41(const FLAC__int32 residual[], uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 data[]);
# endif
# endif
#endif /* FLAC__NO_ASM */
#ifndef FLAC__INTEGER_ONLY_LIBRARY

View File

@ -588,550 +588,6 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FL
}
}
FLAC__SSE_TARGET("sse4.1")
void FLAC__lpc_restore_signal_wide_intrin_sse41(const FLAC__int32 residual[], uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 data[])
{
int i;
const __m128i cnt = _mm_cvtsi32_si128(lp_quantization);
if (!data_len)
return;
FLAC__ASSERT(order > 0);
FLAC__ASSERT(order <= 32);
FLAC__ASSERT(lp_quantization <= 32); /* there's no _mm_sra_epi64() so we have to use _mm_srl_epi64() */
if(order <= 12) {
if(order > 8) { /* order == 9, 10, 11, 12 */
if(order > 10) { /* order == 11, 12 */
__m128i qlp[6], dat[6];
__m128i summ, temp;
qlp[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0))); // 0 q[1] 0 q[0]
qlp[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2))); // 0 q[3] 0 q[2]
qlp[2] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4))); // 0 q[5] 0 q[4]
qlp[3] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+6))); // 0 q[7] 0 q[6]
qlp[4] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+8))); // 0 q[9] 0 q[8]
if (order == 12)
qlp[5] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+10))); // 0 q[11] 0 q[10]
else
qlp[5] = _mm_cvtepu32_epi64(_mm_cvtsi32_si128(qlp_coeff[10])); // 0 0 0 q[10]
dat[5] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-12)), _MM_SHUFFLE(2,0,3,1)); // 0 d[i-12] 0 d[i-11]
dat[4] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-10)), _MM_SHUFFLE(2,0,3,1)); // 0 d[i-10] 0 d[i-9]
dat[3] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-8 )), _MM_SHUFFLE(2,0,3,1)); // 0 d[i-8] 0 d[i-7]
dat[2] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-6 )), _MM_SHUFFLE(2,0,3,1)); // 0 d[i-6] 0 d[i-5]
dat[1] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-4 )), _MM_SHUFFLE(2,0,3,1)); // 0 d[i-4] 0 d[i-3]
dat[0] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-2 )), _MM_SHUFFLE(2,0,3,1)); // 0 d[i-2] 0 d[i-1]
summ = _mm_mul_epi32(dat[5], qlp[5]) ;
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[4], qlp[4]));
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[3], qlp[3]));
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[2], qlp[2]));
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1]));
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8)); // ?_64 sum_64
summ = _mm_srl_epi64(summ, cnt); // ?_64 (sum >> lp_quantization)_64 == ?_32 ?_32 ?_32 (sum >> lp_quantization)_32
temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[0]), summ); // ? ? ? d[i]
data[0] = _mm_cvtsi128_si32(temp);
for(i = 1; i < (int)data_len; i++) {
temp = _mm_slli_si128(temp, 8);
dat[5] = _mm_alignr_epi8(dat[5], dat[4], 8); // ? d[i-11] ? d[i-10]
dat[4] = _mm_alignr_epi8(dat[4], dat[3], 8); // ? d[i-9] ? d[i-8]
dat[3] = _mm_alignr_epi8(dat[3], dat[2], 8); // ? d[i-7] ? d[i-6]
dat[2] = _mm_alignr_epi8(dat[2], dat[1], 8); // ? d[i-5] ? d[i-4]
dat[1] = _mm_alignr_epi8(dat[1], dat[0], 8); // ? d[i-3] ? d[i-2]
dat[0] = _mm_alignr_epi8(dat[0], temp, 8); // ? d[i-1] ? d[i ]
summ = _mm_mul_epi32(dat[5], qlp[5]) ;
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[4], qlp[4]));
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[3], qlp[3]));
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[2], qlp[2]));
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1]));
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8)); // ?_64 sum_64
summ = _mm_srl_epi64(summ, cnt); // ?_64 (sum >> lp_quantization)_64 == ?_32 ?_32 ?_32 (sum >> lp_quantization)_32
temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ); // ? ? ? d[i]
data[i] = _mm_cvtsi128_si32(temp);
}
}
else { /* order == 9, 10 */
__m128i qlp[5], dat[5];
__m128i summ, temp;
qlp[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0)));
qlp[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2)));
qlp[2] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4)));
qlp[3] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+6)));
if (order == 10)
qlp[4] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+8)));
else
qlp[4] = _mm_cvtepu32_epi64(_mm_cvtsi32_si128(qlp_coeff[8]));
dat[4] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-10)), _MM_SHUFFLE(2,0,3,1));
dat[3] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-8 )), _MM_SHUFFLE(2,0,3,1));
dat[2] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-6 )), _MM_SHUFFLE(2,0,3,1));
dat[1] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-4 )), _MM_SHUFFLE(2,0,3,1));
dat[0] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-2 )), _MM_SHUFFLE(2,0,3,1));
summ = _mm_mul_epi32(dat[4], qlp[4]) ;
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[3], qlp[3]));
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[2], qlp[2]));
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1]));
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
summ = _mm_srl_epi64(summ, cnt);
temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[0]), summ);
data[0] = _mm_cvtsi128_si32(temp);
for(i = 1; i < (int)data_len; i++) {
temp = _mm_slli_si128(temp, 8);
dat[4] = _mm_alignr_epi8(dat[4], dat[3], 8);
dat[3] = _mm_alignr_epi8(dat[3], dat[2], 8);
dat[2] = _mm_alignr_epi8(dat[2], dat[1], 8);
dat[1] = _mm_alignr_epi8(dat[1], dat[0], 8);
dat[0] = _mm_alignr_epi8(dat[0], temp, 8);
summ = _mm_mul_epi32(dat[4], qlp[4]) ;
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[3], qlp[3]));
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[2], qlp[2]));
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1]));
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
summ = _mm_srl_epi64(summ, cnt);
temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ);
data[i] = _mm_cvtsi128_si32(temp);
}
}
}
else if(order > 4) { /* order == 5, 6, 7, 8 */
if(order > 6) { /* order == 7, 8 */
__m128i qlp[4], dat[4];
__m128i summ, temp;
qlp[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0)));
qlp[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2)));
qlp[2] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4)));
if (order == 8)
qlp[3] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+6)));
else
qlp[3] = _mm_cvtepu32_epi64(_mm_cvtsi32_si128(qlp_coeff[6]));
dat[3] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-8 )), _MM_SHUFFLE(2,0,3,1));
dat[2] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-6 )), _MM_SHUFFLE(2,0,3,1));
dat[1] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-4 )), _MM_SHUFFLE(2,0,3,1));
dat[0] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-2 )), _MM_SHUFFLE(2,0,3,1));
summ = _mm_mul_epi32(dat[3], qlp[3]) ;
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[2], qlp[2]));
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1]));
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
summ = _mm_srl_epi64(summ, cnt);
temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[0]), summ);
data[0] = _mm_cvtsi128_si32(temp);
for(i = 1; i < (int)data_len; i++) {
temp = _mm_slli_si128(temp, 8);
dat[3] = _mm_alignr_epi8(dat[3], dat[2], 8);
dat[2] = _mm_alignr_epi8(dat[2], dat[1], 8);
dat[1] = _mm_alignr_epi8(dat[1], dat[0], 8);
dat[0] = _mm_alignr_epi8(dat[0], temp, 8);
summ = _mm_mul_epi32(dat[3], qlp[3]) ;
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[2], qlp[2]));
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1]));
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
summ = _mm_srl_epi64(summ, cnt);
temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ);
data[i] = _mm_cvtsi128_si32(temp);
}
}
else { /* order == 5, 6 */
__m128i qlp[3], dat[3];
__m128i summ, temp;
qlp[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0)));
qlp[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2)));
if (order == 6)
qlp[2] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+4)));
else
qlp[2] = _mm_cvtepu32_epi64(_mm_cvtsi32_si128(qlp_coeff[4]));
dat[2] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-6 )), _MM_SHUFFLE(2,0,3,1));
dat[1] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-4 )), _MM_SHUFFLE(2,0,3,1));
dat[0] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-2 )), _MM_SHUFFLE(2,0,3,1));
summ = _mm_mul_epi32(dat[2], qlp[2]) ;
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1]));
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
summ = _mm_srl_epi64(summ, cnt);
temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[0]), summ);
data[0] = _mm_cvtsi128_si32(temp);
for(i = 1; i < (int)data_len; i++) {
temp = _mm_slli_si128(temp, 8);
dat[2] = _mm_alignr_epi8(dat[2], dat[1], 8);
dat[1] = _mm_alignr_epi8(dat[1], dat[0], 8);
dat[0] = _mm_alignr_epi8(dat[0], temp, 8);
summ = _mm_mul_epi32(dat[2], qlp[2]) ;
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1]));
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
summ = _mm_srl_epi64(summ, cnt);
temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ);
data[i] = _mm_cvtsi128_si32(temp);
}
}
}
else { /* order == 1, 2, 3, 4 */
if(order > 2) { /* order == 3, 4 */
__m128i qlp[2], dat[2];
__m128i summ, temp;
qlp[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+0)));
if (order == 4)
qlp[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+2)));
else
qlp[1] = _mm_cvtepu32_epi64(_mm_cvtsi32_si128(qlp_coeff[2]));
dat[1] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-4 )), _MM_SHUFFLE(2,0,3,1));
dat[0] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-2 )), _MM_SHUFFLE(2,0,3,1));
summ = _mm_mul_epi32(dat[1], qlp[1]) ;
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
summ = _mm_srl_epi64(summ, cnt);
temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[0]), summ);
data[0] = _mm_cvtsi128_si32(temp);
for(i = 1; i < (int)data_len; i++) {
temp = _mm_slli_si128(temp, 8);
dat[1] = _mm_alignr_epi8(dat[1], dat[0], 8);
dat[0] = _mm_alignr_epi8(dat[0], temp, 8);
summ = _mm_mul_epi32(dat[1], qlp[1]) ;
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
summ = _mm_srl_epi64(summ, cnt);
temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ);
data[i] = _mm_cvtsi128_si32(temp);
}
}
else { /* order == 1, 2 */
if(order == 2) {
__m128i qlp0, dat0;
__m128i summ, temp;
qlp0 = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff)));
dat0 = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(data-2 )), _MM_SHUFFLE(2,0,3,1));
summ = _mm_mul_epi32(dat0, qlp0);
summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
summ = _mm_srl_epi64(summ, cnt);
temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[0]), summ);
data[0] = _mm_cvtsi128_si32(temp);
for(i = 1; i < (int)data_len; i++) {
dat0 = _mm_alignr_epi8(dat0, _mm_slli_si128(temp, 8), 8);
summ = _mm_mul_epi32(dat0, qlp0);
summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
summ = _mm_srl_epi64(summ, cnt);
temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ);
data[i] = _mm_cvtsi128_si32(temp);
}
}
else { /* order == 1 */
__m128i qlp0;
__m128i summ, temp;
qlp0 = _mm_cvtsi32_si128(qlp_coeff[0]);
temp = _mm_cvtsi32_si128(data[-1]);
summ = _mm_mul_epi32(temp, qlp0);
summ = _mm_srl_epi64(summ, cnt);
temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[0]), summ);
data[0] = _mm_cvtsi128_si32(temp);
for(i = 1; i < (int)data_len; i++) {
summ = _mm_mul_epi32(temp, qlp0);
summ = _mm_srl_epi64(summ, cnt);
temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ);
data[i] = _mm_cvtsi128_si32(temp);
}
}
}
}
}
else { /* order > 12 */
__m128i qlp[16];
for(i = 0; i < (int)order/2; i++)
qlp[i] = _mm_shuffle_epi32(_mm_loadl_epi64((const __m128i*)(const void*)(qlp_coeff+i*2)), _MM_SHUFFLE(2,0,3,1)); // 0 q[2*i] 0 q[2*i+1]
if(order & 1)
qlp[i] = _mm_shuffle_epi32(_mm_cvtsi32_si128(qlp_coeff[i*2]), _MM_SHUFFLE(2,0,3,1));
for(i = 0; i < (int)data_len; i++) {
__m128i summ = _mm_setzero_si128(), dat;
FLAC__int32 * const datai = &data[i];
switch((order+1) / 2) {
case 16: /* order == 31, 32 */
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-32)));
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[15])); /* Falls through. */
case 15:
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-30)));
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[14])); /* Falls through. */
case 14:
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-28)));
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[13])); /* Falls through. */
case 13:
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-26)));
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[12])); /* Falls through. */
case 12:
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-24)));
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[11])); /* Falls through. */
case 11:
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-22)));
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[10])); /* Falls through. */
case 10:
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-20)));
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[9])); /* Falls through. */
case 9:
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-18)));
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[8])); /* Falls through. */
case 8:
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-16)));
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[7])); /* Falls through. */
case 7: /* order == 13, 14 */
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-14)));
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[6]));
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-12)));
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[5]));
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-10)));
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[4]));
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-8)));
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[3]));
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-6)));
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[2]));
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-4)));
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[1]));
dat = _mm_cvtepu32_epi64(_mm_loadl_epi64((const __m128i*)(const void*)(datai-2)));
summ = _mm_add_epi64(summ, _mm_mul_epi32(dat, qlp[0]));
}
summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
summ = _mm_srl_epi64(summ, cnt);
summ = _mm_add_epi32(summ, _mm_cvtsi32_si128(residual[i]));
data[i] = _mm_cvtsi128_si32(summ);
}
}
}
FLAC__SSE_TARGET("sse4.1")
void FLAC__lpc_restore_signal_intrin_sse41(const FLAC__int32 residual[], uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 data[])
{
if(order < 8) {
FLAC__lpc_restore_signal(residual, data_len, qlp_coeff, order, lp_quantization, data);
return;
}
FLAC__ASSERT(order >= 8);
FLAC__ASSERT(order <= 32);
if(order <= 12) {
int i;
const __m128i cnt = _mm_cvtsi32_si128(lp_quantization);
if(order > 8) /* order == 9, 10, 11, 12 */
{
__m128i qlp[3], dat[3];
__m128i summ, temp;
qlp[0] = _mm_loadu_si128((const __m128i*)(const void*)(qlp_coeff + 0)); // q[3] q[2] q[1] q[0]
qlp[1] = _mm_loadu_si128((const __m128i*)(const void*)(qlp_coeff + 4)); // q[7] q[6] q[5] q[4]
qlp[2] = _mm_loadu_si128((const __m128i*)(const void*)(qlp_coeff + 8)); // q[11] q[10] q[9] q[8]
switch (order)
{
case 9:
qlp[2] = _mm_slli_si128(qlp[2], 12); qlp[2] = _mm_srli_si128(qlp[2], 12); break; // 0 0 0 q[8]
case 10:
qlp[2] = _mm_slli_si128(qlp[2], 8); qlp[2] = _mm_srli_si128(qlp[2], 8); break; // 0 0 q[9] q[8]
case 11:
qlp[2] = _mm_slli_si128(qlp[2], 4); qlp[2] = _mm_srli_si128(qlp[2], 4); break; // 0 q[10] q[9] q[8]
}
dat[2] = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data - 12)), _MM_SHUFFLE(0, 1, 2, 3)); // d[i-12] d[i-11] d[i-10] d[i-9]
dat[1] = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data - 8)), _MM_SHUFFLE(0, 1, 2, 3)); // d[i-8] d[i-7] d[i-6] d[i-5]
dat[0] = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data - 4)), _MM_SHUFFLE(0, 1, 2, 3)); // d[i-4] d[i-3] d[i-2] d[i-1]
for (i = 0;;) {
summ = _mm_mullo_epi32(dat[2], qlp[2]);
summ = _mm_add_epi32(summ, _mm_mullo_epi32(dat[1], qlp[1]));
summ = _mm_add_epi32(summ, _mm_mullo_epi32(dat[0], qlp[0]));
summ = _mm_add_epi32(summ, _mm_shuffle_epi32(summ, _MM_SHUFFLE(1,0,3,2)));
summ = _mm_add_epi32(summ, _mm_shufflelo_epi16(summ, _MM_SHUFFLE(1,0,3,2)));
summ = _mm_sra_epi32(summ, cnt);
temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ);
data[i] = _mm_cvtsi128_si32(temp);
if(++i >= (int)data_len) break;
temp = _mm_slli_si128(temp, 12);
dat[2] = _mm_alignr_epi8(dat[2], dat[1], 12);
dat[1] = _mm_alignr_epi8(dat[1], dat[0], 12);
dat[0] = _mm_alignr_epi8(dat[0], temp, 12);
}
}
else /* order == 8 */
{
__m128i qlp[2], dat[2];
__m128i summ, temp;
qlp[0] = _mm_loadu_si128((const __m128i*)(const void*)(qlp_coeff + 0));
qlp[1] = _mm_loadu_si128((const __m128i*)(const void*)(qlp_coeff + 4));
dat[1] = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data - 8)), _MM_SHUFFLE(0, 1, 2, 3));
dat[0] = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data - 4)), _MM_SHUFFLE(0, 1, 2, 3));
for (i = 0;;) {
summ = _mm_add_epi32(_mm_mullo_epi32(dat[1], qlp[1]), _mm_mullo_epi32(dat[0], qlp[0]));
summ = _mm_add_epi32(summ, _mm_shuffle_epi32(summ, _MM_SHUFFLE(1,0,3,2)));
summ = _mm_add_epi32(summ, _mm_shufflelo_epi16(summ, _MM_SHUFFLE(1,0,3,2)));
summ = _mm_sra_epi32(summ, cnt);
temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ);
data[i] = _mm_cvtsi128_si32(temp);
if(++i >= (int)data_len) break;
temp = _mm_slli_si128(temp, 12);
dat[1] = _mm_alignr_epi8(dat[1], dat[0], 12);
dat[0] = _mm_alignr_epi8(dat[0], temp, 12);
}
}
}
else { /* order > 12 */
#ifdef FLAC__HAS_NASM
FLAC__lpc_restore_signal_asm_ia32(residual, data_len, qlp_coeff, order, lp_quantization, data);
#else
FLAC__lpc_restore_signal(residual, data_len, qlp_coeff, order, lp_quantization, data);
#endif
}
}
FLAC__SSE_TARGET("ssse3")
void FLAC__lpc_restore_signal_16_intrin_sse41(const FLAC__int32 residual[], uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 data[])
{
if(order < 8) {
FLAC__lpc_restore_signal(residual, data_len, qlp_coeff, order, lp_quantization, data);
return;
}
FLAC__ASSERT(order >= 8);
FLAC__ASSERT(order <= 32);
if(order <= 12) {
int i;
const __m128i cnt = _mm_cvtsi32_si128(lp_quantization);
if(order > 8) /* order == 9, 10, 11, 12 */
{
__m128i qlp[2], dat[2];
__m128i summ, temp;
qlp[0] = _mm_loadu_si128((const __m128i*)(const void*)(qlp_coeff+0)); // q[3] q[2] q[1] q[0]
temp = _mm_loadu_si128((const __m128i*)(const void*)(qlp_coeff+4)); // q[7] q[6] q[5] q[4]
qlp[1] = _mm_loadu_si128((const __m128i*)(const void*)(qlp_coeff+8)); // q[11] q[10] q[9] q[8]
switch(order)
{
case 9:
qlp[1] = _mm_slli_si128(qlp[1], 12); qlp[1] = _mm_srli_si128(qlp[1], 12); break; // 0 0 0 q[8]
case 10:
qlp[1] = _mm_slli_si128(qlp[1], 8); qlp[1] = _mm_srli_si128(qlp[1], 8); break; // 0 0 q[9] q[8]
case 11:
qlp[1] = _mm_slli_si128(qlp[1], 4); qlp[1] = _mm_srli_si128(qlp[1], 4); break; // 0 q[10] q[9] q[8]
}
qlp[0] = _mm_packs_epi32(qlp[0], temp); // q[7] q[6] q[5] q[4] q[3] q[2] q[1] q[0]
qlp[1] = _mm_packs_epi32(qlp[1], _mm_setzero_si128()); // 0 0 0 0 q[11] q[10] q[9] q[8]
dat[1] = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data-12)), _MM_SHUFFLE(0,1,2,3)); // d[i-12] d[i-11] d[i-10] d[i-9]
temp = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data-8)), _MM_SHUFFLE(0,1,2,3)); // d[i-8] d[i-7] d[i-6] d[i-5]
dat[0] = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data-4)), _MM_SHUFFLE(0,1,2,3)); // d[i-4] d[i-3] d[i-2] d[i-1]
dat[1] = _mm_packs_epi32(dat[1], _mm_setzero_si128()); // 0 0 0 0 d[i-12] d[i-11] d[i-10] d[i-9]
dat[0] = _mm_packs_epi32(dat[0], temp); // d[i-8] d[i-7] d[i-6] d[i-5] d[i-4] d[i-3] d[i-2] d[i-1]
for(i = 0;;) {
summ = _mm_madd_epi16(dat[1], qlp[1]);
summ = _mm_add_epi32(summ, _mm_madd_epi16(dat[0], qlp[0]));
summ = _mm_add_epi32(summ, _mm_shuffle_epi32(summ, _MM_SHUFFLE(1,0,3,2)));
summ = _mm_add_epi32(summ, _mm_shufflelo_epi16(summ, _MM_SHUFFLE(1,0,3,2)));
summ = _mm_sra_epi32(summ, cnt);
temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ);
data[i] = _mm_cvtsi128_si32(temp);
if(++i >= (int)data_len) break;
temp = _mm_slli_si128(temp, 14);
dat[1] = _mm_alignr_epi8(dat[1], dat[0], 14); // 0 0 0 d[i-12] d[i-11] d[i-10] d[i-9] d[i-8]
dat[0] = _mm_alignr_epi8(dat[0], temp, 14); // d[i-7] d[i-6] d[i-5] d[i-4] d[i-3] d[i-2] d[i-1] d[i]
}
}
else /* order == 8 */
{
__m128i qlp0, dat0;
__m128i summ, temp;
qlp0 = _mm_loadu_si128((const __m128i*)(const void*)(qlp_coeff+0)); // q[3] q[2] q[1] q[0]
temp = _mm_loadu_si128((const __m128i*)(const void*)(qlp_coeff+4)); // q[7] q[6] q[5] q[4]
qlp0 = _mm_packs_epi32(qlp0, temp); // q[7] q[6] q[5] q[4] q[3] q[2] q[1] q[0]
temp = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data-8)), _MM_SHUFFLE(0,1,2,3));
dat0 = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i*)(const void*)(data-4)), _MM_SHUFFLE(0,1,2,3));
dat0 = _mm_packs_epi32(dat0, temp); // d[i-8] d[i-7] d[i-6] d[i-5] d[i-4] d[i-3] d[i-2] d[i-1]
for(i = 0;;) {
summ = _mm_madd_epi16(dat0, qlp0);
summ = _mm_add_epi32(summ, _mm_shuffle_epi32(summ, _MM_SHUFFLE(1,0,3,2)));
summ = _mm_add_epi32(summ, _mm_shufflelo_epi16(summ, _MM_SHUFFLE(1,0,3,2)));
summ = _mm_sra_epi32(summ, cnt);
temp = _mm_add_epi32(_mm_cvtsi32_si128(residual[i]), summ);
data[i] = _mm_cvtsi128_si32(temp);
if(++i >= (int)data_len) break;
temp = _mm_slli_si128(temp, 14);
dat0 = _mm_alignr_epi8(dat0, temp, 14); // d[i-7] d[i-6] d[i-5] d[i-4] d[i-3] d[i-2] d[i-1] d[i]
}
}
}
else { /* order > 12 */
#ifdef FLAC__HAS_NASM
FLAC__lpc_restore_signal_asm_ia32_mmx(residual, data_len, qlp_coeff, order, lp_quantization, data);
#else
FLAC__lpc_restore_signal(residual, data_len, qlp_coeff, order, lp_quantization, data);
#endif
}
}
#endif /* defined FLAC__CPU_IA32 */
FLAC__SSE_TARGET("sse4.1")

View File

@ -45,7 +45,6 @@
#include "protected/stream_decoder.h"
#include "private/bitreader.h"
#include "private/bitmath.h"
#include "private/cpu.h"
#include "private/crc.h"
#include "private/fixed.h"
#include "private/format.h"
@ -129,12 +128,6 @@ typedef struct FLAC__StreamDecoderPrivate {
FLAC__StreamDecoderWriteCallback write_callback;
FLAC__StreamDecoderMetadataCallback metadata_callback;
FLAC__StreamDecoderErrorCallback error_callback;
/* generic 32-bit datapath: */
void (*local_lpc_restore_signal)(const FLAC__int32 residual[], uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 data[]);
/* generic 64-bit datapath: */
void (*local_lpc_restore_signal_64bit)(const FLAC__int32 residual[], uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 data[]);
/* for use when the signal is <= 16 bits-per-sample, or <= 15 bits-per-sample on a side channel (which requires 1 extra bit): */
void (*local_lpc_restore_signal_16bit)(const FLAC__int32 residual[], uint32_t data_len, const FLAC__int32 qlp_coeff[], uint32_t order, int lp_quantization, FLAC__int32 data[]);
void *client_data;
FILE *file; /* only used if FLAC__stream_decoder_init_file()/FLAC__stream_decoder_init_file() called, else NULL */
FLAC__BitReader *input;
@ -152,7 +145,6 @@ typedef struct FLAC__StreamDecoderPrivate {
size_t metadata_filter_ids_count, metadata_filter_ids_capacity; /* units for both are IDs, not bytes */
FLAC__Frame frame;
FLAC__bool cached; /* true if there is a byte in lookahead */
FLAC__CPUInfo cpuinfo;
FLAC__byte header_warmup[2]; /* contains the sync code and reserved bits */
FLAC__byte lookahead; /* temp storage when we need to look ahead one byte in the stream */
/* unaligned (original) pointers to allocated data */
@ -373,48 +365,6 @@ static FLAC__StreamDecoderInitStatus init_stream_internal_(
return decoder->protected_->initstate = FLAC__STREAM_DECODER_INIT_STATUS_ERROR_OPENING_FILE;
#endif
/*
* get the CPU info and set the function pointers
*/
FLAC__cpu_info(&decoder->private_->cpuinfo);
/* first default to the non-asm routines */
decoder->private_->local_lpc_restore_signal = FLAC__lpc_restore_signal;
decoder->private_->local_lpc_restore_signal_64bit = FLAC__lpc_restore_signal_wide;
decoder->private_->local_lpc_restore_signal_16bit = FLAC__lpc_restore_signal;
/* now override with asm where appropriate */
#ifndef FLAC__NO_ASM
if(decoder->private_->cpuinfo.use_asm) {
#ifdef FLAC__CPU_IA32
FLAC__ASSERT(decoder->private_->cpuinfo.type == FLAC__CPUINFO_TYPE_IA32);
#ifdef FLAC__HAS_NASM
decoder->private_->local_lpc_restore_signal_64bit = FLAC__lpc_restore_signal_wide_asm_ia32; /* OPT_IA32: was really necessary for GCC < 4.9 */
if (decoder->private_->cpuinfo.x86.mmx) {
decoder->private_->local_lpc_restore_signal = FLAC__lpc_restore_signal_asm_ia32;
decoder->private_->local_lpc_restore_signal_16bit = FLAC__lpc_restore_signal_asm_ia32_mmx;
}
else {
decoder->private_->local_lpc_restore_signal = FLAC__lpc_restore_signal_asm_ia32;
decoder->private_->local_lpc_restore_signal_16bit = FLAC__lpc_restore_signal_asm_ia32;
}
#endif
#if FLAC__HAS_X86INTRIN && ! defined FLAC__INTEGER_ONLY_LIBRARY
# if defined FLAC__SSE4_1_SUPPORTED
if (decoder->private_->cpuinfo.x86.sse41) {
# if !defined FLAC__HAS_NASM /* these are not undoubtedly faster than their MMX ASM counterparts */
decoder->private_->local_lpc_restore_signal = FLAC__lpc_restore_signal_intrin_sse41;
decoder->private_->local_lpc_restore_signal_16bit = FLAC__lpc_restore_signal_16_intrin_sse41;
# endif
decoder->private_->local_lpc_restore_signal_64bit = FLAC__lpc_restore_signal_wide_intrin_sse41;
}
# endif
#endif
#elif defined FLAC__CPU_X86_64
FLAC__ASSERT(decoder->private_->cpuinfo.type == FLAC__CPUINFO_TYPE_X86_64);
/* No useful SSE optimizations yet */
#endif
}
#endif
/* from here on, errors are fatal */
if(!FLAC__bitreader_init(decoder->private_->input, read_callback_, decoder)) {
@ -2848,12 +2798,9 @@ FLAC__bool read_subframe_lpc_(FLAC__StreamDecoder *decoder, uint32_t channel, ui
if(do_full_decode) {
memcpy(decoder->private_->output[channel], subframe->warmup, sizeof(FLAC__int32) * order);
if(bps + subframe->qlp_coeff_precision + FLAC__bitmath_ilog2(order) <= 32)
if(bps <= 16 && subframe->qlp_coeff_precision <= 16)
decoder->private_->local_lpc_restore_signal_16bit(decoder->private_->residual[channel], decoder->private_->frame.header.blocksize-order, subframe->qlp_coeff, order, subframe->quantization_level, decoder->private_->output[channel]+order);
else
decoder->private_->local_lpc_restore_signal(decoder->private_->residual[channel], decoder->private_->frame.header.blocksize-order, subframe->qlp_coeff, order, subframe->quantization_level, decoder->private_->output[channel]+order);
FLAC__lpc_restore_signal(decoder->private_->residual[channel], decoder->private_->frame.header.blocksize-order, subframe->qlp_coeff, order, subframe->quantization_level, decoder->private_->output[channel]+order);
else
decoder->private_->local_lpc_restore_signal_64bit(decoder->private_->residual[channel], decoder->private_->frame.header.blocksize-order, subframe->qlp_coeff, order, subframe->quantization_level, decoder->private_->output[channel]+order);
FLAC__lpc_restore_signal_wide(decoder->private_->residual[channel], decoder->private_->frame.header.blocksize-order, subframe->qlp_coeff, order, subframe->quantization_level, decoder->private_->output[channel]+order);
}
return true;